From 8f964b9b48572ac71f27ba64207816dfd3a6dc36 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 3 Nov 2020 22:40:57 -0800 Subject: small fixes from local testing for XML ingest --- python/sandcrawler/persist.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'python/sandcrawler/persist.py') diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 81cf664..c225d5a 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -20,7 +20,7 @@ grobid """ import os -from typing import Optional +from typing import Optional, AnyStr import xml.etree.ElementTree from sandcrawler.workers import SandcrawlerWorker @@ -518,12 +518,17 @@ class GenericPersistDocWorker(SandcrawlerWorker): self.s3_folder = kwargs.get('s3_folder', "unknown") self.doc_key = "unknown" - def process(self, record: dict, key: Optional[str] = None) -> None: + def process(self, record: dict, raw_key: Optional[AnyStr] = None) -> None: if record.get('status') != 'success' or not record.get(self.doc_key): return - assert key is not None and len(key) == 40 and isinstance(key, str) + assert raw_key is not None + if isinstance(raw_key, bytes): + key = raw_key.decode('utf-8') + elif isinstance(raw_key, str): + key = raw_key + assert len(key) == 40 if 'sha1hex' in record: assert key == record['sha1hex'] -- cgit v1.2.3