diff options
-rw-r--r-- | python/sandcrawler/ingest.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/persist.py | 11 | ||||
-rwxr-xr-x | python/sandcrawler_worker.py | 2 |
3 files changed, 9 insertions, 6 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 363485e..2e227bf 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -336,7 +336,7 @@ class IngestFileWorker(SandcrawlerWorker): ) def want(self, request: dict) -> bool: - if not request.get('ingest_type') in ('file', 'pdf'): + if not request.get('ingest_type') in ('file', 'pdf', 'xml'): return False return True diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 81cf664..c225d5a 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -20,7 +20,7 @@ grobid """ import os -from typing import Optional +from typing import Optional, AnyStr import xml.etree.ElementTree from sandcrawler.workers import SandcrawlerWorker @@ -518,12 +518,17 @@ class GenericPersistDocWorker(SandcrawlerWorker): self.s3_folder = kwargs.get('s3_folder', "unknown") self.doc_key = "unknown" - def process(self, record: dict, key: Optional[str] = None) -> None: + def process(self, record: dict, raw_key: Optional[AnyStr] = None) -> None: if record.get('status') != 'success' or not record.get(self.doc_key): return - assert key is not None and len(key) == 40 and isinstance(key, str) + assert raw_key is not None + if isinstance(raw_key, bytes): + key = raw_key.decode('utf-8') + elif isinstance(raw_key, str): + key = raw_key + assert len(key) == 40 if 'sha1hex' in record: assert key == record['sha1hex'] diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index 24dbdd0..3681d7f 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -163,7 +163,6 @@ def run_persist_xml_doc(args: argparse.Namespace) -> None: consume_topic=consume_topic, group="persist-xml-doc", push_batches=False, - raw_records=True, batch_size=25, ) pusher.run() @@ -182,7 +181,6 @@ def run_persist_html_teixml(args: argparse.Namespace) -> None: consume_topic=consume_topic, group="persist-html-teixml", push_batches=False, - raw_records=True, batch_size=25, ) pusher.run() |