diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-03 22:40:57 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-03 22:42:05 -0800 |
commit | 8f964b9b48572ac71f27ba64207816dfd3a6dc36 (patch) | |
tree | 68a0146e71a0eef4e43f7b7d686b806eb210cda5 | |
parent | 653fac9632c6ae9dd036ad844454cf419cd5320b (diff) | |
download | sandcrawler-8f964b9b48572ac71f27ba64207816dfd3a6dc36.tar.gz sandcrawler-8f964b9b48572ac71f27ba64207816dfd3a6dc36.zip |
small fixes from local testing for XML ingest
-rw-r--r-- | python/sandcrawler/ingest.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/persist.py | 11 | ||||
-rwxr-xr-x | python/sandcrawler_worker.py | 2 |
3 files changed, 9 insertions, 6 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 363485e..2e227bf 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -336,7 +336,7 @@ class IngestFileWorker(SandcrawlerWorker): ) def want(self, request: dict) -> bool: - if not request.get('ingest_type') in ('file', 'pdf'): + if not request.get('ingest_type') in ('file', 'pdf', 'xml'): return False return True diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 81cf664..c225d5a 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -20,7 +20,7 @@ grobid """ import os -from typing import Optional +from typing import Optional, AnyStr import xml.etree.ElementTree from sandcrawler.workers import SandcrawlerWorker @@ -518,12 +518,17 @@ class GenericPersistDocWorker(SandcrawlerWorker): self.s3_folder = kwargs.get('s3_folder', "unknown") self.doc_key = "unknown" - def process(self, record: dict, key: Optional[str] = None) -> None: + def process(self, record: dict, raw_key: Optional[AnyStr] = None) -> None: if record.get('status') != 'success' or not record.get(self.doc_key): return - assert key is not None and len(key) == 40 and isinstance(key, str) + assert raw_key is not None + if isinstance(raw_key, bytes): + key = raw_key.decode('utf-8') + elif isinstance(raw_key, str): + key = raw_key + assert len(key) == 40 if 'sha1hex' in record: assert key == record['sha1hex'] diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index 24dbdd0..3681d7f 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -163,7 +163,6 @@ def run_persist_xml_doc(args: argparse.Namespace) -> None: consume_topic=consume_topic, group="persist-xml-doc", push_batches=False, - raw_records=True, batch_size=25, ) pusher.run() @@ -182,7 +181,6 @@ def run_persist_html_teixml(args: argparse.Namespace) -> None: consume_topic=consume_topic, group="persist-html-teixml", push_batches=False, - raw_records=True, batch_size=25, ) pusher.run() |