From 8f964b9b48572ac71f27ba64207816dfd3a6dc36 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 3 Nov 2020 22:40:57 -0800 Subject: small fixes from local testing for XML ingest --- python/sandcrawler/ingest.py | 2 +- python/sandcrawler/persist.py | 11 ++++++++--- python/sandcrawler_worker.py | 2 -- 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'python') diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 363485e..2e227bf 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -336,7 +336,7 @@ class IngestFileWorker(SandcrawlerWorker): ) def want(self, request: dict) -> bool: - if not request.get('ingest_type') in ('file', 'pdf'): + if not request.get('ingest_type') in ('file', 'pdf', 'xml'): return False return True diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 81cf664..c225d5a 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -20,7 +20,7 @@ grobid """ import os -from typing import Optional +from typing import Optional, AnyStr import xml.etree.ElementTree from sandcrawler.workers import SandcrawlerWorker @@ -518,12 +518,17 @@ class GenericPersistDocWorker(SandcrawlerWorker): self.s3_folder = kwargs.get('s3_folder', "unknown") self.doc_key = "unknown" - def process(self, record: dict, key: Optional[str] = None) -> None: + def process(self, record: dict, raw_key: Optional[AnyStr] = None) -> None: if record.get('status') != 'success' or not record.get(self.doc_key): return - assert key is not None and len(key) == 40 and isinstance(key, str) + assert raw_key is not None + if isinstance(raw_key, bytes): + key = raw_key.decode('utf-8') + elif isinstance(raw_key, str): + key = raw_key + assert len(key) == 40 if 'sha1hex' in record: assert key == record['sha1hex'] diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index 24dbdd0..3681d7f 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -163,7 +163,6 @@ def run_persist_xml_doc(args: argparse.Namespace) -> None: consume_topic=consume_topic, group="persist-xml-doc", push_batches=False, - raw_records=True, batch_size=25, ) pusher.run() @@ -182,7 +181,6 @@ def run_persist_html_teixml(args: argparse.Namespace) -> None: consume_topic=consume_topic, group="persist-html-teixml", push_batches=False, - raw_records=True, batch_size=25, ) pusher.run() -- cgit v1.2.3