aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/persist.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-03 22:40:57 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-03 22:42:05 -0800
commit8f964b9b48572ac71f27ba64207816dfd3a6dc36 (patch)
tree68a0146e71a0eef4e43f7b7d686b806eb210cda5 /python/sandcrawler/persist.py
parent653fac9632c6ae9dd036ad844454cf419cd5320b (diff)
downloadsandcrawler-8f964b9b48572ac71f27ba64207816dfd3a6dc36.tar.gz
sandcrawler-8f964b9b48572ac71f27ba64207816dfd3a6dc36.zip
small fixes from local testing for XML ingest
Diffstat (limited to 'python/sandcrawler/persist.py')
-rw-r--r--python/sandcrawler/persist.py11
1 files changed, 8 insertions, 3 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 81cf664..c225d5a 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -20,7 +20,7 @@ grobid
"""
import os
-from typing import Optional
+from typing import Optional, AnyStr
import xml.etree.ElementTree
from sandcrawler.workers import SandcrawlerWorker
@@ -518,12 +518,17 @@ class GenericPersistDocWorker(SandcrawlerWorker):
self.s3_folder = kwargs.get('s3_folder', "unknown")
self.doc_key = "unknown"
- def process(self, record: dict, key: Optional[str] = None) -> None:
+ def process(self, record: dict, raw_key: Optional[AnyStr] = None) -> None:
if record.get('status') != 'success' or not record.get(self.doc_key):
return
- assert key is not None and len(key) == 40 and isinstance(key, str)
+ assert raw_key is not None
+ if isinstance(raw_key, bytes):
+ key = raw_key.decode('utf-8')
+ elif isinstance(raw_key, str):
+ key = raw_key
+ assert len(key) == 40
if 'sha1hex' in record:
assert key == record['sha1hex']