aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-04 18:10:33 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-04 18:10:33 -0800
commit8f0ff9996b70e7b7e2e5c5de01933394857da966 (patch)
treea315db5946c985fa4cebfb503777615b0170c6c6
parentde71aa92d4c7c9d14dfccc0188032d4e7b10090f (diff)
downloadsandcrawler-8f0ff9996b70e7b7e2e5c5de01933394857da966.tar.gz
sandcrawler-8f0ff9996b70e7b7e2e5c5de01933394857da966.zip
persist: fix worker API/typing hacks (raw_key, key, key_str)
-rw-r--r--python/sandcrawler/persist.py18
1 files changed, 9 insertions, 9 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 033bc91..fbd2bdb 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -532,24 +532,24 @@ class GenericPersistDocWorker(SandcrawlerWorker):
self.s3_folder = kwargs.get('s3_folder', "unknown")
self.doc_key = "unknown"
- def process(self, record: dict, raw_key: Optional[AnyStr] = None) -> None:
+ def process(self, record: dict, key: Optional[AnyStr] = None) -> None:
if record.get('status') != 'success' or not record.get(self.doc_key):
return
- assert raw_key is not None
- if isinstance(raw_key, bytes):
- key = raw_key.decode('utf-8')
- elif isinstance(raw_key, str):
- key = raw_key
- assert len(key) == 40
+ assert key is not None
+ if isinstance(key, bytes):
+ key_str = key.decode('utf-8')
+ elif isinstance(key, str):
+ key_str = key
+ assert len(key_str) == 40
if 'sha1hex' in record:
- assert key == record['sha1hex']
+ assert key_str == record['sha1hex']
resp = self.s3.put_blob(
folder=self.s3_folder,
blob=record[self.doc_key].encode('utf-8'),
- sha1hex=key,
+ sha1hex=key_str,
extension=self.s3_extension,
)
self.counts['s3-put'] += 1