aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-05-10 17:15:22 +0000
committerBryan Newbold <bnewbold@archive.org>2019-05-10 17:15:22 +0000
commit5b0aad1d8d100832c4a73dd006d8196aa995b4f0 (patch)
tree02818c4410c3dbfb9090315587fd59141e4df4ad
parent9d518593633fac490b47f67544787454dc69f1bf (diff)
downloadsandcrawler-5b0aad1d8d100832c4a73dd006d8196aa995b4f0.tar.gz
sandcrawler-5b0aad1d8d100832c4a73dd006d8196aa995b4f0.zip
deliver_dumpgrobid_to_s3: allow heritrix-style SHA-1 format
-rwxr-xr-xpython/deliver_dumpgrobid_to_s3.py12
1 files changed, 12 insertions, 0 deletions
diff --git a/python/deliver_dumpgrobid_to_s3.py b/python/deliver_dumpgrobid_to_s3.py
index 6a9aea6..ac0949c 100755
--- a/python/deliver_dumpgrobid_to_s3.py
+++ b/python/deliver_dumpgrobid_to_s3.py
@@ -38,6 +38,16 @@ import raven
sentry_client = raven.Client()
+def b32_hex(s):
+ """copy/pasta from elsewhere"""
+ s = s.strip().split()[0].lower()
+ if s.startswith("sha1:"):
+ s = s[5:]
+ if len(s) != 32:
+ return s
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+
+
class DeliverDumpGrobidS3():
def __init__(self, s3_bucket, **kwargs):
@@ -58,6 +68,8 @@ class DeliverDumpGrobidS3():
self.count['skip-line'] += 1
continue
sha1_hex, grobid_json = line[0], line[1]
+ if len(sha1_hex) != 40:
+ sha1_hex = b32_hex(sha1_hex)
assert len(sha1_hex) == 40
grobid = json.loads(grobid_json)
tei_xml = grobid.get('tei_xml')