From 5b0aad1d8d100832c4a73dd006d8196aa995b4f0 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 10 May 2019 17:15:22 +0000 Subject: deliver_dumpgrobid_to_s3: allow heritrix-style SHA-1 format --- python/deliver_dumpgrobid_to_s3.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'python') diff --git a/python/deliver_dumpgrobid_to_s3.py b/python/deliver_dumpgrobid_to_s3.py index 6a9aea6..ac0949c 100755 --- a/python/deliver_dumpgrobid_to_s3.py +++ b/python/deliver_dumpgrobid_to_s3.py @@ -38,6 +38,16 @@ import raven sentry_client = raven.Client() +def b32_hex(s): + """copy/pasta from elsewhere""" + s = s.strip().split()[0].lower() + if s.startswith("sha1:"): + s = s[5:] + if len(s) != 32: + return s + return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + + class DeliverDumpGrobidS3(): def __init__(self, s3_bucket, **kwargs): @@ -58,6 +68,8 @@ class DeliverDumpGrobidS3(): self.count['skip-line'] += 1 continue sha1_hex, grobid_json = line[0], line[1] + if len(sha1_hex) != 40: + sha1_hex = b32_hex(sha1_hex) assert len(sha1_hex) == 40 grobid = json.loads(grobid_json) tei_xml = grobid.get('tei_xml') -- cgit v1.2.3