diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-05-10 17:15:22 +0000 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-05-10 17:15:22 +0000 |
commit | 5b0aad1d8d100832c4a73dd006d8196aa995b4f0 (patch) | |
tree | 02818c4410c3dbfb9090315587fd59141e4df4ad | |
parent | 9d518593633fac490b47f67544787454dc69f1bf (diff) | |
download | sandcrawler-5b0aad1d8d100832c4a73dd006d8196aa995b4f0.tar.gz sandcrawler-5b0aad1d8d100832c4a73dd006d8196aa995b4f0.zip |
deliver_dumpgrobid_to_s3: allow heritrix-style SHA-1 format
-rwxr-xr-x | python/deliver_dumpgrobid_to_s3.py | 12 |
1 files changed, 12 insertions, 0 deletions
diff --git a/python/deliver_dumpgrobid_to_s3.py b/python/deliver_dumpgrobid_to_s3.py index 6a9aea6..ac0949c 100755 --- a/python/deliver_dumpgrobid_to_s3.py +++ b/python/deliver_dumpgrobid_to_s3.py @@ -38,6 +38,16 @@ import raven sentry_client = raven.Client() +def b32_hex(s): + """copy/pasta from elsewhere""" + s = s.strip().split()[0].lower() + if s.startswith("sha1:"): + s = s[5:] + if len(s) != 32: + return s + return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + + class DeliverDumpGrobidS3(): def __init__(self, s3_bucket, **kwargs): @@ -58,6 +68,8 @@ class DeliverDumpGrobidS3(): self.count['skip-line'] += 1 continue sha1_hex, grobid_json = line[0], line[1] + if len(sha1_hex) != 40: + sha1_hex = b32_hex(sha1_hex) assert len(sha1_hex) == 40 grobid = json.loads(grobid_json) tei_xml = grobid.get('tei_xml') |