diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-09-20 20:09:11 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-09-20 20:09:11 -0700 |
commit | 4166872ab53596cf7edb22cae8b283272c4a3758 (patch) | |
tree | 8474545ac8cd3fba8374a71f18bc7b25d10772b5 | |
parent | 3e67adab7d41db41d12930d66daf76cb2f72f61b (diff) | |
download | sandcrawler-4166872ab53596cf7edb22cae8b283272c4a3758.tar.gz sandcrawler-4166872ab53596cf7edb22cae8b283272c4a3758.zip |
unpaywall blobs.fatcat.wiki backfill notes
-rwxr-xr-x | postgrest/backfill/backfill_grobid_unpaywall.py | 59 |
1 files changed, 59 insertions, 0 deletions
diff --git a/postgrest/backfill/backfill_grobid_unpaywall.py b/postgrest/backfill/backfill_grobid_unpaywall.py new file mode 100755 index 0000000..58e9e3c --- /dev/null +++ b/postgrest/backfill/backfill_grobid_unpaywall.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +""" +This is a "one-time" tranform helper script for GROBID backfill into +sandcrawler minio and postgresql. + +This variant of backfill_grobid.py pushes into the unpaywall bucket of +sandcrawler-minio and doesn't push anything to sandcrawler table in general. +""" + +import json, os, sys, collections, io +import base64 +import requests +from minio import Minio +import psycopg2 +import psycopg2.extras + + +def b32_hex(s): + s = s.strip().split()[0].lower() + if s.startswith("sha1:"): + s = s[5:] + if len(s) != 32: + return s + return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + +def stdin_to_minio(): + mc = Minio('localhost:9000', + access_key=os.environ['MINIO_ACCESS_KEY'], + secret_key=os.environ['MINIO_SECRET_KEY'], + secure=False) + counts = collections.Counter({'total': 0}) + for l in sys.stdin: + if counts['raw_lines'] > 0 and counts['raw_lines'] % 10000 == 0: + print("Progress: {}...".format(counts)) + counts['raw_lines'] += 1 + l = l.strip() + if not l: + continue + row = json.loads(l) + if not row: + continue + sha1hex = b32_hex(row['pdf_hash']) + grobid_xml = row['tei_xml'].encode('utf-8') + grobid_xml_len = len(grobid_xml) + grobid_xml = io.BytesIO(grobid_xml) + + key = "grobid/{}/{}/{}.tei.xml".format( + sha1hex[0:2], + sha1hex[2:4], + sha1hex) + mc.put_object("unpaywall", key, grobid_xml, grobid_xml_len, + content_type="application/tei+xml", + metadata=None) + counts['minio-success'] += 1 + + print("Done: {}".format(counts)) + +if __name__=='__main__': + stdin_to_minio() |