aboutsummaryrefslogtreecommitdiffstats
path: root/postgrest/backfill/backfill_grobid_unpaywall.py
blob: 58e9e3cf97ab9e986bf3fc5de8a619f0c98c1e7e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env python3
"""
This is a "one-time" tranform helper script for GROBID backfill into
sandcrawler minio and postgresql.

This variant of backfill_grobid.py pushes into the unpaywall bucket of
sandcrawler-minio and doesn't push anything to sandcrawler table in general.
"""

import json, os, sys, collections, io
import base64
import requests
from minio import Minio
import psycopg2
import psycopg2.extras


def b32_hex(s):
    s = s.strip().split()[0].lower()
    if s.startswith("sha1:"):
        s = s[5:]
    if len(s) != 32:
        return s
    return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')

def stdin_to_minio():
    mc = Minio('localhost:9000',
        access_key=os.environ['MINIO_ACCESS_KEY'],
        secret_key=os.environ['MINIO_SECRET_KEY'],
        secure=False)
    counts = collections.Counter({'total': 0})
    for l in sys.stdin:
        if counts['raw_lines'] > 0 and counts['raw_lines'] % 10000 == 0:
            print("Progress: {}...".format(counts))
        counts['raw_lines'] += 1
        l = l.strip()
        if not l:
            continue
        row = json.loads(l)
        if not row:
            continue
        sha1hex = b32_hex(row['pdf_hash'])
        grobid_xml = row['tei_xml'].encode('utf-8')
        grobid_xml_len = len(grobid_xml)
        grobid_xml = io.BytesIO(grobid_xml)

        key = "grobid/{}/{}/{}.tei.xml".format(
            sha1hex[0:2],
            sha1hex[2:4],
            sha1hex)
        mc.put_object("unpaywall", key, grobid_xml, grobid_xml_len,
            content_type="application/tei+xml",
            metadata=None)
        counts['minio-success'] += 1

    print("Done: {}".format(counts))

if __name__=='__main__':
    stdin_to_minio()