python/scripts/deliver_dumpgrobid_to_s3.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122

#!/usr/bin/env python3
"""
Tool for bulk uploading GROBID TEI-XML output from a local filesystem dump
(from HBase) to AWS S3.

See unpaywall delivery README (in bnewbold's scratch repo) for notes on running
this script for that specific use-case.

Script takes:
- input TSV: `sha1_hex, json (including grobid0:tei_xml)`
    => usually from dumpgrobid, with SHA-1 key transformed to hex, and filtered
       down (eg, by join by SHA-1) to a specific manifest
- AWS S3 bucket and prefix

AWS S3 credentials are passed via environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)

Output:
- errors/stats to stderr
- log to stdout (redirect to file), prefixed by sha1

Requires:
- raven (sentry)
- boto3 (AWS S3 client library)
"""

import argparse
import base64
import hashlib
import json
import os
import sys
from collections import Counter

import boto3
import raven

# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
sentry_client = raven.Client()


def b32_hex(s):
    """copy/pasta from elsewhere"""
    s = s.strip().split()[0].lower()
    if s.startswith("sha1:"):
        s = s[5:]
    if len(s) != 32:
        return s
    return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')


class DeliverDumpGrobidS3():
    def __init__(self, s3_bucket, **kwargs):
        self.rstore = None
        self.count = Counter()
        self.s3_bucket = s3_bucket
        self.s3_prefix = kwargs.get('s3_prefix', 'grobid/')
        self.s3_suffix = kwargs.get('s3_suffix', '.tei.xml')
        self.s3_storage_class = kwargs.get('s3_storage_class', 'STANDARD')
        self.s3 = boto3.resource('s3')
        self.bucket = self.s3.Bucket(self.s3_bucket)

    def run(self, dump_file):
        sys.stderr.write("Starting...\n")
        for line in dump_file:
            line = line.strip().split('\t')
            if len(line) != 2:
                self.count['skip-line'] += 1
                continue
            sha1_hex, grobid_json = line[0], line[1]
            if len(sha1_hex) != 40:
                sha1_hex = b32_hex(sha1_hex)
            assert len(sha1_hex) == 40
            grobid = json.loads(grobid_json)
            tei_xml = grobid.get('tei_xml')
            if not tei_xml:
                print("{}\tskip empty".format(sha1_hex))
                self.count['skip-empty'] += 1
                continue
            tei_xml = tei_xml.encode('utf-8')
            # upload to AWS S3
            obj = self.bucket.put_object(
                Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix),
                Body=tei_xml,
                StorageClass=self.s3_storage_class,
            )
            print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(tei_xml)))
            self.count['success-s3'] += 1
        sys.stderr.write("{}\n".format(self.count))


@sentry_client.capture_exceptions
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--s3-bucket',
                        required=True,
                        type=str,
                        help='AWS S3 bucket to upload into')
    parser.add_argument('--s3-prefix',
                        type=str,
                        default="grobid/",
                        help='key prefix for items created in bucket')
    parser.add_argument('--s3-suffix',
                        type=str,
                        default=".tei.xml",
                        help='file suffix for created objects')
    parser.add_argument('--s3-storage-class',
                        type=str,
                        default="STANDARD",
                        help='AWS S3 storage class (redundancy) to use')
    parser.add_argument('dump_file',
                        help="TSV/JSON dump file",
                        default=sys.stdin,
                        type=argparse.FileType('r'))
    args = parser.parse_args()

    worker = DeliverDumpGrobidS3(**args.__dict__)
    worker.run(args.dump_file)


if __name__ == '__main__':  # pragma: no cover
    main()