diff options
Diffstat (limited to 'python/scripts/deliver_dumpgrobid_to_s3.py')
-rwxr-xr-x | python/scripts/deliver_dumpgrobid_to_s3.py | 94 |
1 files changed, 45 insertions, 49 deletions
diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py index 86b3b35..27ccf21 100755 --- a/python/scripts/deliver_dumpgrobid_to_s3.py +++ b/python/scripts/deliver_dumpgrobid_to_s3.py @@ -19,23 +19,20 @@ Output: - log to stdout (redirect to file), prefixed by sha1 Requires: -- raven (sentry) +- sentry-sdk - boto3 (AWS S3 client library) """ -import os -import sys -import json +import argparse import base64 import hashlib -import argparse +import json +import os +import sys from collections import Counter import boto3 -import raven - -# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable -sentry_client = raven.Client() +import sentry_sdk def b32_hex(s): @@ -45,81 +42,80 @@ def b32_hex(s): s = s[5:] if len(s) != 32: return s - return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') - + return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8") -class DeliverDumpGrobidS3(): +class DeliverDumpGrobidS3: def __init__(self, s3_bucket, **kwargs): self.rstore = None self.count = Counter() self.s3_bucket = s3_bucket - self.s3_prefix = kwargs.get('s3_prefix', 'grobid/') - self.s3_suffix = kwargs.get('s3_suffix', '.tei.xml') - self.s3_storage_class = kwargs.get('s3_storage_class', 'STANDARD') - self.s3 = boto3.resource('s3') + self.s3_prefix = kwargs.get("s3_prefix", "grobid/") + self.s3_suffix = kwargs.get("s3_suffix", ".tei.xml") + self.s3_storage_class = kwargs.get("s3_storage_class", "STANDARD") + self.s3 = boto3.resource("s3") self.bucket = self.s3.Bucket(self.s3_bucket) def run(self, dump_file): sys.stderr.write("Starting...\n") for line in dump_file: - line = line.strip().split('\t') + line = line.strip().split("\t") if len(line) != 2: - self.count['skip-line'] += 1 + self.count["skip-line"] += 1 continue sha1_hex, grobid_json = line[0], line[1] if len(sha1_hex) != 40: sha1_hex = b32_hex(sha1_hex) assert len(sha1_hex) == 40 grobid = json.loads(grobid_json) - tei_xml = grobid.get('tei_xml') + tei_xml = grobid.get("tei_xml") if not tei_xml: print("{}\tskip empty".format(sha1_hex)) - self.count['skip-empty'] += 1 + self.count["skip-empty"] += 1 continue - tei_xml = tei_xml.encode('utf-8') + tei_xml = tei_xml.encode("utf-8") # upload to AWS S3 obj = self.bucket.put_object( - Key="{}{}/{}{}".format( - self.s3_prefix, - sha1_hex[0:4], - sha1_hex, - self.s3_suffix), + Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix), Body=tei_xml, StorageClass=self.s3_storage_class, ) print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(tei_xml))) - self.count['success-s3'] += 1 + self.count["success-s3"] += 1 sys.stderr.write("{}\n".format(self.count)) -@sentry_client.capture_exceptions + def main(): parser = argparse.ArgumentParser() - parser.add_argument('--s3-bucket', - required=True, - type=str, - help='AWS S3 bucket to upload into') - parser.add_argument('--s3-prefix', - type=str, - default="grobid/", - help='key prefix for items created in bucket') - parser.add_argument('--s3-suffix', - type=str, - default=".tei.xml", - help='file suffix for created objects') - parser.add_argument('--s3-storage-class', - type=str, - default="STANDARD", - help='AWS S3 storage class (redundancy) to use') - parser.add_argument('dump_file', - help="TSV/JSON dump file", - default=sys.stdin, - type=argparse.FileType('r')) + parser.add_argument( + "--s3-bucket", required=True, type=str, help="AWS S3 bucket to upload into" + ) + parser.add_argument( + "--s3-prefix", + type=str, + default="grobid/", + help="key prefix for items created in bucket", + ) + parser.add_argument( + "--s3-suffix", type=str, default=".tei.xml", help="file suffix for created objects" + ) + parser.add_argument( + "--s3-storage-class", + type=str, + default="STANDARD", + help="AWS S3 storage class (redundancy) to use", + ) + parser.add_argument( + "dump_file", help="TSV/JSON dump file", default=sys.stdin, type=argparse.FileType("r") + ) args = parser.parse_args() + sentry_sdk.init() + worker = DeliverDumpGrobidS3(**args.__dict__) worker.run(args.dump_file) -if __name__ == '__main__': # pragma: no cover + +if __name__ == "__main__": # pragma: no cover main() |