aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/deliver_dumpgrobid_to_s3.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/scripts/deliver_dumpgrobid_to_s3.py')
-rwxr-xr-xpython/scripts/deliver_dumpgrobid_to_s3.py94
1 files changed, 45 insertions, 49 deletions
diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py
index 86b3b35..27ccf21 100755
--- a/python/scripts/deliver_dumpgrobid_to_s3.py
+++ b/python/scripts/deliver_dumpgrobid_to_s3.py
@@ -19,23 +19,20 @@ Output:
- log to stdout (redirect to file), prefixed by sha1
Requires:
-- raven (sentry)
+- sentry-sdk
- boto3 (AWS S3 client library)
"""
-import os
-import sys
-import json
+import argparse
import base64
import hashlib
-import argparse
+import json
+import os
+import sys
from collections import Counter
import boto3
-import raven
-
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
+import sentry_sdk
def b32_hex(s):
@@ -45,81 +42,80 @@ def b32_hex(s):
s = s[5:]
if len(s) != 32:
return s
- return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
-
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
-class DeliverDumpGrobidS3():
+class DeliverDumpGrobidS3:
def __init__(self, s3_bucket, **kwargs):
self.rstore = None
self.count = Counter()
self.s3_bucket = s3_bucket
- self.s3_prefix = kwargs.get('s3_prefix', 'grobid/')
- self.s3_suffix = kwargs.get('s3_suffix', '.tei.xml')
- self.s3_storage_class = kwargs.get('s3_storage_class', 'STANDARD')
- self.s3 = boto3.resource('s3')
+ self.s3_prefix = kwargs.get("s3_prefix", "grobid/")
+ self.s3_suffix = kwargs.get("s3_suffix", ".tei.xml")
+ self.s3_storage_class = kwargs.get("s3_storage_class", "STANDARD")
+ self.s3 = boto3.resource("s3")
self.bucket = self.s3.Bucket(self.s3_bucket)
def run(self, dump_file):
sys.stderr.write("Starting...\n")
for line in dump_file:
- line = line.strip().split('\t')
+ line = line.strip().split("\t")
if len(line) != 2:
- self.count['skip-line'] += 1
+ self.count["skip-line"] += 1
continue
sha1_hex, grobid_json = line[0], line[1]
if len(sha1_hex) != 40:
sha1_hex = b32_hex(sha1_hex)
assert len(sha1_hex) == 40
grobid = json.loads(grobid_json)
- tei_xml = grobid.get('tei_xml')
+ tei_xml = grobid.get("tei_xml")
if not tei_xml:
print("{}\tskip empty".format(sha1_hex))
- self.count['skip-empty'] += 1
+ self.count["skip-empty"] += 1
continue
- tei_xml = tei_xml.encode('utf-8')
+ tei_xml = tei_xml.encode("utf-8")
# upload to AWS S3
obj = self.bucket.put_object(
- Key="{}{}/{}{}".format(
- self.s3_prefix,
- sha1_hex[0:4],
- sha1_hex,
- self.s3_suffix),
+ Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix),
Body=tei_xml,
StorageClass=self.s3_storage_class,
)
print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(tei_xml)))
- self.count['success-s3'] += 1
+ self.count["success-s3"] += 1
sys.stderr.write("{}\n".format(self.count))
-@sentry_client.capture_exceptions
+
def main():
parser = argparse.ArgumentParser()
- parser.add_argument('--s3-bucket',
- required=True,
- type=str,
- help='AWS S3 bucket to upload into')
- parser.add_argument('--s3-prefix',
- type=str,
- default="grobid/",
- help='key prefix for items created in bucket')
- parser.add_argument('--s3-suffix',
- type=str,
- default=".tei.xml",
- help='file suffix for created objects')
- parser.add_argument('--s3-storage-class',
- type=str,
- default="STANDARD",
- help='AWS S3 storage class (redundancy) to use')
- parser.add_argument('dump_file',
- help="TSV/JSON dump file",
- default=sys.stdin,
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "--s3-bucket", required=True, type=str, help="AWS S3 bucket to upload into"
+ )
+ parser.add_argument(
+ "--s3-prefix",
+ type=str,
+ default="grobid/",
+ help="key prefix for items created in bucket",
+ )
+ parser.add_argument(
+ "--s3-suffix", type=str, default=".tei.xml", help="file suffix for created objects"
+ )
+ parser.add_argument(
+ "--s3-storage-class",
+ type=str,
+ default="STANDARD",
+ help="AWS S3 storage class (redundancy) to use",
+ )
+ parser.add_argument(
+ "dump_file", help="TSV/JSON dump file", default=sys.stdin, type=argparse.FileType("r")
+ )
args = parser.parse_args()
+ sentry_sdk.init()
+
worker = DeliverDumpGrobidS3(**args.__dict__)
worker.run(args.dump_file)
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == "__main__": # pragma: no cover
main()