aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-02-19 17:23:10 -0800
committerBryan Newbold <bnewbold@archive.org>2019-02-19 17:23:10 -0800
commit624ee2e7ae97a6d5de82bec9587b6c89a0fc5048 (patch)
treefbccc3e911ab37ed2f5f01b8d2f4780daefff3f1 /python
parentabdf6750dfcdcfb88f15f4bdb7a0e472087647eb (diff)
downloadsandcrawler-624ee2e7ae97a6d5de82bec9587b6c89a0fc5048.tar.gz
sandcrawler-624ee2e7ae97a6d5de82bec9587b6c89a0fc5048.zip
deliver python tweaks
Diffstat (limited to 'python')
-rwxr-xr-xpython/deliver_gwb_to_s3.py13
1 files changed, 8 insertions, 5 deletions
diff --git a/python/deliver_gwb_to_s3.py b/python/deliver_gwb_to_s3.py
index 729c0da..d76bedf 100755
--- a/python/deliver_gwb_to_s3.py
+++ b/python/deliver_gwb_to_s3.py
@@ -2,10 +2,12 @@
"""
Tool for bulk copying of PDFs (or other files) from GWB to AWS S3.
-Script should take:
-- input TSV: `(sha1, datetime, terminal_url)` (can be exported from arabesque sqlite3 output)
- => sha1_hex, warc_path, offset, c_size
-- GWB credentials
+See unpaywall delivery README (in bnewbold's scratch repo) for notes on running
+this script for that specific use-case.
+
+Script takes:
+- input TSV: `sha1, file:cdx (json)`
+ => usually from dumpfilemeta, filtered down (eg, by join by SHA-1) to a specific manifest
- AWS S3 bucket and prefix
AWS S3 credentials are passed via environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
@@ -17,6 +19,8 @@ Output:
- log to stdout (redirect to file), prefixed by sha1
Requires:
+- raven (sentry)
+- boto3 (AWS S3 client library)
- wayback/GWB libraries
"""
@@ -43,7 +47,6 @@ class DeliverGwbS3():
def __init__(self, s3_bucket, **kwargs):
self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
- self.mime_filter = ['application/pdf']
self.rstore = None
self.count = Counter()
self.s3_bucket = s3_bucket