From 624ee2e7ae97a6d5de82bec9587b6c89a0fc5048 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 19 Feb 2019 17:23:10 -0800
Subject: deliver python tweaks

---
 python/deliver_gwb_to_s3.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'python')

diff --git a/python/deliver_gwb_to_s3.py b/python/deliver_gwb_to_s3.py
index 729c0da..d76bedf 100755
--- a/python/deliver_gwb_to_s3.py
+++ b/python/deliver_gwb_to_s3.py
@@ -2,10 +2,12 @@
 """
 Tool for bulk copying of PDFs (or other files) from GWB to AWS S3.
 
-Script should take:
-- input TSV: `(sha1, datetime, terminal_url)` (can be exported from arabesque sqlite3 output)
-    => sha1_hex, warc_path, offset, c_size
-- GWB credentials
+See unpaywall delivery README (in bnewbold's scratch repo) for notes on running
+this script for that specific use-case.
+
+Script takes:
+- input TSV: `sha1, file:cdx (json)`
+    => usually from dumpfilemeta, filtered down (eg, by join by SHA-1) to a specific manifest
 - AWS S3 bucket and prefix
 
 AWS S3 credentials are passed via environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
@@ -17,6 +19,8 @@ Output:
 - log to stdout (redirect to file), prefixed by sha1
 
 Requires:
+- raven (sentry)
+- boto3 (AWS S3 client library)
 - wayback/GWB libraries
 """
 
@@ -43,7 +47,6 @@ class DeliverGwbS3():
 
     def __init__(self, s3_bucket, **kwargs):
         self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
-        self.mime_filter = ['application/pdf']
         self.rstore = None
         self.count = Counter()
         self.s3_bucket = s3_bucket
-- 
cgit v1.2.3