aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-02-20 15:38:20 -0800
committerBryan Newbold <bnewbold@archive.org>2019-02-20 15:38:20 -0800
commite402599373abef442f034c878ef586764ab88726 (patch)
treee47623facdaed4354b9c830ec7aa836062a14cf0
parent5b3239bcf6ae38004566d8de8f8859fe155d7a49 (diff)
downloadsandcrawler-e402599373abef442f034c878ef586764ab88726.tar.gz
sandcrawler-e402599373abef442f034c878ef586764ab88726.zip
include file size in S3 uploads
-rwxr-xr-xpython/deliver_gwb_to_s3.py6
1 files changed, 3 insertions, 3 deletions
diff --git a/python/deliver_gwb_to_s3.py b/python/deliver_gwb_to_s3.py
index 98131e1..8a52382 100755
--- a/python/deliver_gwb_to_s3.py
+++ b/python/deliver_gwb_to_s3.py
@@ -6,13 +6,13 @@ See unpaywall delivery README (in bnewbold's scratch repo) for notes on running
this script for that specific use-case.
Script takes:
-- input TSV: `sha1, file:cdx (json)`
+- input TSV: `sha1_hex, file:cdx (json)`
=> usually from dumpfilemeta, filtered down (eg, by join by SHA-1) to a specific manifest
- AWS S3 bucket and prefix
AWS S3 credentials are passed via environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
-GWB credentials end up under /opt/.petabox/ (!)
+GWB credentials from environment variable PETABOX_WEBDATA_SECRET, else looks in /opt/.petabox/.
20x threads on a single machine can process about 340k files in 3 hours; that's
roughly 6 hours per million per host with 32 threads, or 5k files an hour
@@ -145,7 +145,7 @@ class DeliverGwbS3():
sha1_hex,
self.s3_suffix),
Body=blob)
- print("{}\tsuccess\t{}".format(sha1_hex, obj.key))
+ print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob)))
self.count['success-s3'] += 1
sys.stderr.write("{}\n".format(self.count))