aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-05-04 13:00:52 -0700
committerBryan Newbold <bnewbold@archive.org>2021-05-04 13:00:52 -0700
commitaf50cb970644e968ed329f268181d507073b2789 (patch)
tree5a86826f43f10f1a2ffb51741379347f9b1d944b /python/scripts
parentda1fcb9e294e4598a21fc66b0b7e8f102c315dfb (diff)
downloadsandcrawler-af50cb970644e968ed329f268181d507073b2789.tar.gz
sandcrawler-af50cb970644e968ed329f268181d507073b2789.zip
add cdx_collection.py python script (from scratch repo)
Diffstat (limited to 'python/scripts')
-rwxr-xr-xpython/scripts/cdx_collection.py80
1 files changed, 80 insertions, 0 deletions
diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py
new file mode 100755
index 0000000..4539a49
--- /dev/null
+++ b/python/scripts/cdx_collection.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""
+Fetches and merges all CDX files for a collection.
+
+Calls metadata API to enumerate all items/files, then fetches and concatanates
+them all. Requires the 'internetarchive' library.
+
+Call with a collection name:
+
+ ./cdx_collection SOME_COLLECTION_NAME
+"""
+
+import os
+import sys
+import shutil
+import tempfile
+import requests
+import subprocess
+import internetarchive as ia
+
+def run():
+
+ if len(sys.argv) != 2:
+ print("Expected a single argument (collection name)")
+ sys.exit(-1)
+
+ collection = sys.argv[1]
+
+ # Check collection name is clean
+ assert collection.replace('_', '').replace('-', '').replace('.', '').isalnum()
+
+ tempdir = tempfile.mkdtemp()
+ print("Looking up collection: {}".format(collection))
+
+ # First fetch list
+ item_list = list(
+ ia.search_items(
+ query="collection:{} mediatype:web".format(collection)))
+
+ if len(item_list) is 0:
+ print("No items found, bailing")
+ sys.exit(-1)
+
+ print("Found {} potential items".format(len(item_list)))
+ status = True
+ errors = []
+ for item in item_list:
+ item = item['identifier']
+ # TODO: error handling
+ try:
+ ret = ia.download(item, files=[item + '.cdx.gz'],
+ verbose=True,
+ destdir=tempdir,
+ no_directory=True,
+ retries=1000)
+ status = ret and status
+ except requests.exceptions.ReadTimeout as rt:
+ print(str(rt), file=sys.stderr)
+ errors.append(rt)
+ continue
+
+ if errors:
+ print("## Download Errors", file=sys.stderr)
+ for e in errors:
+ print(e, file=sys.stderr)
+
+ # Combine files
+ print("Merging and re-compressing all CDX files...")
+ #subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir),
+ subprocess.run('zcat {0}/*.cdx.gz | gzip > {0}/combined.gz'.format(tempdir),
+ shell=True)
+
+ # Move and cleanup
+ shutil.move('{}/combined.gz'.format(tempdir),
+ '{}.cdx.gz'.format(collection))
+
+ print("Done!")
+
+if __name__=='__main__':
+ run()