aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/cdx_collection.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/scripts/cdx_collection.py')
-rwxr-xr-xpython/scripts/cdx_collection.py82
1 files changed, 82 insertions, 0 deletions
diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py
new file mode 100755
index 0000000..0b60da3
--- /dev/null
+++ b/python/scripts/cdx_collection.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+"""
+Fetches and merges all CDX files for a collection.
+
+Calls metadata API to enumerate all items/files, then fetches and concatanates
+them all. Requires the 'internetarchive' library.
+
+Call with a collection name:
+
+ ./cdx_collection SOME_COLLECTION_NAME
+"""
+
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import internetarchive as ia
+import requests
+
+
+def run():
+
+ if len(sys.argv) != 2:
+ print("Expected a single argument (collection name)")
+ sys.exit(-1)
+
+ collection = sys.argv[1]
+
+ # Check collection name is clean
+ assert collection.replace("_", "").replace("-", "").replace(".", "").isalnum()
+
+ tempdir = tempfile.mkdtemp()
+ print("Looking up collection: {}".format(collection))
+
+ # First fetch list
+ item_list = list(ia.search_items(query="collection:{} mediatype:web".format(collection)))
+
+ if len(item_list) == 0:
+ print("No items found, bailing")
+ sys.exit(-1)
+
+ print("Found {} potential items".format(len(item_list)))
+ status = True
+ errors = []
+ for item in item_list:
+ item = item["identifier"]
+ # TODO: error handling
+ try:
+ ret = ia.download(
+ item,
+ files=[item + ".cdx.gz"],
+ verbose=True,
+ destdir=tempdir,
+ no_directory=True,
+ retries=1000,
+ )
+ status = ret and status
+ except requests.exceptions.ReadTimeout as rt:
+ print(str(rt), file=sys.stderr)
+ errors.append(rt)
+ continue
+
+ if errors:
+ print("## Download Errors", file=sys.stderr)
+ for e in errors:
+ print(e, file=sys.stderr)
+
+ # Combine files
+ print("Merging and re-compressing all CDX files...")
+ # subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir),
+ subprocess.run("zcat {0}/*.cdx.gz | gzip > {0}/combined.gz".format(tempdir), shell=True)
+
+ # Move and cleanup
+ shutil.move("{}/combined.gz".format(tempdir), "{}.cdx.gz".format(collection))
+
+ print("Done!")
+
+
+if __name__ == "__main__":
+ run()