add cdx_collection.py python script (from scratch repo)

author: Bryan Newbold <bnewbold@archive.org> 2021-05-04 13:00:52 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-05-04 13:00:52 -0700
commit: af50cb970644e968ed329f268181d507073b2789 (patch)
tree: 5a86826f43f10f1a2ffb51741379347f9b1d944b
parent: da1fcb9e294e4598a21fc66b0b7e8f102c315dfb (diff)
download: sandcrawler-af50cb970644e968ed329f268181d507073b2789.tar.gz
sandcrawler-af50cb970644e968ed329f268181d507073b2789.zip
1 files changed, 80 insertions, 0 deletions
diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py
new file mode 100755
index 0000000..4539a49
--- /dev/null
+++ b/python/scripts/cdx_collection.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""
+Fetches and merges all CDX files for a collection.
+
+Calls metadata API to enumerate all items/files, then fetches and concatanates
+them all. Requires the 'internetarchive' library.
+
+Call with a collection name:
+
+    ./cdx_collection SOME_COLLECTION_NAME
+"""
+
+import os
+import sys
+import shutil
+import tempfile
+import requests
+import subprocess
+import internetarchive as ia
+
+def run():
+
+    if len(sys.argv) != 2:
+        print("Expected a single argument (collection name)")
+        sys.exit(-1)
+
+    collection = sys.argv[1]
+
+    # Check collection name is clean
+    assert collection.replace('_', '').replace('-', '').replace('.', '').isalnum()
+
+    tempdir = tempfile.mkdtemp()
+    print("Looking up collection: {}".format(collection))
+
+    # First fetch list
+    item_list = list(
+        ia.search_items(
+            query="collection:{} mediatype:web".format(collection)))
+
+    if len(item_list) is 0:
+        print("No items found, bailing")
+        sys.exit(-1)
+
+    print("Found {} potential items".format(len(item_list)))
+    status = True
+    errors = []
+    for item in item_list:
+        item = item['identifier']
+        # TODO: error handling
+        try:
+            ret = ia.download(item, files=[item + '.cdx.gz'],
+                verbose=True,
+                destdir=tempdir,
+                no_directory=True,
+                retries=1000)
+            status = ret and status
+        except requests.exceptions.ReadTimeout as rt:
+            print(str(rt), file=sys.stderr)
+            errors.append(rt)
+            continue
+
+    if errors:
+        print("## Download Errors", file=sys.stderr)
+        for e in errors:
+            print(e, file=sys.stderr)
+
+    # Combine files
+    print("Merging and re-compressing all CDX files...")
+    #subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir),
+    subprocess.run('zcat {0}/*.cdx.gz | gzip > {0}/combined.gz'.format(tempdir),
+        shell=True)
+
+    # Move and cleanup
+    shutil.move('{}/combined.gz'.format(tempdir),
+                '{}.cdx.gz'.format(collection))
+
+    print("Done!")
+
+if __name__=='__main__':
+    run()
author	Bryan Newbold <bnewbold@archive.org>	2021-05-04 13:00:52 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-05-04 13:00:52 -0700
commit	af50cb970644e968ed329f268181d507073b2789 (patch)
tree	5a86826f43f10f1a2ffb51741379347f9b1d944b
parent	da1fcb9e294e4598a21fc66b0b7e8f102c315dfb (diff)
download	sandcrawler-af50cb970644e968ed329f268181d507073b2789.tar.gz sandcrawler-af50cb970644e968ed329f268181d507073b2789.zip