#!/usr/bin/env python3 """ Fetches and merges all CDX files for a collection. Calls metadata API to enumerate all items/files, then fetches and concatanates them all. Requires the 'internetarchive' library. Call with a collection name: ./cdx_collection SOME_COLLECTION_NAME """ import os import shutil import subprocess import sys import tempfile import internetarchive as ia import requests def run(): if len(sys.argv) != 2: print("Expected a single argument (collection name)") sys.exit(-1) collection = sys.argv[1] # Check collection name is clean assert collection.replace("_", "").replace("-", "").replace(".", "").isalnum() tempdir = tempfile.mkdtemp() print("Looking up collection: {}".format(collection)) # First fetch list item_list = list(ia.search_items(query="collection:{} mediatype:web".format(collection))) if len(item_list) == 0: print("No items found, bailing") sys.exit(-1) print("Found {} potential items".format(len(item_list))) status = True errors = [] for item in item_list: item = item["identifier"] # TODO: error handling try: ret = ia.download( item, files=[item + ".cdx.gz"], verbose=True, destdir=tempdir, no_directory=True, retries=1000, ) status = ret and status except requests.exceptions.ReadTimeout as rt: print(str(rt), file=sys.stderr) errors.append(rt) continue if errors: print("## Download Errors", file=sys.stderr) for e in errors: print(e, file=sys.stderr) # Combine files print("Merging and re-compressing all CDX files...") # subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir), subprocess.run("zcat {0}/*.cdx.gz | gzip > {0}/combined.gz".format(tempdir), shell=True) # Move and cleanup shutil.move("{}/combined.gz".format(tempdir), "{}.cdx.gz".format(collection)) print("Done!") if __name__ == "__main__": run()