diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-05-04 13:00:52 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-05-04 13:00:52 -0700 |
commit | af50cb970644e968ed329f268181d507073b2789 (patch) | |
tree | 5a86826f43f10f1a2ffb51741379347f9b1d944b | |
parent | da1fcb9e294e4598a21fc66b0b7e8f102c315dfb (diff) | |
download | sandcrawler-af50cb970644e968ed329f268181d507073b2789.tar.gz sandcrawler-af50cb970644e968ed329f268181d507073b2789.zip |
add cdx_collection.py python script (from scratch repo)
-rwxr-xr-x | python/scripts/cdx_collection.py | 80 |
1 files changed, 80 insertions, 0 deletions
diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py new file mode 100755 index 0000000..4539a49 --- /dev/null +++ b/python/scripts/cdx_collection.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +""" +Fetches and merges all CDX files for a collection. + +Calls metadata API to enumerate all items/files, then fetches and concatanates +them all. Requires the 'internetarchive' library. + +Call with a collection name: + + ./cdx_collection SOME_COLLECTION_NAME +""" + +import os +import sys +import shutil +import tempfile +import requests +import subprocess +import internetarchive as ia + +def run(): + + if len(sys.argv) != 2: + print("Expected a single argument (collection name)") + sys.exit(-1) + + collection = sys.argv[1] + + # Check collection name is clean + assert collection.replace('_', '').replace('-', '').replace('.', '').isalnum() + + tempdir = tempfile.mkdtemp() + print("Looking up collection: {}".format(collection)) + + # First fetch list + item_list = list( + ia.search_items( + query="collection:{} mediatype:web".format(collection))) + + if len(item_list) is 0: + print("No items found, bailing") + sys.exit(-1) + + print("Found {} potential items".format(len(item_list))) + status = True + errors = [] + for item in item_list: + item = item['identifier'] + # TODO: error handling + try: + ret = ia.download(item, files=[item + '.cdx.gz'], + verbose=True, + destdir=tempdir, + no_directory=True, + retries=1000) + status = ret and status + except requests.exceptions.ReadTimeout as rt: + print(str(rt), file=sys.stderr) + errors.append(rt) + continue + + if errors: + print("## Download Errors", file=sys.stderr) + for e in errors: + print(e, file=sys.stderr) + + # Combine files + print("Merging and re-compressing all CDX files...") + #subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir), + subprocess.run('zcat {0}/*.cdx.gz | gzip > {0}/combined.gz'.format(tempdir), + shell=True) + + # Move and cleanup + shutil.move('{}/combined.gz'.format(tempdir), + '{}.cdx.gz'.format(collection)) + + print("Done!") + +if __name__=='__main__': + run() |