diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 12:54:37 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 12:54:37 -0700 |
commit | 05bd7cbcc62588e431c5efd533189e246b2a997e (patch) | |
tree | abcc707a451e77ea1e8c5ac9a5925b97a4bd139a /python/scripts/cdx_collection.py | |
parent | f3f424e42f2f4f383103cf80b30a00cfa6cfc179 (diff) | |
download | sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.tar.gz sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.zip |
make fmt
Diffstat (limited to 'python/scripts/cdx_collection.py')
-rwxr-xr-x | python/scripts/cdx_collection.py | 24 |
1 files changed, 11 insertions, 13 deletions
diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py index 5e33def..aa78aec 100755 --- a/python/scripts/cdx_collection.py +++ b/python/scripts/cdx_collection.py @@ -35,9 +35,7 @@ def run(): print("Looking up collection: {}".format(collection)) # First fetch list - item_list = list( - ia.search_items( - query="collection:{} mediatype:web".format(collection))) + item_list = list(ia.search_items(query="collection:{} mediatype:web".format(collection))) if len(item_list) == 0: print("No items found, bailing") @@ -50,11 +48,12 @@ def run(): item = item['identifier'] # TODO: error handling try: - ret = ia.download(item, files=[item + '.cdx.gz'], - verbose=True, - destdir=tempdir, - no_directory=True, - retries=1000) + ret = ia.download(item, + files=[item + '.cdx.gz'], + verbose=True, + destdir=tempdir, + no_directory=True, + retries=1000) status = ret and status except requests.exceptions.ReadTimeout as rt: print(str(rt), file=sys.stderr) @@ -69,14 +68,13 @@ def run(): # Combine files print("Merging and re-compressing all CDX files...") #subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir), - subprocess.run('zcat {0}/*.cdx.gz | gzip > {0}/combined.gz'.format(tempdir), - shell=True) + subprocess.run('zcat {0}/*.cdx.gz | gzip > {0}/combined.gz'.format(tempdir), shell=True) # Move and cleanup - shutil.move('{}/combined.gz'.format(tempdir), - '{}.cdx.gz'.format(collection)) + shutil.move('{}/combined.gz'.format(tempdir), '{}.cdx.gz'.format(collection)) print("Done!") -if __name__=='__main__': + +if __name__ == '__main__': run() |