1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
|
#!/usr/bin/env python3
"""
Fetches and merges all CDX files for a collection.
Calls metadata API to enumerate all items/files, then fetches and concatanates
them all. Requires the 'internetarchive' library.
Call with a collection name:
./cdx_collection SOME_COLLECTION_NAME
"""
import os
import shutil
import subprocess
import sys
import tempfile
import internetarchive as ia
import requests
def run():
if len(sys.argv) != 2:
print("Expected a single argument (collection name)")
sys.exit(-1)
collection = sys.argv[1]
# Check collection name is clean
assert collection.replace('_', '').replace('-', '').replace('.', '').isalnum()
tempdir = tempfile.mkdtemp()
print("Looking up collection: {}".format(collection))
# First fetch list
item_list = list(ia.search_items(query="collection:{} mediatype:web".format(collection)))
if len(item_list) == 0:
print("No items found, bailing")
sys.exit(-1)
print("Found {} potential items".format(len(item_list)))
status = True
errors = []
for item in item_list:
item = item['identifier']
# TODO: error handling
try:
ret = ia.download(item,
files=[item + '.cdx.gz'],
verbose=True,
destdir=tempdir,
no_directory=True,
retries=1000)
status = ret and status
except requests.exceptions.ReadTimeout as rt:
print(str(rt), file=sys.stderr)
errors.append(rt)
continue
if errors:
print("## Download Errors", file=sys.stderr)
for e in errors:
print(e, file=sys.stderr)
# Combine files
print("Merging and re-compressing all CDX files...")
#subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir),
subprocess.run('zcat {0}/*.cdx.gz | gzip > {0}/combined.gz'.format(tempdir), shell=True)
# Move and cleanup
shutil.move('{}/combined.gz'.format(tempdir), '{}.cdx.gz'.format(collection))
print("Done!")
if __name__ == '__main__':
run()
|