aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/cdx_collection.py
blob: e867b21dc9774b58f6f91fc3c00f334af437329e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python3
"""
Fetches and merges all CDX files for a collection.

Calls metadata API to enumerate all items/files, then fetches and concatanates
them all. Requires the 'internetarchive' library.

Call with a collection name:

    ./cdx_collection SOME_COLLECTION_NAME
"""

import os
import sys
import shutil
import tempfile
import requests
import subprocess
import internetarchive as ia

def run():

    if len(sys.argv) != 2:
        print("Expected a single argument (collection name)")
        sys.exit(-1)

    collection = sys.argv[1]

    # Check collection name is clean
    assert collection.replace('_', '').replace('-', '').replace('.', '').isalnum()

    tempdir = tempfile.mkdtemp()
    print("Looking up collection: {}".format(collection))

    # First fetch list
    item_list = list(
        ia.search_items(
            query="collection:{} mediatype:web".format(collection)))

    if len(item_list) == 0:
        print("No items found, bailing")
        sys.exit(-1)

    print("Found {} potential items".format(len(item_list)))
    status = True
    errors = []
    for item in item_list:
        item = item['identifier']
        # TODO: error handling
        try:
            ret = ia.download(item, files=[item + '.cdx.gz'],
                verbose=True,
                destdir=tempdir,
                no_directory=True,
                retries=1000)
            status = ret and status
        except requests.exceptions.ReadTimeout as rt:
            print(str(rt), file=sys.stderr)
            errors.append(rt)
            continue

    if errors:
        print("## Download Errors", file=sys.stderr)
        for e in errors:
            print(e, file=sys.stderr)

    # Combine files
    print("Merging and re-compressing all CDX files...")
    #subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir),
    subprocess.run('zcat {0}/*.cdx.gz | gzip > {0}/combined.gz'.format(tempdir),
        shell=True)

    # Move and cleanup
    shutil.move('{}/combined.gz'.format(tempdir),
                '{}.cdx.gz'.format(collection))

    print("Done!")

if __name__=='__main__':
    run()