python/scripts/archiveorg_fileset.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134

#!/usr/bin/env python3
"""
Helper script to 

Takes either two args (release ident and archive.org item), or a stream of
tab-separated such pairs on stdin.

TODO:
- should this check the item type?
"""

import json
import sys
from typing import Any

import internetarchive

FORMAT_TO_MIMETYPE = {
    'BZIP': 'application/x-bzip',
    'BZIP2': 'application/x-bzip2',
    'ZIP': 'application/zip',
    'GZIP': 'application/gzip',
    'RAR': 'application/vnd.rar',
    'TAR': 'application/x-tar',
    '7z': 'application/x-7z-compressed',
    'HTML': 'text/html',
    'Text': 'text/plain',
    'PDF': 'application/pdf',
    'CSV': 'text/csv',
    'XML': 'application/xml',
    'JSON': 'application/json',

    #'application/msword (.doc)', # .doc
    #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
    #'application/vnd.ms-excel', # .xls
    #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
    'MP3': 'audio/mpeg',  # .mp3
    'MP4': 'video/mp4',  # .mp4
    'MPEG': 'video/mpeg',  # .mpeg
    'JPEG': 'image/jpeg',
    'GIF': 'image/gif',
    'PNG': 'image/png',
    'TIFF': 'image/tiff',
    'Unknown': None,
}


def want_file(f: dict, item_name: str) -> bool:
    """
    Filters IA API files
    """
    if f.source != 'original':
        return False
    for suffix in [
            '_meta.sqlite',
            '_archive.torrent',
            '_itemimage.jpg',
            '_meta.xml',
            '_thumb.png',
            '_files.xml',
    ]:
        if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
            return False
    if f.name.startswith('_'):
        return False
    if item_name.startswith('academictorrents_'):
        for suffix in ['_academictorrents.torrent', '_academictorrents_torrent.txt', '.bib']:
            if f.name == item_name + suffix:
                return False
    return True


def parse_file(f: dict) -> dict:
    """
    Takes an IA API file and turns it in to a fatcat fileset manifest file
    """
    assert f.name and f.sha1 and f.md5
    assert f.name is not None
    mf = {
        'path': f.name,
        'size': int(f.size),
        'sha1': f.sha1,
        'md5': f.md5,
    }
    # TODO: will disable this hard check eventually and replace with:
    #mimetype = FORMAT_TO_MIMETYPE.get(f.format)
    mimetype = FORMAT_TO_MIMETYPE[f.format]
    if mimetype:
        mf['extra'] = dict(mimetype=mimetype)
    return mf


def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession):
    print(f"processing item={item_name} release_id={release_id}", file=sys.stderr)
    if release_id.startswith('release_'):
        release_id = release_id[9:]
    assert len(release_id) == 26
    item = session.get_item(item_name)
    assert item.metadata['mediatype'] not in ['collection', 'web']
    item_files = item.get_files(on_the_fly=False)
    manifest = [parse_file(f) for f in item_files if want_file(f, item_name)]
    fileset = {
        'manifest': manifest,
        'urls': [{
            'rel': 'archive',
            'url': f'https://archive.org/download/{item_name}/',
        }, ],
        'release_ids': [release_id],
        #extra={},
    }
    print(json.dumps(fileset))
    return fileset


def main():
    session = internetarchive.get_session()
    if len(sys.argv) == 3:
        item_name = sys.argv[1]
        release_id = sys.argv[2]
        item_to_fileset(item_name, release_id=release_id, session=session)
    else:
        for line in sys.stdin:
            line = line.strip()
            if not line:
                continue
            fields = line.split('\t')
            assert len(fields) == 2
            item_name = fields[0]
            release_id = fields[1]
            item_to_fileset(item_name, release_id=release_id, session=session)


if __name__ == '__main__':
    main()