1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
|
#!/usr/bin/env python3
"""
Helper script to
Takes either two args (release ident and archive.org item), or a stream of
tab-separated such pairs on stdin.
TODO:
- should this check the item type?
"""
import json
import sys
from typing import Any
import internetarchive
FORMAT_TO_MIMETYPE = {
"BZIP": "application/x-bzip",
"BZIP2": "application/x-bzip2",
"ZIP": "application/zip",
"GZIP": "application/gzip",
"RAR": "application/vnd.rar",
"TAR": "application/x-tar",
"7z": "application/x-7z-compressed",
"HTML": "text/html",
"Text": "text/plain",
"PDF": "application/pdf",
"CSV": "text/csv",
"XML": "application/xml",
"JSON": "application/json",
#'application/msword (.doc)', # .doc
#'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
#'application/vnd.ms-excel', # .xls
#'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
"MP3": "audio/mpeg", # .mp3
"MP4": "video/mp4", # .mp4
"MPEG": "video/mpeg", # .mpeg
"JPEG": "image/jpeg",
"GIF": "image/gif",
"PNG": "image/png",
"TIFF": "image/tiff",
"Unknown": None,
}
def want_file(f: dict, item_name: str) -> bool:
"""
Filters IA API files
"""
if f.source != "original":
return False
for suffix in [
"_meta.sqlite",
"_archive.torrent",
"_itemimage.jpg",
"_meta.xml",
"_thumb.png",
"_files.xml",
]:
if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
return False
if f.name.startswith("_"):
return False
if item_name.startswith("academictorrents_"):
for suffix in ["_academictorrents.torrent", "_academictorrents_torrent.txt", ".bib"]:
if f.name == item_name + suffix:
return False
return True
def parse_file(f: dict) -> dict:
"""
Takes an IA API file and turns it in to a fatcat fileset manifest file
"""
assert f.name and f.sha1 and f.md5
assert f.name is not None
mf = {
"path": f.name,
"size": int(f.size),
"sha1": f.sha1,
"md5": f.md5,
}
# TODO: will disable this hard check eventually and replace with:
# mimetype = FORMAT_TO_MIMETYPE.get(f.format)
mimetype = FORMAT_TO_MIMETYPE[f.format]
if mimetype:
mf["extra"] = dict(mimetype=mimetype)
return mf
def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession):
print(f"processing item={item_name} release_id={release_id}", file=sys.stderr)
if release_id.startswith("release_"):
release_id = release_id[9:]
assert len(release_id) == 26
item = session.get_item(item_name)
assert item.metadata["mediatype"] not in ["collection", "web"]
item_files = item.get_files(on_the_fly=False)
manifest = [parse_file(f) for f in item_files if want_file(f, item_name)]
fileset = {
"manifest": manifest,
"urls": [
{
"rel": "archive",
"url": f"https://archive.org/download/{item_name}/",
},
],
"release_ids": [release_id],
# extra={},
}
print(json.dumps(fileset))
return fileset
def main():
session = internetarchive.get_session()
if len(sys.argv) == 3:
item_name = sys.argv[1]
release_id = sys.argv[2]
item_to_fileset(item_name, release_id=release_id, session=session)
else:
for line in sys.stdin:
line = line.strip()
if not line:
continue
fields = line.split("\t")
assert len(fields) == 2
item_name = fields[0]
release_id = fields[1]
item_to_fileset(item_name, release_id=release_id, session=session)
if __name__ == "__main__":
main()
|