aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/archiveorg_fileset.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/scripts/archiveorg_fileset.py')
-rwxr-xr-xpython/scripts/archiveorg_fileset.py99
1 files changed, 50 insertions, 49 deletions
diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py
index 83c04e3..6328f52 100755
--- a/python/scripts/archiveorg_fileset.py
+++ b/python/scripts/archiveorg_fileset.py
@@ -16,32 +16,31 @@ from typing import Any
import internetarchive
FORMAT_TO_MIMETYPE = {
- 'BZIP': 'application/x-bzip',
- 'BZIP2': 'application/x-bzip2',
- 'ZIP': 'application/zip',
- 'GZIP': 'application/gzip',
- 'RAR': 'application/vnd.rar',
- 'TAR': 'application/x-tar',
- '7z': 'application/x-7z-compressed',
- 'HTML': 'text/html',
- 'Text': 'text/plain',
- 'PDF': 'application/pdf',
- 'CSV': 'text/csv',
- 'XML': 'application/xml',
- 'JSON': 'application/json',
-
+ "BZIP": "application/x-bzip",
+ "BZIP2": "application/x-bzip2",
+ "ZIP": "application/zip",
+ "GZIP": "application/gzip",
+ "RAR": "application/vnd.rar",
+ "TAR": "application/x-tar",
+ "7z": "application/x-7z-compressed",
+ "HTML": "text/html",
+ "Text": "text/plain",
+ "PDF": "application/pdf",
+ "CSV": "text/csv",
+ "XML": "application/xml",
+ "JSON": "application/json",
#'application/msword (.doc)', # .doc
#'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
#'application/vnd.ms-excel', # .xls
#'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
- 'MP3': 'audio/mpeg', # .mp3
- 'MP4': 'video/mp4', # .mp4
- 'MPEG': 'video/mpeg', # .mpeg
- 'JPEG': 'image/jpeg',
- 'GIF': 'image/gif',
- 'PNG': 'image/png',
- 'TIFF': 'image/tiff',
- 'Unknown': None,
+ "MP3": "audio/mpeg", # .mp3
+ "MP4": "video/mp4", # .mp4
+ "MPEG": "video/mpeg", # .mpeg
+ "JPEG": "image/jpeg",
+ "GIF": "image/gif",
+ "PNG": "image/png",
+ "TIFF": "image/tiff",
+ "Unknown": None,
}
@@ -49,22 +48,22 @@ def want_file(f: dict, item_name: str) -> bool:
"""
Filters IA API files
"""
- if f.source != 'original':
+ if f.source != "original":
return False
for suffix in [
- '_meta.sqlite',
- '_archive.torrent',
- '_itemimage.jpg',
- '_meta.xml',
- '_thumb.png',
- '_files.xml',
+ "_meta.sqlite",
+ "_archive.torrent",
+ "_itemimage.jpg",
+ "_meta.xml",
+ "_thumb.png",
+ "_files.xml",
]:
if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
return False
- if f.name.startswith('_'):
+ if f.name.startswith("_"):
return False
- if item_name.startswith('academictorrents_'):
- for suffix in ['_academictorrents.torrent', '_academictorrents_torrent.txt', '.bib']:
+ if item_name.startswith("academictorrents_"):
+ for suffix in ["_academictorrents.torrent", "_academictorrents_torrent.txt", ".bib"]:
if f.name == item_name + suffix:
return False
return True
@@ -77,36 +76,38 @@ def parse_file(f: dict) -> dict:
assert f.name and f.sha1 and f.md5
assert f.name is not None
mf = {
- 'path': f.name,
- 'size': int(f.size),
- 'sha1': f.sha1,
- 'md5': f.md5,
+ "path": f.name,
+ "size": int(f.size),
+ "sha1": f.sha1,
+ "md5": f.md5,
}
# TODO: will disable this hard check eventually and replace with:
- #mimetype = FORMAT_TO_MIMETYPE.get(f.format)
+ # mimetype = FORMAT_TO_MIMETYPE.get(f.format)
mimetype = FORMAT_TO_MIMETYPE[f.format]
if mimetype:
- mf['extra'] = dict(mimetype=mimetype)
+ mf["extra"] = dict(mimetype=mimetype)
return mf
def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession):
print(f"processing item={item_name} release_id={release_id}", file=sys.stderr)
- if release_id.startswith('release_'):
+ if release_id.startswith("release_"):
release_id = release_id[9:]
assert len(release_id) == 26
item = session.get_item(item_name)
- assert item.metadata['mediatype'] not in ['collection', 'web']
+ assert item.metadata["mediatype"] not in ["collection", "web"]
item_files = item.get_files(on_the_fly=False)
manifest = [parse_file(f) for f in item_files if want_file(f, item_name)]
fileset = {
- 'manifest': manifest,
- 'urls': [{
- 'rel': 'archive',
- 'url': f'https://archive.org/download/{item_name}/',
- }, ],
- 'release_ids': [release_id],
- #extra={},
+ "manifest": manifest,
+ "urls": [
+ {
+ "rel": "archive",
+ "url": f"https://archive.org/download/{item_name}/",
+ },
+ ],
+ "release_ids": [release_id],
+ # extra={},
}
print(json.dumps(fileset))
return fileset
@@ -123,12 +124,12 @@ def main():
line = line.strip()
if not line:
continue
- fields = line.split('\t')
+ fields = line.split("\t")
assert len(fields) == 2
item_name = fields[0]
release_id = fields[1]
item_to_fileset(item_name, release_id=release_id, session=session)
-if __name__ == '__main__':
+if __name__ == "__main__":
main()