diff options
Diffstat (limited to 'python/scripts/archiveorg_fileset.py')
-rwxr-xr-x | python/scripts/archiveorg_fileset.py | 135 |
1 files changed, 135 insertions, 0 deletions
diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py new file mode 100755 index 0000000..6328f52 --- /dev/null +++ b/python/scripts/archiveorg_fileset.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +""" +Helper script to + +Takes either two args (release ident and archive.org item), or a stream of +tab-separated such pairs on stdin. + +TODO: +- should this check the item type? +""" + +import json +import sys +from typing import Any + +import internetarchive + +FORMAT_TO_MIMETYPE = { + "BZIP": "application/x-bzip", + "BZIP2": "application/x-bzip2", + "ZIP": "application/zip", + "GZIP": "application/gzip", + "RAR": "application/vnd.rar", + "TAR": "application/x-tar", + "7z": "application/x-7z-compressed", + "HTML": "text/html", + "Text": "text/plain", + "PDF": "application/pdf", + "CSV": "text/csv", + "XML": "application/xml", + "JSON": "application/json", + #'application/msword (.doc)', # .doc + #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx + #'application/vnd.ms-excel', # .xls + #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx + "MP3": "audio/mpeg", # .mp3 + "MP4": "video/mp4", # .mp4 + "MPEG": "video/mpeg", # .mpeg + "JPEG": "image/jpeg", + "GIF": "image/gif", + "PNG": "image/png", + "TIFF": "image/tiff", + "Unknown": None, +} + + +def want_file(f: dict, item_name: str) -> bool: + """ + Filters IA API files + """ + if f.source != "original": + return False + for suffix in [ + "_meta.sqlite", + "_archive.torrent", + "_itemimage.jpg", + "_meta.xml", + "_thumb.png", + "_files.xml", + ]: + if f.name == item_name + suffix or f.name == item_name.lower() + suffix: + return False + if f.name.startswith("_"): + return False + if item_name.startswith("academictorrents_"): + for suffix in ["_academictorrents.torrent", "_academictorrents_torrent.txt", ".bib"]: + if f.name == item_name + suffix: + return False + return True + + +def parse_file(f: dict) -> dict: + """ + Takes an IA API file and turns it in to a fatcat fileset manifest file + """ + assert f.name and f.sha1 and f.md5 + assert f.name is not None + mf = { + "path": f.name, + "size": int(f.size), + "sha1": f.sha1, + "md5": f.md5, + } + # TODO: will disable this hard check eventually and replace with: + # mimetype = FORMAT_TO_MIMETYPE.get(f.format) + mimetype = FORMAT_TO_MIMETYPE[f.format] + if mimetype: + mf["extra"] = dict(mimetype=mimetype) + return mf + + +def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession): + print(f"processing item={item_name} release_id={release_id}", file=sys.stderr) + if release_id.startswith("release_"): + release_id = release_id[9:] + assert len(release_id) == 26 + item = session.get_item(item_name) + assert item.metadata["mediatype"] not in ["collection", "web"] + item_files = item.get_files(on_the_fly=False) + manifest = [parse_file(f) for f in item_files if want_file(f, item_name)] + fileset = { + "manifest": manifest, + "urls": [ + { + "rel": "archive", + "url": f"https://archive.org/download/{item_name}/", + }, + ], + "release_ids": [release_id], + # extra={}, + } + print(json.dumps(fileset)) + return fileset + + +def main(): + session = internetarchive.get_session() + if len(sys.argv) == 3: + item_name = sys.argv[1] + release_id = sys.argv[2] + item_to_fileset(item_name, release_id=release_id, session=session) + else: + for line in sys.stdin: + line = line.strip() + if not line: + continue + fields = line.split("\t") + assert len(fields) == 2 + item_name = fields[0] + release_id = fields[1] + item_to_fileset(item_name, release_id=release_id, session=session) + + +if __name__ == "__main__": + main() |