#!/usr/bin/env python3 """ Helper script to Takes either two args (release ident and archive.org item), or a stream of tab-separated such pairs on stdin. TODO: - should this check the item type? """ import json import sys from typing import Any import internetarchive FORMAT_TO_MIMETYPE = { "BZIP": "application/x-bzip", "BZIP2": "application/x-bzip2", "ZIP": "application/zip", "GZIP": "application/gzip", "RAR": "application/vnd.rar", "TAR": "application/x-tar", "7z": "application/x-7z-compressed", "HTML": "text/html", "Text": "text/plain", "PDF": "application/pdf", "CSV": "text/csv", "XML": "application/xml", "JSON": "application/json", #'application/msword (.doc)', # .doc #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx #'application/vnd.ms-excel', # .xls #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx "MP3": "audio/mpeg", # .mp3 "MP4": "video/mp4", # .mp4 "MPEG": "video/mpeg", # .mpeg "JPEG": "image/jpeg", "GIF": "image/gif", "PNG": "image/png", "TIFF": "image/tiff", "Unknown": None, } def want_file(f: dict, item_name: str) -> bool: """ Filters IA API files """ if f.source != "original": return False for suffix in [ "_meta.sqlite", "_archive.torrent", "_itemimage.jpg", "_meta.xml", "_thumb.png", "_files.xml", ]: if f.name == item_name + suffix or f.name == item_name.lower() + suffix: return False if f.name.startswith("_"): return False if item_name.startswith("academictorrents_"): for suffix in ["_academictorrents.torrent", "_academictorrents_torrent.txt", ".bib"]: if f.name == item_name + suffix: return False return True def parse_file(f: dict) -> dict: """ Takes an IA API file and turns it in to a fatcat fileset manifest file """ assert f.name and f.sha1 and f.md5 assert f.name is not None mf = { "path": f.name, "size": int(f.size), "sha1": f.sha1, "md5": f.md5, } # TODO: will disable this hard check eventually and replace with: # mimetype = FORMAT_TO_MIMETYPE.get(f.format) mimetype = FORMAT_TO_MIMETYPE[f.format] if mimetype: mf["extra"] = dict(mimetype=mimetype) return mf def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession): print(f"processing item={item_name} release_id={release_id}", file=sys.stderr) if release_id.startswith("release_"): release_id = release_id[9:] assert len(release_id) == 26 item = session.get_item(item_name) assert item.metadata["mediatype"] not in ["collection", "web"] item_files = item.get_files(on_the_fly=False) manifest = [parse_file(f) for f in item_files if want_file(f, item_name)] fileset = { "manifest": manifest, "urls": [ { "rel": "archive", "url": f"https://archive.org/download/{item_name}/", }, ], "release_ids": [release_id], # extra={}, } print(json.dumps(fileset)) return fileset def main(): session = internetarchive.get_session() if len(sys.argv) == 3: item_name = sys.argv[1] release_id = sys.argv[2] item_to_fileset(item_name, release_id=release_id, session=session) else: for line in sys.stdin: line = line.strip() if not line: continue fields = line.split("\t") assert len(fields) == 2 item_name = fields[0] release_id = fields[1] item_to_fileset(item_name, release_id=release_id, session=session) if __name__ == "__main__": main()