aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/archiveorg_fileset.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/scripts/archiveorg_fileset.py')
-rwxr-xr-xpython/scripts/archiveorg_fileset.py135
1 files changed, 135 insertions, 0 deletions
diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py
new file mode 100755
index 0000000..6328f52
--- /dev/null
+++ b/python/scripts/archiveorg_fileset.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+"""
+Helper script to
+
+Takes either two args (release ident and archive.org item), or a stream of
+tab-separated such pairs on stdin.
+
+TODO:
+- should this check the item type?
+"""
+
+import json
+import sys
+from typing import Any
+
+import internetarchive
+
+FORMAT_TO_MIMETYPE = {
+ "BZIP": "application/x-bzip",
+ "BZIP2": "application/x-bzip2",
+ "ZIP": "application/zip",
+ "GZIP": "application/gzip",
+ "RAR": "application/vnd.rar",
+ "TAR": "application/x-tar",
+ "7z": "application/x-7z-compressed",
+ "HTML": "text/html",
+ "Text": "text/plain",
+ "PDF": "application/pdf",
+ "CSV": "text/csv",
+ "XML": "application/xml",
+ "JSON": "application/json",
+ #'application/msword (.doc)', # .doc
+ #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
+ #'application/vnd.ms-excel', # .xls
+ #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
+ "MP3": "audio/mpeg", # .mp3
+ "MP4": "video/mp4", # .mp4
+ "MPEG": "video/mpeg", # .mpeg
+ "JPEG": "image/jpeg",
+ "GIF": "image/gif",
+ "PNG": "image/png",
+ "TIFF": "image/tiff",
+ "Unknown": None,
+}
+
+
+def want_file(f: dict, item_name: str) -> bool:
+ """
+ Filters IA API files
+ """
+ if f.source != "original":
+ return False
+ for suffix in [
+ "_meta.sqlite",
+ "_archive.torrent",
+ "_itemimage.jpg",
+ "_meta.xml",
+ "_thumb.png",
+ "_files.xml",
+ ]:
+ if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
+ return False
+ if f.name.startswith("_"):
+ return False
+ if item_name.startswith("academictorrents_"):
+ for suffix in ["_academictorrents.torrent", "_academictorrents_torrent.txt", ".bib"]:
+ if f.name == item_name + suffix:
+ return False
+ return True
+
+
+def parse_file(f: dict) -> dict:
+ """
+ Takes an IA API file and turns it in to a fatcat fileset manifest file
+ """
+ assert f.name and f.sha1 and f.md5
+ assert f.name is not None
+ mf = {
+ "path": f.name,
+ "size": int(f.size),
+ "sha1": f.sha1,
+ "md5": f.md5,
+ }
+ # TODO: will disable this hard check eventually and replace with:
+ # mimetype = FORMAT_TO_MIMETYPE.get(f.format)
+ mimetype = FORMAT_TO_MIMETYPE[f.format]
+ if mimetype:
+ mf["extra"] = dict(mimetype=mimetype)
+ return mf
+
+
+def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession):
+ print(f"processing item={item_name} release_id={release_id}", file=sys.stderr)
+ if release_id.startswith("release_"):
+ release_id = release_id[9:]
+ assert len(release_id) == 26
+ item = session.get_item(item_name)
+ assert item.metadata["mediatype"] not in ["collection", "web"]
+ item_files = item.get_files(on_the_fly=False)
+ manifest = [parse_file(f) for f in item_files if want_file(f, item_name)]
+ fileset = {
+ "manifest": manifest,
+ "urls": [
+ {
+ "rel": "archive",
+ "url": f"https://archive.org/download/{item_name}/",
+ },
+ ],
+ "release_ids": [release_id],
+ # extra={},
+ }
+ print(json.dumps(fileset))
+ return fileset
+
+
+def main():
+ session = internetarchive.get_session()
+ if len(sys.argv) == 3:
+ item_name = sys.argv[1]
+ release_id = sys.argv[2]
+ item_to_fileset(item_name, release_id=release_id, session=session)
+ else:
+ for line in sys.stdin:
+ line = line.strip()
+ if not line:
+ continue
+ fields = line.split("\t")
+ assert len(fields) == 2
+ item_name = fields[0]
+ release_id = fields[1]
+ item_to_fileset(item_name, release_id=release_id, session=session)
+
+
+if __name__ == "__main__":
+ main()