scripts: example archiveorg-to-fileset importer

author: Bryan Newbold <bnewbold@archive.org> 2021-10-04 12:54:35 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-15 18:15:20 -0700
commit: 452475df7619f3743eac5ad86e2e1fb8ba9972da (patch)
tree: 57752fe5ef4fbe75078f01dcc665d5d01e08ea49 /python/scripts
parent: 8a1906d876e0494e483f8d867aac831f26715b0c (diff)
download: sandcrawler-452475df7619f3743eac5ad86e2e1fb8ba9972da.tar.gz
sandcrawler-452475df7619f3743eac5ad86e2e1fb8ba9972da.zip
1 files changed, 138 insertions, 0 deletions
diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py
new file mode 100755
index 0000000..0e507eb
--- /dev/null
+++ b/python/scripts/archiveorg_fileset.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+"""
+Helper script to 
+
+Takes either two args (release ident and archive.org item), or a stream of
+tab-separated such pairs on stdin.
+
+TODO:
+- should this check the item type?
+"""
+
+import sys
+import json
+from typing import Any
+
+import internetarchive
+
+
+FORMAT_TO_MIMETYPE = {
+    'BZIP': 'application/x-bzip',
+    'BZIP2': 'application/x-bzip2',
+    'ZIP': 'application/zip',
+    'GZIP': 'application/gzip',
+    'RAR': 'application/vnd.rar',
+    'TAR': 'application/x-tar',
+    '7z': 'application/x-7z-compressed',
+
+    'HTML': 'text/html',
+    'Text': 'text/plain',
+    'PDF': 'application/pdf',
+
+    'CSV': 'text/csv',
+    'XML': 'application/xml',
+    'JSON': 'application/json',
+
+    #'application/msword (.doc)', # .doc
+    #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
+    #'application/vnd.ms-excel', # .xls
+    #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
+
+    'MP3': 'audio/mpeg', # .mp3
+
+    'MP4': 'video/mp4', # .mp4
+    'MPEG': 'video/mpeg', # .mpeg
+
+    'JPEG': 'image/jpeg',
+    'GIF': 'image/gif',
+    'PNG': 'image/png',
+    'TIFF': 'image/tiff',
+
+    'Unknown': None,
+}
+
+def want_file(f: dict, item_name: str) -> bool:
+    """
+    Filters IA API files
+    """
+    if f.source != 'original':
+        return False
+    for suffix in [
+        '_meta.sqlite',
+        '_archive.torrent',
+        '_itemimage.jpg',
+        '_meta.xml',
+        '_thumb.png',
+        '_files.xml',
+    ]:
+        if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
+            return False
+    if f.name.startswith('_'):
+        return False
+    if item_name.startswith('academictorrents_'):
+        for suffix in ['_academictorrents.torrent', '_academictorrents_torrent.txt', '.bib']:
+            if f.name == item_name + suffix:
+                return False
+    return True
+
+def parse_file(f: dict) -> dict:
+    """
+    Takes an IA API file and turns it in to a fatcat fileset manifest file
+    """
+    assert f.name and f.sha1 and f.md5
+    assert f.name is not None
+    mf = {
+        'path': f.name,
+        'size': int(f.size),
+        'sha1': f.sha1,
+        'md5': f.md5,
+    }
+    # TODO: will disable this hard check eventually and replace with:
+    #mimetype = FORMAT_TO_MIMETYPE.get(f.format)
+    mimetype = FORMAT_TO_MIMETYPE[f.format]
+    if mimetype:
+        mf['extra'] = dict(mimetype=mimetype)
+    return mf
+
+def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession):
+    print(f"processing item={item_name} release_id={release_id}", file=sys.stderr)
+    if release_id.startswith('release_'):
+        release_id = release_id[9:]
+    assert len(release_id) == 26
+    item = session.get_item(item_name)
+    assert item.metadata['mediatype'] not in ['collection', 'web']
+    item_files = item.get_files(on_the_fly=False)
+    manifest = [parse_file(f) for f in item_files if want_file(f, item_name)]
+    fileset = {
+        'manifest': manifest,
+        'urls': [
+            {
+                'rel': 'archive',
+                'url': f'https://archive.org/download/{item_name}/',
+            },
+        ],
+        'release_ids': [release_id],
+        #extra={},
+    }
+    print(json.dumps(fileset))
+    return fileset
+
+def main():
+    session = internetarchive.get_session()
+    if len(sys.argv) == 3:
+        item_name = sys.argv[1]
+        release_id = sys.argv[2]
+        item_to_fileset(item_name, release_id=release_id, session=session)
+    else:
+        for line in sys.stdin:
+            line = line.strip()
+            if not line:
+                continue
+            fields = line.split('\t')
+            assert len(fields) == 2
+            item_name = fields[0]
+            release_id = fields[1]
+            item_to_fileset(item_name, release_id=release_id, session=session)
+
+if __name__ == '__main__':
+    main()
author	Bryan Newbold <bnewbold@archive.org>	2021-10-04 12:54:35 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-15 18:15:20 -0700
commit	452475df7619f3743eac5ad86e2e1fb8ba9972da (patch)
tree	57752fe5ef4fbe75078f01dcc665d5d01e08ea49 /python/scripts
parent	8a1906d876e0494e483f8d867aac831f26715b0c (diff)
download	sandcrawler-452475df7619f3743eac5ad86e2e1fb8ba9972da.tar.gz sandcrawler-452475df7619f3743eac5ad86e2e1fb8ba9972da.zip