1 files changed, 50 insertions, 49 deletions
diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py
index 83c04e3..6328f52 100755
--- a/python/scripts/archiveorg_fileset.py
+++ b/python/scripts/archiveorg_fileset.py
@@ -16,32 +16,31 @@ from typing import Any
 import internetarchive
 
 FORMAT_TO_MIMETYPE = {
-    'BZIP': 'application/x-bzip',
-    'BZIP2': 'application/x-bzip2',
-    'ZIP': 'application/zip',
-    'GZIP': 'application/gzip',
-    'RAR': 'application/vnd.rar',
-    'TAR': 'application/x-tar',
-    '7z': 'application/x-7z-compressed',
-    'HTML': 'text/html',
-    'Text': 'text/plain',
-    'PDF': 'application/pdf',
-    'CSV': 'text/csv',
-    'XML': 'application/xml',
-    'JSON': 'application/json',
-
+    "BZIP": "application/x-bzip",
+    "BZIP2": "application/x-bzip2",
+    "ZIP": "application/zip",
+    "GZIP": "application/gzip",
+    "RAR": "application/vnd.rar",
+    "TAR": "application/x-tar",
+    "7z": "application/x-7z-compressed",
+    "HTML": "text/html",
+    "Text": "text/plain",
+    "PDF": "application/pdf",
+    "CSV": "text/csv",
+    "XML": "application/xml",
+    "JSON": "application/json",
     #'application/msword (.doc)', # .doc
     #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
     #'application/vnd.ms-excel', # .xls
     #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
-    'MP3': 'audio/mpeg',  # .mp3
-    'MP4': 'video/mp4',  # .mp4
-    'MPEG': 'video/mpeg',  # .mpeg
-    'JPEG': 'image/jpeg',
-    'GIF': 'image/gif',
-    'PNG': 'image/png',
-    'TIFF': 'image/tiff',
-    'Unknown': None,
+    "MP3": "audio/mpeg",  # .mp3
+    "MP4": "video/mp4",  # .mp4
+    "MPEG": "video/mpeg",  # .mpeg
+    "JPEG": "image/jpeg",
+    "GIF": "image/gif",
+    "PNG": "image/png",
+    "TIFF": "image/tiff",
+    "Unknown": None,
 }
 
 
@@ -49,22 +48,22 @@ def want_file(f: dict, item_name: str) -> bool:
     """
     Filters IA API files
     """
-    if f.source != 'original':
+    if f.source != "original":
         return False
     for suffix in [
-            '_meta.sqlite',
-            '_archive.torrent',
-            '_itemimage.jpg',
-            '_meta.xml',
-            '_thumb.png',
-            '_files.xml',
+        "_meta.sqlite",
+        "_archive.torrent",
+        "_itemimage.jpg",
+        "_meta.xml",
+        "_thumb.png",
+        "_files.xml",
     ]:
         if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
             return False
-    if f.name.startswith('_'):
+    if f.name.startswith("_"):
         return False
-    if item_name.startswith('academictorrents_'):
-        for suffix in ['_academictorrents.torrent', '_academictorrents_torrent.txt', '.bib']:
+    if item_name.startswith("academictorrents_"):
+        for suffix in ["_academictorrents.torrent", "_academictorrents_torrent.txt", ".bib"]:
             if f.name == item_name + suffix:
                 return False
     return True
@@ -77,36 +76,38 @@ def parse_file(f: dict) -> dict:
     assert f.name and f.sha1 and f.md5
     assert f.name is not None
     mf = {
-        'path': f.name,
-        'size': int(f.size),
-        'sha1': f.sha1,
-        'md5': f.md5,
+        "path": f.name,
+        "size": int(f.size),
+        "sha1": f.sha1,
+        "md5": f.md5,
     }
     # TODO: will disable this hard check eventually and replace with:
-    #mimetype = FORMAT_TO_MIMETYPE.get(f.format)
+    # mimetype = FORMAT_TO_MIMETYPE.get(f.format)
     mimetype = FORMAT_TO_MIMETYPE[f.format]
     if mimetype:
-        mf['extra'] = dict(mimetype=mimetype)
+        mf["extra"] = dict(mimetype=mimetype)
     return mf
 
 
 def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession):
     print(f"processing item={item_name} release_id={release_id}", file=sys.stderr)
-    if release_id.startswith('release_'):
+    if release_id.startswith("release_"):
         release_id = release_id[9:]
     assert len(release_id) == 26
     item = session.get_item(item_name)
-    assert item.metadata['mediatype'] not in ['collection', 'web']
+    assert item.metadata["mediatype"] not in ["collection", "web"]
     item_files = item.get_files(on_the_fly=False)
     manifest = [parse_file(f) for f in item_files if want_file(f, item_name)]
     fileset = {
-        'manifest': manifest,
-        'urls': [{
-            'rel': 'archive',
-            'url': f'https://archive.org/download/{item_name}/',
-        }, ],
-        'release_ids': [release_id],
-        #extra={},
+        "manifest": manifest,
+        "urls": [
+            {
+                "rel": "archive",
+                "url": f"https://archive.org/download/{item_name}/",
+            },
+        ],
+        "release_ids": [release_id],
+        # extra={},
     }
     print(json.dumps(fileset))
     return fileset
@@ -123,12 +124,12 @@ def main():
             line = line.strip()
             if not line:
                 continue
-            fields = line.split('\t')
+            fields = line.split("\t")
             assert len(fields) == 2
             item_name = fields[0]
             release_id = fields[1]
             item_to_fileset(item_name, release_id=release_id, session=session)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()