diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-04 12:54:35 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-15 18:15:20 -0700 |
commit | 452475df7619f3743eac5ad86e2e1fb8ba9972da (patch) | |
tree | 57752fe5ef4fbe75078f01dcc665d5d01e08ea49 /python/scripts | |
parent | 8a1906d876e0494e483f8d867aac831f26715b0c (diff) | |
download | sandcrawler-452475df7619f3743eac5ad86e2e1fb8ba9972da.tar.gz sandcrawler-452475df7619f3743eac5ad86e2e1fb8ba9972da.zip |
scripts: example archiveorg-to-fileset importer
Diffstat (limited to 'python/scripts')
-rwxr-xr-x | python/scripts/archiveorg_fileset.py | 138 |
1 files changed, 138 insertions, 0 deletions
diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py new file mode 100755 index 0000000..0e507eb --- /dev/null +++ b/python/scripts/archiveorg_fileset.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +""" +Helper script to + +Takes either two args (release ident and archive.org item), or a stream of +tab-separated such pairs on stdin. + +TODO: +- should this check the item type? +""" + +import sys +import json +from typing import Any + +import internetarchive + + +FORMAT_TO_MIMETYPE = { + 'BZIP': 'application/x-bzip', + 'BZIP2': 'application/x-bzip2', + 'ZIP': 'application/zip', + 'GZIP': 'application/gzip', + 'RAR': 'application/vnd.rar', + 'TAR': 'application/x-tar', + '7z': 'application/x-7z-compressed', + + 'HTML': 'text/html', + 'Text': 'text/plain', + 'PDF': 'application/pdf', + + 'CSV': 'text/csv', + 'XML': 'application/xml', + 'JSON': 'application/json', + + #'application/msword (.doc)', # .doc + #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx + #'application/vnd.ms-excel', # .xls + #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx + + 'MP3': 'audio/mpeg', # .mp3 + + 'MP4': 'video/mp4', # .mp4 + 'MPEG': 'video/mpeg', # .mpeg + + 'JPEG': 'image/jpeg', + 'GIF': 'image/gif', + 'PNG': 'image/png', + 'TIFF': 'image/tiff', + + 'Unknown': None, +} + +def want_file(f: dict, item_name: str) -> bool: + """ + Filters IA API files + """ + if f.source != 'original': + return False + for suffix in [ + '_meta.sqlite', + '_archive.torrent', + '_itemimage.jpg', + '_meta.xml', + '_thumb.png', + '_files.xml', + ]: + if f.name == item_name + suffix or f.name == item_name.lower() + suffix: + return False + if f.name.startswith('_'): + return False + if item_name.startswith('academictorrents_'): + for suffix in ['_academictorrents.torrent', '_academictorrents_torrent.txt', '.bib']: + if f.name == item_name + suffix: + return False + return True + +def parse_file(f: dict) -> dict: + """ + Takes an IA API file and turns it in to a fatcat fileset manifest file + """ + assert f.name and f.sha1 and f.md5 + assert f.name is not None + mf = { + 'path': f.name, + 'size': int(f.size), + 'sha1': f.sha1, + 'md5': f.md5, + } + # TODO: will disable this hard check eventually and replace with: + #mimetype = FORMAT_TO_MIMETYPE.get(f.format) + mimetype = FORMAT_TO_MIMETYPE[f.format] + if mimetype: + mf['extra'] = dict(mimetype=mimetype) + return mf + +def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession): + print(f"processing item={item_name} release_id={release_id}", file=sys.stderr) + if release_id.startswith('release_'): + release_id = release_id[9:] + assert len(release_id) == 26 + item = session.get_item(item_name) + assert item.metadata['mediatype'] not in ['collection', 'web'] + item_files = item.get_files(on_the_fly=False) + manifest = [parse_file(f) for f in item_files if want_file(f, item_name)] + fileset = { + 'manifest': manifest, + 'urls': [ + { + 'rel': 'archive', + 'url': f'https://archive.org/download/{item_name}/', + }, + ], + 'release_ids': [release_id], + #extra={}, + } + print(json.dumps(fileset)) + return fileset + +def main(): + session = internetarchive.get_session() + if len(sys.argv) == 3: + item_name = sys.argv[1] + release_id = sys.argv[2] + item_to_fileset(item_name, release_id=release_id, session=session) + else: + for line in sys.stdin: + line = line.strip() + if not line: + continue + fields = line.split('\t') + assert len(fields) == 2 + item_name = fields[0] + release_id = fields[1] + item_to_fileset(item_name, release_id=release_id, session=session) + +if __name__ == '__main__': + main() |