#!/usr/bin/env python3
"""
Helper script to 

Takes either two args (release ident and archive.org item), or a stream of
tab-separated such pairs on stdin.

TODO:
- should this check the item type?
"""

import sys
import json
from typing import Any

import internetarchive


FORMAT_TO_MIMETYPE = {
    'BZIP': 'application/x-bzip',
    'BZIP2': 'application/x-bzip2',
    'ZIP': 'application/zip',
    'GZIP': 'application/gzip',
    'RAR': 'application/vnd.rar',
    'TAR': 'application/x-tar',
    '7z': 'application/x-7z-compressed',

    'HTML': 'text/html',
    'Text': 'text/plain',
    'PDF': 'application/pdf',

    'CSV': 'text/csv',
    'XML': 'application/xml',
    'JSON': 'application/json',

    #'application/msword (.doc)', # .doc
    #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
    #'application/vnd.ms-excel', # .xls
    #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx

    'MP3': 'audio/mpeg', # .mp3

    'MP4': 'video/mp4', # .mp4
    'MPEG': 'video/mpeg', # .mpeg

    'JPEG': 'image/jpeg',
    'GIF': 'image/gif',
    'PNG': 'image/png',
    'TIFF': 'image/tiff',

    'Unknown': None,
}

def want_file(f: dict, item_name: str) -> bool:
    """
    Filters IA API files
    """
    if f.source != 'original':
        return False
    for suffix in [
        '_meta.sqlite',
        '_archive.torrent',
        '_itemimage.jpg',
        '_meta.xml',
        '_thumb.png',
        '_files.xml',
    ]:
        if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
            return False
    if f.name.startswith('_'):
        return False
    if item_name.startswith('academictorrents_'):
        for suffix in ['_academictorrents.torrent', '_academictorrents_torrent.txt', '.bib']:
            if f.name == item_name + suffix:
                return False
    return True

def parse_file(f: dict) -> dict:
    """
    Takes an IA API file and turns it in to a fatcat fileset manifest file
    """
    assert f.name and f.sha1 and f.md5
    assert f.name is not None
    mf = {
        'path': f.name,
        'size': int(f.size),
        'sha1': f.sha1,
        'md5': f.md5,
    }
    # TODO: will disable this hard check eventually and replace with:
    #mimetype = FORMAT_TO_MIMETYPE.get(f.format)
    mimetype = FORMAT_TO_MIMETYPE[f.format]
    if mimetype:
        mf['extra'] = dict(mimetype=mimetype)
    return mf

def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession):
    print(f"processing item={item_name} release_id={release_id}", file=sys.stderr)
    if release_id.startswith('release_'):
        release_id = release_id[9:]
    assert len(release_id) == 26
    item = session.get_item(item_name)
    assert item.metadata['mediatype'] not in ['collection', 'web']
    item_files = item.get_files(on_the_fly=False)
    manifest = [parse_file(f) for f in item_files if want_file(f, item_name)]
    fileset = {
        'manifest': manifest,
        'urls': [
            {
                'rel': 'archive',
                'url': f'https://archive.org/download/{item_name}/',
            },
        ],
        'release_ids': [release_id],
        #extra={},
    }
    print(json.dumps(fileset))
    return fileset

def main():
    session = internetarchive.get_session()
    if len(sys.argv) == 3:
        item_name = sys.argv[1]
        release_id = sys.argv[2]
        item_to_fileset(item_name, release_id=release_id, session=session)
    else:
        for line in sys.stdin:
            line = line.strip()
            if not line:
                continue
            fields = line.split('\t')
            assert len(fields) == 2
            item_name = fields[0]
            release_id = fields[1]
            item_to_fileset(item_name, release_id=release_id, session=session)

if __name__ == '__main__':
    main()