#!/usr/bin/env python3
"""
Helper script to 

Takes either two args (release ident and archive.org item), or a stream of
tab-separated such pairs on stdin.

TODO:
- should this check the item type?
"""

import json
import sys
from typing import Any

import internetarchive

FORMAT_TO_MIMETYPE = {
    "BZIP": "application/x-bzip",
    "BZIP2": "application/x-bzip2",
    "ZIP": "application/zip",
    "GZIP": "application/gzip",
    "RAR": "application/vnd.rar",
    "TAR": "application/x-tar",
    "7z": "application/x-7z-compressed",
    "HTML": "text/html",
    "Text": "text/plain",
    "PDF": "application/pdf",
    "CSV": "text/csv",
    "XML": "application/xml",
    "JSON": "application/json",
    #'application/msword (.doc)', # .doc
    #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
    #'application/vnd.ms-excel', # .xls
    #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
    "MP3": "audio/mpeg",  # .mp3
    "MP4": "video/mp4",  # .mp4
    "MPEG": "video/mpeg",  # .mpeg
    "JPEG": "image/jpeg",
    "GIF": "image/gif",
    "PNG": "image/png",
    "TIFF": "image/tiff",
    "Unknown": None,
}


def want_file(f: dict, item_name: str) -> bool:
    """
    Filters IA API files
    """
    if f.source != "original":
        return False
    for suffix in [
        "_meta.sqlite",
        "_archive.torrent",
        "_itemimage.jpg",
        "_meta.xml",
        "_thumb.png",
        "_files.xml",
    ]:
        if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
            return False
    if f.name.startswith("_"):
        return False
    if item_name.startswith("academictorrents_"):
        for suffix in ["_academictorrents.torrent", "_academictorrents_torrent.txt", ".bib"]:
            if f.name == item_name + suffix:
                return False
    return True


def parse_file(f: dict) -> dict:
    """
    Takes an IA API file and turns it in to a fatcat fileset manifest file
    """
    assert f.name and f.sha1 and f.md5
    assert f.name is not None
    mf = {
        "path": f.name,
        "size": int(f.size),
        "sha1": f.sha1,
        "md5": f.md5,
    }
    # TODO: will disable this hard check eventually and replace with:
    # mimetype = FORMAT_TO_MIMETYPE.get(f.format)
    mimetype = FORMAT_TO_MIMETYPE[f.format]
    if mimetype:
        mf["extra"] = dict(mimetype=mimetype)
    return mf


def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession):
    print(f"processing item={item_name} release_id={release_id}", file=sys.stderr)
    if release_id.startswith("release_"):
        release_id = release_id[9:]
    assert len(release_id) == 26
    item = session.get_item(item_name)
    assert item.metadata["mediatype"] not in ["collection", "web"]
    item_files = item.get_files(on_the_fly=False)
    manifest = [parse_file(f) for f in item_files if want_file(f, item_name)]
    fileset = {
        "manifest": manifest,
        "urls": [
            {
                "rel": "archive",
                "url": f"https://archive.org/download/{item_name}/",
            },
        ],
        "release_ids": [release_id],
        # extra={},
    }
    print(json.dumps(fileset))
    return fileset


def main():
    session = internetarchive.get_session()
    if len(sys.argv) == 3:
        item_name = sys.argv[1]
        release_id = sys.argv[2]
        item_to_fileset(item_name, release_id=release_id, session=session)
    else:
        for line in sys.stdin:
            line = line.strip()
            if not line:
                continue
            fields = line.split("\t")
            assert len(fields) == 2
            item_name = fields[0]
            release_id = fields[1]
            item_to_fileset(item_name, release_id=release_id, session=session)


if __name__ == "__main__":
    main()