#!/usr/bin/env python3
"""
Transform an OAI-PMH bulk dump (JSON) into ingest requests.

Eg: https://archive.org/details/oai_harvest_20200215
"""

import argparse
import json
import sys

import urlcanon

DOMAIN_BLOCKLIST = [
    # large OA publishers (we get via DOI)
    # large repos and aggregators (we crawl directly)
    "://arxiv.org/",
    "://europepmc.org/",
    "ncbi.nlm.nih.gov/",
    "semanticscholar.org/",
    "://doi.org/",
    "://dx.doi.org/",
    "zenodo.org/",
    "figshare.com/",
    "://archive.org/",
    ".archive.org/",
    "://127.0.0.1/",
    # OAI specific additions
    "://hdl.handle.net/",
]

RELEASE_STAGE_MAP = {
    "info:eu-repo/semantics/draftVersion": "draft",
    "info:eu-repo/semantics/submittedVersion": "submitted",
    "info:eu-repo/semantics/acceptedVersion": "accepted",
    "info:eu-repo/semantics/publishedVersion": "published",
    "info:eu-repo/semantics/updatedVersion": "updated",
}


def canon(s):
    parsed = urlcanon.parse_url(s)
    return str(urlcanon.whatwg(parsed))


def transform(obj):
    """
    Transforms from a single OAI-PMH object to zero or more ingest requests.
    Returns a list of dicts.
    """

    requests = []
    if not obj.get("oai") or not obj["oai"].startswith("oai:"):
        return []
    if not obj.get("urls"):
        return []

    # look in obj['formats'] for PDF?
    if obj.get("formats"):
        # if there is a list of formats, and it does not contain PDF, then
        # skip. Note that we will continue if there is no formats list.
        has_pdf = False
        for f in obj["formats"]:
            if "pdf" in f.lower():
                has_pdf = True
        if not has_pdf:
            return []

    doi = None
    if obj.get("doi"):
        doi = obj["doi"][0].lower().strip()
        if not doi.startswith("10."):
            doi = None

    # infer release stage and/or type from obj['types']
    release_stage = None
    for t in obj.get("types", []):
        if t in RELEASE_STAGE_MAP:
            release_stage = RELEASE_STAGE_MAP[t]

    # TODO: infer rel somehow? Eg, repository vs. OJS publisher
    rel = None

    for url in obj["urls"]:
        skip = False
        for domain in DOMAIN_BLOCKLIST:
            if domain in url:
                skip = True
        if skip:
            continue
        try:
            base_url = canon(url)
        except UnicodeEncodeError:
            continue

        request = {
            "base_url": base_url,
            "ingest_type": "pdf",
            "link_source": "oai",
            "link_source_id": obj["oai"].lower(),
            "ingest_request_source": "metha-bulk",
            "release_stage": release_stage,
            "rel": rel,
            "ext_ids": {
                "doi": doi,
                "oai": obj["oai"].lower(),
            },
            "edit_extra": {},
        }
        requests.append(request)

    return requests


def run(args):
    for l in args.json_file:
        if not l.strip():
            continue
        row = json.loads(l)

        requests = transform(row) or []
        for r in requests:
            print("{}".format(json.dumps(r, sort_keys=True)))


def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "json_file",
        help="OAI-PMH dump file to use (usually stdin)",
        type=argparse.FileType("r"),
    )
    subparsers = parser.add_subparsers()

    args = parser.parse_args()

    run(args)


if __name__ == "__main__":
    main()