"""
Helper functions to parse an unstructured citation string using GROBID, then
fuzzy match using the result.

- try to parse string with GROBID REST API call
- transform the GROBID XML response to a simple dict/struct

TODO: more general versions which handle multiple reference strings in a batch?
"""

import io
import sys
import xml.etree.ElementTree as ET
from typing import Optional

import requests
from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds

from fuzzycat.config import settings
from fuzzycat.grobid2json import biblio_info
from fuzzycat.utils import clean_doi

GROBID_API_BASE = settings.get("GROBID_API_BASE", "https://grobid.qa.fatcat.wiki")


def grobid_api_process_citation(raw_citation: str,
                                grobid_api_base: str = GROBID_API_BASE,
                                timeout: float = 20.0) -> Optional[str]:
    """
    Process a single citation string using GROBID API, returning a TEI-XML response.

    Raises python TimeoutError if there was a network or request timeout.

    Raises a 'requests' error other unexpected failures (including network
    connection failures)
    """
    try:
        grobid_response = requests.post(
            grobid_api_base + "/api/processCitation",
            data={
                "citations": raw_citation,
                "consolidateCitations": 0,
            },
            timeout=timeout,
        )
    except requests.Timeout:
        raise TimeoutError("GROBID request (HTTP POST) timeout")

    if grobid_response.status_code == 204:
        return None
    elif grobid_response.status_code != 200:
        print(f"GROBID request (HTTP POST) failed: {grobid_response}", file=sys.stderr)
    grobid_response.raise_for_status()

    return grobid_response.text or None


def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]:
    """
    Parses GROBID XML for the case of a single reference/citation string (eg,
    not a full/propper TEI-XML fulltext document), and returns a dict.
    """
    # first, remove any xmlns stuff, for consistent parsign
    raw_xml = raw_xml.replace('xmlns="http://www.tei-c.org/ns/1.0"', "")
    tree = ET.parse(io.StringIO(raw_xml))
    root = tree.getroot()
    ref = biblio_info(root, ns="")
    if not any(ref.values()):
        return None
    return ref


def grobid_ref_to_release(ref: dict) -> ReleaseEntity:
    """
    Takes the dict returned by transform_grobid_ref_xml() and returns a partial
    ReleaseEntity object (for use with fuzzycat)
    """
    contribs = []
    for author in ref.get("authors") or []:
        contribs.append(
            ReleaseContrib(
                raw_name=author.get("name"),
                given_name=author.get("given_name"),
                surname=author.get("surname"),
            ))
    release = ReleaseEntity(
        title=ref.get("title"),
        contribs=contribs,
        volume=ref.get("volume"),
        issue=ref.get("issue"),
        pages=ref.get("pages"),
        ext_ids=ReleaseExtIds(
            doi=clean_doi(ref.get("doi")),
            pmid=ref.get("pmid"),
            pmcid=ref.get("pmcid"),
            arxiv=ref.get("arxiv_id"),
        ),
    )
    if ref.get("journal"):
        release.extra = {"container_name": ref.get("journal")}
    if ref.get("date"):
        if len(ref["date"]) >= 4 and ref["date"][0:4].isdigit():
            release.release_year = int(ref["date"][0:4])
        # TODO: try to parse 'date' into an ISO date format, and assign to release_date?
    return release


def grobid_parse_unstructured(raw_citation: str,
                              grobid_api_base: str = GROBID_API_BASE,
                              timeout: float = 20.0) -> Optional[ReleaseEntity]:
    """
    High-level wrapper to parse a raw citation string into a (partial) release
    entity.
    
    Returns None if it fails to parse.

    Raises various exceptions on network or remote errors.
    """
    ref_xml = grobid_api_process_citation(raw_citation,
                                          grobid_api_base=grobid_api_base,
                                          timeout=timeout)
    if not ref_xml:
        return None
    biblio_dict = transform_grobid_ref_xml(ref_xml)
    if not biblio_dict:
        return None
    return grobid_ref_to_release(biblio_dict)