import io
import xml.etree.ElementTree as ET
from typing import Any, AnyStr, Dict, List, Optional

from .types import GrobidAddress, GrobidAffiliation, GrobidAuthor, GrobidBiblio, GrobidDocument

xml_ns = "http://www.w3.org/XML/1998/namespace"
ns = "http://www.tei-c.org/ns/1.0"


def _string_to_tree(content: AnyStr) -> ET.ElementTree:
    """
    Helper to consistently parse XML into an ElementTree, whether provided as
    str, bytes, wrapper thereof
    """
    if isinstance(content, str):
        return ET.parse(io.StringIO(content))
    elif isinstance(content, bytes):
        return ET.parse(io.BytesIO(content))
    if isinstance(content, io.StringIO) or isinstance(content, io.BytesIO):
        return ET.parse(content)
    elif isinstance(content, ET.ElementTree):
        return content
    else:
        raise TypeError(f"expected XML as string or bytes, got: {type(content)}")


def _parse_author(elem: Optional[ET.Element], ns: str = ns) -> Optional[GrobidAuthor]:
    """
    Internal helper to parse one or more TEI 'author' XML tags into
    GrobidAuthor objects. 'author' could appear in document headers or
    citations.
    """

    if elem is None:
        return None
    persname_tag = elem.find(f"./{{{ns}}}persName")
    if persname_tag is None:
        # should we do something else here? it is possible to have author
        # without persName?
        return None

    # basic author name stuff
    # instead create full_name from all the sub-components of the tag
    full_name = " ".join([t.strip() for t in persname_tag.itertext() if t.strip()]).strip()
    ga = GrobidAuthor(
        full_name=full_name or None,
        given_name=persname_tag.findtext(f'./{{{ns}}}forename[@type="first"]'),
        middle_name=persname_tag.findtext(f'./{{{ns}}}forename[@type="middle"]'),
        surname=persname_tag.findtext(f"./{{{ns}}}surname"),
        email=persname_tag.findtext(f"./{{{ns}}}email"),
        orcid=elem.findtext(f'.//{{{ns}}}idno[@type="ORCID"]'),
    )

    # author affiliation
    affiliation_tag = elem.find(f"./{{{ns}}}affiliation")
    if affiliation_tag is not None:
        affiliation_dict: Dict[str, Any] = dict()
        for orgname_tag in affiliation_tag.findall(f"./{{{ns}}}orgName"):
            orgname_type = orgname_tag.get("type")
            if orgname_type:
                affiliation_dict[orgname_type] = orgname_tag.text or None
        if affiliation_dict:
            ga.affiliation = GrobidAffiliation(
                institution=affiliation_dict.get('institution'),
                department=affiliation_dict.get('department'),
                laboratory=affiliation_dict.get('laboratory'),
            )
            address_tag = affiliation_tag.find(f"./{{{ns}}}address")
            if address_tag is not None:
                address_dict = dict()
                for t in list(address_tag):
                    address_dict[t.tag.split("}")[-1]] = t.text or None
                if address_dict:
                    ga.affiliation.address = GrobidAddress(
                        addr_line=address_dict.get('addrLine'),
                        post_code=address_dict.get('postCode'),
                        settlement=address_dict.get('settlement'),
                        country=address_dict.get('country'),
                    )
    return ga


def _clean_url(url: Optional[str]) -> Optional[str]:
    if not url:
        return None
    url = url.strip()
    if url.endswith(".Lastaccessed"):
        url = url.replace(".Lastaccessed", "")
    if url.startswith("<"):
        url = url[1:]
    if ">" in url:
        url = url.split(">")[0]
    return url or None


def test_clean_url() -> None:
    examples: List[dict] = [
        dict(
            dirty="https://archive.org/thing.pdf",
            clean="https://archive.org/thing.pdf",
        ),
        dict(
            dirty="https://archive.org/thing.pdf.Lastaccessed",
            clean="https://archive.org/thing.pdf",
        ),
        dict(
            dirty="<https://archive.org/thing.pdf>",
            clean="https://archive.org/thing.pdf",
        ),
        dict(
            dirty="   https://archive.org/thing.pdf>",
            clean="https://archive.org/thing.pdf",
        ),
        dict(
            dirty="   https://archive.org/thing.pdf>",
            clean="https://archive.org/thing.pdf",
        ),
        dict(dirty="", clean=None),
        dict(dirty=None, clean=None),
    ]

    for row in examples:
        assert row['clean'] == _clean_url(row['dirty'])


def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio:
    """
    Parses an entire TEI 'biblStruct' or 'teiHeader' XML tag

    Could be document header or a citation.
    """

    authors = []
    for ela in elem.findall(f".//{{{ns}}}author"):
        a = _parse_author(ela, ns=ns)
        if a is not None:
            authors.append(a)

    editors = []
    editor_tags = elem.findall(f'.//{{{ns}}}editor')
    if not editor_tags:
        editor_tags = elem.findall(f'.//{{{ns}}}contributor[@role="editor"]')
    for elt in editor_tags or []:
        e = _parse_author(elt, ns=ns)
        if e is not None:
            editors.append(e)

    biblio = GrobidBiblio(
        authors=authors,
        editors=editors or None,
        id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id"),
        unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]'),

        # date below
        # titles: @level=a for article, @level=m for manuscrupt (book)
        title=elem.findtext(f'.//{{{ns}}}title[@type="main"]'),
        journal=elem.findtext(f'.//{{{ns}}}title[@level="j"]'),
        journal_abbrev=elem.findtext(f'.//{{{ns}}}title[@level="j"][@type="abbrev"]'),
        series_title=elem.findtext(f'.//{{{ns}}}title[@level="s"]'),
        publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher"),
        institution=elem.findtext(f".//{{{ns}}}respStmt/{{{ns}}}orgName"),
        volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]'),
        issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]'),
        # pages below
        doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]'),
        pmid=elem.findtext(f'.//{{{ns}}}idno[@type="PMID"]'),
        pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]'),
        arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]'),
        pii=elem.findtext(f'.//{{{ns}}}idno[@type="PII"]'),
        ark=elem.findtext(f'.//{{{ns}}}idno[@type="ark"]'),
        istex_id=elem.findtext(f'.//{{{ns}}}idno[@type="istexId"]'),
        issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]'),
        eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]'),
    )

    book_title_tag = elem.find(f'.//{{{ns}}}title[@level="m"]')
    if book_title_tag is not None and book_title_tag.attrib.get('type') is None:
        biblio.book_title = book_title_tag.text
    if biblio.book_title and not biblio.title:
        biblio.title = biblio.book_title
        biblio.book_title = None

    note_tag = elem.find(f'.//{{{ns}}}note')
    if note_tag is not None and note_tag.attrib.get('type') is None:
        biblio.note = note_tag.text

    if not biblio.publisher:
        biblio.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher")

    date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]')
    if date_tag is not None:
        biblio.date = date_tag.attrib.get("when") or None

    if biblio.arxiv_id and biblio.arxiv_id.startswith("arXiv:"):
        biblio.arxiv_id = biblio.arxiv_id[6:]

    el = elem.find(f'.//{{{ns}}}biblScope[@unit="page"]')
    if el is not None:
        if el.attrib.get("from"):
            biblio.first_page = el.attrib["from"]
        if el.attrib.get("to"):
            biblio.last_page = el.attrib["to"]
        if el.attrib.get("from") and el.attrib.get("to"):
            biblio.pages = "{}-{}".format(el.attrib["from"], el.attrib["to"])
        else:
            biblio.pages = el.text

    el = elem.find(f".//{{{ns}}}ptr[@target]")
    if el is not None:
        biblio.url = _clean_url(el.attrib["target"])

    # having DOI and a DOI URL is redundant
    if biblio.doi and biblio.url:
        if ('://doi.org/' in biblio.url) or ('://dx.doi.org/' in biblio.url):
            biblio.url = None

    return biblio


def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
    """
    Use this function to parse TEI-XML of a full document or header processed
    by GROBID.

    Eg, the output of '/api/processFulltextDocument' or '/api/processHeader'
    """
    tree = _string_to_tree(xml_text)
    tei = tree.getroot()

    header = tei.find(f".//{{{ns}}}teiHeader")
    if header is None:
        raise ValueError("XML does not look like TEI format")

    application_tag = header.findall(f".//{{{ns}}}appInfo/{{{ns}}}application")[0]

    doc = GrobidDocument(
        grobid_version=application_tag.attrib["version"].strip(),
        grobid_timestamp=application_tag.attrib["when"].strip(),
        header=_parse_biblio(header),
        pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]'),
    )

    refs = []
    for (i, bs) in enumerate(tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")):
        ref = _parse_biblio(bs)
        ref.index = i
        refs.append(ref)
    doc.citations = refs

    text = tei.find(f".//{{{ns}}}text")
    # print(text.attrib)

    if text and text.attrib.get(f"{{{xml_ns}}}lang"):
        # this is the 'body' language
        doc.language_code = text.attrib[f"{{{xml_ns}}}lang"]  # xml:lang

    el = tei.find(f".//{{{ns}}}profileDesc/{{{ns}}}abstract")
    if el is not None:
        doc.abstract = " ".join(el.itertext()).strip() or None
    el = tei.find(f".//{{{ns}}}text/{{{ns}}}body")
    if el is not None:
        doc.body = " ".join(el.itertext()).strip() or None
    el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]')
    if el is not None:
        doc.acknowledgement = " ".join(el.itertext()).strip() or None
    el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]')
    if el is not None:
        doc.annex = " ".join(el.itertext()).strip() or None

    return doc


def parse_citations_xml(xml_text: AnyStr) -> List[GrobidBiblio]:
    """
    Use this function to parse TEI-XML of one or more references. This should
    work with either /api/processCitation or /api/processCitationList API
    responses from GROBID

    Note that processed citations are usually returned as a bare XML tag, not a
    full XML document, which means that the TEI xmlns is not set. This requires
    a tweak to all downstream parsing code to handle documents with or without
    the namespace.
    """
    if isinstance(xml_text, bytes):
        xml_text = xml_text.replace(b'xmlns="http://www.tei-c.org/ns/1.0"', b'')
    elif isinstance(xml_text, str):
        xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', '')
    tree = _string_to_tree(xml_text)
    root = tree.getroot()

    if root.tag == 'biblStruct':
        ref = _parse_biblio(root, ns='')
        ref.index = 0
        return [ref]

    refs = []
    for (i, bs) in enumerate(tree.findall(".//biblStruct")):
        ref = _parse_biblio(bs, ns='')
        ref.index = i
        refs.append(ref)
    return refs