"""
WIP: Importer for datacite.org data.

Example doc at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8
"""

from .common import EntityImporter
import dateparser
import langcodes
import datetime
import fatcat_openapi_client
import json
import sys

# https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
CONTAINER_TYPE_MAP = {
    'Journal': 'journal',
    'Series': 'journal',
    'Book Series': 'book-series',
}

# The docs/guide should be the cannonical home for these mappings; update there
# first.
#
# > select count(*), release_type from release_rev group by release_type order by count(*) desc;
#
#   count   |   release_type
# ----------+-------------------
#  95030004 | article-journal
#  13477878 | chapter
#   5926811 | paper-conference
#   2169642 | article
#   1806415 | dataset
#   1548614 | book
#   1390304 |
#    818351 | report
#    815684 | entry
#    307998 | standard
#    297769 | thesis
#    261426 | letter
#    148093 | post
#    122736 | editorial
#     99225 | stub
#     96219 | review-book
#     22854 | peer_review
#     19078 | interview
#     16278 | article-newspaper
#      3973 | speech
#      3536 | legal_case
#      2264 | abstract
#      1626 | legislation
#      1053 | retraction
#        85 | component
# (25 rows)
#
# Map various datacite type types to CSL-ish types. None means TODO or remove.
DATACITE_TYPE_MAP = {
    'ris': {
        'THES': 'thesis',
        'SOUND': None,
        'CHAP': 'chapter',
        'FIGURE': None,
        'RPRT': 'report',
        'JOUR': 'article-journal',
        'MPCT': None,
        'GEN': None,
        'BOOK': 'book',
        'DATA': 'dataset',
        'COMP': None,
    },
    'schemaOrg': {
        'Dataset': 'dataset',
        'Book': 'book',
        'ScholarlyArticle': 'article',
        'ImageObject': 'graphic',
        'Collection': None,
        'MediaObject': None,
        'Event': None,
        'SoftwareSourceCode': None,
        'Chapter': 'chapter',
        'CreativeWork': None,
        'PublicationIssue': 'article',
        'AudioObject': None,
        'Thesis': 'thesis',
    },
    'citeproc': {
        'dataset': 'dataset',
        'chapter': 'chapter',
        'article-journal': 'article-journal',
        'song': 'song',
        'article': 'article',
        'report': 'report',
        'graphic': 'graphic',
        'thesis': 'thesis',
        'book': 'book',
    },
    'bibtex': {
        'phdthesis': 'thesis',
        'inbook': 'chapter',
        'misc': None,
        'article': 'article-journal',
        'book': 'book',
    },
    'resourceTypeGeneral': {
        'Image': None,
        'Dataset': 'dataset',
        'PhysicalObject': None,
        'Collection': None,
        'Text': None,
        'Sound': None,
        'InteractiveResource': None,
        'Event': None,
        'Software': None,
        'Other': None,
        'Workflow': None,
        'Audiovisual': None,
    }
}


# TODO(martin): merge this with other maps, maybe.
LICENSE_SLUG_MAP = {
    "//creativecommons.org/licenses/by/2.0/": "CC-BY",
    "//creativecommons.org/licenses/by/2.0/uk/legalcode": "CC-BY",
    "//creativecommons.org/licenses/by/3.0/": "CC-BY",
    "//creativecommons.org/licenses/by/3.0/us": "CC-BY",
    "//creativecommons.org/licenses/by/4.0/": "CC-BY",
    "//creativecommons.org/licenses/by/4.0/deed.de/": "CC-BY",
    "//creativecommons.org/licenses/by/4.0/deed.en_US/": "CC-BY",
    "//creativecommons.org/licenses/by/4.0/legalcode/": "CC-BY",
    "//creativecommons.org/licenses/by-nc/2.0/": "CC-BY-NC",
    "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
    "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
    "//creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC",
    "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
    "//creativecommons.org/licenses/by-nc-nd/3.0/gr": "CC-BY-NC-ND",
    "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-ND",
    "//creativecommons.org/licenses/by-nc-nd/4.0/legalcode": "CC-BY-ND",
    "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
    "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
    "//creativecommons.org/licenses/by-sa/3.0/de": "CC-BY-SA",
    "//creativecommons.org/licenses/by-sa/3.0/gr": "CC-BY-SA",
    "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
    "//creativecommons.org/licenses/by-sa/4.0/legalcode": "CC-BY-SA",
    "//creativecommons.org/licenses/CC-BY/4.0/": "CC-BY",
    "//creativecommons.org/licenses/publicdomain/zero/1.0/": "CC-0",
    "//creativecommons.org/publicdomain/zero/1.0/": "CC-0",
    "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0",
    "//opensource.org/licenses/MIT": "MIT",
    "//www.elsevier.com/open-access/userlicense/1.0": "ELSEVIER-USER-1.0",
    "//www.gnu.org/licenses/gpl-3.0.en.html": "GPLv3",
    "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html": "GPLv2",
    "//www.karger.com/Services/SiteLicenses": "KARGER",
    "//www.opensource.org/licenses/Apache-2.0": "Apache-2.0",
    "//www.opensource.org/licenses/BSD-3-Clause": "BSD-3-Clause",
    "//www.opensource.org/licenses/EUPL-1.1": "EUPL-1.1", # redirects to EUPL-1.2
    "//www.opensource.org/licenses/MIT": "MIT",
    # "http://royalsocietypublishing.org/licence": "", # OA and "normal", https://royalsociety.org/journals/authors/licence-to-publish/
    # "http://rsc.li/journals-terms-of-use": "RSC",
    # "http://www.fu-berlin.de/sites/refubium/rechtliches/Nutzungsbedingungen": "", # 53 UrhG.
    # "http://www.nrcresearchpress.com/page/about/CorporateTextAndDataMining": "",
    # "http://www.springer.com/tdm": "",
    # "https://cds.unistra.fr/vizier-org/licences_vizier.html": "", # Maybe try to "SPN" those: https://web.archive.org/web/*/https://cds.unistra.fr/vizier-org/licences_vizier.html
    # "https://link.aps.org/licenses/aps-default-accepted-manuscript-license": "",
    # "https://oparu.uni-ulm.de/xmlui/license_opod_v1": "",
    # "https://publikationen.bibliothek.kit.edu/kitopen-lizenz": "",
    # "https://rightsstatements.org/page/InC/1.0?language=en": "",
    # "https://services.ceda.ac.uk/cedasite/register/info": "",
    # "https://wdc.dlr.de/ndmc/userfiles/file/NDMC-Data_Sharing_Principles.pdf": "", # 404
    # "https://www.cambridge.org/core/terms": "",
    # "https://www.elsevier.com/tdm/userlicense/1.0",
    # "info:eu-repo/semantics/closedAccess": "", # https://wiki.surfnet.nl/display/standards/info-eu-repo/#info-eu-repo-AccessRights
    # "info:eu-repo/semantics/embargoedAccess": "",
    # "info:eu-repo/semantics/openAccess": "",
    # Note: Some URLs pointing to licensing terms are not in WB yet (but would be nice).
}

class DataciteImporter(EntityImporter):
    """
    Importer for datacite records. TODO(martin): Do we need issn_map_file?
    """

    def __init__(self, api, issn_map_file, **kwargs):

        eg_desc = kwargs.get('editgroup_description',
            "Automated import of Datacite DOI metadata, harvested from REST API")
        eg_extra = kwargs.get('editgroup_extra', dict())
        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.DataciteImporter')
        super().__init__(api,
            issn_map_file=issn_map_file,
            editgroup_description=eg_desc,
            editgroup_extra=eg_extra,
            **kwargs)

        self.create_containers = kwargs.get('create_containers', True)
        self.read_issn_map_file(issn_map_file)

    def parse_record(self, obj):
        """
        TODO(martin): Map datacite to RE.

        WIP, notes:

        * Many subjects, should they end up in extra?
        * attributes.creators and attributes.contributors

        $ jq '.attributes.creators[]?.nameType?' datacite.500k | sort | uniq -c | sort -nr
        3963663 "Personal"
        289795 null
        8892 "Organizational"

        Shall we use issued, available?

          {
            "date": "2011-11-18",
            "dateType": "Accepted"
          },
          {
            "date": "2011-11-18",
            "dateType": "Available"
          },
          {
            "date": "2011-11-07",
            "dateType": "Copyrighted"
          },
          {
            "date": "2011-11-18",
            "dateType": "Issued"
          },
          {
            "date": "2011-11-07",
            "dateType": "Issued"
          }

        TODO(martin): Quick analysis of dates and stages.
        """

        if 'attributes' not in obj:
            return None

        attributes = obj['attributes']

        # > Contributors
        #
        #  "attributes.creators[].contributorType": [
        #    "author"
        #  ],
        #  "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": [
        #    "LCNA",
        #    "GND",
        #    "email",
        #    "NAF",
        #    "OSF",
        #    "RRID",
        #    "ORCID",
        #    "SCOPUS",
        #    "NRCPID",
        #    "schema.org",
        #    "GRID",
        #    "MGDS",
        #    "VIAF",
        #    "JACoW-ID"
        #  ],
        #
        #    "https://orcid.org/0000-0002-9902-738X",
        #    "http://jacow.org/JACoW-00001280",
        #    "Wiebe_Peter",
        #    "https://osf.io/https://osf.io/kjfuy/",
        #    "http://www.viaf.org176549220",
        #    "2239",
        #    "Jeffries_Martin",
        #    "https://orcid.org/0000-0002-1493-6630",
        #    "0000-0002-6233-612X",
        #
        # "creators": [
        #   {
        #     "name": "Bögli, Hans",
        #     "nameType": "Personal",
        #     "givenName": "Hans",
        #     "familyName": "Bögli",
        #     "affiliation": []
        #   }
        # ],

        contribs = []

        for i, c in enumerate(attributes['creators']):
            if not c.get('nameType') == 'Personal':
                continue
            creator_id = None
            for nid in c.get('nameIdentifiers', []):
                if not nid.get('nameIdentifierScheme').lower() == "orcid":
                    continue
                orcid = nid.get('nameIdentifier', '').replace('https://orcid.org/', '')
                if not orcid:
                    continue
                creator_id = self.lookup_orcid(orcid)
                # If creator_id is None, should we create creators?
            contribs.append(fatcat_openapi_client.ReleaseContrib(
                creator_id=creator_id,
                index=i,
                raw_name=c.get('name'),
                given_name=c.get('givenName'),
                surname=c.get('familyName'),
            ))

        # > Title
        #
        #   "attributes.titles[].titleType": [
        #     "AlternativeTitle",
        #     "Other",
        #     "Subtitle",
        #     null,
        #     "TranslatedTitle"
        #   ],
        title, subtitle = None, None

        for entry in attributes.get('titles', []):
            if not title and 'titleType' not in entry:
                title = entry.get('title').strip()
            if entry.get('titleType') == 'Subtitle':
                subtitle = entry.get('title').strip()

        # > Dates
        #
        #  "attributes.dates[].dateType": [
        #    "Accepted",
        #    "Available"
        #    "Collected",
        #    "Copyrighted",
        #    "Created",
        #    "Issued",
        #    "Submitted",
        #    "Updated",
        #    "Valid",
        #  ],
        #
        # Different documents have different dates defined. Choose the topmost
        # available from prio list.
        date_type_prio = (
            'Valid',
            'Issued',
            'Available',
            'Accepted',
            'Submitted',
            'Copyrighted',
            'Collected',
            'Created',
            'Updated',
        )

        release_year, release_date = None, None
        for prio in date_type_prio:
            dates = attributes.get('dates', []) or [] # Never be None.
            for item in dates:
                if not item.get('dateType') == prio:
                    continue
                result = dateparser.parse(item.get('date'))
                if result is None:
                    # Unparsable date.
                    continue
                release_date = result
                release_year = result.year
                if 1000 < release_year < datetime.date.today().year + 5:
                    # Skip possibly bogus dates.
                    continue
                break
            else:
                continue
            break

        # > Publisher
        #
        # A few NA values. A few bogus values.
        #
        publisher = attributes.get('publisher')

        if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)'):
            publisher = None
        if publisher is not None and len(publisher) > 80:
            # Arbitrary magic value, TODO(martin): better heuristic.
            # Example: "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller,
            # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE / Unbekannt,
            # Nutzungsrechte müssen durch den Nutzer abgeklärt werden",
            # TODO(martin): log misses.
            publisher = None

        # > Container
        #
        # For the moment, only ISSN as container.
        #
        #    "container": {
        #      "type": "Journal",
        #      "issue": "8",
        #      "title": "Angewandte Chemie International Edition",
        #      "volume": "57",
        #      "lastPage": "2080",
        #      "firstPage": "2077",
        #      "identifier": "14337851",
        #      "identifierType": "ISSN"
        #    },
        #
        #  "attributes.container.type": [
        #    "DataRepository",
        #    "Journal",
        #    "Series",
        #    "Book Series"
        #  ],
        #
        #  "attributes.container.identifierType": [
        #    "Handle",
        #    "ISBN",
        #    "LISSN",
        #    "DOI",
        #    "EISSN",
        #    "URL",
        #    "ISSN"
        #  ],
        #

        container_id = None
        container = attributes.get('container', {}) or {}
        if container.get('type') in CONTAINER_TYPE_MAP.keys():
            container_type = CONTAINER_TYPE_MAP.get(container['type'])
            if container.get('identifier') and container.get('identifierType') == 'ISSN':
                issn = container.get('identifier')
                if len(issn) == 8:
                    issn = issn[:4] + "-" + issn[4:]
                issnl = self.issn2issnl(issn)
                if issnl is not None:
                    container_id = self.lookup_issnl(issnl)

                    if container_id is None and container.get('title'):
                        ce = fatcat_openapi_client.ContainerEntity(
                            issnl=issnl,
                            container_type=container_type,
                            name=container.get('title'),
                        )
                        ce_edit = self.create_container(ce)
                        container_id = ce_edit.ident
                        self._issnl_id_map[issnl] = container_id

        # > License
        #
        # attributes.rightsList[].rightsUri
        # attributes.rightsList[].rights
        # attributes.rightsList[].lang
        #

        license_slug = None
        license_extra = []
        for l in attributes.get('rightsList', []):
            slug = lookup_license_slug(l.get('rightsUri'))
            if slug:
                license_slug = slug
            license_extra.append(l)

        # > Release type.
        #
        # Datacite has some fine granular typing (e.g. "Supplementary
        # Collection of Datasets", "Taxonomic treatment", "blog_entry", ...
        #
        # Additional, coarse: resourceTypeGeneral
        #
        #  "attributes.types.resourceTypeGeneral": [
        #    "Image",
        #    "Dataset",
        #    "PhysicalObject",
        #    "Collection",
        #    "Text",
        #    "Sound",
        #    "InteractiveResource",
        #    "Event",
        #    "Software",
        #    "Other",
        #    "Workflow",
        #    "Audiovisual"
        #  ],
        #  "attributes.types.citeproc": [
        #    "dataset",
        #    "chapter",
        #    "article-journal",
        #    "song",
        #    "article",
        #    "report",
        #    "graphic",
        #    "thesis",
        #    "book"
        #  ],
        #
        # There is RIS, also.

        # attributes.types.resourceType contains too many things for now.
        for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', 'bibtex', 'ris'):
            release_type = attributes.get('types', {}).get(typeType)
            if release_type is not None:
                break

        # TODO(martin): Skip unmapped release_type entirely?
        if release_type is None:
            print("datacite unmapped type: {}".format(release_type), file=sys.stderr)

        # > Language.
        # attributes.language

        language = None
        value = attributes.get('language', '') or '' # As it is written.
        try:
            language = langcodes.find(value).language
        except LookupError:
            try:
                language = langcodes.get(value).language
            except langcodes.tag_parser.LanguageTagError:
                pass

        # > Extra information: license, subjects, ...
        extra, extra_datacite = dict(), dict()
        if license_extra:
            extra_datacite = {
                'license': license_extra,
            }
        if attributes.get('subjects'):
            extra_datacite['subjects'] = attributes.get('subjects', [])

        if extra_datacite:
            extra['datacite'] = extra_datacite

        # https://guide.fatcat.wiki/entity_release.html
        re = fatcat_openapi_client.ReleaseEntity(
            work_id=None,
            container_id=container_id,
            release_type=release_type,
            release_stage=None,
            title=title, # attributes.titles, various titleType
            subtitle=subtitle,
            original_title=title, # AlternativeTitle?
            release_year=release_year, # publicationYear
            release_date=release_date, # date issues/available?
            publisher=publisher, # attributes.publisher
            ext_ids=fatcat_openapi_client.ReleaseExtIds(
                doi=attributes.get('doi'), # attributes.doi,
                # Can we add handle.net link?
            ),
            contribs=contribs,
            volume=None,
            issue=None,
            pages=None,
            language=language,
            abstracts=None,
            refs=None,
            extra=extra,
            license_slug=license_slug,
        )
        return re

    def try_update(self, re, debug=True):
        if debug is True:
            # print(type(re))
            print(json.dumps(re.to_dict(), default=extended_encoder))
            return
        return False

    def insert_batch(self, batch):
        # Debugging.
        for item in batch:
            print(item)
        return

        # Orig.
        self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
            editgroup=fatcat_openapi_client.Editgroup(
                description=self.editgroup_description,
                extra=self.editgroup_extra),
            entity_list=batch))

def extended_encoder(value):
    """
    Can be used with json.dumps(value, default=extended_encoder) to serialize
    value not serializable by default. https://docs.python.org/3/library/json.html#basic-usage
    """
    if isinstance(value, (datetime.datetime, datetime.date)):
        return value.isoformat()
    if isinstance(value, set):
        return list(value)

def lookup_license_slug(raw):
    """
    TODO(martin): reuse from crossref, maybe.
    """
    if not raw:
        return None
    raw = raw.strip().replace('http://', '//').replace('https://', '//')
    if 'creativecommons.org' in raw.lower():
        raw = raw.lower()
        raw = raw.replace('/legalcode', '/').replace('/uk', '')
        if not raw.endswith('/'):
            raw = raw + '/'
    return LICENSE_SLUG_MAP.get(raw)