#!/usr/bin/env python3

import sys
import json
import base64
import datetime
import fatcat_client
from .common import EntityImporter, clean, make_rel_url

MAX_ABSTRACT_BYTES=4096


class GrobidMetadataImporter(EntityImporter):
    """
    This is a complex case: we need to parse and create both file and release entities.

    The "primary" entity here is really File, not Release. If a matching File
    exists, we bail in want(); if not we insert the Release during parsing, and
    insert both.

    TODO: should instead check if the File has any releases; if not, insert and update.
    TODO: relaxing 'None' constraint on parse_record() might make this refactor-able.
    """

    def __init__(self, api, **kwargs):

        eg_desc = kwargs.get('editgroup_description',
            "Import of release and file metadata, as extracted from PDFs by GROBID.")
        eg_extra = kwargs.get('editgroup_extra', dict())
        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.GrobidMetadataImporter')
        super().__init__(api,
            editgroup_description=eg_desc,
            editgroup_extra=eg_extra)
        self.default_link_rel = kwargs.get("default_link_rel", "web")
        self.longtail_oa = kwargs.get("longtail_oa", False)

    def want(self, raw_record):
        return True

    def parse_record(self, row):

        fields = row.split('\t')
        sha1_key = fields[0]
        cdx = json.loads(fields[1])
        mimetype = fields[2]
        file_size = int(fields[3])
        grobid_meta = json.loads(fields[4])
        fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size)
        re = self.parse_grobid_json(grobid_meta)

        if not (fe and re):
            return None

        # lookup existing file SHA1
        existing = None
        try:
            existing = self.api.lookup_file(sha1=fe.sha1)
        except fatcat_client.rest.ApiException as err:
            if err.status != 404:
                raise err

        # if file is already in here, presumably not actually long-tail
        # HACK: this is doing an exists check in parse_record(), which is weird
        # TODO: this is where we should check if the file actually has
        # release_ids and/or URLs associated with it
        if existing and not self.bezerk_mode:
            self.counts['exists'] += 1
            self.counts['skip'] -= 1
            return None

        release_edit = self.create_release(re)
        fe.release_ids.append(release_edit.ident)
        return fe

    def parse_grobid_json(self, obj):

        if not obj.get('title'):
            return None

        extra_grobid = dict()

        if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES:
            abobj = dict(
                mimetype="text/plain",
                language=None,
                content=clean(obj.get('abstract')))
            abstracts = [abobj]
        else:
            abstracts = None

        contribs = []
        for i, a in enumerate(obj.get('authors', [])):
            contribs.append(fatcat_client.ReleaseContrib(
                index=i,
                raw_name=clean(a['name']),
                role="author",
                extra=None))

        # XXX: why is this a dict()? not covered by tests?
        refs = []
        for raw in obj.get('citations', []):
            cite_extra = dict()
            ref = dict()
            ref['key'] = clean(raw.get('id'))
            if raw.get('title'):
                ref['title'] = clean(raw['title'])
            if raw.get('date'):
                try:
                    year = int(raw['date'].strip()[:4])
                    ref['year'] = year
                except:
                    pass
            for key in ('volume', 'url', 'issue', 'publisher'):
                if raw.get(key):
                    cite_extra[key] = clean(raw[key])
            if raw.get('authors'):
                cite_extra['authors'] = [clean(a['name']) for a in raw['authors']]
            if cite_extra:
                cite_extra = dict(grobid=cite_extra)
            else:
                cite_extra = None
            ref['extra'] = cite_extra
            refs.append(ref)

        release_date = None
        release_year = None
        if obj.get('date'):
            # only returns year, ever?
            release_year = int(obj['date'][:4])

        extra = dict()

        if obj.get('doi'):
            extra_grobid['doi'] = obj['doi']
        if obj['journal'] and obj['journal'].get('name'):
            extra['container_name'] = clean(obj['journal']['name'])

        # TODO: ISSN/eISSN handling? or just journal name lookup?

        if extra_grobid:
            extra['grobid'] = extra_grobid
        if self.longtail_oa:
            extra['longtail_oa'] = True
        if not extra:
            extra = None

        title = clean(obj['title'], force_xml=True)
        if not title or len(title) < 2:
            return None

        re = fatcat_client.ReleaseEntity(
            title=title,
            release_type="article-journal",
            release_date=release_date,
            release_year=release_year,
            contribs=contribs,
            refs=refs,
            publisher=clean(obj['journal'].get('publisher')),
            volume=clean(obj['journal'].get('volume')),
            issue=clean(obj['journal'].get('issue')),
            abstracts=abstracts,
            extra=extra)
        return re

    def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size):

        sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower()

        fe = fatcat_client.FileEntity(
            sha1=sha1,
            size=int(file_size),
            mimetype=mimetype,
            release_ids=[],
            urls=[],
        )

        # parse URLs and CDX
        original = cdx['url']
        assert len(cdx['dt']) >= 8
        wayback = "https://web.archive.org/web/{}/{}".format(
            cdx['dt'],
            original)
        fe.urls.append(
            fatcat_client.FileEntityUrls(url=wayback, rel="webarchive"))
        original_url = make_rel_url(original, default_link_rel=self.default_link_rel)
        if original_url is not None:
            fe.urls.append(fatcat_client.FileEntityUrls(rel=original_url[0], url=original_url[1]))

        return fe

    def try_update(self, entity):
        # did the exists check in 'parse_record()', because we needed to create a release
        return True

    def insert_batch(self, batch):
        self.api.create_file_batch(batch,
            autoaccept=True,
            description=self.editgroup_description,
            extra=json.dumps(self.editgroup_extra))