import sys
import json
import sqlite3
import itertools
import fatcat_client
from .common import FatcatImporter

#row = row.split('\t')
#assert len(row) == 2
#sha1 = row[0].replace('sha1:')
#sha1 = base64.b16encode(base64.b32decode(sha1)).lower()
#print(sha1)
#dois = [d.lower() for d in json.loads(row[1])]

class MatchedImporter(FatcatImporter):
    """
    Importer for "file to crossref DOI" matches.

    These matches are currently generated by Internet Archive hadoop jobs
    written in scala (part of the 'sandcrawler' repo/project), but could be
    generated by other parties as well.

    Input format is JSON with keys:
    - dois (list)
    - sha1 (hex)
    - md5 (hex)
    - sha256 (hex)
    - size (int)
    - cdx (list of objects)
        - dt
        - url
    - mimetype
    - urls (list of strings... or objects?)

    Future handlings/extensions:
    - core_id, wikidata_id, pmcid, pmid: not as lists
    """

    def __init__(self, api, **kwargs):

        eg_desc = kwargs.get('editgroup_description',
            "Import of large-scale file-to-release match results. Source of metadata varies.")
        eg_extra = kwargs.get('editgroup_extra', dict())
        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.MatchedImporter')
        super().__init__(api,
            editgroup_description=eg_desc,
            editgroup_extra=eg_extra)
        self.default_link_rel = kwargs.get("default_link_rel", "web")
        self.default_mime = kwargs.get("default_mime", None)
        self.skip_file_updates = kwargs.get("skip_file_updates", False)

    def make_url(self, raw):
        rel = self.default_link_rel
        # TODO: this is where we could map specific domains to rel types,
        # and also filter out bad domains, invalid URLs, etc
        if "//archive.org/" in raw or "//arxiv.org/" in raw:
            # TODO: special-case the arxiv.org bulk mirror?
            rel = "repository"
        elif "//web.archive.org/" in raw or "//archive.is/" in raw:
            rel = "webarchive"
        return fatcat_client.FileEntityUrls(url=raw, rel=rel)

    def parse_matched_dict(self, obj):
        sha1 = obj['sha1']
        dois = [d.lower() for d in obj.get('dois', [])]

        # lookup sha1, or create new entity
        fe = None
        if not self.skip_file_updates:
            try:
                fe = self.api.lookup_file(sha1=sha1)
            except fatcat_client.rest.ApiException as err:
                if err.status != 404:
                    raise err
        if fe is None:
            fe = fatcat_client.FileEntity(
                sha1=sha1,
                release_ids=[],
                urls=[],
            )

        # lookup dois
        re_list = set()
        for doi in dois:
            try:
                re = self.api.lookup_release(doi=doi)
            except fatcat_client.rest.ApiException as err:
                if err.status != 404:
                    raise err
                re = None
            if re is None:
                print("DOI not found: {}".format(doi))
            else:
                re_list.add(re.ident)
        if len(re_list) == 0:
            return None
        if fe.release_ids == set(re_list):
            return None
        re_list.update(fe.release_ids)
        fe.release_ids = list(re_list)

        # parse URLs and CDX
        existing_urls = [feu.url for feu in fe.urls]
        for url in obj.get('url', []):
            if url not in existing_urls:
                url = self.make_url(url)
                if url != None:
                    fe.urls.append(url)
        for cdx in obj.get('cdx', []):
            original = cdx['url']
            wayback = "https://web.archive.org/web/{}/{}".format(
                cdx['dt'],
                original)
            if wayback not in existing_urls:
                fe.urls.append(
                    fatcat_client.FileEntityUrls(url=wayback, rel="webarchive"))
            if original not in existing_urls:
                url = self.make_url(original)
                if url != None:
                    fe.urls.append(url)

        if obj.get('size') != None:
            fe.size = int(obj['size'])
        fe.sha256 = obj.get('sha256', fe.sha256)
        fe.md5 = obj.get('md5', fe.sha256)
        if obj.get('mimetype') is None:
            if fe.mimetype is None:
                fe.mimetype = self.default_mime
        else:
            fe.mimetype = obj.get('mimetype')
        return fe

    def create_row(self, row, editgroup_id=None):
        obj = json.loads(row)
        fe = self.parse_matched_dict(obj)
        if fe is not None:
            if fe.ident is None:
                self.api.create_file(fe, editgroup_id=editgroup_id)
                self.counts['insert'] += 1
            else:
                self.api.update_file(fe.ident, fe, editgroup_id=editgroup_id)
                self.counts['update'] += 1

    def create_batch(self, batch):
        """Reads and processes in batches (not API-call-per-line)"""
        objects = [self.parse_matched_dict(json.loads(l))
                   for l in batch if l != None]
        new_objects = [o for o in objects if o != None and o.ident == None]
        update_objects = [o for o in objects if o != None and o.ident != None]
        if len(update_objects):
            update_eg = self._editgroup().editgroup_id
            for obj in update_objects:
                self.api.update_file(obj.ident, obj, editgroup_id=update_eg)
            self.api.accept_editgroup(update_eg)
        if len(new_objects) > 0:
            self.api.create_file_batch(new_objects, autoaccept="true")
        self.counts['update'] += len(update_objects)
        self.counts['insert'] += len(new_objects)