import json
import time
from confluent_kafka import Consumer, Producer, KafkaException

from fatcat_tools.transforms import release_ingest_request, release_to_elasticsearch

from .worker_common import FatcatWorker, most_recent_message


class ChangelogWorker(FatcatWorker):
    """
    Periodically polls the fatcat API looking for new changelogs. When they are
    found, fetch them and push (as JSON) into a Kafka topic.
    """

    def __init__(self, api, kafka_hosts, produce_topic, poll_interval=10.0, offset=None):
        super().__init__(kafka_hosts=kafka_hosts,
                         produce_topic=produce_topic,
                         api=api)
        self.poll_interval = poll_interval
        self.offset = offset    # the fatcat changelog offset, not the kafka offset

    def run(self):

        # On start, try to consume the most recent from the topic, and using
        # that as the starting offset. Note that this is a single-partition
        # topic
        if self.offset is None:
            print("Checking for most recent changelog offset...")
            msg = most_recent_message(self.produce_topic, self.kafka_config)
            if msg:
                self.offset = json.loads(msg.decode('utf-8'))['index']
            else:
                self.offset = 0
            print("Most recent changelog index in Kafka seems to be {}".format(self.offset))

        def fail_fast(err, msg):
            if err is not None:
                print("Kafka producer delivery error: {}".format(err))
                print("Bailing out...")
                # TODO: should it be sys.exit(-1)?
                raise KafkaException(err)

        producer_conf = self.kafka_config.copy()
        producer_conf.update({
            'delivery.report.only.error': True,
            'default.topic.config': {
                'request.required.acks': -1, # all brokers must confirm
            },
        })
        producer = Producer(producer_conf)

        while True:
            latest = int(self.api.get_changelog(limit=1)[0].index)
            if latest > self.offset:
                print("Fetching changelogs from {} through {}".format(
                    self.offset+1, latest))
            for i in range(self.offset+1, latest+1):
                cle = self.api.get_changelog_entry(i)
                obj = self.api.api_client.sanitize_for_serialization(cle)
                producer.produce(
                    self.produce_topic,
                    json.dumps(obj).encode('utf-8'),
                    key=str(i),
                    on_delivery=fail_fast,
                    #NOTE timestamp could be timestamp=cle.timestamp (?)
                )
                self.offset = i
            producer.flush()
            print("Sleeping {} seconds...".format(self.poll_interval))
            time.sleep(self.poll_interval)


class EntityUpdatesWorker(FatcatWorker):
    """
    Consumes from the changelog topic and publishes expanded entities (fetched
    from API) to update topics.
    """

    def __init__(self, api, kafka_hosts, consume_topic, release_topic,
            file_topic, container_topic, ingest_file_request_topic,
            work_ident_topic, poll_interval=5.0):
        super().__init__(kafka_hosts=kafka_hosts,
                         consume_topic=consume_topic,
                         api=api)
        self.release_topic = release_topic
        self.file_topic = file_topic
        self.container_topic = container_topic
        self.ingest_file_request_topic = ingest_file_request_topic
        self.work_ident_topic = work_ident_topic
        self.poll_interval = poll_interval
        self.consumer_group = "entity-updates"
        self.ingest_oa_only = False
        self.ingest_pdf_doi_prefix_blocklist = [
            # gbif.org: many DOIs, not PDF fulltext
            "10.15468/",
            # ssl.fao.org: gene data
            "10.18730/",
            # plutof.ut.ee: gene data
            "10.15156/",
            # ba.e-pics.ethz.ch: swiss image (photo) archive
            "10.3932/",
            # ccdc.cam.ac.uk: crystal structures
            "10.5517/",
            # researchgate: mostly blocks our crawler
            "10.13140/",
            # springerlink: mostly blocks crawler
            "10.1007/",
            # nature group: mostly blocks crawler
            "10.1038/",
            # SAGE: mostly blocks crawler
            "10.1177/",
            # IOP: mostly blocks crawler
            "10.1088/",
            # JSTOR: mostly blocks crawler
            "10.2307/",
        ]
        self.live_pdf_ingest_doi_prefix_acceptlist = [
            # biorxiv and medrxiv
            "10.1101/",
            # the lancet (often hybrid OA)
            "10.1016/s0140-6736",
            "10.1016/s2213-2600",
            # journal of virology
            "10.1128/jvi.",
            # FEBS letters
            "10.1002/1873-3468.",
            # Journal of Neuroscience
            "10.1523/jneurosci.",
            # Chemical and pharmaceutical bulletin
            "10.1248/cpb.",
            # Japanese Journal of Radiological Technology
            "10.6009/jjrt.",
            # Seibutsu Butsuri
            "10.2142/biophys.",
            # Chemical Communications
            "10.1039/d0cc",
            # Yakugaku zasshi
            "10.1248/yakushi.",
            # bulletin AMS
            "10.1090/s0002-9904",
            # Current Biology
            "10.1016/j.cub.",
            # Antarctica A Keystone in a Changing World
            "10.3133/ofr",
            # Clinical Cancer Research
            "10.1158/1078-0432.",
            # Transactions of the Japan Society of Mechanical Engineers
            "10.1299/kikai",
            # protocols.io
            "10.17504/"
        ]

    def want_live_ingest(self, release, ingest_request):
        """
        This function looks at ingest requests and decides whether they are
        worth enqueing for ingest.

        In theory crawling all DOIs to a landing page is valuable.  It is
        intended to be an operational point of control to reduce load on daily
        ingest crawling (via wayback SPN).
        """

        link_source = ingest_request.get('ingest_request')
        ingest_type = ingest_request.get('ingest_type')
        doi = ingest_request.get('ext_ids', {}).get('doi')
        es = release_to_elasticsearch(release)

        is_document = release.release_type in (
            'article',
            'article-journal',
            'article-newspaper',
            'book',
            'chapter',
            'editorial',
            'interview',
            'legal_case',
            'legislation',
            'letter',
            'manuscript',
            'paper-conference',
            'patent',
            'peer_review',
            'post',
            'report',
            'retraction',
            'review',
            'review-book',
            'thesis',
        )
        is_not_pdf = release.release_type in (
            'component',
            'dataset',
            'figure',
            'graphic',
            'software',
            'stub',
        )

        # accept list sets a default "crawl it" despite OA metadata for
        # known-OA DOI prefixes
        in_acceptlist = False
        if doi:
            for prefix in self.live_pdf_ingest_doi_prefix_acceptlist:
                if doi.startswith(prefix):
                    in_acceptlist = True

        if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'):

            # most datacite documents are in IRs and should be crawled
            is_datacite_doc = False
            if release.extra and ('datacite' in release.extra) and is_document:
                is_datacite_doc = True
            if not (es['is_oa'] or in_acceptlist or is_datacite_doc):
                return False

        # big publishers *generally* have accurate OA metadata, use
        # preservation networks, and block our crawlers. So unless OA, or
        # explicitly on accept list, or not preserved, skip crawling
        if es.get('publisher_type') == 'big5' and es.get('is_preserved') and not (es['is_oa'] or in_acceptlist):
            return False

        # if ingest_type is pdf but release_type is almost certainly not a PDF,
        # skip it. This is mostly a datacite thing.
        if ingest_type == "pdf" and is_not_pdf:
            return False

        if ingest_type == "pdf" and doi:
            for prefix in self.ingest_pdf_doi_prefix_blocklist:
                if doi.startswith(prefix):
                    return False

        # figshare
        if doi and (doi.startswith('10.6084/') or doi.startswith('10.25384/')):
            # don't crawl "most recent version" (aka "group") DOIs
            if not release.version:
                return False

        # zenodo
        if doi and doi.startswith('10.5281/'):
            # if this is a "grouping" DOI of multiple "version" DOIs, do not crawl (will crawl the versioned DOIs)
            if release.extra and release.extra.get('relations'):
                for rel in release.extra['relations']:
                    if (rel.get('relationType') == 'HasVersion' and rel.get('relatedIdentifier', '').startswith('10.5281/')):
                        return False

        return True

    def run(self):

        def fail_fast(err, msg):
            if err is not None:
                print("Kafka producer delivery error: {}".format(err))
                print("Bailing out...")
                # TODO: should it be sys.exit(-1)?
                raise KafkaException(err)

        def on_commit(err, partitions):
            if err is not None:
                print("Kafka consumer commit error: {}".format(err))
                print("Bailing out...")
                # TODO: should it be sys.exit(-1)?
                raise KafkaException(err)
            for p in partitions:
                # check for partition-specific commit errors
                print(p)
                if p.error:
                    print("Kafka consumer commit error: {}".format(p.error))
                    print("Bailing out...")
                    # TODO: should it be sys.exit(-1)?
                    raise KafkaException(p.error)
            print("Kafka consumer commit successful")
            pass

        def on_rebalance(consumer, partitions):
            for p in partitions:
                if p.error:
                    raise KafkaException(p.error)
            print("Kafka partitions rebalanced: {} / {}".format(
                consumer, partitions))

        consumer_conf = self.kafka_config.copy()
        consumer_conf.update({
            'group.id': self.consumer_group,
            'on_commit': fail_fast,
            # messages don't have offset marked as stored until pushed to
            # elastic, but we do auto-commit stored offsets to broker
            'enable.auto.commit': True,
            'enable.auto.offset.store': False,
            # user code timeout; if no poll after this long, assume user code
            # hung and rebalance (default: 5min)
            'max.poll.interval.ms': 180000,
            'default.topic.config': {
                'auto.offset.reset': 'latest',
            },
        })
        consumer = Consumer(consumer_conf)

        producer_conf = self.kafka_config.copy()
        producer_conf.update({
            'delivery.report.only.error': True,
            'default.topic.config': {
                'request.required.acks': -1, # all brokers must confirm
            },
        })
        producer = Producer(producer_conf)

        consumer.subscribe([self.consume_topic],
            on_assign=on_rebalance,
            on_revoke=on_rebalance,
        )
        print("Kafka consuming {}".format(self.consume_topic))

        while True:
            msg = consumer.poll(self.poll_interval)
            if not msg:
                print("nothing new from kafka (poll_interval: {} sec)".format(self.poll_interval))
                continue
            if msg.error():
                raise KafkaException(msg.error())

            cle = json.loads(msg.value().decode('utf-8'))
            #print(cle)
            print("processing changelog index {}".format(cle['index']))
            release_ids = []
            new_release_ids = []
            file_ids = []
            fileset_ids = []
            webcapture_ids = []
            container_ids = []
            work_ids = []
            release_edits = cle['editgroup']['edits']['releases']
            for re in release_edits:
                release_ids.append(re['ident'])
                # filter to direct release edits which are not updates
                if not re.get('prev_revision') and not re.get('redirect_ident'):
                    new_release_ids.append(re['ident'])
            file_edits = cle['editgroup']['edits']['files']
            for e in file_edits:
                file_ids.append(e['ident'])
            fileset_edits = cle['editgroup']['edits']['filesets']
            for e in fileset_edits:
                fileset_ids.append(e['ident'])
            webcapture_edits = cle['editgroup']['edits']['webcaptures']
            for e in webcapture_edits:
                webcapture_ids.append(e['ident'])
            container_edits = cle['editgroup']['edits']['containers']
            for e in container_edits:
                container_ids.append(e['ident'])
            work_edits = cle['editgroup']['edits']['works']
            for e in work_edits:
                work_ids.append(e['ident'])

            # TODO: do these fetches in parallel using a thread pool?
            for ident in set(file_ids):
                file_entity = self.api.get_file(ident, expand=None)
                # update release when a file changes
                # TODO: also fetch old version of file and update any *removed*
                # release idents (and same for filesets, webcapture updates)
                release_ids.extend(file_entity.release_ids or [])
                file_dict = self.api.api_client.sanitize_for_serialization(file_entity)
                producer.produce(
                    self.file_topic,
                    json.dumps(file_dict).encode('utf-8'),
                    key=ident.encode('utf-8'),
                    on_delivery=fail_fast,
                )

            # TODO: topic for fileset updates
            for ident in set(fileset_ids):
                fileset_entity = self.api.get_fileset(ident, expand=None)
                # update release when a fileset changes
                release_ids.extend(fileset_entity.release_ids or [])

            # TODO: topic for webcapture updates
            for ident in set(webcapture_ids):
                webcapture_entity = self.api.get_webcapture(ident, expand=None)
                # update release when a webcapture changes
                release_ids.extend(webcapture_entity.release_ids or [])

            for ident in set(container_ids):
                container = self.api.get_container(ident)
                container_dict = self.api.api_client.sanitize_for_serialization(container)
                producer.produce(
                    self.container_topic,
                    json.dumps(container_dict).encode('utf-8'),
                    key=ident.encode('utf-8'),
                    on_delivery=fail_fast,
                )

            for ident in set(release_ids):
                release = self.api.get_release(ident, expand="files,filesets,webcaptures,container")
                if release.work_id:
                    work_ids.append(release.work_id)
                release_dict = self.api.api_client.sanitize_for_serialization(release)
                producer.produce(
                    self.release_topic,
                    json.dumps(release_dict).encode('utf-8'),
                    key=ident.encode('utf-8'),
                    on_delivery=fail_fast,
                )
                # for ingest requests, filter to "new" active releases with no matched files
                if release.ident in new_release_ids:
                    ir = release_ingest_request(release, ingest_request_source='fatcat-changelog')
                    if ir and not release.files and self.want_live_ingest(release, ir):
                        producer.produce(
                            self.ingest_file_request_topic,
                            json.dumps(ir).encode('utf-8'),
                            #key=None,
                            on_delivery=fail_fast,
                        )

            # send work updates (just ident and changelog metadata) to scholar for re-indexing
            for ident in set(work_ids):
                assert ident
                key = f"work_{ident}"
                work_ident_dict = dict(
                    key=key,
                    type="fatcat_work",
                    work_ident=ident,
                    updated=cle['timestamp'],
                    fatcat_changelog_index=cle['index'],
                )
                producer.produce(
                    self.work_ident_topic,
                    json.dumps(work_ident_dict).encode('utf-8'),
                    key=key.encode('utf-8'),
                    on_delivery=fail_fast,
                )

            producer.flush()
            # TODO: publish updated 'work' entities to a topic
            consumer.store_offsets(message=msg)