Merge branch 'kafka'

author: Bryan Newbold <bnewbold@robocracy.org> 2018-11-12 23:13:22 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2018-11-12 23:13:22 -0800
commit: 055c464deea8cdaccf3ed384995d4409b0f51409 (patch)
tree: 582e3d62d192d51fafed0c5e3321fd21a3a42d34 /python/fatcat
parent: 6828313ecf2fea06e500f447f3abb92e5651c1fa (diff)
parent: 942f23257bcf1059a2502a3b019ce5af1bde7de5 (diff)
download: fatcat-055c464deea8cdaccf3ed384995d4409b0f51409.tar.gz
fatcat-055c464deea8cdaccf3ed384995d4409b0f51409.zip
4 files changed, 294 insertions, 0 deletions
diff --git a/python/fatcat/changelog_workers.py b/python/fatcat/changelog_workers.py
new file mode 100644
index 00000000..e341ea32
--- /dev/null
+++ b/python/fatcat/changelog_workers.py
@@ -0,0 +1,122 @@
+
+import json
+import time
+from itertools import islice
+from fatcat.worker_common import FatcatWorker
+from pykafka.common import OffsetType
+
+
+class FatcatChangelogWorker(FatcatWorker):
+    """
+    Periodically polls the fatcat API looking for new changelogs. When they are
+    found, fetch them and push (as JSON) into a Kafka topic.
+    """
+
+    def __init__(self, api_host_url, kafka_hosts, produce_topic, poll_interval=10.0, offset=None):
+        # TODO: should be offset=0
+        super().__init__(kafka_hosts=kafka_hosts,
+                         produce_topic=produce_topic,
+                         api_host_url=api_host_url)
+        self.poll_interval = poll_interval
+        self.offset = offset    # the fatcat changelog offset, not the kafka offset
+
+    def most_recent_message(self, topic):
+        """
+        Tries to fetch the most recent message from a given topic.
+        This only makes sense for single partition topics, though could be
+        extended with "last N" behavior.
+
+        Following "Consuming the last N messages from a topic"
+        from https://pykafka.readthedocs.io/en/latest/usage.html#consumer-patterns
+        """
+        consumer = topic.get_simple_consumer(
+            auto_offset_reset=OffsetType.LATEST,
+            reset_offset_on_start=True)
+        offsets = [(p, op.last_offset_consumed - 1)
+                    for p, op in consumer._partitions.items()]
+        offsets = [(p, (o if o > -1 else -2)) for p, o in offsets]
+        if -2 in [o for p, o in offsets]:
+            return None
+        else:
+            consumer.reset_offsets(offsets)
+            msg = islice(consumer, 1)
+            if msg:
+                return list(msg)[0].value
+            else:
+                return None
+
+    def run(self):
+        topic = self.kafka.topics[self.produce_topic]
+        # On start, try to consume the most recent from the topic, and using
+        # that as the starting offset. Note that this is a single-partition
+        # topic
+        if self.offset is None:
+            print("Checking for most recent changelog offset...")
+            msg = self.most_recent_message(topic)
+            if msg:
+                self.offset = json.loads(msg.decode('utf-8'))['index']
+            else:
+                self.offset = 1
+
+        with topic.get_sync_producer() as producer:
+            while True: 
+                latest = int(self.api.get_changelog(limit=1)[0].index)
+                if latest > self.offset:
+                    print("Fetching changelogs from {} through {}".format(
+                        self.offset+1, latest))
+                for i in range(self.offset+1, latest+1):
+                    cle = self.api.get_changelog_entry(i)
+                    obj = self.api.api_client.sanitize_for_serialization(cle)
+                    producer.produce(
+                        message=json.dumps(obj).encode('utf-8'),
+                        partition_key=None,
+                        timestamp=None,
+                        #XXX: timestamp=cle.timestamp,
+                    )
+                    self.offset = i
+                print("Sleeping {} seconds...".format(self.poll_interval))
+                time.sleep(self.poll_interval)
+
+
+class FatcatEntityUpdatesWorker(FatcatWorker):
+    """
+    Consumes from the changelog topic and publishes expanded entities (fetched
+    from API) to update topics.
+
+    For now, only release updates are published.
+    """
+
+    def __init__(self, api_host_url, kafka_hosts, consume_topic, release_topic):
+        super().__init__(kafka_hosts=kafka_hosts,
+                         consume_topic=consume_topic,
+                         api_host_url=api_host_url)
+        self.release_topic = release_topic
+        self.consumer_group = "entity-updates"
+
+    def run(self):
+        changelog_topic = self.kafka.topics[self.consume_topic]
+        release_topic = self.kafka.topics[self.release_topic]
+
+        consumer = changelog_topic.get_balanced_consumer(
+            consumer_group=self.consumer_group,
+            managed=True,
+            auto_offset_reset=OffsetType.LATEST,
+            reset_offset_on_start=False,
+        )
+
+        with release_topic.get_sync_producer() as producer:
+            for msg in consumer:
+                cle = json.loads(msg.value.decode('utf-8'))
+                #print(cle)
+                release_edits = cle['editgroup']['edits']['releases']
+                for re in release_edits:
+                    ident = re['ident']
+                    release = self.api.get_release(ident, expand="files,container")
+                    release_dict = self.api.api_client.sanitize_for_serialization(release)
+                    producer.produce(
+                        message=json.dumps(release_dict).encode('utf-8'),
+                        partition_key=ident.encode('utf-8'),
+                        timestamp=None,
+                    )
+                consumer.commit_offsets()
+
diff --git a/python/fatcat/elastic_workers.py b/python/fatcat/elastic_workers.py
new file mode 100644
index 00000000..3d2e9c39
--- /dev/null
+++ b/python/fatcat/elastic_workers.py
@@ -0,0 +1,47 @@
+
+import json
+import time
+import requests
+from fatcat.worker_common import FatcatWorker
+from fatcat_client.models import ReleaseEntity
+from fatcat.entity_helpers import *
+from pykafka.common import OffsetType
+
+
+class FatcatElasticReleaseWorker(FatcatWorker):
+    """
+    Consumes from release-updates topic and pushes into (presumably local)
+    elasticsearch.
+
+    Uses a consumer group to manage offset.
+    """
+
+    def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None,
+            elastic_backend="http://localhost:9200", elastic_index="fatcat"):
+        super().__init__(kafka_hosts=kafka_hosts,
+                         consume_topic=consume_topic,
+                         api_host_url=None)
+        self.consumer_group = "elastic-updates"
+        self.elastic_backend = elastic_backend
+        self.elastic_index = elastic_index
+
+    def run(self):
+        consume_topic = self.kafka.topics[self.consume_topic]
+
+        consumer = consume_topic.get_balanced_consumer(
+            consumer_group=self.consumer_group,
+            managed=True,
+        )
+
+        for msg in consumer:
+            json_str = msg.value.decode('utf-8')
+            release = entity_from_json(json_str, ReleaseEntity)
+            #print(release)
+            elastic_endpoint = "{}/{}/release/{}".format(
+                self.elastic_backend,
+                self.elastic_index,
+                release.ident)
+            print("Updating document: {}".format(elastic_endpoint))
+            resp = requests.post(elastic_endpoint, json=release.to_elastic_dict())
+            assert resp.status_code in (200, 201)
+            consumer.commit_offsets()
diff --git a/python/fatcat/entity_helpers.py b/python/fatcat/entity_helpers.py
new file mode 100644
index 00000000..c454536b
--- /dev/null
+++ b/python/fatcat/entity_helpers.py
@@ -0,0 +1,100 @@
+
+import collections
+from fatcat_client.models import ReleaseEntity
+from fatcat_client.api_client import ApiClient
+
+def entity_to_json(entity):
+    ac = ApiClient()
+    return ac.sanitize_for_serialization(entity)
+
+def entity_from_json(json_str, entity_type):
+    """
+    Hack to take advantage of the code-generated deserialization code
+    """
+    ac = ApiClient()
+    thing = collections.namedtuple('Thing', ['data'])
+    thing.data = json_str
+    return ac.deserialize(thing, entity_type)
+
+def release_elastic_dict(release):
+    """
+    Converts from an entity model/schema to elasticsearch oriented schema.
+
+    Returns: dict
+    """
+
+    if release.state != 'active':
+        raise ValueError("Entity is not 'active'")
+
+    # First, the easy ones (direct copy)
+    t = dict(
+        ident = release.ident,
+        revision = release.revision,
+        title = release.title,
+        release_type = release.release_type,
+        release_status = release.release_status,
+        language = release.language,
+        doi = release.doi,
+        pmid = release.pmid,
+        pmcid = release.pmcid,
+        isbn13 = release.isbn13,
+        core_id = release.core_id,
+        wikidata_qid = release.wikidata_qid
+    )
+
+    if release.release_date:
+        # TODO: resolve why this can be either a string or datetime
+        if type(release.release_date) == str:
+            t['release_date'] = release.release_date
+        else:
+            t['release_date'] = release.release_date.strftime('%F')
+
+    container = release.container
+    container_is_kept = False
+    if container:
+        t['publisher'] = container.publisher
+        t['container_name'] = container.name
+        t['container_issnl'] = container.issnl
+        container_extra = container.extra
+        if container_extra:
+            t['container_is_oa'] = container_extra.get('is_oa')
+            container_is_kept = container_extra.get('is_kept', False)
+            t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa')
+    else:
+        t['publisher'] = release.publisher
+
+    files = release.files or []
+    t['file_count'] = len(files)
+    in_wa = False
+    in_ia = False
+    t['file_pdf_url'] = None
+    for f in files:
+        is_pdf = 'pdf' in f.get('mimetype', '')
+        for url in f.get('urls', []):
+            if url.get('rel', '') == 'webarchive':
+                in_wa = True
+            if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']:
+                in_ia = True
+                if is_pdf:
+                    t['file_pdf_url'] = url['url']
+            if not t['file_pdf_url'] and is_pdf:
+                t['file_pdf_url'] = url['url']
+    t['file_in_webarchive'] = in_wa
+    t['file_in_ia'] = in_ia
+
+    extra = release.extra or dict()
+    if extra:
+        t['in_shadow'] = extra.get('in_shadow')
+        if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'):
+            t['container_is_longtail_oa'] = True
+    t['any_abstract'] = bool(release.abstracts)
+    t['is_kept'] = container_is_kept or extra.get('is_kept', False)
+
+    t['ref_count'] = len(release.refs or [])
+    t['contrib_count'] = len(release.contribs or [])
+    contrib_names = []
+    for c in (release.contribs or []):
+        if c.raw_name:
+            contrib_names.append(c.raw_name)
+    t['contrib_names'] = contrib_names
+    return t
diff --git a/python/fatcat/worker_common.py b/python/fatcat/worker_common.py
new file mode 100644
index 00000000..77ea2c15
--- /dev/null
+++ b/python/fatcat/worker_common.py
@@ -0,0 +1,25 @@
+
+import re
+import sys
+import csv
+import json
+import itertools
+import fatcat_client
+from pykafka import KafkaClient
+from fatcat_client.rest import ApiException
+
+
+class FatcatWorker:
+    """
+    Common code for for Kafka producers and consumers.
+    """
+
+    def __init__(self, kafka_hosts, produce_topic=None, consume_topic=None, api_host_url=None):
+        if api_host_url:
+            conf = fatcat_client.Configuration()
+            conf.host = api_host_url
+            self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf))
+        self.kafka = KafkaClient(hosts=kafka_hosts, broker_version="1.0.0")
+        self.produce_topic = produce_topic
+        self.consume_topic = consume_topic
+
author	Bryan Newbold <bnewbold@robocracy.org>	2018-11-12 23:13:22 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2018-11-12 23:13:22 -0800
commit	055c464deea8cdaccf3ed384995d4409b0f51409 (patch)
tree	582e3d62d192d51fafed0c5e3321fd21a3a42d34 /python/fatcat
parent	6828313ecf2fea06e500f447f3abb92e5651c1fa (diff)
parent	942f23257bcf1059a2502a3b019ce5af1bde7de5 (diff)
download	fatcat-055c464deea8cdaccf3ed384995d4409b0f51409.tar.gz fatcat-055c464deea8cdaccf3ed384995d4409b0f51409.zip