From d70abdd82955feba4eecdda24ff6d95f703e0598 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 2 Nov 2018 13:59:24 -0700 Subject: FatcatRelease: start wrapping entities with extra methods --- python/fatcat/__init__.py | 1 + python/fatcat/crossref_importer.py | 5 ++- python/fatcat/release_model.py | 85 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 python/fatcat/release_model.py (limited to 'python/fatcat') diff --git a/python/fatcat/__init__.py b/python/fatcat/__init__.py index aa12f972..b0492684 100644 --- a/python/fatcat/__init__.py +++ b/python/fatcat/__init__.py @@ -4,6 +4,7 @@ from flask_uuid import FlaskUUID from flask_debugtoolbar import DebugToolbarExtension from config import Config import fatcat_client +from fatcat.release_model import FatcatRelease toolbar = DebugToolbarExtension() app = Flask(__name__) diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py index 37005965..fbf666a3 100644 --- a/python/fatcat/crossref_importer.py +++ b/python/fatcat/crossref_importer.py @@ -6,6 +6,7 @@ import datetime import itertools import fatcat_client from fatcat.importer_common import FatcatImporter +from fatcat import FatcatRelease class FatcatCrossrefImporter(FatcatImporter): @@ -38,7 +39,7 @@ class FatcatCrossrefImporter(FatcatImporter): def parse_crossref_dict(self, obj): """ obj is a python dict (parsed from json). - returns a ReleaseEntity + returns a FatcatRelease """ # This work is out of scope if it doesn't have authors and a title @@ -212,7 +213,7 @@ class FatcatCrossrefImporter(FatcatImporter): if release_date: release_date = release_date.isoformat() + "Z" - re = fatcat_client.ReleaseEntity( + re = FatcatRelease( work_id=None, title=obj['title'][0], contribs=contribs, diff --git a/python/fatcat/release_model.py b/python/fatcat/release_model.py new file mode 100644 index 00000000..a584c00b --- /dev/null +++ b/python/fatcat/release_model.py @@ -0,0 +1,85 @@ + +from fatcat_client.models import ReleaseEntity + +class FatcatRelease(ReleaseEntity): + """ + This is a wrapper class that extends the code-generated `ReleaseEntity` + class with extra methods. + """ + + def to_elastic_dict(self): + """ + Converts from an entity model/schema to elasticsearch oriented schema. + + Returns: dict + """ + + if self.state != 'active': + raise ValueError("Entity is not 'active'") + + # First, the easy ones (direct copy) + t = dict( + ident = self.ident, + revision = self.revision, + title = self.title, + release_date = self.release_date, + release_type = self.release_type, + release_status = self.release_status, + language = self.language, + doi = self.doi, + pmid = self.pmid, + pmcid = self.pmcid, + isbn13 = self.isbn13, + core_id = self.core_id, + wikidata_qid = self.wikidata_qid + ) + + container = self.container + container_is_kept = False + if container: + t['publisher'] = container.publisher + t['container_name'] = container.name + t['container_issnl'] = container.issnl + container_extra = container.extra + if container_extra: + t['container_is_oa'] = container_extra.get('is_oa') + container_is_kept = container_extra.get('is_kept', False) + t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa') + else: + t['publisher'] = self.publisher + + files = self.files or [] + t['file_count'] = len(files) + in_wa = False + in_ia = False + t['file_pdf_url'] = None + for f in files: + is_pdf = 'pdf' in f.get('mimetype', '') + for url in f.get('urls', []): + if url.get('rel', '') == 'webarchive': + in_wa = True + if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']: + in_ia = True + if is_pdf: + t['file_pdf_url'] = url['url'] + if not t['file_pdf_url'] and is_pdf: + t['file_pdf_url'] = url['url'] + t['file_in_webarchive'] = in_wa + t['file_in_ia'] = in_ia + + extra = self.extra or dict() + if extra: + t['in_shadow'] = extra.get('in_shadow') + if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'): + t['container_is_longtail_oa'] = True + t['any_abstract'] = bool(self.abstracts) + t['is_kept'] = container_is_kept or extra.get('is_kept', False) + + t['ref_count'] = len(self.refs or []) + t['contrib_count'] = len(self.contribs or []) + contrib_names = [] + for c in (self.contribs or []): + if c.raw_name: + contrib_names.append(c.raw_name) + t['contrib_names'] = contrib_names + return t -- cgit v1.2.3 From 87b8f1c710c7c94ed9c1b040f75c1601591a214b Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 4 Nov 2018 17:25:41 -0800 Subject: to_json() method for fatcatrelease --- python/fatcat/release_model.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'python/fatcat') diff --git a/python/fatcat/release_model.py b/python/fatcat/release_model.py index a584c00b..aa0d2d9f 100644 --- a/python/fatcat/release_model.py +++ b/python/fatcat/release_model.py @@ -1,5 +1,6 @@ from fatcat_client.models import ReleaseEntity +from fatcat_client.api_client import ApiClient class FatcatRelease(ReleaseEntity): """ @@ -83,3 +84,7 @@ class FatcatRelease(ReleaseEntity): contrib_names.append(c.raw_name) t['contrib_names'] = contrib_names return t + + def to_json(self): + ac = ApiClient() + return ac.sanitize_for_serialization(self) -- cgit v1.2.3 From 881b46e3b1682974f48fc196f483c3fa2648b998 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 4 Nov 2018 17:26:28 -0800 Subject: first-draft kafka workers (changelog, release_update) --- python/fatcat/changelog_workers.py | 120 +++++++++++++++++++++++++++++++++++++ python/fatcat/worker_common.py | 25 ++++++++ python/fatcat_worker.py | 52 ++++++++++++++++ 3 files changed, 197 insertions(+) create mode 100644 python/fatcat/changelog_workers.py create mode 100644 python/fatcat/worker_common.py create mode 100755 python/fatcat_worker.py (limited to 'python/fatcat') diff --git a/python/fatcat/changelog_workers.py b/python/fatcat/changelog_workers.py new file mode 100644 index 00000000..5f8621cf --- /dev/null +++ b/python/fatcat/changelog_workers.py @@ -0,0 +1,120 @@ + +import json +import time +from itertools import islice +from fatcat.worker_common import FatcatWorker +from pykafka.common import OffsetType + + +class FatcatChangelogWorker(FatcatWorker): + """ + Periodically polls the fatcat API looking for new changelogs. When they are + found, fetch them and push (as JSON) into a Kafka topic. + """ + + def __init__(self, api_host_url, kafka_hosts, produce_topic, poll_interval=10.0, offset=None): + # TODO: should be offset=0 + super().__init__(kafka_hosts=kafka_hosts, + produce_topic=produce_topic, + api_host_url=api_host_url) + self.poll_interval = poll_interval + self.offset = offset # the fatcat changelog offset, not the kafka offset + + def most_recent_message(self, topic): + """ + Tries to fetch the most recent message from a given topic. + This only makes sense for single partition topics, though could be + extended with "last N" behavior. + + Following "Consuming the last N messages from a topic" + from https://pykafka.readthedocs.io/en/latest/usage.html#consumer-patterns + """ + consumer = topic.get_simple_consumer( + auto_offset_reset=OffsetType.LATEST, + reset_offset_on_start=True) + offsets = [(p, op.last_offset_consumed - 1) + for p, op in consumer._partitions.items()] + offsets = [(p, (o if o > -1 else -2)) for p, o in offsets] + if -2 in [o for p, o in offsets]: + return None + else: + consumer.reset_offsets(offsets) + msg = islice(consumer, 1) + if msg: + return list(msg)[0].value + else: + return None + + def run(self): + topic = self.kafka.topics[self.produce_topic] + # On start, try to consume the most recent from the topic, and using + # that as the starting offset. Note that this is a single-partition + # topic + if self.offset is None: + print("Checking for most recent changelog offset...") + msg = self.most_recent_message(topic) + if msg: + self.offset = json.loads(msg.decode('utf-8'))['index'] + else: + self.offset = 1 + + with topic.get_sync_producer() as producer: + while True: + latest = int(self.api.get_changelog(limit=1)[0].index) + if latest > self.offset: + print("Fetching changelogs from {} through {}".format( + self.offset+1, latest)) + for i in range(self.offset+1, latest+1): + cle = self.api.get_changelog_entry(i) + obj = self.api.api_client.sanitize_for_serialization(cle) + producer.produce( + message=json.dumps(obj).encode('utf-8'), + partition_key=None, + timestamp=None, + #XXX: timestamp=cle.timestamp, + ) + self.offset = i + print("Sleeping {} seconds...".format(self.poll_interval)) + time.sleep(self.poll_interval) + + +class FatcatEntityUpdatesWorker(FatcatWorker): + """ + Consumes from the changelog topic and publishes expanded entities (fetched + from API) to update topics. + + For now, only release updates are published. + """ + + def __init__(self, api_host_url, kafka_hosts, consume_topic, release_topic): + super().__init__(kafka_hosts=kafka_hosts, + consume_topic=consume_topic, + api_host_url=api_host_url) + self.release_topic = release_topic + self.consumer_group = "entity-updates" + + def run(self): + changelog_topic = self.kafka.topics[self.consume_topic] + release_topic = self.kafka.topics[self.release_topic] + + consumer = changelog_topic.get_simple_consumer( + consumer_group=self.consumer_group, + auto_offset_reset=OffsetType.LATEST, + reset_offset_on_start=False, + ) + + with release_topic.get_sync_producer() as producer: + for msg in consumer: + cle = json.loads(msg.value.decode('utf-8')) + #print(cle) + release_edits = cle['editgroup']['edits']['releases'] + for re in release_edits: + ident = re['ident'] + release = self.api.get_release(ident, expand="files,container") + release_dict = self.api.api_client.sanitize_for_serialization(release) + producer.produce( + message=json.dumps(release_dict).encode('utf-8'), + partition_key=ident.encode('utf-8'), + timestamp=None, + ) + diff --git a/python/fatcat/worker_common.py b/python/fatcat/worker_common.py new file mode 100644 index 00000000..77ea2c15 --- /dev/null +++ b/python/fatcat/worker_common.py @@ -0,0 +1,25 @@ + +import re +import sys +import csv +import json +import itertools +import fatcat_client +from pykafka import KafkaClient +from fatcat_client.rest import ApiException + + +class FatcatWorker: + """ + Common code for for Kafka producers and consumers. + """ + + def __init__(self, kafka_hosts, produce_topic=None, consume_topic=None, api_host_url=None): + if api_host_url: + conf = fatcat_client.Configuration() + conf.host = api_host_url + self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) + self.kafka = KafkaClient(hosts=kafka_hosts, broker_version="1.0.0") + self.produce_topic = produce_topic + self.consume_topic = consume_topic + diff --git a/python/fatcat_worker.py b/python/fatcat_worker.py new file mode 100755 index 00000000..cc11beca --- /dev/null +++ b/python/fatcat_worker.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 + +import sys +import argparse +from fatcat.changelog_workers import FatcatChangelogWorker, FatcatEntityUpdatesWorker + +def run_changelog_worker(args): + topic = "fatcat-{}.changelog".format(args.env) + worker = FatcatChangelogWorker(args.api_host_url, args.kafka_hosts, topic, + args.poll_interval) + worker.run() + +def run_entity_updates_worker(args): + changelog_topic = "fatcat-{}.changelog".format(args.env) + release_topic = "fatcat-{}.release-updates".format(args.env) + worker = FatcatEntityUpdatesWorker(args.api_host_url, args.kafka_hosts, + changelog_topic, release_topic) + worker.run() + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--debug', + action='store_true', + help="enable debug logging") + parser.add_argument('--api-host-url', + default="http://localhost:9411/v0", + help="fatcat API host/port to use") + parser.add_argument('--kafka-hosts', + default="localhost:9092", + help="list of Kafka brokers (host/port) to use") + parser.add_argument('--env', + default="qa", + help="Kafka topic namespace to use (eg, prod, qa)") + subparsers = parser.add_subparsers() + + sub_changelog = subparsers.add_parser('changelog') + sub_changelog.set_defaults(func=run_changelog_worker) + sub_changelog.add_argument('--poll-interval', + help="how long to wait between polling (seconds)", + default=10.0, type=float) + + sub_entity_updates = subparsers.add_parser('entity-updates') + sub_entity_updates.set_defaults(func=run_entity_updates_worker) + + args = parser.parse_args() + if not args.__dict__.get("func"): + print("tell me what to do!") + sys.exit(-1) + args.func(args) + +if __name__ == '__main__': + main() -- cgit v1.2.3 From e5486378d8d7adf8974b1f1ebaf0400445ba8791 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 4 Nov 2018 18:52:22 -0800 Subject: elastic release worker --- python/fatcat/elastic_workers.py | 46 ++++++++++++++++++++++++++++++++++++++++ python/fatcat/release_model.py | 15 ++++++++++++- python/fatcat_worker.py | 10 +++++++++ 3 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 python/fatcat/elastic_workers.py (limited to 'python/fatcat') diff --git a/python/fatcat/elastic_workers.py b/python/fatcat/elastic_workers.py new file mode 100644 index 00000000..c3226981 --- /dev/null +++ b/python/fatcat/elastic_workers.py @@ -0,0 +1,46 @@ + +import json +import time +import requests +from fatcat.worker_common import FatcatWorker +from fatcat.release_model import FatcatRelease +from pykafka.common import OffsetType + + +class FatcatElasticReleaseWorker(FatcatWorker): + """ + Consumes from release-updates topic and pushes into (presumably local) + elasticsearch. + + Uses a consumer group to manage offset. + """ + + def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None, + elastic_backend="http://localhost:9200", elastic_index="fatcat"): + super().__init__(kafka_hosts=kafka_hosts, + consume_topic=consume_topic, + api_host_url=None) + self.consumer_group = "elastic-updates" + self.elastic_backend = elastic_backend + self.elastic_index = elastic_index + + def run(self): + consume_topic = self.kafka.topics[self.consume_topic] + + consumer = consume_topic.get_balanced_consumer( + consumer_group=self.consumer_group, + managed=True, + ) + + for msg in consumer: + json_str = msg.value.decode('utf-8') + release = FatcatRelease.from_json(json_str) + #print(release) + elastic_endpoint = "{}/{}/release/{}".format( + self.elastic_backend, + self.elastic_index, + release.ident) + print("Updating document: {}".format(elastic_endpoint)) + resp = requests.post(elastic_endpoint, json=release.to_elastic_dict()) + assert resp.status_code in (200, 201) + consumer.commit_offsets() diff --git a/python/fatcat/release_model.py b/python/fatcat/release_model.py index aa0d2d9f..403fc671 100644 --- a/python/fatcat/release_model.py +++ b/python/fatcat/release_model.py @@ -1,4 +1,5 @@ +import collections from fatcat_client.models import ReleaseEntity from fatcat_client.api_client import ApiClient @@ -23,7 +24,6 @@ class FatcatRelease(ReleaseEntity): ident = self.ident, revision = self.revision, title = self.title, - release_date = self.release_date, release_type = self.release_type, release_status = self.release_status, language = self.language, @@ -35,6 +35,9 @@ class FatcatRelease(ReleaseEntity): wikidata_qid = self.wikidata_qid ) + if self.release_date: + t['release_date'] = self.release_date.strftime('%F') + container = self.container container_is_kept = False if container: @@ -88,3 +91,13 @@ class FatcatRelease(ReleaseEntity): def to_json(self): ac = ApiClient() return ac.sanitize_for_serialization(self) + + def from_json(json_str): + """ + Hack to take advantage of the code-generated deserialization code + """ + ac = ApiClient() + thing = collections.namedtuple('Thing', ['data']) + thing.data = json_str + return ac.deserialize(thing, FatcatRelease) + diff --git a/python/fatcat_worker.py b/python/fatcat_worker.py index cc11beca..50ff0fb7 100755 --- a/python/fatcat_worker.py +++ b/python/fatcat_worker.py @@ -3,6 +3,7 @@ import sys import argparse from fatcat.changelog_workers import FatcatChangelogWorker, FatcatEntityUpdatesWorker +from fatcat.elastic_workers import FatcatElasticReleaseWorker def run_changelog_worker(args): topic = "fatcat-{}.changelog".format(args.env) @@ -17,6 +18,12 @@ def run_entity_updates_worker(args): changelog_topic, release_topic) worker.run() +def run_elastic_release_worker(args): + consume_topic = "fatcat-{}.release-updates".format(args.env) + worker = FatcatElasticReleaseWorker(args.kafka_hosts, + consume_topic) + worker.run() + def main(): parser = argparse.ArgumentParser() parser.add_argument('--debug', @@ -42,6 +49,9 @@ def main(): sub_entity_updates = subparsers.add_parser('entity-updates') sub_entity_updates.set_defaults(func=run_entity_updates_worker) + sub_elastic_release = subparsers.add_parser('elastic-release') + sub_elastic_release.set_defaults(func=run_elastic_release_worker) + args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") -- cgit v1.2.3 From 53d91dbefeb598539b02d18fad33f79babe2bb94 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 4 Nov 2018 18:52:48 -0800 Subject: switch entity update worker to use balanced/manager consumer --- python/fatcat/changelog_workers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'python/fatcat') diff --git a/python/fatcat/changelog_workers.py b/python/fatcat/changelog_workers.py index 5f8621cf..e341ea32 100644 --- a/python/fatcat/changelog_workers.py +++ b/python/fatcat/changelog_workers.py @@ -97,8 +97,9 @@ class FatcatEntityUpdatesWorker(FatcatWorker): changelog_topic = self.kafka.topics[self.consume_topic] release_topic = self.kafka.topics[self.release_topic] - consumer = changelog_topic.get_simple_consumer( + consumer = changelog_topic.get_balanced_consumer( consumer_group=self.consumer_group, + managed=True, auto_offset_reset=OffsetType.LATEST, reset_offset_on_start=False, ) @@ -117,4 +118,5 @@ class FatcatEntityUpdatesWorker(FatcatWorker): partition_key=ident.encode('utf-8'), timestamp=None, ) + consumer.commit_offsets() -- cgit v1.2.3 From 0109c3a75e201e81036ad031d93602ba6c46ba08 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 12 Nov 2018 22:54:04 -0800 Subject: Revert "FatcatRelease: start wrapping entities with extra methods" This reverts commit d70abdd82955feba4eecdda24ff6d95f703e0598. Decided this wasn't the right approach. --- python/fatcat/__init__.py | 1 - python/fatcat/crossref_importer.py | 5 +- python/fatcat/release_model.py | 103 ------------------------------------- python/tests/release_model.py | 15 ------ 4 files changed, 2 insertions(+), 122 deletions(-) delete mode 100644 python/fatcat/release_model.py delete mode 100644 python/tests/release_model.py (limited to 'python/fatcat') diff --git a/python/fatcat/__init__.py b/python/fatcat/__init__.py index b0492684..aa12f972 100644 --- a/python/fatcat/__init__.py +++ b/python/fatcat/__init__.py @@ -4,7 +4,6 @@ from flask_uuid import FlaskUUID from flask_debugtoolbar import DebugToolbarExtension from config import Config import fatcat_client -from fatcat.release_model import FatcatRelease toolbar = DebugToolbarExtension() app = Flask(__name__) diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py index fbf666a3..37005965 100644 --- a/python/fatcat/crossref_importer.py +++ b/python/fatcat/crossref_importer.py @@ -6,7 +6,6 @@ import datetime import itertools import fatcat_client from fatcat.importer_common import FatcatImporter -from fatcat import FatcatRelease class FatcatCrossrefImporter(FatcatImporter): @@ -39,7 +38,7 @@ class FatcatCrossrefImporter(FatcatImporter): def parse_crossref_dict(self, obj): """ obj is a python dict (parsed from json). - returns a FatcatRelease + returns a ReleaseEntity """ # This work is out of scope if it doesn't have authors and a title @@ -213,7 +212,7 @@ class FatcatCrossrefImporter(FatcatImporter): if release_date: release_date = release_date.isoformat() + "Z" - re = FatcatRelease( + re = fatcat_client.ReleaseEntity( work_id=None, title=obj['title'][0], contribs=contribs, diff --git a/python/fatcat/release_model.py b/python/fatcat/release_model.py deleted file mode 100644 index 403fc671..00000000 --- a/python/fatcat/release_model.py +++ /dev/null @@ -1,103 +0,0 @@ - -import collections -from fatcat_client.models import ReleaseEntity -from fatcat_client.api_client import ApiClient - -class FatcatRelease(ReleaseEntity): - """ - This is a wrapper class that extends the code-generated `ReleaseEntity` - class with extra methods. - """ - - def to_elastic_dict(self): - """ - Converts from an entity model/schema to elasticsearch oriented schema. - - Returns: dict - """ - - if self.state != 'active': - raise ValueError("Entity is not 'active'") - - # First, the easy ones (direct copy) - t = dict( - ident = self.ident, - revision = self.revision, - title = self.title, - release_type = self.release_type, - release_status = self.release_status, - language = self.language, - doi = self.doi, - pmid = self.pmid, - pmcid = self.pmcid, - isbn13 = self.isbn13, - core_id = self.core_id, - wikidata_qid = self.wikidata_qid - ) - - if self.release_date: - t['release_date'] = self.release_date.strftime('%F') - - container = self.container - container_is_kept = False - if container: - t['publisher'] = container.publisher - t['container_name'] = container.name - t['container_issnl'] = container.issnl - container_extra = container.extra - if container_extra: - t['container_is_oa'] = container_extra.get('is_oa') - container_is_kept = container_extra.get('is_kept', False) - t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa') - else: - t['publisher'] = self.publisher - - files = self.files or [] - t['file_count'] = len(files) - in_wa = False - in_ia = False - t['file_pdf_url'] = None - for f in files: - is_pdf = 'pdf' in f.get('mimetype', '') - for url in f.get('urls', []): - if url.get('rel', '') == 'webarchive': - in_wa = True - if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']: - in_ia = True - if is_pdf: - t['file_pdf_url'] = url['url'] - if not t['file_pdf_url'] and is_pdf: - t['file_pdf_url'] = url['url'] - t['file_in_webarchive'] = in_wa - t['file_in_ia'] = in_ia - - extra = self.extra or dict() - if extra: - t['in_shadow'] = extra.get('in_shadow') - if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'): - t['container_is_longtail_oa'] = True - t['any_abstract'] = bool(self.abstracts) - t['is_kept'] = container_is_kept or extra.get('is_kept', False) - - t['ref_count'] = len(self.refs or []) - t['contrib_count'] = len(self.contribs or []) - contrib_names = [] - for c in (self.contribs or []): - if c.raw_name: - contrib_names.append(c.raw_name) - t['contrib_names'] = contrib_names - return t - - def to_json(self): - ac = ApiClient() - return ac.sanitize_for_serialization(self) - - def from_json(json_str): - """ - Hack to take advantage of the code-generated deserialization code - """ - ac = ApiClient() - thing = collections.namedtuple('Thing', ['data']) - thing.data = json_str - return ac.deserialize(thing, FatcatRelease) - diff --git a/python/tests/release_model.py b/python/tests/release_model.py deleted file mode 100644 index 4b9dddba..00000000 --- a/python/tests/release_model.py +++ /dev/null @@ -1,15 +0,0 @@ - -import json -import pytest -from fatcat.crossref_importer import FatcatCrossrefImporter -from fatcat.release_model import FatcatRelease - -from crossref import crossref_importer - -def test_elastic_convert(crossref_importer): - with open('tests/files/crossref-works.single.json', 'r') as f: - # not a single line - raw = json.loads(f.read()) - (r, c) = crossref_importer.parse_crossref_dict(raw) - r.state = 'active' - r.to_elastic_dict() -- cgit v1.2.3 From 942f23257bcf1059a2502a3b019ce5af1bde7de5 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 12 Nov 2018 23:12:55 -0800 Subject: refactor kafka branch to not use release_model --- python/fatcat/elastic_workers.py | 5 +- python/fatcat/entity_helpers.py | 100 +++++++++++++++++++++++++++++++++++++++ python/tests/entity_helpers.py | 15 ++++++ 3 files changed, 118 insertions(+), 2 deletions(-) create mode 100644 python/fatcat/entity_helpers.py create mode 100644 python/tests/entity_helpers.py (limited to 'python/fatcat') diff --git a/python/fatcat/elastic_workers.py b/python/fatcat/elastic_workers.py index c3226981..3d2e9c39 100644 --- a/python/fatcat/elastic_workers.py +++ b/python/fatcat/elastic_workers.py @@ -3,7 +3,8 @@ import json import time import requests from fatcat.worker_common import FatcatWorker -from fatcat.release_model import FatcatRelease +from fatcat_client.models import ReleaseEntity +from fatcat.entity_helpers import * from pykafka.common import OffsetType @@ -34,7 +35,7 @@ class FatcatElasticReleaseWorker(FatcatWorker): for msg in consumer: json_str = msg.value.decode('utf-8') - release = FatcatRelease.from_json(json_str) + release = entity_from_json(json_str, ReleaseEntity) #print(release) elastic_endpoint = "{}/{}/release/{}".format( self.elastic_backend, diff --git a/python/fatcat/entity_helpers.py b/python/fatcat/entity_helpers.py new file mode 100644 index 00000000..c454536b --- /dev/null +++ b/python/fatcat/entity_helpers.py @@ -0,0 +1,100 @@ + +import collections +from fatcat_client.models import ReleaseEntity +from fatcat_client.api_client import ApiClient + +def entity_to_json(entity): + ac = ApiClient() + return ac.sanitize_for_serialization(entity) + +def entity_from_json(json_str, entity_type): + """ + Hack to take advantage of the code-generated deserialization code + """ + ac = ApiClient() + thing = collections.namedtuple('Thing', ['data']) + thing.data = json_str + return ac.deserialize(thing, entity_type) + +def release_elastic_dict(release): + """ + Converts from an entity model/schema to elasticsearch oriented schema. + + Returns: dict + """ + + if release.state != 'active': + raise ValueError("Entity is not 'active'") + + # First, the easy ones (direct copy) + t = dict( + ident = release.ident, + revision = release.revision, + title = release.title, + release_type = release.release_type, + release_status = release.release_status, + language = release.language, + doi = release.doi, + pmid = release.pmid, + pmcid = release.pmcid, + isbn13 = release.isbn13, + core_id = release.core_id, + wikidata_qid = release.wikidata_qid + ) + + if release.release_date: + # TODO: resolve why this can be either a string or datetime + if type(release.release_date) == str: + t['release_date'] = release.release_date + else: + t['release_date'] = release.release_date.strftime('%F') + + container = release.container + container_is_kept = False + if container: + t['publisher'] = container.publisher + t['container_name'] = container.name + t['container_issnl'] = container.issnl + container_extra = container.extra + if container_extra: + t['container_is_oa'] = container_extra.get('is_oa') + container_is_kept = container_extra.get('is_kept', False) + t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa') + else: + t['publisher'] = release.publisher + + files = release.files or [] + t['file_count'] = len(files) + in_wa = False + in_ia = False + t['file_pdf_url'] = None + for f in files: + is_pdf = 'pdf' in f.get('mimetype', '') + for url in f.get('urls', []): + if url.get('rel', '') == 'webarchive': + in_wa = True + if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']: + in_ia = True + if is_pdf: + t['file_pdf_url'] = url['url'] + if not t['file_pdf_url'] and is_pdf: + t['file_pdf_url'] = url['url'] + t['file_in_webarchive'] = in_wa + t['file_in_ia'] = in_ia + + extra = release.extra or dict() + if extra: + t['in_shadow'] = extra.get('in_shadow') + if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'): + t['container_is_longtail_oa'] = True + t['any_abstract'] = bool(release.abstracts) + t['is_kept'] = container_is_kept or extra.get('is_kept', False) + + t['ref_count'] = len(release.refs or []) + t['contrib_count'] = len(release.contribs or []) + contrib_names = [] + for c in (release.contribs or []): + if c.raw_name: + contrib_names.append(c.raw_name) + t['contrib_names'] = contrib_names + return t diff --git a/python/tests/entity_helpers.py b/python/tests/entity_helpers.py new file mode 100644 index 00000000..dd6fa00a --- /dev/null +++ b/python/tests/entity_helpers.py @@ -0,0 +1,15 @@ + +import json +import pytest +from fatcat.crossref_importer import FatcatCrossrefImporter +from fatcat.entity_helpers import * + +from crossref import crossref_importer + +def test_elastic_convert(crossref_importer): + with open('tests/files/crossref-works.single.json', 'r') as f: + # not a single line + raw = json.loads(f.read()) + (r, c) = crossref_importer.parse_crossref_dict(raw) + r.state = 'active' + release_elastic_dict(r) -- cgit v1.2.3