From 942f23257bcf1059a2502a3b019ce5af1bde7de5 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 12 Nov 2018 23:12:55 -0800 Subject: refactor kafka branch to not use release_model --- python/fatcat/elastic_workers.py | 5 +- python/fatcat/entity_helpers.py | 100 +++++++++++++++++++++++++++++++++++++++ python/tests/entity_helpers.py | 15 ++++++ 3 files changed, 118 insertions(+), 2 deletions(-) create mode 100644 python/fatcat/entity_helpers.py create mode 100644 python/tests/entity_helpers.py diff --git a/python/fatcat/elastic_workers.py b/python/fatcat/elastic_workers.py index c3226981..3d2e9c39 100644 --- a/python/fatcat/elastic_workers.py +++ b/python/fatcat/elastic_workers.py @@ -3,7 +3,8 @@ import json import time import requests from fatcat.worker_common import FatcatWorker -from fatcat.release_model import FatcatRelease +from fatcat_client.models import ReleaseEntity +from fatcat.entity_helpers import * from pykafka.common import OffsetType @@ -34,7 +35,7 @@ class FatcatElasticReleaseWorker(FatcatWorker): for msg in consumer: json_str = msg.value.decode('utf-8') - release = FatcatRelease.from_json(json_str) + release = entity_from_json(json_str, ReleaseEntity) #print(release) elastic_endpoint = "{}/{}/release/{}".format( self.elastic_backend, diff --git a/python/fatcat/entity_helpers.py b/python/fatcat/entity_helpers.py new file mode 100644 index 00000000..c454536b --- /dev/null +++ b/python/fatcat/entity_helpers.py @@ -0,0 +1,100 @@ + +import collections +from fatcat_client.models import ReleaseEntity +from fatcat_client.api_client import ApiClient + +def entity_to_json(entity): + ac = ApiClient() + return ac.sanitize_for_serialization(entity) + +def entity_from_json(json_str, entity_type): + """ + Hack to take advantage of the code-generated deserialization code + """ + ac = ApiClient() + thing = collections.namedtuple('Thing', ['data']) + thing.data = json_str + return ac.deserialize(thing, entity_type) + +def release_elastic_dict(release): + """ + Converts from an entity model/schema to elasticsearch oriented schema. + + Returns: dict + """ + + if release.state != 'active': + raise ValueError("Entity is not 'active'") + + # First, the easy ones (direct copy) + t = dict( + ident = release.ident, + revision = release.revision, + title = release.title, + release_type = release.release_type, + release_status = release.release_status, + language = release.language, + doi = release.doi, + pmid = release.pmid, + pmcid = release.pmcid, + isbn13 = release.isbn13, + core_id = release.core_id, + wikidata_qid = release.wikidata_qid + ) + + if release.release_date: + # TODO: resolve why this can be either a string or datetime + if type(release.release_date) == str: + t['release_date'] = release.release_date + else: + t['release_date'] = release.release_date.strftime('%F') + + container = release.container + container_is_kept = False + if container: + t['publisher'] = container.publisher + t['container_name'] = container.name + t['container_issnl'] = container.issnl + container_extra = container.extra + if container_extra: + t['container_is_oa'] = container_extra.get('is_oa') + container_is_kept = container_extra.get('is_kept', False) + t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa') + else: + t['publisher'] = release.publisher + + files = release.files or [] + t['file_count'] = len(files) + in_wa = False + in_ia = False + t['file_pdf_url'] = None + for f in files: + is_pdf = 'pdf' in f.get('mimetype', '') + for url in f.get('urls', []): + if url.get('rel', '') == 'webarchive': + in_wa = True + if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']: + in_ia = True + if is_pdf: + t['file_pdf_url'] = url['url'] + if not t['file_pdf_url'] and is_pdf: + t['file_pdf_url'] = url['url'] + t['file_in_webarchive'] = in_wa + t['file_in_ia'] = in_ia + + extra = release.extra or dict() + if extra: + t['in_shadow'] = extra.get('in_shadow') + if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'): + t['container_is_longtail_oa'] = True + t['any_abstract'] = bool(release.abstracts) + t['is_kept'] = container_is_kept or extra.get('is_kept', False) + + t['ref_count'] = len(release.refs or []) + t['contrib_count'] = len(release.contribs or []) + contrib_names = [] + for c in (release.contribs or []): + if c.raw_name: + contrib_names.append(c.raw_name) + t['contrib_names'] = contrib_names + return t diff --git a/python/tests/entity_helpers.py b/python/tests/entity_helpers.py new file mode 100644 index 00000000..dd6fa00a --- /dev/null +++ b/python/tests/entity_helpers.py @@ -0,0 +1,15 @@ + +import json +import pytest +from fatcat.crossref_importer import FatcatCrossrefImporter +from fatcat.entity_helpers import * + +from crossref import crossref_importer + +def test_elastic_convert(crossref_importer): + with open('tests/files/crossref-works.single.json', 'r') as f: + # not a single line + raw = json.loads(f.read()) + (r, c) = crossref_importer.parse_crossref_dict(raw) + r.state = 'active' + release_elastic_dict(r) -- cgit v1.2.3