From d70abdd82955feba4eecdda24ff6d95f703e0598 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 2 Nov 2018 13:59:24 -0700 Subject: FatcatRelease: start wrapping entities with extra methods --- python/fatcat/__init__.py | 1 + python/fatcat/crossref_importer.py | 5 ++- python/fatcat/release_model.py | 85 ++++++++++++++++++++++++++++++++++++++ python/tests/release_model.py | 15 +++++++ 4 files changed, 104 insertions(+), 2 deletions(-) create mode 100644 python/fatcat/release_model.py create mode 100644 python/tests/release_model.py (limited to 'python') diff --git a/python/fatcat/__init__.py b/python/fatcat/__init__.py index aa12f972..b0492684 100644 --- a/python/fatcat/__init__.py +++ b/python/fatcat/__init__.py @@ -4,6 +4,7 @@ from flask_uuid import FlaskUUID from flask_debugtoolbar import DebugToolbarExtension from config import Config import fatcat_client +from fatcat.release_model import FatcatRelease toolbar = DebugToolbarExtension() app = Flask(__name__) diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py index 37005965..fbf666a3 100644 --- a/python/fatcat/crossref_importer.py +++ b/python/fatcat/crossref_importer.py @@ -6,6 +6,7 @@ import datetime import itertools import fatcat_client from fatcat.importer_common import FatcatImporter +from fatcat import FatcatRelease class FatcatCrossrefImporter(FatcatImporter): @@ -38,7 +39,7 @@ class FatcatCrossrefImporter(FatcatImporter): def parse_crossref_dict(self, obj): """ obj is a python dict (parsed from json). - returns a ReleaseEntity + returns a FatcatRelease """ # This work is out of scope if it doesn't have authors and a title @@ -212,7 +213,7 @@ class FatcatCrossrefImporter(FatcatImporter): if release_date: release_date = release_date.isoformat() + "Z" - re = fatcat_client.ReleaseEntity( + re = FatcatRelease( work_id=None, title=obj['title'][0], contribs=contribs, diff --git a/python/fatcat/release_model.py b/python/fatcat/release_model.py new file mode 100644 index 00000000..a584c00b --- /dev/null +++ b/python/fatcat/release_model.py @@ -0,0 +1,85 @@ + +from fatcat_client.models import ReleaseEntity + +class FatcatRelease(ReleaseEntity): + """ + This is a wrapper class that extends the code-generated `ReleaseEntity` + class with extra methods. + """ + + def to_elastic_dict(self): + """ + Converts from an entity model/schema to elasticsearch oriented schema. + + Returns: dict + """ + + if self.state != 'active': + raise ValueError("Entity is not 'active'") + + # First, the easy ones (direct copy) + t = dict( + ident = self.ident, + revision = self.revision, + title = self.title, + release_date = self.release_date, + release_type = self.release_type, + release_status = self.release_status, + language = self.language, + doi = self.doi, + pmid = self.pmid, + pmcid = self.pmcid, + isbn13 = self.isbn13, + core_id = self.core_id, + wikidata_qid = self.wikidata_qid + ) + + container = self.container + container_is_kept = False + if container: + t['publisher'] = container.publisher + t['container_name'] = container.name + t['container_issnl'] = container.issnl + container_extra = container.extra + if container_extra: + t['container_is_oa'] = container_extra.get('is_oa') + container_is_kept = container_extra.get('is_kept', False) + t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa') + else: + t['publisher'] = self.publisher + + files = self.files or [] + t['file_count'] = len(files) + in_wa = False + in_ia = False + t['file_pdf_url'] = None + for f in files: + is_pdf = 'pdf' in f.get('mimetype', '') + for url in f.get('urls', []): + if url.get('rel', '') == 'webarchive': + in_wa = True + if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']: + in_ia = True + if is_pdf: + t['file_pdf_url'] = url['url'] + if not t['file_pdf_url'] and is_pdf: + t['file_pdf_url'] = url['url'] + t['file_in_webarchive'] = in_wa + t['file_in_ia'] = in_ia + + extra = self.extra or dict() + if extra: + t['in_shadow'] = extra.get('in_shadow') + if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'): + t['container_is_longtail_oa'] = True + t['any_abstract'] = bool(self.abstracts) + t['is_kept'] = container_is_kept or extra.get('is_kept', False) + + t['ref_count'] = len(self.refs or []) + t['contrib_count'] = len(self.contribs or []) + contrib_names = [] + for c in (self.contribs or []): + if c.raw_name: + contrib_names.append(c.raw_name) + t['contrib_names'] = contrib_names + return t diff --git a/python/tests/release_model.py b/python/tests/release_model.py new file mode 100644 index 00000000..4b9dddba --- /dev/null +++ b/python/tests/release_model.py @@ -0,0 +1,15 @@ + +import json +import pytest +from fatcat.crossref_importer import FatcatCrossrefImporter +from fatcat.release_model import FatcatRelease + +from crossref import crossref_importer + +def test_elastic_convert(crossref_importer): + with open('tests/files/crossref-works.single.json', 'r') as f: + # not a single line + raw = json.loads(f.read()) + (r, c) = crossref_importer.parse_crossref_dict(raw) + r.state = 'active' + r.to_elastic_dict() -- cgit v1.2.3