diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat/__init__.py | 1 | ||||
| -rw-r--r-- | python/fatcat/crossref_importer.py | 5 | ||||
| -rw-r--r-- | python/fatcat/release_model.py | 85 | ||||
| -rw-r--r-- | python/tests/release_model.py | 15 | 
4 files changed, 104 insertions, 2 deletions
| diff --git a/python/fatcat/__init__.py b/python/fatcat/__init__.py index aa12f972..b0492684 100644 --- a/python/fatcat/__init__.py +++ b/python/fatcat/__init__.py @@ -4,6 +4,7 @@ from flask_uuid import FlaskUUID  from flask_debugtoolbar import DebugToolbarExtension  from config import Config  import fatcat_client +from fatcat.release_model import FatcatRelease  toolbar = DebugToolbarExtension()  app = Flask(__name__) diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py index 37005965..fbf666a3 100644 --- a/python/fatcat/crossref_importer.py +++ b/python/fatcat/crossref_importer.py @@ -6,6 +6,7 @@ import datetime  import itertools  import fatcat_client  from fatcat.importer_common import FatcatImporter +from fatcat import FatcatRelease  class FatcatCrossrefImporter(FatcatImporter): @@ -38,7 +39,7 @@ class FatcatCrossrefImporter(FatcatImporter):      def parse_crossref_dict(self, obj):          """          obj is a python dict (parsed from json). -        returns a ReleaseEntity +        returns a FatcatRelease          """          # This work is out of scope if it doesn't have authors and a title @@ -212,7 +213,7 @@ class FatcatCrossrefImporter(FatcatImporter):          if release_date:              release_date = release_date.isoformat() + "Z" -        re = fatcat_client.ReleaseEntity( +        re = FatcatRelease(              work_id=None,              title=obj['title'][0],              contribs=contribs, diff --git a/python/fatcat/release_model.py b/python/fatcat/release_model.py new file mode 100644 index 00000000..a584c00b --- /dev/null +++ b/python/fatcat/release_model.py @@ -0,0 +1,85 @@ + +from fatcat_client.models import ReleaseEntity + +class FatcatRelease(ReleaseEntity): +    """ +    This is a wrapper class that extends the code-generated `ReleaseEntity` +    class with extra methods. +    """ + +    def to_elastic_dict(self): +        """ +        Converts from an entity model/schema to elasticsearch oriented schema. + +        Returns: dict +        """ + +        if self.state != 'active': +            raise ValueError("Entity is not 'active'") + +        # First, the easy ones (direct copy) +        t = dict( +            ident = self.ident, +            revision = self.revision, +            title = self.title, +            release_date = self.release_date, +            release_type = self.release_type, +            release_status = self.release_status, +            language = self.language, +            doi = self.doi, +            pmid = self.pmid, +            pmcid = self.pmcid, +            isbn13 = self.isbn13, +            core_id = self.core_id, +            wikidata_qid = self.wikidata_qid +        ) + +        container = self.container +        container_is_kept = False +        if container: +            t['publisher'] = container.publisher +            t['container_name'] = container.name +            t['container_issnl'] = container.issnl +            container_extra = container.extra +            if container_extra: +                t['container_is_oa'] = container_extra.get('is_oa') +                container_is_kept = container_extra.get('is_kept', False) +                t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa') +        else: +            t['publisher'] = self.publisher + +        files = self.files or [] +        t['file_count'] = len(files) +        in_wa = False +        in_ia = False +        t['file_pdf_url'] = None +        for f in files: +            is_pdf = 'pdf' in f.get('mimetype', '') +            for url in f.get('urls', []): +                if url.get('rel', '') == 'webarchive': +                    in_wa = True +                if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']: +                    in_ia = True +                    if is_pdf: +                        t['file_pdf_url'] = url['url'] +                if not t['file_pdf_url'] and is_pdf: +                    t['file_pdf_url'] = url['url'] +        t['file_in_webarchive'] = in_wa +        t['file_in_ia'] = in_ia + +        extra = self.extra or dict() +        if extra: +            t['in_shadow'] = extra.get('in_shadow') +            if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'): +                t['container_is_longtail_oa'] = True +        t['any_abstract'] = bool(self.abstracts) +        t['is_kept'] = container_is_kept or extra.get('is_kept', False) + +        t['ref_count'] = len(self.refs or []) +        t['contrib_count'] = len(self.contribs or []) +        contrib_names = [] +        for c in (self.contribs or []): +            if c.raw_name: +                contrib_names.append(c.raw_name) +        t['contrib_names'] = contrib_names +        return t diff --git a/python/tests/release_model.py b/python/tests/release_model.py new file mode 100644 index 00000000..4b9dddba --- /dev/null +++ b/python/tests/release_model.py @@ -0,0 +1,15 @@ + +import json +import pytest +from fatcat.crossref_importer import FatcatCrossrefImporter +from fatcat.release_model import FatcatRelease + +from crossref import crossref_importer + +def test_elastic_convert(crossref_importer): +    with open('tests/files/crossref-works.single.json', 'r') as f: +        # not a single line +        raw = json.loads(f.read()) +        (r, c) = crossref_importer.parse_crossref_dict(raw) +    r.state = 'active' +    r.to_elastic_dict() | 
