summaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/fatcat/__init__.py1
-rw-r--r--python/fatcat/crossref_importer.py5
-rw-r--r--python/fatcat/release_model.py85
-rw-r--r--python/tests/release_model.py15
4 files changed, 104 insertions, 2 deletions
diff --git a/python/fatcat/__init__.py b/python/fatcat/__init__.py
index aa12f972..b0492684 100644
--- a/python/fatcat/__init__.py
+++ b/python/fatcat/__init__.py
@@ -4,6 +4,7 @@ from flask_uuid import FlaskUUID
from flask_debugtoolbar import DebugToolbarExtension
from config import Config
import fatcat_client
+from fatcat.release_model import FatcatRelease
toolbar = DebugToolbarExtension()
app = Flask(__name__)
diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py
index 37005965..fbf666a3 100644
--- a/python/fatcat/crossref_importer.py
+++ b/python/fatcat/crossref_importer.py
@@ -6,6 +6,7 @@ import datetime
import itertools
import fatcat_client
from fatcat.importer_common import FatcatImporter
+from fatcat import FatcatRelease
class FatcatCrossrefImporter(FatcatImporter):
@@ -38,7 +39,7 @@ class FatcatCrossrefImporter(FatcatImporter):
def parse_crossref_dict(self, obj):
"""
obj is a python dict (parsed from json).
- returns a ReleaseEntity
+ returns a FatcatRelease
"""
# This work is out of scope if it doesn't have authors and a title
@@ -212,7 +213,7 @@ class FatcatCrossrefImporter(FatcatImporter):
if release_date:
release_date = release_date.isoformat() + "Z"
- re = fatcat_client.ReleaseEntity(
+ re = FatcatRelease(
work_id=None,
title=obj['title'][0],
contribs=contribs,
diff --git a/python/fatcat/release_model.py b/python/fatcat/release_model.py
new file mode 100644
index 00000000..a584c00b
--- /dev/null
+++ b/python/fatcat/release_model.py
@@ -0,0 +1,85 @@
+
+from fatcat_client.models import ReleaseEntity
+
+class FatcatRelease(ReleaseEntity):
+ """
+ This is a wrapper class that extends the code-generated `ReleaseEntity`
+ class with extra methods.
+ """
+
+ def to_elastic_dict(self):
+ """
+ Converts from an entity model/schema to elasticsearch oriented schema.
+
+ Returns: dict
+ """
+
+ if self.state != 'active':
+ raise ValueError("Entity is not 'active'")
+
+ # First, the easy ones (direct copy)
+ t = dict(
+ ident = self.ident,
+ revision = self.revision,
+ title = self.title,
+ release_date = self.release_date,
+ release_type = self.release_type,
+ release_status = self.release_status,
+ language = self.language,
+ doi = self.doi,
+ pmid = self.pmid,
+ pmcid = self.pmcid,
+ isbn13 = self.isbn13,
+ core_id = self.core_id,
+ wikidata_qid = self.wikidata_qid
+ )
+
+ container = self.container
+ container_is_kept = False
+ if container:
+ t['publisher'] = container.publisher
+ t['container_name'] = container.name
+ t['container_issnl'] = container.issnl
+ container_extra = container.extra
+ if container_extra:
+ t['container_is_oa'] = container_extra.get('is_oa')
+ container_is_kept = container_extra.get('is_kept', False)
+ t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa')
+ else:
+ t['publisher'] = self.publisher
+
+ files = self.files or []
+ t['file_count'] = len(files)
+ in_wa = False
+ in_ia = False
+ t['file_pdf_url'] = None
+ for f in files:
+ is_pdf = 'pdf' in f.get('mimetype', '')
+ for url in f.get('urls', []):
+ if url.get('rel', '') == 'webarchive':
+ in_wa = True
+ if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']:
+ in_ia = True
+ if is_pdf:
+ t['file_pdf_url'] = url['url']
+ if not t['file_pdf_url'] and is_pdf:
+ t['file_pdf_url'] = url['url']
+ t['file_in_webarchive'] = in_wa
+ t['file_in_ia'] = in_ia
+
+ extra = self.extra or dict()
+ if extra:
+ t['in_shadow'] = extra.get('in_shadow')
+ if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'):
+ t['container_is_longtail_oa'] = True
+ t['any_abstract'] = bool(self.abstracts)
+ t['is_kept'] = container_is_kept or extra.get('is_kept', False)
+
+ t['ref_count'] = len(self.refs or [])
+ t['contrib_count'] = len(self.contribs or [])
+ contrib_names = []
+ for c in (self.contribs or []):
+ if c.raw_name:
+ contrib_names.append(c.raw_name)
+ t['contrib_names'] = contrib_names
+ return t
diff --git a/python/tests/release_model.py b/python/tests/release_model.py
new file mode 100644
index 00000000..4b9dddba
--- /dev/null
+++ b/python/tests/release_model.py
@@ -0,0 +1,15 @@
+
+import json
+import pytest
+from fatcat.crossref_importer import FatcatCrossrefImporter
+from fatcat.release_model import FatcatRelease
+
+from crossref import crossref_importer
+
+def test_elastic_convert(crossref_importer):
+ with open('tests/files/crossref-works.single.json', 'r') as f:
+ # not a single line
+ raw = json.loads(f.read())
+ (r, c) = crossref_importer.parse_crossref_dict(raw)
+ r.state = 'active'
+ r.to_elastic_dict()