aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-14 23:10:41 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-14 23:10:41 -0700
commitc4852afdec87712e09b9cdba5b5db7e1ad1a0701 (patch)
tree3d50a3641245d0d30a0f7f4e0704ffa9d19f03a1 /fatcat_scholar
parent4df616706146fce16dbc1fdc3b5502abd13144df (diff)
downloadfatcat-scholar-c4852afdec87712e09b9cdba5b5db7e1ad1a0701.tar.gz
fatcat-scholar-c4852afdec87712e09b9cdba5b5db7e1ad1a0701.zip
start implementing ES transform helpers
Diffstat (limited to 'fatcat_scholar')
-rw-r--r--fatcat_scholar/api_entities.py34
-rw-r--r--fatcat_scholar/es_transform.py202
2 files changed, 236 insertions, 0 deletions
diff --git a/fatcat_scholar/api_entities.py b/fatcat_scholar/api_entities.py
new file mode 100644
index 0000000..53455e8
--- /dev/null
+++ b/fatcat_scholar/api_entities.py
@@ -0,0 +1,34 @@
+
+import json
+import collections
+from fatcat_openapi_client import ApiClient
+
+def entity_to_dict(entity, api_client=None):
+ """
+ Hack to take advantage of the code-generated serialization code.
+
+ Initializing/destroying ApiClient objects is surprisingly expensive
+ (because it involves a threadpool), so we allow passing an existing
+ instance. If you already have a full-on API connection `api`, you can
+ access the ApiClient object as `api.api_client`. This is such a speed-up
+ that this argument may become mandatory.
+ """
+ if not api_client:
+ api_client = ApiClient()
+ return api_client.sanitize_for_serialization(entity)
+
+def entity_from_json(json_str, entity_type, api_client=None):
+ """
+ Hack to take advantage of the code-generated deserialization code
+
+ See note on `entity_to_dict()` about api_client argument.
+ """
+ if not api_client:
+ api_client = ApiClient()
+ thing = collections.namedtuple('Thing', ['data'])
+ thing.data = json_str
+ return api_client.deserialize(thing, entity_type)
+
+def entity_from_dict(obj, entity_type, api_client=None):
+ json_str = json.dumps(obj)
+ return entity_from_json(json_str, entity_type, api_client=api_client)
diff --git a/fatcat_scholar/es_transform.py b/fatcat_scholar/es_transform.py
new file mode 100644
index 0000000..1f47e2c
--- /dev/null
+++ b/fatcat_scholar/es_transform.py
@@ -0,0 +1,202 @@
+
+"""
+Originally wrote these as dataclasses using pydantic.dataclasses, but we don't
+get serialization for free with those. This is useful for things like
+auto-conversion of datetime objects.
+"""
+
+import typing
+import datetime
+from enum import Enum
+from typing import Optional, List
+from pydantic import BaseModel
+from fatcat_openapi_client import ReleaseEntity
+
+
+class DocType(str, Enum):
+ work = "work"
+ sim_page = "sim_page"
+
+class AccessType(str, Enum):
+ ia_sim = "ia_sim"
+ ia_file = "ia_file"
+ wayback = "wayback"
+ repository = "repository"
+ paywall = "paywall"
+ loginwall = "loginwall"
+ shadow = "shadow"
+
+class ScholarBiblio(BaseModel):
+ release_ident: Optional[str]
+ title: str
+ subtitle: Optional[str]
+ original_title: Optional[str]
+ release_date: Optional[datetime.date]
+ release_year: Optional[int]
+ release_type: Optional[str]
+ release_stage: Optional[str]
+ withdrawn_status: Optional[str]
+ lang_code: Optional[str]
+ country_code: Optional[str]
+ volume: Optional[str]
+ volume_int: Optional[str] # TODO: needed?
+ issue: Optional[str]
+ issue_int: Optional[str] # TODO: needed?
+ pages: Optional[str]
+ first_page: Optional[str]
+ first_page_int: Optional[str] # TODO: needed?
+ number: Optional[str]
+
+ doi: Optional[str]
+ doi_prefix: Optional[str]
+ doi_registrar: Optional[str]
+ pmid: Optional[str]
+ pmcid: Optional[str]
+ isbn13: Optional[str]
+ wikidata_qid: Optional[str]
+ arxiv_id: Optional[str]
+ jstor_id: Optional[str]
+ mag_id: Optional[str]
+
+ license_slug: Optional[str]
+ publisher: Optional[str]
+ publisher_type: Optional[str]
+ container_name: Optional[str]
+ container_original_name: Optional[str]
+ container_ident: Optional[str]
+ container_issnl: Optional[str]
+ issns: List[str]
+ container_type: Optional[str]
+ contrib_count: Optional[int]
+ contrib_names: List[str]
+ affiliations: List[str]
+
+class ScholarFulltext(BaseModel):
+ lang_code: Optional[str]
+ body: str
+ acknowledgement: Optional[str]
+ annex: Optional[str]
+ release_ident: Optional[str]
+ file_ident: Optional[str]
+ file_sha1: Optional[str]
+ file_mimetype: Optional[str]
+ thumbnail_url: Optional[str]
+ access_url: Optional[str]
+ access_type: Optional[AccessType]
+
+class ScholarRelease(BaseModel):
+ ident: Optional[str]
+ revision: Optional[str]
+ title: str
+ release_date: Optional[datetime.date]
+ release_year: Optional[int]
+ release_type: Optional[str]
+ release_stage: Optional[str]
+ withdrawn_status: Optional[str]
+
+ doi: Optional[str]
+ doi_prefix: Optional[str]
+ doi_registrar: Optional[str]
+ pmid: Optional[str]
+ pmcid: Optional[str]
+ isbn13: Optional[str]
+ wikidata_qid: Optional[str]
+ arxiv_id: Optional[str]
+ jstor_id: Optional[str]
+ mag_id: Optional[str]
+
+ license_slug: Optional[str]
+ container_name: Optional[str]
+ container_ident: Optional[str]
+ container_issnl: Optional[str]
+ container_type: Optional[str]
+
+class ScholarSim(BaseModel):
+ ia_item: str
+ ia_collection: str
+ first_page: Optional[str]
+ pub_id: str
+
+class ScholarAbstract(BaseModel):
+ body: str
+ lang_code: Optional[str]
+
+class ScholarAccess(BaseModel):
+ access_type: AccessType
+ access_url: str
+ mimetype: str
+ file_ident: Optional[str]
+ release_ident: Optional[str]
+
+class ScholarDoc(BaseModel):
+ key: str
+ doc_type: str # enum: work or page
+ doc_index_ts: datetime.datetime
+ work_ident: Optional[str]
+ tags: List[str] = []
+
+ biblio: ScholarBiblio
+ fulltext: ScholarFulltext
+ ia_sim: ScholarSim
+ abstracts: List[ScholarAbstract]
+ releases: List[ScholarRelease]
+ access: List[ScholarAccess]
+
+# TODO:
+# es_biblio_from_release
+# es_release_from_release
+# es_abstracts_from_release
+
+def doi_split_prefix(doi: str) -> str:
+ return doi.split('/')[0]
+
+def release_doi_registrar(release: ReleaseEntity) -> Optional[str]:
+ if not release.ext_ids.doi or not release.extra:
+ return None
+ for registrar in ('crossref', 'datacite', 'jalc'):
+ if registrar in release.extra:
+ return registrar
+ # TODO: should we default to Crossref?
+ return None
+
+#def es_biblio_from_release(release: Release) -> ScholarBiblio:
+
+def es_release_from_release(release: ReleaseEntity) -> ScholarRelease:
+
+ if release.container:
+ container_name = release.container.name
+ container_ident = release.container.ident
+ container_issnl = release.container.issnl
+ container_type = release.container.container_type
+ else:
+ container_name = release.extra.get('container_name')
+ container_ident = None
+ container_issnl = None
+ container_type = None
+
+ ret = ScholarRelease(
+ ident=release.ident,
+ revision=release.revision,
+ title=release.title,
+ release_date=release.release_date,
+ release_year=release.release_year,
+ release_type=release.release_type,
+ release_stage=release.release_stage,
+ withdrawn_status=release.withdrawn_status,
+ doi=release.ext_ids.doi,
+ doi_prefix=release.ext_ids.doi and doi_split_prefix(release.ext_ids.doi),
+ doi_registrar=release_doi_registrar(release),
+ pmid=release.ext_ids.pmid,
+ pmcid=release.ext_ids.pmcid,
+ isbn13=release.ext_ids.isbn13,
+ wikidata_qid=release.ext_ids.wikidata_qid,
+ arxiv_id=release.ext_ids.arxiv,
+ jstor_id=release.ext_ids.jstor,
+ mag_id=release.ext_ids.mag,
+ license_slug=release.license_slug,
+ container_name=container_name,
+ container_ident=container_ident,
+ container_issnl=container_issnl,
+ container_type=container_type,
+ )
+ return ret