aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/es_transform.py
diff options
context:
space:
mode:
Diffstat (limited to 'fatcat_scholar/es_transform.py')
-rw-r--r--fatcat_scholar/es_transform.py316
1 files changed, 0 insertions, 316 deletions
diff --git a/fatcat_scholar/es_transform.py b/fatcat_scholar/es_transform.py
deleted file mode 100644
index 464b082..0000000
--- a/fatcat_scholar/es_transform.py
+++ /dev/null
@@ -1,316 +0,0 @@
-
-"""
-Originally wrote these as dataclasses using pydantic.dataclasses, but we don't
-get serialization for free with those. This is useful for things like
-auto-conversion of datetime objects.
-"""
-
-import ftfy
-import typing
-import datetime
-from enum import Enum
-from typing import Optional, List
-from xml.etree import cElementTree as ET
-from pydantic import BaseModel
-from fatcat_openapi_client import ReleaseEntity, ReleaseContrib
-
-
-class DocType(str, Enum):
- work = "work"
- sim_page = "sim_page"
-
-class AccessType(str, Enum):
- ia_sim = "ia_sim"
- ia_file = "ia_file"
- wayback = "wayback"
- repository = "repository"
- paywall = "paywall"
- loginwall = "loginwall"
- shadow = "shadow"
-
-class ScholarBiblio(BaseModel):
- release_ident: Optional[str]
- title: str
- subtitle: Optional[str]
- original_title: Optional[str]
- release_date: Optional[datetime.date]
- release_year: Optional[int]
- release_type: Optional[str]
- release_stage: Optional[str]
- withdrawn_status: Optional[str]
- lang_code: Optional[str]
- country_code: Optional[str]
- volume: Optional[str]
- volume_int: Optional[str] # TODO: needed?
- issue: Optional[str]
- issue_int: Optional[str] # TODO: needed?
- pages: Optional[str]
- first_page: Optional[str]
- first_page_int: Optional[str] # TODO: needed?
- number: Optional[str]
-
- doi: Optional[str]
- doi_prefix: Optional[str]
- doi_registrar: Optional[str]
- pmid: Optional[str]
- pmcid: Optional[str]
- isbn13: Optional[str]
- wikidata_qid: Optional[str]
- arxiv_id: Optional[str]
- jstor_id: Optional[str]
- mag_id: Optional[str]
-
- license_slug: Optional[str]
- publisher: Optional[str]
- publisher_type: Optional[str]
- container_name: Optional[str]
- container_original_name: Optional[str]
- container_ident: Optional[str]
- container_issnl: Optional[str]
- container_wikidata_qid: Optional[str]
- issns: List[str]
- container_type: Optional[str]
- contrib_count: Optional[int]
- contrib_names: List[str]
- affiliations: List[str]
-
-class ScholarFulltext(BaseModel):
- lang_code: Optional[str]
- body: str
- acknowledgement: Optional[str]
- annex: Optional[str]
- release_ident: Optional[str]
- file_ident: Optional[str]
- file_sha1: Optional[str]
- file_mimetype: Optional[str]
- thumbnail_url: Optional[str]
- access_url: Optional[str]
- access_type: Optional[AccessType]
-
-class ScholarRelease(BaseModel):
- ident: Optional[str]
- revision: Optional[str]
- title: str
- release_date: Optional[datetime.date]
- release_year: Optional[int]
- release_type: Optional[str]
- release_stage: Optional[str]
- withdrawn_status: Optional[str]
-
- doi: Optional[str]
- doi_prefix: Optional[str]
- doi_registrar: Optional[str]
- pmid: Optional[str]
- pmcid: Optional[str]
- isbn13: Optional[str]
- wikidata_qid: Optional[str]
- arxiv_id: Optional[str]
- jstor_id: Optional[str]
- mag_id: Optional[str]
-
- license_slug: Optional[str]
- container_name: Optional[str]
- container_ident: Optional[str]
- container_issnl: Optional[str]
- container_type: Optional[str]
-
-class ScholarSim(BaseModel):
- issue_item: str
- pub_collection: str
- sim_pubid: str
- first_page: Optional[str]
-
-class ScholarAbstract(BaseModel):
- body: str
- lang_code: Optional[str]
-
-class ScholarAccess(BaseModel):
- access_type: AccessType
- access_url: str
- mimetype: str
- file_ident: Optional[str]
- release_ident: Optional[str]
-
-class ScholarDoc(BaseModel):
- key: str
- doc_type: str # enum: work or page
- doc_index_ts: datetime.datetime
- work_ident: Optional[str]
- tags: List[str] = []
-
- biblio: ScholarBiblio
- fulltext: ScholarFulltext
- ia_sim: ScholarSim
- abstracts: List[ScholarAbstract]
- releases: List[ScholarRelease]
- access: List[ScholarAccess]
-
-def doi_split_prefix(doi: str) -> str:
- return doi.split('/')[0]
-
-def release_doi_registrar(release: ReleaseEntity) -> Optional[str]:
- if not release.ext_ids.doi or not release.extra:
- return None
- for registrar in ('crossref', 'datacite', 'jalc'):
- if registrar in release.extra:
- return registrar
- # TODO: should we default to Crossref?
- return None
-
-def scrub_text(raw: str, mimetype: str = None) -> str:
- """
- This function takes a mimetype-hinted string and tries to reduce it to a
- simple token-and-punctuation scheme with any and all markup removed. Eg,
- HTML tags, JATS XML tags, LaTeX, whatever.
-
- The output should be clean and "HTML safe" (though should still be escaped
- in HTML to get entity encoding correct).
-
- TODO: barely implemented yet
- """
- if "<jats" in raw or (mimetype and "application/xml" in mimetype):
- root = ET.fromstring(raw)
- raw = " ".join(list(root.itertext())) or ""
- raw = ftfy.fix_text(raw)
- assert raw, "Empty abstract"
- return raw
-
-def contrib_name(contrib: ReleaseContrib) -> str:
- # TODO: support more cultural normals for name presentation
- if contrib.given_name and contrib.family_name:
- return f"{contrib.given_name} {contrib.family_name}"
- elif contrib.raw_name:
- return contrib.raw_name
- elif contrib.family_name:
- return contrib.family_name
- else:
- return contrib.given_name
-
-def contrib_affiliation(contrib: ReleaseContrib) -> Optional[str]:
- return None
-
-def es_abstracts_from_release(release: ReleaseEntity) -> List[ScholarAbstract]:
-
- d = dict()
- for abst in release.abstracts:
- if not abst.lang in d:
- d[abst.lang] = ScholarAbstract(lang_code=abst.lang, body=scrub_text(abst.content))
- return list(d.values())
-
-def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
-
- if release.container:
- publisher = release.publisher
- container_name = release.container.name
- container_original_name = release.container.extra and release.container.extra.get('original_name')
- container_ident = release.container.ident
- container_type = release.container.container_type
- container_issnl = release.container.issnl
- issns = [container_issnl,]
- if release.extra.get('issne'):
- issns.append(release.extra['issne'])
- if release.extra.get('issnp'):
- issns.append(release.extra['issnp'])
- issns = list(set(issns))
- else:
- publisher = release.extra.get('publisher')
- container_name = release.extra.get('container_name')
- container_original_name = None
- container_ident = None
- container_type = None
- container_issnl = None
- issns = []
-
- first_page: Optional[str] = None
- if release.pages:
- first_page = release.pages.split('-')[0]
- first_page_int: Optional[int] = None
- if first_page and first_page.isdigit():
- first_page_int = int(first_page)
-
- ret = ScholarBiblio(
- release_ident=release.ident,
- title=release.title,
- subtitle=release.subtitle,
- original_title=release.original_title,
- release_date=release.release_date,
- release_year=release.release_year,
- release_type=release.release_type,
- release_stage=release.release_stage,
- withdrawn_status=release.withdrawn_status,
- lang_code=release.language,
- country_code=release.extra and release.extra.get('country'),
- volume=release.volume,
- volume_int=None,
- issue=release.issue,
- issue_int=None,
- pages=release.pages,
- first_page=first_page,
- first_page_int=None,
- number=release.number,
-
- doi=release.ext_ids.doi,
- doi_prefix=release.ext_ids.doi and doi_split_prefix(release.ext_ids.doi),
- doi_registrar=release_doi_registrar(release),
- pmid=release.ext_ids.pmid,
- pmcid=release.ext_ids.pmcid,
- isbn13=release.ext_ids.isbn13,
- wikidata_qid=release.ext_ids.wikidata_qid,
- arxiv_id=release.ext_ids.arxiv,
- jstor_id=release.ext_ids.jstor,
- mag_id=release.ext_ids.mag,
-
- license_slug=release.license_slug,
- publisher=publisher,
- container_name=container_name,
- container_original_name=container_original_name,
- container_ident=container_ident,
- container_type=container_type,
- container_issnl=container_issnl,
- issns=issns,
-
- contrib_names=[contrib_name(c) for c in release.contribs if c.index],
- contrib_count = len([c for c in release.contribs if c.index]),
- affiliations=list(filter(lambda x: bool(x), [contrib_affiliation(c) for c in release.contribs if c.index])),
- )
- return ret
-
-def es_release_from_release(release: ReleaseEntity) -> ScholarRelease:
-
- if release.container:
- container_name = release.container.name
- container_ident = release.container.ident
- container_issnl = release.container.issnl
- container_type = release.container.container_type
- else:
- container_name = release.extra.get('container_name')
- container_ident = None
- container_issnl = None
- container_type = None
-
- ret = ScholarRelease(
- ident=release.ident,
- revision=release.revision,
- title=release.title,
- release_date=release.release_date,
- release_year=release.release_year,
- release_type=release.release_type,
- release_stage=release.release_stage,
- withdrawn_status=release.withdrawn_status,
- doi=release.ext_ids.doi,
- doi_prefix=release.ext_ids.doi and doi_split_prefix(release.ext_ids.doi),
- doi_registrar=release_doi_registrar(release),
- pmid=release.ext_ids.pmid,
- pmcid=release.ext_ids.pmcid,
- isbn13=release.ext_ids.isbn13,
- wikidata_qid=release.ext_ids.wikidata_qid,
- arxiv_id=release.ext_ids.arxiv,
- jstor_id=release.ext_ids.jstor,
- mag_id=release.ext_ids.mag,
- license_slug=release.license_slug,
- container_name=container_name,
- container_ident=container_ident,
- container_issnl=container_issnl,
- container_type=container_type,
- )
- return ret