summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/schema.py
diff options
context:
space:
mode:
Diffstat (limited to 'fatcat_scholar/schema.py')
-rw-r--r--fatcat_scholar/schema.py102
1 files changed, 64 insertions, 38 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 10742fb..110991d 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -1,4 +1,3 @@
-
"""
Originally wrote these as dataclasses using pydantic.dataclasses, but we don't
get serialization for free with those. This is useful for things like
@@ -22,6 +21,7 @@ class DocType(str, Enum):
work = "work"
sim_page = "sim_page"
+
class IntermediateBundle(BaseModel):
doc_type: DocType
releases: List[ReleaseEntity]
@@ -47,6 +47,7 @@ class AccessType(str, Enum):
loginwall = "loginwall"
shadow = "shadow"
+
class ScholarBiblio(BaseModel):
release_ident: Optional[str]
title: Optional[str]
@@ -60,12 +61,12 @@ class ScholarBiblio(BaseModel):
lang_code: Optional[str]
country_code: Optional[str]
volume: Optional[str]
- volume_int: Optional[str] # TODO: needed?
+ volume_int: Optional[str] # TODO: needed?
issue: Optional[str]
- issue_int: Optional[str] # TODO: needed?
+ issue_int: Optional[str] # TODO: needed?
pages: Optional[str]
first_page: Optional[str]
- first_page_int: Optional[str] # TODO: needed?
+ first_page_int: Optional[str] # TODO: needed?
number: Optional[str]
doi: Optional[str]
@@ -93,6 +94,7 @@ class ScholarBiblio(BaseModel):
contrib_names: List[str]
affiliations: List[str]
+
class ScholarFulltext(BaseModel):
lang_code: Optional[str]
body: str
@@ -106,6 +108,7 @@ class ScholarFulltext(BaseModel):
access_url: Optional[str]
access_type: Optional[AccessType]
+
class ScholarRelease(BaseModel):
ident: Optional[str]
revision: Optional[str]
@@ -133,16 +136,19 @@ class ScholarRelease(BaseModel):
container_issnl: Optional[str]
container_type: Optional[str]
+
class ScholarSim(BaseModel):
issue_item: str
pub_collection: str
sim_pubid: str
first_page: Optional[str]
+
class ScholarAbstract(BaseModel):
body: str
lang_code: Optional[str]
+
class ScholarAccess(BaseModel):
access_type: AccessType
access_url: str
@@ -150,9 +156,10 @@ class ScholarAccess(BaseModel):
file_ident: Optional[str]
release_ident: Optional[str]
+
class ScholarDoc(BaseModel):
key: str
- doc_type: str # enum: work or page
+ doc_type: str # enum: work or page
doc_index_ts: datetime.datetime
work_ident: Optional[str]
tags: List[str] = []
@@ -164,29 +171,33 @@ class ScholarDoc(BaseModel):
releases: List[ScholarRelease]
access: List[ScholarAccess]
+
def doi_split_prefix(doi: str) -> str:
- return doi.split('/')[0]
+ return doi.split("/")[0]
+
def release_doi_registrar(release: ReleaseEntity) -> Optional[str]:
if not release.ext_ids.doi or not release.extra:
return None
- for registrar in ('crossref', 'datacite', 'jalc'):
+ for registrar in ("crossref", "datacite", "jalc"):
if registrar in release.extra:
return registrar
# TODO: should we default to Crossref?
return None
+
UNWANTED_ABSTRACT_PREFIXES = [
# roughly sort this long to short
- 'Abstract No Abstract ',
- 'Publisher Summary ',
- 'Abstract ',
- 'ABSTRACT ',
- 'Summary ',
- 'Background: ',
- 'Background ',
+ "Abstract No Abstract ",
+ "Publisher Summary ",
+ "Abstract ",
+ "ABSTRACT ",
+ "Summary ",
+ "Background: ",
+ "Background ",
]
+
def scrub_text(raw: str, mimetype: str = None) -> str:
"""
This function takes a mimetype-hinted string and tries to reduce it to a
@@ -201,25 +212,26 @@ def scrub_text(raw: str, mimetype: str = None) -> str:
text = ftfy.fix_text(raw)
# remove HTML
- text = BeautifulSoup(text, 'html.parser').get_text()
+ text = BeautifulSoup(text, "html.parser").get_text()
# TODO: for performance, compile these as globals?
# Three regexes below adapted from Blendle cleaner.py
# https://github.com/blendle/research-summarization/blob/master/enrichers/cleaner.py#L29
- text = re.sub(r'…', '...', text)
- text = re.sub(r'[`‘’‛⸂⸃⸌⸍⸜⸝]', "'", text)
- text = re.sub(r'[„“]|(\'\')|(,,)', '"', text)
- text = re.sub(r'\s+', ' ', text).strip()
+ text = re.sub(r"…", "...", text)
+ text = re.sub(r"[`‘’‛⸂⸃⸌⸍⸜⸝]", "'", text)
+ text = re.sub(r"[„“]|(\'\')|(,,)", '"', text)
+ text = re.sub(r"\s+", " ", text).strip()
# hack to remove abstract prefixes
for prefix in UNWANTED_ABSTRACT_PREFIXES:
if text.startswith(prefix):
- text = text[len(prefix):]
+ text = text[len(prefix) :]
break
assert text, "Empty abstract"
return text
+
def contrib_name(contrib: ReleaseContrib) -> str:
# TODO: support more cultural normals for name presentation
if contrib.raw_name:
@@ -231,36 +243,45 @@ def contrib_name(contrib: ReleaseContrib) -> str:
else:
return contrib.given_name
+
def contrib_affiliation(contrib: ReleaseContrib) -> Optional[str]:
# TODO
return None
+
def es_abstracts_from_release(release: ReleaseEntity) -> List[ScholarAbstract]:
d = dict()
for abst in release.abstracts:
if not abst.lang in d:
- d[abst.lang] = ScholarAbstract(lang_code=abst.lang, body=scrub_text(abst.content))
+ d[abst.lang] = ScholarAbstract(
+ lang_code=abst.lang, body=scrub_text(abst.content)
+ )
return list(d.values())
+
def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
if release.container:
publisher = release.publisher
container_name = release.container.name
- container_original_name = release.container.extra and release.container.extra.get('original_name')
+ container_original_name = (
+ release.container.extra and release.container.extra.get("original_name")
+ )
container_ident = release.container.ident
container_type = release.container.container_type
container_issnl = release.container.issnl
- issns = [container_issnl,]
- if release.extra.get('issne'):
- issns.append(release.extra['issne'])
- if release.extra.get('issnp'):
- issns.append(release.extra['issnp'])
+ issns = [
+ container_issnl,
+ ]
+ if release.extra.get("issne"):
+ issns.append(release.extra["issne"])
+ if release.extra.get("issnp"):
+ issns.append(release.extra["issnp"])
issns = list(set(issns))
else:
- publisher = release.extra.get('publisher')
- container_name = release.extra.get('container_name')
+ publisher = release.extra.get("publisher")
+ container_name = release.extra.get("container_name")
container_original_name = None
container_ident = None
container_type = None
@@ -269,7 +290,7 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
first_page: Optional[str] = None
if release.pages:
- first_page = release.pages.split('-')[0]
+ first_page = release.pages.split("-")[0]
first_page_int: Optional[int] = None
if first_page and first_page.isdigit():
first_page_int = int(first_page)
@@ -285,7 +306,7 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
release_stage=release.release_stage,
withdrawn_status=release.withdrawn_status,
lang_code=release.language,
- country_code=release.extra and release.extra.get('country'),
+ country_code=release.extra and release.extra.get("country"),
volume=release.volume,
volume_int=None,
issue=release.issue,
@@ -294,7 +315,6 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
first_page=first_page,
first_page_int=None,
number=release.number,
-
doi=release.ext_ids.doi,
doi_prefix=release.ext_ids.doi and doi_split_prefix(release.ext_ids.doi),
doi_registrar=release_doi_registrar(release),
@@ -305,7 +325,6 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
arxiv_id=release.ext_ids.arxiv,
jstor_id=release.ext_ids.jstor,
mag_id=release.ext_ids.mag,
-
license_slug=release.license_slug,
publisher=publisher,
container_name=container_name,
@@ -314,14 +333,21 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
container_type=container_type,
container_issnl=container_issnl,
issns=issns,
-
# TODO; these filters sort of meh. refactor to be above?
- contrib_names=list(filter(lambda x: bool(x), [contrib_name(c) for c in release.contribs])),
- contrib_count = len([c for c in release.contribs if c.index]),
- affiliations=list(filter(lambda x: bool(x), [contrib_affiliation(c) for c in release.contribs if c.index])),
+ contrib_names=list(
+ filter(lambda x: bool(x), [contrib_name(c) for c in release.contribs])
+ ),
+ contrib_count=len([c for c in release.contribs if c.index]),
+ affiliations=list(
+ filter(
+ lambda x: bool(x),
+ [contrib_affiliation(c) for c in release.contribs if c.index],
+ )
+ ),
)
return ret
+
def es_release_from_release(release: ReleaseEntity) -> ScholarRelease:
if release.container:
@@ -330,7 +356,7 @@ def es_release_from_release(release: ReleaseEntity) -> ScholarRelease:
container_issnl = release.container.issnl
container_type = release.container.container_type
else:
- container_name = release.extra.get('container_name')
+ container_name = release.extra.get("container_name")
container_ident = None
container_issnl = None
container_type = None