summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/schema.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-29 21:22:21 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-29 21:22:21 -0700
commit8c75ce9e78dfe4295188d8a57632d1630f987f8e (patch)
tree1d6816cbfdd73c640c578e969070eadb19b8b14c /fatcat_scholar/schema.py
parent06adf39b0ab38e08d1977127606b784d1a9aca4f (diff)
downloadfatcat-scholar-8c75ce9e78dfe4295188d8a57632d1630f987f8e.tar.gz
fatcat-scholar-8c75ce9e78dfe4295188d8a57632d1630f987f8e.zip
fixes to schema parsing from prod
Diffstat (limited to 'fatcat_scholar/schema.py')
-rw-r--r--fatcat_scholar/schema.py22
1 files changed, 13 insertions, 9 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 35cf9a1..d3cc1fe 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -71,7 +71,7 @@ class ScholarBiblio(BaseModel):
issue_int: Optional[str] # TODO: needed?
pages: Optional[str]
first_page: Optional[str]
- first_page_int: Optional[str] # TODO: needed?
+ first_page_int: Optional[int] # TODO: needed?
number: Optional[str]
doi: Optional[str]
@@ -204,7 +204,7 @@ UNWANTED_ABSTRACT_PREFIXES = [
]
-def scrub_text(raw: str, mimetype: str = None) -> str:
+def scrub_text(raw: str, mimetype: str = None) -> Optional[str]:
"""
This function takes a mimetype-hinted string and tries to reduce it to a
simple token-and-punctuation scheme with any and all markup removed. Eg,
@@ -234,7 +234,8 @@ def scrub_text(raw: str, mimetype: str = None) -> str:
text = text[len(prefix) :]
break
- assert text, "Empty abstract"
+ if not text:
+ return None
return text
@@ -258,9 +259,10 @@ def contrib_affiliation(contrib: ReleaseContrib) -> Optional[str]:
def es_abstracts_from_grobid(tei_dict: dict) -> List[ScholarAbstract]:
if tei_dict.get("abstract"):
- return [ScholarAbstract(lang_code=tei_dict.get("lang"), body=scrub_text(tei_dict["abstract"]))]
- else:
- return []
+ body = scrub_text(tei_dict["abstract"])
+ if body:
+ return [ScholarAbstract(lang_code=tei_dict.get("lang"), body=body)]
+ return []
def es_abstracts_from_release(release: ReleaseEntity) -> List[ScholarAbstract]:
@@ -268,9 +270,11 @@ def es_abstracts_from_release(release: ReleaseEntity) -> List[ScholarAbstract]:
d = dict()
for abst in release.abstracts:
if abst.lang not in d:
- d[abst.lang] = ScholarAbstract(
- lang_code=abst.lang, body=scrub_text(abst.content)
- )
+ body = scrub_text(abst.content)
+ if body:
+ d[abst.lang] = ScholarAbstract(
+ lang_code=abst.lang, body=scrub_text(abst.content)
+ )
return list(d.values())