diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-29 21:22:21 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-29 21:22:21 -0700 |
commit | 8c75ce9e78dfe4295188d8a57632d1630f987f8e (patch) | |
tree | 1d6816cbfdd73c640c578e969070eadb19b8b14c | |
parent | 06adf39b0ab38e08d1977127606b784d1a9aca4f (diff) | |
download | fatcat-scholar-8c75ce9e78dfe4295188d8a57632d1630f987f8e.tar.gz fatcat-scholar-8c75ce9e78dfe4295188d8a57632d1630f987f8e.zip |
fixes to schema parsing from prod
-rw-r--r-- | fatcat_scholar/schema.py | 22 |
1 files changed, 13 insertions, 9 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 35cf9a1..d3cc1fe 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -71,7 +71,7 @@ class ScholarBiblio(BaseModel): issue_int: Optional[str] # TODO: needed? pages: Optional[str] first_page: Optional[str] - first_page_int: Optional[str] # TODO: needed? + first_page_int: Optional[int] # TODO: needed? number: Optional[str] doi: Optional[str] @@ -204,7 +204,7 @@ UNWANTED_ABSTRACT_PREFIXES = [ ] -def scrub_text(raw: str, mimetype: str = None) -> str: +def scrub_text(raw: str, mimetype: str = None) -> Optional[str]: """ This function takes a mimetype-hinted string and tries to reduce it to a simple token-and-punctuation scheme with any and all markup removed. Eg, @@ -234,7 +234,8 @@ def scrub_text(raw: str, mimetype: str = None) -> str: text = text[len(prefix) :] break - assert text, "Empty abstract" + if not text: + return None return text @@ -258,9 +259,10 @@ def contrib_affiliation(contrib: ReleaseContrib) -> Optional[str]: def es_abstracts_from_grobid(tei_dict: dict) -> List[ScholarAbstract]: if tei_dict.get("abstract"): - return [ScholarAbstract(lang_code=tei_dict.get("lang"), body=scrub_text(tei_dict["abstract"]))] - else: - return [] + body = scrub_text(tei_dict["abstract"]) + if body: + return [ScholarAbstract(lang_code=tei_dict.get("lang"), body=body)] + return [] def es_abstracts_from_release(release: ReleaseEntity) -> List[ScholarAbstract]: @@ -268,9 +270,11 @@ def es_abstracts_from_release(release: ReleaseEntity) -> List[ScholarAbstract]: d = dict() for abst in release.abstracts: if abst.lang not in d: - d[abst.lang] = ScholarAbstract( - lang_code=abst.lang, body=scrub_text(abst.content) - ) + body = scrub_text(abst.content) + if body: + d[abst.lang] = ScholarAbstract( + lang_code=abst.lang, body=scrub_text(abst.content) + ) return list(d.values()) |