diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-06-29 21:22:21 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-29 21:22:21 -0700 | 
| commit | 8c75ce9e78dfe4295188d8a57632d1630f987f8e (patch) | |
| tree | 1d6816cbfdd73c640c578e969070eadb19b8b14c | |
| parent | 06adf39b0ab38e08d1977127606b784d1a9aca4f (diff) | |
| download | fatcat-scholar-8c75ce9e78dfe4295188d8a57632d1630f987f8e.tar.gz fatcat-scholar-8c75ce9e78dfe4295188d8a57632d1630f987f8e.zip | |
fixes to schema parsing from prod
| -rw-r--r-- | fatcat_scholar/schema.py | 22 | 
1 files changed, 13 insertions, 9 deletions
| diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 35cf9a1..d3cc1fe 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -71,7 +71,7 @@ class ScholarBiblio(BaseModel):      issue_int: Optional[str]  # TODO: needed?      pages: Optional[str]      first_page: Optional[str] -    first_page_int: Optional[str]  # TODO: needed? +    first_page_int: Optional[int]  # TODO: needed?      number: Optional[str]      doi: Optional[str] @@ -204,7 +204,7 @@ UNWANTED_ABSTRACT_PREFIXES = [  ] -def scrub_text(raw: str, mimetype: str = None) -> str: +def scrub_text(raw: str, mimetype: str = None) -> Optional[str]:      """      This function takes a mimetype-hinted string and tries to reduce it to a      simple token-and-punctuation scheme with any and all markup removed. Eg, @@ -234,7 +234,8 @@ def scrub_text(raw: str, mimetype: str = None) -> str:              text = text[len(prefix) :]              break -    assert text, "Empty abstract" +    if not text: +        return None      return text @@ -258,9 +259,10 @@ def contrib_affiliation(contrib: ReleaseContrib) -> Optional[str]:  def es_abstracts_from_grobid(tei_dict: dict) -> List[ScholarAbstract]:      if tei_dict.get("abstract"): -        return [ScholarAbstract(lang_code=tei_dict.get("lang"), body=scrub_text(tei_dict["abstract"]))] -    else: -        return [] +        body = scrub_text(tei_dict["abstract"]) +        if body: +            return [ScholarAbstract(lang_code=tei_dict.get("lang"), body=body)] +    return []  def es_abstracts_from_release(release: ReleaseEntity) -> List[ScholarAbstract]: @@ -268,9 +270,11 @@ def es_abstracts_from_release(release: ReleaseEntity) -> List[ScholarAbstract]:      d = dict()      for abst in release.abstracts:          if abst.lang not in d: -            d[abst.lang] = ScholarAbstract( -                lang_code=abst.lang, body=scrub_text(abst.content) -            ) +            body = scrub_text(abst.content) +            if body: +                d[abst.lang] = ScholarAbstract( +                    lang_code=abst.lang, body=scrub_text(abst.content) +                )      return list(d.values()) | 
