From bada18b7cc6a46b1f18fba9e6550c30c1f8ac00e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 6 Aug 2020 16:09:49 -0700 Subject: handle integer conversion and bounding for ES schema --- fatcat_scholar/schema.py | 31 ++++++++++++++++++++++--------- fatcat_scholar/transform.py | 23 +++++++++++++---------- 2 files changed, 35 insertions(+), 19 deletions(-) diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 8b09ab3..fb3de10 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -178,6 +178,25 @@ class ScholarDoc(BaseModel): access: List[ScholarAccess] +def clean_small_int(raw: Optional[str]) -> Optional[int]: + if not raw or not raw.isdigit(): + return None + val = int(raw) + if abs(val) > 50000: + return None + return val + + +def test_clean_small_int() -> None: + assert clean_small_int("") == None + assert clean_small_int(None) == None + assert clean_small_int("asdf") == None + assert clean_small_int("iiv") == None + assert clean_small_int("123") == 123 + assert clean_small_int("1200003") == None + assert clean_small_int("-123") == None + + def doi_split_prefix(doi: str) -> str: return doi.split("/")[0] @@ -320,12 +339,6 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio: first_page: Optional[str] = None if release.pages: first_page = release.pages.split("-")[0] - first_page_int: Optional[int] = None - if first_page and first_page.isdigit(): - first_page_int = int(first_page) - # catch metadata errors which result in ES indexing errors - if abs(first_page_int) > 1000000: - first_page_int = None ret = ScholarBiblio( release_ident=release.ident, @@ -340,12 +353,12 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio: lang_code=release.language, country_code=release.extra and release.extra.get("country"), volume=release.volume, - volume_int=None, + volume_int=clean_small_int(release.volume), issue=release.issue, - issue_int=None, + issue_int=clean_small_int(release.issue), pages=release.pages, first_page=first_page, - first_page_int=first_page_int, + first_page_int=clean_small_int(first_page), number=release.number, doi=release.ext_ids.doi, doi_prefix=release.ext_ids.doi and doi_split_prefix(release.ext_ids.doi), diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index 7afbce5..e02043b 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -69,8 +69,12 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio: first_page = None if sim["page_texts"]: - first_page = sim["page_texts"][0]["page_num"] + first_page = sim["page_texts"][0]["page_num"] or None + first_page_int = clean_small_int(first_page) + container_name = sim["pub_item_metadata"]["metadata"]["title"] + + # can't remember what this hack is for... last_word = container_name.split()[-1] if len(last_word) == 9 and last_word[4] == "-": container_name = container_name[:-10] @@ -81,13 +85,9 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio: issns.append(raw_issn) volume = issue_meta.get("volume") - volume_int = None - if volume and volume.isdigit(): - volume_int = int(volume) + volume_int = clean_small_int(volume) issue = issue_meta.get("issue") - issue_int = None - if issue and issue.isdigit(): - issue_int = int(issue) + issue_int = clean_small_int(issue) date = issue_meta.get("date") release_year = None @@ -102,6 +102,9 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio: except ValueError: pass + if release_year and abs(release_year) > 2050: + release_year = None + return ScholarBiblio( # release_ident=release.ident, title=None, @@ -122,16 +125,16 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio: issue_int=issue_int, pages=sim.get("pages"), first_page=first_page, - first_page_int=None, + first_page_int=first_page_int, # number=None, # no external identifiers # license_slug=release.license_slug, publisher=issue_meta.get("publisher") or pub_meta.get("publisher"), container_name=container_name, container_original_name=None, - container_ident=None, + container_ident=None, # TODO container_type=None, # TODO - container_issnl=None, + container_issnl=None, # TODO issns=issns, # no contrib/affiliation info contrib_names=[], -- cgit v1.2.3