aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-08-06 16:09:49 -0700
committerBryan Newbold <bnewbold@archive.org>2020-08-06 19:01:12 -0700
commitbada18b7cc6a46b1f18fba9e6550c30c1f8ac00e (patch)
treed7a8e125edfc89fbbffb98298a2d91e7c5356caf
parent03f8d72d0601a93bf1181a9f469166cf4f26761f (diff)
downloadfatcat-scholar-bada18b7cc6a46b1f18fba9e6550c30c1f8ac00e.tar.gz
fatcat-scholar-bada18b7cc6a46b1f18fba9e6550c30c1f8ac00e.zip
handle integer conversion and bounding for ES schema
-rw-r--r--fatcat_scholar/schema.py31
-rw-r--r--fatcat_scholar/transform.py23
2 files changed, 35 insertions, 19 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 8b09ab3..fb3de10 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -178,6 +178,25 @@ class ScholarDoc(BaseModel):
access: List[ScholarAccess]
+def clean_small_int(raw: Optional[str]) -> Optional[int]:
+ if not raw or not raw.isdigit():
+ return None
+ val = int(raw)
+ if abs(val) > 50000:
+ return None
+ return val
+
+
+def test_clean_small_int() -> None:
+ assert clean_small_int("") == None
+ assert clean_small_int(None) == None
+ assert clean_small_int("asdf") == None
+ assert clean_small_int("iiv") == None
+ assert clean_small_int("123") == 123
+ assert clean_small_int("1200003") == None
+ assert clean_small_int("-123") == None
+
+
def doi_split_prefix(doi: str) -> str:
return doi.split("/")[0]
@@ -320,12 +339,6 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
first_page: Optional[str] = None
if release.pages:
first_page = release.pages.split("-")[0]
- first_page_int: Optional[int] = None
- if first_page and first_page.isdigit():
- first_page_int = int(first_page)
- # catch metadata errors which result in ES indexing errors
- if abs(first_page_int) > 1000000:
- first_page_int = None
ret = ScholarBiblio(
release_ident=release.ident,
@@ -340,12 +353,12 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
lang_code=release.language,
country_code=release.extra and release.extra.get("country"),
volume=release.volume,
- volume_int=None,
+ volume_int=clean_small_int(release.volume),
issue=release.issue,
- issue_int=None,
+ issue_int=clean_small_int(release.issue),
pages=release.pages,
first_page=first_page,
- first_page_int=first_page_int,
+ first_page_int=clean_small_int(first_page),
number=release.number,
doi=release.ext_ids.doi,
doi_prefix=release.ext_ids.doi and doi_split_prefix(release.ext_ids.doi),
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index 7afbce5..e02043b 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -69,8 +69,12 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio:
first_page = None
if sim["page_texts"]:
- first_page = sim["page_texts"][0]["page_num"]
+ first_page = sim["page_texts"][0]["page_num"] or None
+ first_page_int = clean_small_int(first_page)
+
container_name = sim["pub_item_metadata"]["metadata"]["title"]
+
+ # can't remember what this hack is for...
last_word = container_name.split()[-1]
if len(last_word) == 9 and last_word[4] == "-":
container_name = container_name[:-10]
@@ -81,13 +85,9 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio:
issns.append(raw_issn)
volume = issue_meta.get("volume")
- volume_int = None
- if volume and volume.isdigit():
- volume_int = int(volume)
+ volume_int = clean_small_int(volume)
issue = issue_meta.get("issue")
- issue_int = None
- if issue and issue.isdigit():
- issue_int = int(issue)
+ issue_int = clean_small_int(issue)
date = issue_meta.get("date")
release_year = None
@@ -102,6 +102,9 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio:
except ValueError:
pass
+ if release_year and abs(release_year) > 2050:
+ release_year = None
+
return ScholarBiblio(
# release_ident=release.ident,
title=None,
@@ -122,16 +125,16 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio:
issue_int=issue_int,
pages=sim.get("pages"),
first_page=first_page,
- first_page_int=None,
+ first_page_int=first_page_int,
# number=None,
# no external identifiers
# license_slug=release.license_slug,
publisher=issue_meta.get("publisher") or pub_meta.get("publisher"),
container_name=container_name,
container_original_name=None,
- container_ident=None,
+ container_ident=None, # TODO
container_type=None, # TODO
- container_issnl=None,
+ container_issnl=None, # TODO
issns=issns,
# no contrib/affiliation info
contrib_names=[],