aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-07-26 15:11:44 -0700
committerBryan Newbold <bnewbold@archive.org>2021-07-26 15:11:44 -0700
commiteeb456c16d016d8523023f787597efae7a6317b9 (patch)
tree6b79a50b0d4d2c150f3d3a3411ba831fd46358ec
parent51a36f5e6069efedef0fbcd0ba319ced8f28eba4 (diff)
downloadfatcat-scholar-eeb456c16d016d8523023f787597efae7a6317b9.tar.gz
fatcat-scholar-eeb456c16d016d8523023f787597efae7a6317b9.zip
better parsing of year as integer in refs pipeline
-rw-r--r--fatcat_scholar/schema.py8
-rw-r--r--fatcat_scholar/transform.py4
2 files changed, 8 insertions, 4 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index bc6b016..0fcf56e 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -301,9 +301,12 @@ class RefTarget(BaseModel):
def clean_small_int(raw: Optional[str]) -> Optional[int]:
- if not raw or not raw.isdigit():
+ if not raw or not raw.strip().isdigit():
+ return None
+ try:
+ val = int(raw.strip())
+ except ValueError:
return None
- val = int(raw)
if abs(val) > 30000:
return None
return val
@@ -318,6 +321,7 @@ def test_clean_small_int() -> None:
assert clean_small_int("1200003") == None
assert clean_small_int("-123") == None
assert clean_small_int("48844") == None
+ assert clean_small_int("1990²") == None
def doi_split_prefix(doi: str) -> str:
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index 13bedb9..3a7102a 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -798,8 +798,8 @@ def refs_from_crossref(
ref_container_name = series_title
year = ref.get("year")
- if year and year.isdigit():
- year = int(year)
+ if year:
+ year = clean_small_int(year)
else:
year = None
date = ref.get("date")