diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-07-26 15:11:44 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-07-26 15:11:44 -0700 |
commit | eeb456c16d016d8523023f787597efae7a6317b9 (patch) | |
tree | 6b79a50b0d4d2c150f3d3a3411ba831fd46358ec | |
parent | 51a36f5e6069efedef0fbcd0ba319ced8f28eba4 (diff) | |
download | fatcat-scholar-eeb456c16d016d8523023f787597efae7a6317b9.tar.gz fatcat-scholar-eeb456c16d016d8523023f787597efae7a6317b9.zip |
better parsing of year as integer in refs pipeline
-rw-r--r-- | fatcat_scholar/schema.py | 8 | ||||
-rw-r--r-- | fatcat_scholar/transform.py | 4 |
2 files changed, 8 insertions, 4 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index bc6b016..0fcf56e 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -301,9 +301,12 @@ class RefTarget(BaseModel): def clean_small_int(raw: Optional[str]) -> Optional[int]: - if not raw or not raw.isdigit(): + if not raw or not raw.strip().isdigit(): + return None + try: + val = int(raw.strip()) + except ValueError: return None - val = int(raw) if abs(val) > 30000: return None return val @@ -318,6 +321,7 @@ def test_clean_small_int() -> None: assert clean_small_int("1200003") == None assert clean_small_int("-123") == None assert clean_small_int("48844") == None + assert clean_small_int("1990²") == None def doi_split_prefix(doi: str) -> str: diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index 13bedb9..3a7102a 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -798,8 +798,8 @@ def refs_from_crossref( ref_container_name = series_title year = ref.get("year") - if year and year.isdigit(): - year = int(year) + if year: + year = clean_small_int(year) else: year = None date = ref.get("date") |