aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/transform.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-08-06 16:09:49 -0700
committerBryan Newbold <bnewbold@archive.org>2020-08-06 19:01:12 -0700
commitbada18b7cc6a46b1f18fba9e6550c30c1f8ac00e (patch)
treed7a8e125edfc89fbbffb98298a2d91e7c5356caf /fatcat_scholar/transform.py
parent03f8d72d0601a93bf1181a9f469166cf4f26761f (diff)
downloadfatcat-scholar-bada18b7cc6a46b1f18fba9e6550c30c1f8ac00e.tar.gz
fatcat-scholar-bada18b7cc6a46b1f18fba9e6550c30c1f8ac00e.zip
handle integer conversion and bounding for ES schema
Diffstat (limited to 'fatcat_scholar/transform.py')
-rw-r--r--fatcat_scholar/transform.py23
1 files changed, 13 insertions, 10 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index 7afbce5..e02043b 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -69,8 +69,12 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio:
first_page = None
if sim["page_texts"]:
- first_page = sim["page_texts"][0]["page_num"]
+ first_page = sim["page_texts"][0]["page_num"] or None
+ first_page_int = clean_small_int(first_page)
+
container_name = sim["pub_item_metadata"]["metadata"]["title"]
+
+ # can't remember what this hack is for...
last_word = container_name.split()[-1]
if len(last_word) == 9 and last_word[4] == "-":
container_name = container_name[:-10]
@@ -81,13 +85,9 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio:
issns.append(raw_issn)
volume = issue_meta.get("volume")
- volume_int = None
- if volume and volume.isdigit():
- volume_int = int(volume)
+ volume_int = clean_small_int(volume)
issue = issue_meta.get("issue")
- issue_int = None
- if issue and issue.isdigit():
- issue_int = int(issue)
+ issue_int = clean_small_int(issue)
date = issue_meta.get("date")
release_year = None
@@ -102,6 +102,9 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio:
except ValueError:
pass
+ if release_year and abs(release_year) > 2050:
+ release_year = None
+
return ScholarBiblio(
# release_ident=release.ident,
title=None,
@@ -122,16 +125,16 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio:
issue_int=issue_int,
pages=sim.get("pages"),
first_page=first_page,
- first_page_int=None,
+ first_page_int=first_page_int,
# number=None,
# no external identifiers
# license_slug=release.license_slug,
publisher=issue_meta.get("publisher") or pub_meta.get("publisher"),
container_name=container_name,
container_original_name=None,
- container_ident=None,
+ container_ident=None, # TODO
container_type=None, # TODO
- container_issnl=None,
+ container_issnl=None, # TODO
issns=issns,
# no contrib/affiliation info
contrib_names=[],