summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-08-06 16:21:30 -0700
committerBryan Newbold <bnewbold@archive.org>2020-08-06 19:01:12 -0700
commitc56a5361e3c7ec19e8c461c48c4619e3b62e10fd (patch)
treee98456af5198c6edcf4257913bd534ab16bda373
parent45a33f49d111d929d4b24321af94060b1942f7ba (diff)
downloadfatcat-scholar-c56a5361e3c7ec19e8c461c48c4619e3b62e10fd.tar.gz
fatcat-scholar-c56a5361e3c7ec19e8c461c48c4619e3b62e10fd.zip
don't index sim_page without issue_item and first_page
-rw-r--r--fatcat_scholar/transform.py3
1 files changed, 3 insertions, 0 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index e02043b..a238d81 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -216,6 +216,9 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
if heavy.doc_type == DocType.sim_page:
assert ia_sim is not None
assert heavy.sim_fulltext is not None
+ if not ia_sim.first_page or not ia_sim.issue_item:
+ # can't create a valid key if we don't have these fields, so shouldn't index
+ return None
key = f"page_{ia_sim.issue_item}_{ia_sim.first_page}"
sim_issue = ia_sim.issue_item
biblio = es_biblio_from_sim(heavy.sim_fulltext)