aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-04-03 18:33:17 -0700
committerBryan Newbold <bnewbold@archive.org>2020-04-03 18:33:17 -0700
commit3b0bb20ad0b9ce58992a45e1fcf863069119f560 (patch)
tree245bcbb274513791b6764d79971286a604c1294f
parentc5f018e734def3ee274f64cd8bbe2f8974810ffb (diff)
downloadfatcat-covid19-3b0bb20ad0b9ce58992a45e1fcf863069119f560.tar.gz
fatcat-covid19-3b0bb20ad0b9ce58992a45e1fcf863069119f560.zip
include ia_pdf_url when available
-rw-r--r--fatcat_covid19/transform.py4
1 files changed, 4 insertions, 0 deletions
diff --git a/fatcat_covid19/transform.py b/fatcat_covid19/transform.py
index 16774ab..3f942ba 100644
--- a/fatcat_covid19/transform.py
+++ b/fatcat_covid19/transform.py
@@ -152,6 +152,10 @@ def fulltext_to_elasticsearch(row, force_bool=True):
t['fulltext']['thumbnail_url'] = "/" + full['thumbnail_path']
if full.get('grobid_xml_path'):
t['fulltext']['grobid_xml_url'] = "/" + full['grobid_xml_path']
+ for url in full.get('urls', []):
+ if url.get('rel') in ('webarchive', 'archive') and 'archive.org/' in url['url']:
+ t['fulltext']['ia_pdf_url'] = url['url']
+ break
if 'fulltext_grobid' in row:
grobid = row['fulltext_grobid']