From 3b0bb20ad0b9ce58992a45e1fcf863069119f560 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 3 Apr 2020 18:33:17 -0700 Subject: include ia_pdf_url when available --- fatcat_covid19/transform.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fatcat_covid19/transform.py b/fatcat_covid19/transform.py index 16774ab..3f942ba 100644 --- a/fatcat_covid19/transform.py +++ b/fatcat_covid19/transform.py @@ -152,6 +152,10 @@ def fulltext_to_elasticsearch(row, force_bool=True): t['fulltext']['thumbnail_url'] = "/" + full['thumbnail_path'] if full.get('grobid_xml_path'): t['fulltext']['grobid_xml_url'] = "/" + full['grobid_xml_path'] + for url in full.get('urls', []): + if url.get('rel') in ('webarchive', 'archive') and 'archive.org/' in url['url']: + t['fulltext']['ia_pdf_url'] = url['url'] + break if 'fulltext_grobid' in row: grobid = row['fulltext_grobid'] -- cgit v1.2.3