diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-04-03 18:33:17 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-03 18:33:17 -0700 |
commit | 3b0bb20ad0b9ce58992a45e1fcf863069119f560 (patch) | |
tree | 245bcbb274513791b6764d79971286a604c1294f | |
parent | c5f018e734def3ee274f64cd8bbe2f8974810ffb (diff) | |
download | fatcat-covid19-3b0bb20ad0b9ce58992a45e1fcf863069119f560.tar.gz fatcat-covid19-3b0bb20ad0b9ce58992a45e1fcf863069119f560.zip |
include ia_pdf_url when available
-rw-r--r-- | fatcat_covid19/transform.py | 4 |
1 files changed, 4 insertions, 0 deletions
diff --git a/fatcat_covid19/transform.py b/fatcat_covid19/transform.py index 16774ab..3f942ba 100644 --- a/fatcat_covid19/transform.py +++ b/fatcat_covid19/transform.py @@ -152,6 +152,10 @@ def fulltext_to_elasticsearch(row, force_bool=True): t['fulltext']['thumbnail_url'] = "/" + full['thumbnail_path'] if full.get('grobid_xml_path'): t['fulltext']['grobid_xml_url'] = "/" + full['grobid_xml_path'] + for url in full.get('urls', []): + if url.get('rel') in ('webarchive', 'archive') and 'archive.org/' in url['url']: + t['fulltext']['ia_pdf_url'] = url['url'] + break if 'fulltext_grobid' in row: grobid = row['fulltext_grobid'] |