From 767018eae6c628e0add27a0f187327b25d8569dc Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 26 Nov 2018 19:26:02 -0800 Subject: fix file extraction (and transforms) --- python/fatcat_tools/transforms.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py index 87facd57..516b68ae 100644 --- a/python/fatcat_tools/transforms.py +++ b/python/fatcat_tools/transforms.py @@ -69,16 +69,16 @@ def release_to_elasticsearch(release): in_ia = False t['file_pdf_url'] = None for f in files: - is_pdf = 'pdf' in f.get('mimetype', '') - for url in f.get('urls', []): - if url.get('rel', '') == 'webarchive': + is_pdf = 'pdf' in (f.mimetype or '') + for url in (f.urls or []): + if url.rel == 'webarchive': in_wa = True - if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']: + if '//web.archive.org/' in (url.url or '') or '//archive.org/' in (url.url or ''): in_ia = True if is_pdf: - t['file_pdf_url'] = url['url'] + t['file_pdf_url'] = url.url if not t['file_pdf_url'] and is_pdf: - t['file_pdf_url'] = url['url'] + t['file_pdf_url'] = url.url t['file_in_webarchive'] = in_wa t['file_in_ia'] = in_ia -- cgit v1.2.3