diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-07-27 16:06:12 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-07-27 16:06:12 -0700 |
commit | c0e6ab2d70a56374aefb93643a76b833e22188c4 (patch) | |
tree | 8478f4fda3432b6ff57a4475374a96781c765578 /fatcat_scholar/work_pipeline.py | |
parent | 622ae627ac39c872103dd837efcc5baec5291e9f (diff) | |
download | fatcat-scholar-c0e6ab2d70a56374aefb93643a76b833e22188c4.tar.gz fatcat-scholar-c0e6ab2d70a56374aefb93643a76b833e22188c4.zip |
pipeline: skip grobid/pdftext lookups when no URL; prefer GROBID to pdftext
Diffstat (limited to 'fatcat_scholar/work_pipeline.py')
-rw-r--r-- | fatcat_scholar/work_pipeline.py | 4 |
1 files changed, 3 insertions, 1 deletions
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index b2ceaf8..720c696 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -283,10 +283,12 @@ class WorkPipeline: for fe in release.files: if not fe.sha1 or fe.mimetype not in (None, "application/pdf"): continue + if not fe.urls: + continue grobid_fulltext = self.fetch_file_grobid(fe, ident) pdf_meta = self.fetch_pdf_meta(fe, ident) pdftotext_fulltext = None - if pdf_meta: + if pdf_meta and not grobid_fulltext: pdftotext_fulltext = self.fetch_file_pdftotext(fe, ident) if grobid_fulltext or pdftotext_fulltext: break |