summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/work_pipeline.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-07-27 16:06:12 -0700
committerBryan Newbold <bnewbold@archive.org>2020-07-27 16:06:12 -0700
commitc0e6ab2d70a56374aefb93643a76b833e22188c4 (patch)
tree8478f4fda3432b6ff57a4475374a96781c765578 /fatcat_scholar/work_pipeline.py
parent622ae627ac39c872103dd837efcc5baec5291e9f (diff)
downloadfatcat-scholar-c0e6ab2d70a56374aefb93643a76b833e22188c4.tar.gz
fatcat-scholar-c0e6ab2d70a56374aefb93643a76b833e22188c4.zip
pipeline: skip grobid/pdftext lookups when no URL; prefer GROBID to pdftext
Diffstat (limited to 'fatcat_scholar/work_pipeline.py')
-rw-r--r--fatcat_scholar/work_pipeline.py4
1 files changed, 3 insertions, 1 deletions
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index b2ceaf8..720c696 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -283,10 +283,12 @@ class WorkPipeline:
for fe in release.files:
if not fe.sha1 or fe.mimetype not in (None, "application/pdf"):
continue
+ if not fe.urls:
+ continue
grobid_fulltext = self.fetch_file_grobid(fe, ident)
pdf_meta = self.fetch_pdf_meta(fe, ident)
pdftotext_fulltext = None
- if pdf_meta:
+ if pdf_meta and not grobid_fulltext:
pdftotext_fulltext = self.fetch_file_pdftotext(fe, ident)
if grobid_fulltext or pdftotext_fulltext:
break