From c0e6ab2d70a56374aefb93643a76b833e22188c4 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 27 Jul 2020 16:06:12 -0700 Subject: pipeline: skip grobid/pdftext lookups when no URL; prefer GROBID to pdftext --- fatcat_scholar/work_pipeline.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fatcat_scholar') diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index b2ceaf8..720c696 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -283,10 +283,12 @@ class WorkPipeline: for fe in release.files: if not fe.sha1 or fe.mimetype not in (None, "application/pdf"): continue + if not fe.urls: + continue grobid_fulltext = self.fetch_file_grobid(fe, ident) pdf_meta = self.fetch_pdf_meta(fe, ident) pdftotext_fulltext = None - if pdf_meta: + if pdf_meta and not grobid_fulltext: pdftotext_fulltext = self.fetch_file_pdftotext(fe, ident) if grobid_fulltext or pdftotext_fulltext: break -- cgit v1.2.3