From a68aadc4107fc68dc2748c52dab8a4bd92cca022 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 8 Nov 2020 21:54:24 -0800 Subject: move some PDF URL extraction into declarative format --- python/sandcrawler/ingest.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'python/sandcrawler/ingest.py') diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 2f6be05..602f9c5 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -605,16 +605,14 @@ class IngestFileWorker(SandcrawlerWorker): if ingest_type == "pdf" and html_ish_resource: - fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body) - # the new style of URL extraction (already computed) - # we aren't quite ready to adopt this for the PDF path (which - # has more complex logic to avoid loops, etc) - #if not fulltext_url and html_biblio and html_biblio.pdf_fulltext_url: - # fulltext_url = dict( - # pdf_url=html_biblio.pdf_fulltext_url, - # technique="html_biblio", - # ) + if html_biblio and html_biblio.pdf_fulltext_url: + fulltext_url = dict( + pdf_url=html_biblio.pdf_fulltext_url, + technique="html_biblio", + ) + else: + fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body) result['extract_next_hop'] = fulltext_url if not fulltext_url: -- cgit v1.2.3