From a68aadc4107fc68dc2748c52dab8a4bd92cca022 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Sun, 8 Nov 2020 21:54:24 -0800
Subject: move some PDF URL extraction into declarative format

---
 python/sandcrawler/ingest.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'python/sandcrawler/ingest.py')

diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 2f6be05..602f9c5 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -605,16 +605,14 @@ class IngestFileWorker(SandcrawlerWorker):
 
             if ingest_type == "pdf" and html_ish_resource:
 
-                fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
-
                 # the new style of URL extraction (already computed)
-                # we aren't quite ready to adopt this for the PDF path (which
-                # has more complex logic to avoid loops, etc)
-                #if not fulltext_url and html_biblio and html_biblio.pdf_fulltext_url:
-                #    fulltext_url = dict(
-                #        pdf_url=html_biblio.pdf_fulltext_url,
-                #        technique="html_biblio",
-                #    )
+                if html_biblio and html_biblio.pdf_fulltext_url:
+                    fulltext_url = dict(
+                        pdf_url=html_biblio.pdf_fulltext_url,
+                        technique="html_biblio",
+                    )
+                else:
+                    fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
 
                 result['extract_next_hop'] = fulltext_url
                 if not fulltext_url:
-- 
cgit v1.2.3