move some PDF URL extraction into declarative format

author: Bryan Newbold <bnewbold@archive.org> 2020-11-08 21:54:24 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-11-08 21:54:24 -0800
commit: a68aadc4107fc68dc2748c52dab8a4bd92cca022 (patch)
tree: da3da0a847d5c10dee873e8bce8198a39c12ce1f /python/sandcrawler/ingest.py
parent: 6a701f966b8bc760bf904c0569562b0159e13559 (diff)
download: sandcrawler-a68aadc4107fc68dc2748c52dab8a4bd92cca022.tar.gz
sandcrawler-a68aadc4107fc68dc2748c52dab8a4bd92cca022.zip
1 files changed, 7 insertions, 9 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 2f6be05..602f9c5 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -605,16 +605,14 @@ class IngestFileWorker(SandcrawlerWorker):
 
             if ingest_type == "pdf" and html_ish_resource:
 
-                fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
-
                 # the new style of URL extraction (already computed)
-                # we aren't quite ready to adopt this for the PDF path (which
-                # has more complex logic to avoid loops, etc)
-                #if not fulltext_url and html_biblio and html_biblio.pdf_fulltext_url:
-                #    fulltext_url = dict(
-                #        pdf_url=html_biblio.pdf_fulltext_url,
-                #        technique="html_biblio",
-                #    )
+                if html_biblio and html_biblio.pdf_fulltext_url:
+                    fulltext_url = dict(
+                        pdf_url=html_biblio.pdf_fulltext_url,
+                        technique="html_biblio",
+                    )
+                else:
+                    fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
 
                 result['extract_next_hop'] = fulltext_url
                 if not fulltext_url:
author	Bryan Newbold <bnewbold@archive.org>	2020-11-08 21:54:24 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-11-08 21:54:24 -0800
commit	a68aadc4107fc68dc2748c52dab8a4bd92cca022 (patch)
tree	da3da0a847d5c10dee873e8bce8198a39c12ce1f /python/sandcrawler/ingest.py
parent	6a701f966b8bc760bf904c0569562b0159e13559 (diff)
download	sandcrawler-a68aadc4107fc68dc2748c52dab8a4bd92cca022.tar.gz sandcrawler-a68aadc4107fc68dc2748c52dab8a4bd92cca022.zip