aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-08 21:54:24 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-08 21:54:24 -0800
commita68aadc4107fc68dc2748c52dab8a4bd92cca022 (patch)
treeda3da0a847d5c10dee873e8bce8198a39c12ce1f /python/sandcrawler/ingest.py
parent6a701f966b8bc760bf904c0569562b0159e13559 (diff)
downloadsandcrawler-a68aadc4107fc68dc2748c52dab8a4bd92cca022.tar.gz
sandcrawler-a68aadc4107fc68dc2748c52dab8a4bd92cca022.zip
move some PDF URL extraction into declarative format
Diffstat (limited to 'python/sandcrawler/ingest.py')
-rw-r--r--python/sandcrawler/ingest.py16
1 files changed, 7 insertions, 9 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 2f6be05..602f9c5 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -605,16 +605,14 @@ class IngestFileWorker(SandcrawlerWorker):
if ingest_type == "pdf" and html_ish_resource:
- fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
-
# the new style of URL extraction (already computed)
- # we aren't quite ready to adopt this for the PDF path (which
- # has more complex logic to avoid loops, etc)
- #if not fulltext_url and html_biblio and html_biblio.pdf_fulltext_url:
- # fulltext_url = dict(
- # pdf_url=html_biblio.pdf_fulltext_url,
- # technique="html_biblio",
- # )
+ if html_biblio and html_biblio.pdf_fulltext_url:
+ fulltext_url = dict(
+ pdf_url=html_biblio.pdf_fulltext_url,
+ technique="html_biblio",
+ )
+ else:
+ fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
result['extract_next_hop'] = fulltext_url
if not fulltext_url: