aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/ingest.py')
-rw-r--r--python/sandcrawler/ingest.py16
1 files changed, 7 insertions, 9 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 2f6be05..602f9c5 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -605,16 +605,14 @@ class IngestFileWorker(SandcrawlerWorker):
if ingest_type == "pdf" and html_ish_resource:
- fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
-
# the new style of URL extraction (already computed)
- # we aren't quite ready to adopt this for the PDF path (which
- # has more complex logic to avoid loops, etc)
- #if not fulltext_url and html_biblio and html_biblio.pdf_fulltext_url:
- # fulltext_url = dict(
- # pdf_url=html_biblio.pdf_fulltext_url,
- # technique="html_biblio",
- # )
+ if html_biblio and html_biblio.pdf_fulltext_url:
+ fulltext_url = dict(
+ pdf_url=html_biblio.pdf_fulltext_url,
+ technique="html_biblio",
+ )
+ else:
+ fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
result['extract_next_hop'] = fulltext_url
if not fulltext_url: