diff options
Diffstat (limited to 'python/sandcrawler/ingest.py')
-rw-r--r-- | python/sandcrawler/ingest.py | 16 |
1 files changed, 7 insertions, 9 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 2f6be05..602f9c5 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -605,16 +605,14 @@ class IngestFileWorker(SandcrawlerWorker): if ingest_type == "pdf" and html_ish_resource: - fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body) - # the new style of URL extraction (already computed) - # we aren't quite ready to adopt this for the PDF path (which - # has more complex logic to avoid loops, etc) - #if not fulltext_url and html_biblio and html_biblio.pdf_fulltext_url: - # fulltext_url = dict( - # pdf_url=html_biblio.pdf_fulltext_url, - # technique="html_biblio", - # ) + if html_biblio and html_biblio.pdf_fulltext_url: + fulltext_url = dict( + pdf_url=html_biblio.pdf_fulltext_url, + technique="html_biblio", + ) + else: + fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body) result['extract_next_hop'] = fulltext_url if not fulltext_url: |