diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 21:54:24 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 21:54:24 -0800 |
commit | a68aadc4107fc68dc2748c52dab8a4bd92cca022 (patch) | |
tree | da3da0a847d5c10dee873e8bce8198a39c12ce1f /python/sandcrawler/ingest.py | |
parent | 6a701f966b8bc760bf904c0569562b0159e13559 (diff) | |
download | sandcrawler-a68aadc4107fc68dc2748c52dab8a4bd92cca022.tar.gz sandcrawler-a68aadc4107fc68dc2748c52dab8a4bd92cca022.zip |
move some PDF URL extraction into declarative format
Diffstat (limited to 'python/sandcrawler/ingest.py')
-rw-r--r-- | python/sandcrawler/ingest.py | 16 |
1 files changed, 7 insertions, 9 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 2f6be05..602f9c5 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -605,16 +605,14 @@ class IngestFileWorker(SandcrawlerWorker): if ingest_type == "pdf" and html_ish_resource: - fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body) - # the new style of URL extraction (already computed) - # we aren't quite ready to adopt this for the PDF path (which - # has more complex logic to avoid loops, etc) - #if not fulltext_url and html_biblio and html_biblio.pdf_fulltext_url: - # fulltext_url = dict( - # pdf_url=html_biblio.pdf_fulltext_url, - # technique="html_biblio", - # ) + if html_biblio and html_biblio.pdf_fulltext_url: + fulltext_url = dict( + pdf_url=html_biblio.pdf_fulltext_url, + technique="html_biblio", + ) + else: + fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body) result['extract_next_hop'] = fulltext_url if not fulltext_url: |