support forwarding url types other than pdf_url

author: Bryan Newbold <bnewbold@archive.org> 2020-01-09 17:54:33 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-01-09 17:54:33 -0800
commit: 5ba7c9556d9c671184818476b9deb2506a47ef42 (patch)
tree: 548187942db1a31207999fae7b50164f03ed707e
parent: 2e112935a59993cf930558278362835056897c49 (diff)
download: sandcrawler-5ba7c9556d9c671184818476b9deb2506a47ef42.tar.gz
sandcrawler-5ba7c9556d9c671184818476b9deb2506a47ef42.zip
1 files changed, 5 insertions, 4 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 591c971..fe07a89 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -188,7 +188,7 @@ class IngestFileWorker(SandcrawlerWorker):
                 fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
                 
                 result['html'] = fulltext_url
-                if not fulltext_url or not 'pdf_url' in fulltext_url:
+                if not fulltext_url:
                     result['status'] = 'no-pdf-link'
                     if resource.terminal_dt:
                         result['terminal'] = {
@@ -197,12 +197,13 @@ class IngestFileWorker(SandcrawlerWorker):
                             "terminal_status_code": resource.terminal_status_code,
                         }
                     return result
-                print("\tlanding page URL extracted ({}): {}".format(
+                next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url')
+                assert next_url
+                print("\tnext hop extracted ({}): {}".format(
                         fulltext_url.get('technique'),
-                        fulltext_url['pdf_url'],
+                        next_url,
                     ),
                     file=sys.stderr)
-                next_url = fulltext_url['pdf_url']
                 if next_url in hops:
                     result['status'] = 'link-loop'
                     result['error_message'] = "repeated: {}".format(next_url)
author	Bryan Newbold <bnewbold@archive.org>	2020-01-09 17:54:33 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-01-09 17:54:33 -0800
commit	5ba7c9556d9c671184818476b9deb2506a47ef42 (patch)
tree	548187942db1a31207999fae7b50164f03ed707e
parent	2e112935a59993cf930558278362835056897c49 (diff)
download	sandcrawler-5ba7c9556d9c671184818476b9deb2506a47ef42.tar.gz sandcrawler-5ba7c9556d9c671184818476b9deb2506a47ef42.zip