diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-09 17:54:33 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-09 17:54:33 -0800 |
commit | 5ba7c9556d9c671184818476b9deb2506a47ef42 (patch) | |
tree | 548187942db1a31207999fae7b50164f03ed707e | |
parent | 2e112935a59993cf930558278362835056897c49 (diff) | |
download | sandcrawler-5ba7c9556d9c671184818476b9deb2506a47ef42.tar.gz sandcrawler-5ba7c9556d9c671184818476b9deb2506a47ef42.zip |
support forwarding url types other than pdf_url
-rw-r--r-- | python/sandcrawler/ingest.py | 9 |
1 files changed, 5 insertions, 4 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 591c971..fe07a89 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -188,7 +188,7 @@ class IngestFileWorker(SandcrawlerWorker): fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body) result['html'] = fulltext_url - if not fulltext_url or not 'pdf_url' in fulltext_url: + if not fulltext_url: result['status'] = 'no-pdf-link' if resource.terminal_dt: result['terminal'] = { @@ -197,12 +197,13 @@ class IngestFileWorker(SandcrawlerWorker): "terminal_status_code": resource.terminal_status_code, } return result - print("\tlanding page URL extracted ({}): {}".format( + next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url') + assert next_url + print("\tnext hop extracted ({}): {}".format( fulltext_url.get('technique'), - fulltext_url['pdf_url'], + next_url, ), file=sys.stderr) - next_url = fulltext_url['pdf_url'] if next_url in hops: result['status'] = 'link-loop' result['error_message'] = "repeated: {}".format(next_url) |