aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-09 17:54:33 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-09 17:54:33 -0800
commit5ba7c9556d9c671184818476b9deb2506a47ef42 (patch)
tree548187942db1a31207999fae7b50164f03ed707e
parent2e112935a59993cf930558278362835056897c49 (diff)
downloadsandcrawler-5ba7c9556d9c671184818476b9deb2506a47ef42.tar.gz
sandcrawler-5ba7c9556d9c671184818476b9deb2506a47ef42.zip
support forwarding url types other than pdf_url
-rw-r--r--python/sandcrawler/ingest.py9
1 files changed, 5 insertions, 4 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 591c971..fe07a89 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -188,7 +188,7 @@ class IngestFileWorker(SandcrawlerWorker):
fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
result['html'] = fulltext_url
- if not fulltext_url or not 'pdf_url' in fulltext_url:
+ if not fulltext_url:
result['status'] = 'no-pdf-link'
if resource.terminal_dt:
result['terminal'] = {
@@ -197,12 +197,13 @@ class IngestFileWorker(SandcrawlerWorker):
"terminal_status_code": resource.terminal_status_code,
}
return result
- print("\tlanding page URL extracted ({}): {}".format(
+ next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url')
+ assert next_url
+ print("\tnext hop extracted ({}): {}".format(
fulltext_url.get('technique'),
- fulltext_url['pdf_url'],
+ next_url,
),
file=sys.stderr)
- next_url = fulltext_url['pdf_url']
if next_url in hops:
result['status'] = 'link-loop'
result['error_message'] = "repeated: {}".format(next_url)