aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-12 13:06:31 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-12 13:06:31 -0700
commitf5286affa62c0caf95ff4848626e85aff476737e (patch)
tree93e2dcb550fa8a057bc02b38e34d32ce0ee0e561 /python/sandcrawler
parent2469d483326e5e81e774e46fd100888710b9bbc3 (diff)
downloadsandcrawler-f5286affa62c0caf95ff4848626e85aff476737e.tar.gz
sandcrawler-f5286affa62c0caf95ff4848626e85aff476737e.zip
store no-capture URLs in terminal_url
Diffstat (limited to 'python/sandcrawler')
-rw-r--r--python/sandcrawler/ia.py2
-rw-r--r--python/sandcrawler/ingest.py4
2 files changed, 3 insertions, 3 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 7b623bc..2bc52ce 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -589,7 +589,7 @@ class WaybackClient:
start_url=start_url,
hit=False,
status="no-capture",
- terminal_url=None,
+ terminal_url=next_url,
terminal_dt=None,
terminal_status_code=None,
body=None,
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index e8e517a..5ab7e13 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -387,7 +387,7 @@ class IngestFileWorker(SandcrawlerWorker):
if not resource.hit:
result['status'] = resource.status
- if resource.terminal_dt and resource.terminal_status_code:
+ if resource.terminal_url:
result['terminal'] = {
"terminal_url": resource.terminal_url,
"terminal_dt": resource.terminal_dt,
@@ -465,7 +465,7 @@ class IngestFileWorker(SandcrawlerWorker):
result['status'] = "max-hops-exceeded"
return result
- if resource.terminal_dt:
+ if resource.terminal_url:
result['terminal'] = {
"terminal_url": resource.terminal_url,
"terminal_dt": resource.terminal_dt,