aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-04-12 12:14:29 -0700
committerBryan Newbold <bnewbold@archive.org>2019-04-12 12:14:29 -0700
commit4f40ea7d0cb19ceac15c28b61c479a66895cea2d (patch)
treed375de3b20859d77ea1504e78e7ab2687be9d2e1
parent5df87216af1b6feb741791f33814cd82acad1d79 (diff)
downloadarabesque-4f40ea7d0cb19ceac15c28b61c479a66895cea2d.tar.gz
arabesque-4f40ea7d0cb19ceac15c28b61c479a66895cea2d.zip
add loop detection in backward processing
-rwxr-xr-xarabesque.py5
1 files changed, 5 insertions, 0 deletions
diff --git a/arabesque.py b/arabesque.py
index 429042d..07c05b5 100755
--- a/arabesque.py
+++ b/arabesque.py
@@ -427,12 +427,17 @@ def backward(log_file, map_db, output_db, hit_mimetypes=FULLTEXT_MIMETYPES):
counts['skip-map-scope'] += 1
continue
row = final_row
+ loop_stack = []
while row and row.referrer_url != None:
next_row = lookup_referrer_row(m, row.referrer_url)
if next_row:
row = next_row
else:
break
+ if row.referrer_url in loop_stack:
+ counts['map-url-redirect-loop'] += 1
+ break
+ loop_stack.append(row.referrer_url)
initial_domain = urllib3.util.parse_url(row.url).host
final_domain = urllib3.util.parse_url(final_row.url).host