aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-24 10:41:03 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-24 10:41:03 -0800
commit15fbaa45003937db9414be729fda9615b960dbe1 (patch)
tree8498aea7c09d6d6baa22ffb85d132319ecbace3f
parent43d9cc3d87654e6e3bd199fbb01972cc47df863e (diff)
downloadsandcrawler-15fbaa45003937db9414be729fda9615b960dbe1.tar.gz
sandcrawler-15fbaa45003937db9414be729fda9615b960dbe1.zip
allow fuzzy revisit matches
-rw-r--r--python/sandcrawler/ia.py27
1 files changed, 26 insertions, 1 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 945c136..cf99e83 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -98,6 +98,30 @@ def cdx_to_dict(cdx):
d['warc_path'] = cdx.warc_path
return d
+def fuzzy_match_url(left, right):
+ """
+ Matches URLs agnostic of http/https (and maybe other normalizations in the
+ future)
+ """
+ if left == right:
+ return True
+ if '://' in left and '://' in right:
+ if left.split('://')[1:] == right.split('://')[1:]:
+ return True
+ return False
+
+def test_fuzzy_match_url():
+ assert fuzzy_match_url("http://thing.com", "http://thing.com") == True
+ assert fuzzy_match_url("http://thing.com", "https://thing.com") == True
+ assert fuzzy_match_url("http://thing.com", "ftp://thing.com") == True
+ assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") == False
+
+ # should probably handle these?
+ assert fuzzy_match_url("http://thing.com", "http://thing.com/") == False
+ assert fuzzy_match_url("http://thing.com", "http://www.thing.com") == False
+ assert fuzzy_match_url("http://www.thing.com", "http://www2.thing.com") == False
+ assert fuzzy_match_url("http://www.thing.com", "https://www2.thing.com") == False
+
class CdxApiError(Exception):
pass
@@ -186,7 +210,8 @@ class CdxApiClient:
return self.fetch(url, datetime, filter_status_code=filter_status_code, retry_sleep=None)
raise KeyError("CDX url/datetime not found: {} {}".format(url, datetime))
row = resp[0]
- if not (row.url == url and row.datetime == datetime):
+ # allow fuzzy http/https match
+ if not (fuzzy_match_url(row.url, url) and row.datetime == datetime):
if retry_sleep:
print("CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr)
time.sleep(retry_sleep)