aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-30 15:17:44 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-30 15:17:44 -0700
commiteee590e67b80915d2b72d3b213384fd193875242 (patch)
treec89a2489a84c637c9c9663e8aa7d1aa3f7d17b3a
parent08bf16e6da9666bb81e4d1ecddff48fe7cf9205c (diff)
downloadsandcrawler-eee590e67b80915d2b72d3b213384fd193875242.tar.gz
sandcrawler-eee590e67b80915d2b72d3b213384fd193875242.zip
cdx: add support for 'closest' time parameter
-rw-r--r--python/sandcrawler/ia.py12
1 files changed, 9 insertions, 3 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index e6c6295..664bd20 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -232,7 +232,7 @@ class CdxApiClient:
assert row.status_code == filter_status_code
return row
- def lookup_best(self, url, max_age_days=None, best_mimetype=None):
+ def lookup_best(self, url, max_age_days=None, best_mimetype=None, closest=None):
"""
Fetches multiple CDX rows for the given URL, tries to find the most recent.
@@ -270,7 +270,13 @@ class CdxApiClient:
if max_age_days:
since = datetime.date.today() - datetime.timedelta(days=max_age_days)
params['from'] = '%04d%02d%02d' % (since.year, since.month, since.day),
+ if closest:
+ params['closest'] = closest
+ params['sort'] = "closest"
+ print(params)
rows = self._query_api(params)
+ for r in rows:
+ print(f" {r.datetime}")
if not rows:
return None
@@ -568,7 +574,7 @@ class WaybackClient:
else:
return None
- def lookup_resource(self, start_url, best_mimetype=None):
+ def lookup_resource(self, start_url, best_mimetype=None, closest=None):
"""
Looks in wayback for a resource starting at the URL, following any
redirects. Returns a ResourceResult object, which may indicate a
@@ -596,7 +602,7 @@ class WaybackClient:
urls_seen = [start_url]
for i in range(self.max_redirects):
print(" URL: {}".format(next_url), file=sys.stderr)
- cdx_row = self.cdx_client.lookup_best(next_url, best_mimetype=best_mimetype)
+ cdx_row = self.cdx_client.lookup_best(next_url, best_mimetype=best_mimetype, closest=closest)
#print(cdx_row, file=sys.stderr)
if not cdx_row:
return ResourceResult(