diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-10-30 15:17:44 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-10-30 15:17:44 -0700 |
commit | eee590e67b80915d2b72d3b213384fd193875242 (patch) | |
tree | c89a2489a84c637c9c9663e8aa7d1aa3f7d17b3a | |
parent | 08bf16e6da9666bb81e4d1ecddff48fe7cf9205c (diff) | |
download | sandcrawler-eee590e67b80915d2b72d3b213384fd193875242.tar.gz sandcrawler-eee590e67b80915d2b72d3b213384fd193875242.zip |
cdx: add support for 'closest' time parameter
-rw-r--r-- | python/sandcrawler/ia.py | 12 |
1 files changed, 9 insertions, 3 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index e6c6295..664bd20 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -232,7 +232,7 @@ class CdxApiClient: assert row.status_code == filter_status_code return row - def lookup_best(self, url, max_age_days=None, best_mimetype=None): + def lookup_best(self, url, max_age_days=None, best_mimetype=None, closest=None): """ Fetches multiple CDX rows for the given URL, tries to find the most recent. @@ -270,7 +270,13 @@ class CdxApiClient: if max_age_days: since = datetime.date.today() - datetime.timedelta(days=max_age_days) params['from'] = '%04d%02d%02d' % (since.year, since.month, since.day), + if closest: + params['closest'] = closest + params['sort'] = "closest" + print(params) rows = self._query_api(params) + for r in rows: + print(f" {r.datetime}") if not rows: return None @@ -568,7 +574,7 @@ class WaybackClient: else: return None - def lookup_resource(self, start_url, best_mimetype=None): + def lookup_resource(self, start_url, best_mimetype=None, closest=None): """ Looks in wayback for a resource starting at the URL, following any redirects. Returns a ResourceResult object, which may indicate a @@ -596,7 +602,7 @@ class WaybackClient: urls_seen = [start_url] for i in range(self.max_redirects): print(" URL: {}".format(next_url), file=sys.stderr) - cdx_row = self.cdx_client.lookup_best(next_url, best_mimetype=best_mimetype) + cdx_row = self.cdx_client.lookup_best(next_url, best_mimetype=best_mimetype, closest=closest) #print(cdx_row, file=sys.stderr) if not cdx_row: return ResourceResult( |