diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-07-16 13:08:41 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-07-16 13:08:41 -0700 |
commit | 50893d02d205622668098890b0795f9168198caa (patch) | |
tree | 341fec400d86874168ae1a18e1b6b6237bcc961a /python | |
parent | da12c99e0d9cdbdc8868a94f8d78b6cd3b2653fa (diff) | |
download | sandcrawler-50893d02d205622668098890b0795f9168198caa.tar.gz sandcrawler-50893d02d205622668098890b0795f9168198caa.zip |
cdx: tweak CDX lookups and resolution (sort)
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 11 |
1 files changed, 7 insertions, 4 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index e08031e..65f30e7 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -325,7 +325,7 @@ class CdxApiClient: params: Dict[str, str] = { "url": url, "matchType": "exact", - "limit": "-25", + "limit": "-40", "output": "json", # Collapsing seems efficient, but is complex; would need to include # other filters and status code in filter @@ -336,11 +336,14 @@ class CdxApiClient: if max_age_days: since = datetime.date.today() - datetime.timedelta(days=max_age_days) params["from"] = "%04d%02d%02d" % (since.year, since.month, since.day) + closest_dt = "00000000" if closest: if isinstance(closest, datetime.datetime): - params["closest"] = "%04d%02d%02d" % (closest.year, closest.month, closest.day) + closest_dt = "%04d%02d%02d" % (closest.year, closest.month, closest.day) + params["closest"] = closest_dt else: - params["closest"] = closest + closest_dt = closest + params["closest"] = closest_dt params["sort"] = "closest" # print(params, file=sys.stderr) rows = self._query_api(params) @@ -359,7 +362,7 @@ class CdxApiClient: int(0 - (r.status_code or 999)), int(r.mimetype == best_mimetype), int(r.mimetype != "warc/revisit"), - int(r.datetime[:6]), + r.datetime[:4] == closest_dt[:4], int("/" in r.warc_path), int(r.datetime), ) |