From 86e6850e70617e1609b79e0ee4bfe2a26f7f992e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 9 Nov 2021 14:17:31 -0800 Subject: cleanups: tweaks to wayback CDX cleanup scripts --- notes/cleanups/scripts/fetch_full_cdx_ts.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'notes/cleanups') diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py index 6c6817ab..6f67c7e1 100644 --- a/notes/cleanups/scripts/fetch_full_cdx_ts.py +++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py @@ -157,7 +157,14 @@ def process_file(fe, session) -> dict: if short in full_urls: continue - cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session) + cdx_record = None + try: + cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 403: + return dict(file_entity=fe, full_urls=full_urls, status="fail-cdx-403") + else: + raise if cdx_record: if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts): assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit() -- cgit v1.2.3