summaryrefslogtreecommitdiffstats
path: root/notes/cleanups
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-09 14:17:31 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-09 14:17:35 -0800
commit86e6850e70617e1609b79e0ee4bfe2a26f7f992e (patch)
treedeb5724e1c611e9b6aa0a020d5c0ead34e74853b /notes/cleanups
parentad050445ac4f3e218ec101790bbf187731646361 (diff)
downloadfatcat-86e6850e70617e1609b79e0ee4bfe2a26f7f992e.tar.gz
fatcat-86e6850e70617e1609b79e0ee4bfe2a26f7f992e.zip
cleanups: tweaks to wayback CDX cleanup scripts
Diffstat (limited to 'notes/cleanups')
-rw-r--r--notes/cleanups/scripts/fetch_full_cdx_ts.py9
1 files changed, 8 insertions, 1 deletions
diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py
index 6c6817ab..6f67c7e1 100644
--- a/notes/cleanups/scripts/fetch_full_cdx_ts.py
+++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py
@@ -157,7 +157,14 @@ def process_file(fe, session) -> dict:
if short in full_urls:
continue
- cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session)
+ cdx_record = None
+ try:
+ cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session)
+ except requests.exceptions.HTTPError as e:
+ if e.response.status_code == 403:
+ return dict(file_entity=fe, full_urls=full_urls, status="fail-cdx-403")
+ else:
+ raise
if cdx_record:
if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts):
assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit()