From cd09c6d6bd4deef0627de4f8a8a301725db01e14 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 9 Nov 2021 22:55:58 -0800 Subject: wayback ts cleanup: one more filter tweak --- notes/cleanups/scripts/fetch_full_cdx_ts.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py index d5b0c476..ebcf0d62 100644 --- a/notes/cleanups/scripts/fetch_full_cdx_ts.py +++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py @@ -115,7 +115,8 @@ def process_file(fe, session) -> dict: seg = u.split('/') assert seg[2] == "web.archive.org" assert seg[3] == "web" - assert seg[4].isdigit() + if not seg[4].isdigit(): + continue original_url = "/".join(seg[5:]) if len(seg[4]) == 12 or len(seg[4]) == 4: short_urls.append(u) -- cgit v1.2.3