aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-09 22:55:58 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-09 22:55:58 -0800
commitcd09c6d6bd4deef0627de4f8a8a301725db01e14 (patch)
treeb1e1778a6e9e3580003b61d7c33325fbb7c40fa7
parentca3aedcefd5c3835b6fa9b685200ef12d635d22c (diff)
downloadfatcat-cd09c6d6bd4deef0627de4f8a8a301725db01e14.tar.gz
fatcat-cd09c6d6bd4deef0627de4f8a8a301725db01e14.zip
wayback ts cleanup: one more filter tweak
-rw-r--r--notes/cleanups/scripts/fetch_full_cdx_ts.py3
1 files changed, 2 insertions, 1 deletions
diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py
index d5b0c476..ebcf0d62 100644
--- a/notes/cleanups/scripts/fetch_full_cdx_ts.py
+++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py
@@ -115,7 +115,8 @@ def process_file(fe, session) -> dict:
seg = u.split('/')
assert seg[2] == "web.archive.org"
assert seg[3] == "web"
- assert seg[4].isdigit()
+ if not seg[4].isdigit():
+ continue
original_url = "/".join(seg[5:])
if len(seg[4]) == 12 or len(seg[4]) == 4:
short_urls.append(u)