diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-09 22:55:58 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-09 22:55:58 -0800 |
commit | cd09c6d6bd4deef0627de4f8a8a301725db01e14 (patch) | |
tree | b1e1778a6e9e3580003b61d7c33325fbb7c40fa7 /notes/cleanups/scripts/fetch_full_cdx_ts.py | |
parent | ca3aedcefd5c3835b6fa9b685200ef12d635d22c (diff) | |
download | fatcat-cd09c6d6bd4deef0627de4f8a8a301725db01e14.tar.gz fatcat-cd09c6d6bd4deef0627de4f8a8a301725db01e14.zip |
wayback ts cleanup: one more filter tweak
Diffstat (limited to 'notes/cleanups/scripts/fetch_full_cdx_ts.py')
-rw-r--r-- | notes/cleanups/scripts/fetch_full_cdx_ts.py | 3 |
1 files changed, 2 insertions, 1 deletions
diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py index d5b0c476..ebcf0d62 100644 --- a/notes/cleanups/scripts/fetch_full_cdx_ts.py +++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py @@ -115,7 +115,8 @@ def process_file(fe, session) -> dict: seg = u.split('/') assert seg[2] == "web.archive.org" assert seg[3] == "web" - assert seg[4].isdigit() + if not seg[4].isdigit(): + continue original_url = "/".join(seg[5:]) if len(seg[4]) == 12 or len(seg[4]) == 4: short_urls.append(u) |