From edfcf4b0d56e4ee9a7a77345a49d18fb698e1533 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 29 Nov 2021 15:07:41 -0800 Subject: update to file short wayback timestamp cleanup --- extra/cleanups/scripts/fetch_full_cdx_ts.py | 2 +- extra/cleanups/wayback_timestamps.md | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/extra/cleanups/scripts/fetch_full_cdx_ts.py b/extra/cleanups/scripts/fetch_full_cdx_ts.py index ebcf0d62..93ebbcab 100644 --- a/extra/cleanups/scripts/fetch_full_cdx_ts.py +++ b/extra/cleanups/scripts/fetch_full_cdx_ts.py @@ -171,7 +171,7 @@ def process_file(fe, session) -> dict: assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit() full_urls[short] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}" status = "success-api" - break + continue else: print(f"cdx API found, but no match", file=sys.stderr) else: diff --git a/extra/cleanups/wayback_timestamps.md b/extra/cleanups/wayback_timestamps.md index 9db77058..04c4e555 100644 --- a/extra/cleanups/wayback_timestamps.md +++ b/extra/cleanups/wayback_timestamps.md @@ -302,3 +302,32 @@ Looks like the last small tweak was successful! This was with git commit 7583 "fail-not-found" 87 "fail-cdx-403" +## Follow-up (2021-11-16) + +Both re-fetching with updated file export, and also fixed a small one-line bug +in `fetch_full_cdx_ts.py` which was missing most multi-URL file cleanups. + + zcat file_export.json.gz \ + | pv -l \ + | rg 'web.archive.org/web/\d{4,12}/' \ + | gzip \ + > files_20211127_moreshortts.json.gz + # 112M 0:09:38 [ 193k/s] + + zcat files_20211127_moreshortts.json.gz | wc -l + # 29,494 + + zcat files_20211127_moreshortts.json.gz \ + | parallel -j6 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \ + | pv -l \ + | gzip \ + > files_20211127_moreshortts.fetched.json.gz + # 29.5k 0:14:33 [33.8 /s] + + zcat files_20211127_moreshortts.fetched.json.gz | jq .status | sort | uniq -c | sort -nr + 21376 "success-api" + 7576 "fail-not-found" + 439 "success-self" + 87 "fail-cdx-403" + 16 "success-db" + -- cgit v1.2.3