From edfcf4b0d56e4ee9a7a77345a49d18fb698e1533 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 29 Nov 2021 15:07:41 -0800 Subject: update to file short wayback timestamp cleanup --- extra/cleanups/wayback_timestamps.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'extra/cleanups/wayback_timestamps.md') diff --git a/extra/cleanups/wayback_timestamps.md b/extra/cleanups/wayback_timestamps.md index 9db77058..04c4e555 100644 --- a/extra/cleanups/wayback_timestamps.md +++ b/extra/cleanups/wayback_timestamps.md @@ -302,3 +302,32 @@ Looks like the last small tweak was successful! This was with git commit 7583 "fail-not-found" 87 "fail-cdx-403" +## Follow-up (2021-11-16) + +Both re-fetching with updated file export, and also fixed a small one-line bug +in `fetch_full_cdx_ts.py` which was missing most multi-URL file cleanups. + + zcat file_export.json.gz \ + | pv -l \ + | rg 'web.archive.org/web/\d{4,12}/' \ + | gzip \ + > files_20211127_moreshortts.json.gz + # 112M 0:09:38 [ 193k/s] + + zcat files_20211127_moreshortts.json.gz | wc -l + # 29,494 + + zcat files_20211127_moreshortts.json.gz \ + | parallel -j6 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \ + | pv -l \ + | gzip \ + > files_20211127_moreshortts.fetched.json.gz + # 29.5k 0:14:33 [33.8 /s] + + zcat files_20211127_moreshortts.fetched.json.gz | jq .status | sort | uniq -c | sort -nr + 21376 "success-api" + 7576 "fail-not-found" + 439 "success-self" + 87 "fail-cdx-403" + 16 "success-db" + -- cgit v1.2.3