aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--extra/cleanups/scripts/fetch_full_cdx_ts.py2
-rw-r--r--extra/cleanups/wayback_timestamps.md29
2 files changed, 30 insertions, 1 deletions
diff --git a/extra/cleanups/scripts/fetch_full_cdx_ts.py b/extra/cleanups/scripts/fetch_full_cdx_ts.py
index ebcf0d62..93ebbcab 100644
--- a/extra/cleanups/scripts/fetch_full_cdx_ts.py
+++ b/extra/cleanups/scripts/fetch_full_cdx_ts.py
@@ -171,7 +171,7 @@ def process_file(fe, session) -> dict:
assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit()
full_urls[short] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}"
status = "success-api"
- break
+ continue
else:
print(f"cdx API found, but no match", file=sys.stderr)
else:
diff --git a/extra/cleanups/wayback_timestamps.md b/extra/cleanups/wayback_timestamps.md
index 9db77058..04c4e555 100644
--- a/extra/cleanups/wayback_timestamps.md
+++ b/extra/cleanups/wayback_timestamps.md
@@ -302,3 +302,32 @@ Looks like the last small tweak was successful! This was with git commit
7583 "fail-not-found"
87 "fail-cdx-403"
+## Follow-up (2021-11-16)
+
+Both re-fetching with updated file export, and also fixed a small one-line bug
+in `fetch_full_cdx_ts.py` which was missing most multi-URL file cleanups.
+
+ zcat file_export.json.gz \
+ | pv -l \
+ | rg 'web.archive.org/web/\d{4,12}/' \
+ | gzip \
+ > files_20211127_moreshortts.json.gz
+ # 112M 0:09:38 [ 193k/s]
+
+ zcat files_20211127_moreshortts.json.gz | wc -l
+ # 29,494
+
+ zcat files_20211127_moreshortts.json.gz \
+ | parallel -j6 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \
+ | pv -l \
+ | gzip \
+ > files_20211127_moreshortts.fetched.json.gz
+ # 29.5k 0:14:33 [33.8 /s]
+
+ zcat files_20211127_moreshortts.fetched.json.gz | jq .status | sort | uniq -c | sort -nr
+ 21376 "success-api"
+ 7576 "fail-not-found"
+ 439 "success-self"
+ 87 "fail-cdx-403"
+ 16 "success-db"
+