aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-29 15:07:41 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-29 15:07:41 -0800
commitedfcf4b0d56e4ee9a7a77345a49d18fb698e1533 (patch)
tree8a0f1b5c8205b7c175a94b7b33bd4f1d9f9ae920
parentc0f170879b32be98bc0c454a7a7a99dd24125822 (diff)
downloadfatcat-edfcf4b0d56e4ee9a7a77345a49d18fb698e1533.tar.gz
fatcat-edfcf4b0d56e4ee9a7a77345a49d18fb698e1533.zip
update to file short wayback timestamp cleanup
-rw-r--r--extra/cleanups/scripts/fetch_full_cdx_ts.py2
-rw-r--r--extra/cleanups/wayback_timestamps.md29
2 files changed, 30 insertions, 1 deletions
diff --git a/extra/cleanups/scripts/fetch_full_cdx_ts.py b/extra/cleanups/scripts/fetch_full_cdx_ts.py
index ebcf0d62..93ebbcab 100644
--- a/extra/cleanups/scripts/fetch_full_cdx_ts.py
+++ b/extra/cleanups/scripts/fetch_full_cdx_ts.py
@@ -171,7 +171,7 @@ def process_file(fe, session) -> dict:
assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit()
full_urls[short] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}"
status = "success-api"
- break
+ continue
else:
print(f"cdx API found, but no match", file=sys.stderr)
else:
diff --git a/extra/cleanups/wayback_timestamps.md b/extra/cleanups/wayback_timestamps.md
index 9db77058..04c4e555 100644
--- a/extra/cleanups/wayback_timestamps.md
+++ b/extra/cleanups/wayback_timestamps.md
@@ -302,3 +302,32 @@ Looks like the last small tweak was successful! This was with git commit
7583 "fail-not-found"
87 "fail-cdx-403"
+## Follow-up (2021-11-16)
+
+Both re-fetching with updated file export, and also fixed a small one-line bug
+in `fetch_full_cdx_ts.py` which was missing most multi-URL file cleanups.
+
+ zcat file_export.json.gz \
+ | pv -l \
+ | rg 'web.archive.org/web/\d{4,12}/' \
+ | gzip \
+ > files_20211127_moreshortts.json.gz
+ # 112M 0:09:38 [ 193k/s]
+
+ zcat files_20211127_moreshortts.json.gz | wc -l
+ # 29,494
+
+ zcat files_20211127_moreshortts.json.gz \
+ | parallel -j6 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \
+ | pv -l \
+ | gzip \
+ > files_20211127_moreshortts.fetched.json.gz
+ # 29.5k 0:14:33 [33.8 /s]
+
+ zcat files_20211127_moreshortts.fetched.json.gz | jq .status | sort | uniq -c | sort -nr
+ 21376 "success-api"
+ 7576 "fail-not-found"
+ 439 "success-self"
+ 87 "fail-cdx-403"
+ 16 "success-db"
+