From c2cdf60d509e380029f6e2566fc4f98eff4b9f1a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 4 Nov 2021 17:07:54 -0700 Subject: wayback timestamps: updates to handle 4-digit case --- notes/cleanups/scripts/fetch_full_cdx_ts.py | 16 ++--- notes/cleanups/wayback_timestamps.md | 103 +++++++++++++++++++++++++++- 2 files changed, 108 insertions(+), 11 deletions(-) diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py index 5ffd11cb..6c6817ab 100644 --- a/notes/cleanups/scripts/fetch_full_cdx_ts.py +++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py @@ -117,7 +117,7 @@ def process_file(fe, session) -> dict: assert seg[3] == "web" assert seg[4].isdigit() original_url = "/".join(seg[5:]) - if len(seg[4]) == 12: + if len(seg[4]) == 12 or len(seg[4]) == 4: short_urls.append(u) elif len(seg[4]) == 14: self_urls[original_url] = u @@ -131,14 +131,14 @@ def process_file(fe, session) -> dict: for short in list(set(short_urls)): seg = short.split('/') ts = seg[4] - assert len(ts) == 12 and ts.isdigit() + assert len(ts) in [12,4] and ts.isdigit() original_url = '/'.join(seg[5:]) - if original_url in full_urls: + if short in full_urls: continue if original_url in self_urls: - full_urls[original_url] = self_urls[original_url] + full_urls[short] = self_urls[original_url] status = "success-self" continue @@ -146,7 +146,7 @@ def process_file(fe, session) -> dict: for cdx_row in cdx_row_list: if cdx_row['sha1hex'] == fe['sha1'] and cdx_row['url'] == original_url and cdx_row['datetime'].startswith(ts): assert len(cdx_row['datetime']) == 14 and cdx_row['datetime'].isdigit() - full_urls[original_url] = f"https://web.archive.org/web/{cdx_row['datetime']}/{original_url}" + full_urls[short] = f"https://web.archive.org/web/{cdx_row['datetime']}/{original_url}" status = "success-db" break else: @@ -154,14 +154,14 @@ def process_file(fe, session) -> dict: pass cdx_row = None - if original_url in full_urls: + if short in full_urls: continue cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session) if cdx_record: if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts): assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit() - full_urls[original_url] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}" + full_urls[short] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}" status = "success-api" break else: @@ -169,7 +169,7 @@ def process_file(fe, session) -> dict: else: print(f"no CDX API record found: {original_url}", file=sys.stderr) - if original_url not in full_urls: + if short not in full_urls: return dict(file_entity=fe, full_urls=full_urls, status="fail-not-found") return dict( diff --git a/notes/cleanups/wayback_timestamps.md b/notes/cleanups/wayback_timestamps.md index c70ec5b2..81785992 100644 --- a/notes/cleanups/wayback_timestamps.md +++ b/notes/cleanups/wayback_timestamps.md @@ -26,14 +26,53 @@ Filter to files with problem of interest: Wow, this is a lot more than I thought! +There might also be some other short URL patterns, check for those: + + zcat file_export.json.gz \ + | pv -l \ + | rg 'web.archive.org/web/\d{1,11}/' \ + | gzip \ + > files_20211007_veryshortts.json.gz + # skipped, mergine with below + + zcat file_export.json.gz \ + | rg 'web.archive.org/web/None/' \ + | pv -l \ + > /dev/null + # 0.00 0:10:06 [0.00 /s] + # whew, that pattern has been fixed it seems + + zcat file_export.json.gz | rg '/None/' | pv -l > /dev/null + # 2.00 0:10:01 [3.33m/s] + + zcat file_export.json.gz \ + | rg 'web.archive.org/web/\d{13}/' \ + | pv -l \ + > /dev/null + # 0.00 0:10:09 [0.00 /s] + +Yes, 4-digit is a popular pattern as well, need to handle those: + + zcat file_export.json.gz \ + | pv -l \ + | rg 'web.archive.org/web/\d{4,12}/' \ + | gzip \ + > files_20211007_moreshortts.json.gz + # 111M 0:13:22 [ 139k/s] + + zcat files_20211007_moreshortts.json.gz | wc -l + + zcat files_20211007_moreshortts.json.gz | shuf -n10000 > files_20211007_moreshortts.10k_sample.json + # 9,958,854 + ## Fetch Complete URL Want to export JSON like: file_entity [existing file entity] - full_urls[] - : + full_urls[]: list of Dicts[str,str] + : status: str Status one of: @@ -41,5 +80,63 @@ Status one of: - 'success-self': the file already has a fixed URL internally - 'success-db': lookup URL against sandcrawler-db succeeded, and SHA1 matched - 'success-cdx': CDX API lookup succeeded, and SHA1 matched -- 'fail-hash': found a CDX record, but wrong hash - 'fail-not-found': no matching CDX record found + +Ran over a sample: + + cat files_20211007_shortts.10k_sample.json | ./fetch_full_cdx_ts.py > sample_out.json + + cat sample_out.json | jq .status | sort | uniq -c + 5 "fail-not-found" + 576 "success-api" + 7212 "success-db" + 2207 "success-self" + + head -n1000 | ./fetch_full_cdx_ts.py > sample_out.json + + zcat files_20211007_veryshortts.json.gz | head -n1000 | ./fetch_full_cdx_ts.py | jq .status | sort | uniq -c + 2 "fail-not-found" + 168 "success-api" + 208 "success-db" + 622 "success-self" + +Investigating the "fail-not-found", they look like http/https URL +not-exact-matches. Going to put off handling these for now because it is a +small fraction and more delicate. + +Again with the broader set: + + cat files_20211007_moreshortts.10k_sample.json | ./fetch_full_cdx_ts.py > sample_out.json + + cat sample_out.json | jq .status | sort | uniq -c + 9 "fail-not-found" + 781 "success-api" + 6175 "success-db" + 3035 "success-self" + + +## Cleanup Process + +Other possible cleanups to run at the same time, which would not require +external requests or other context: + +- URL has ://archive.org/ link with rel=repository => rel=archive +- mimetype is bogus => clean mimetype +- bogus file => set some new extra field, like scope=stub or scope=partial (?) + +It looks like the rel swap is already implemented in `generic_file_cleanups()`. +From sampling it seems like the mimetype issue is pretty small, so not going to +bite that off now. The "bogus file" issue requires thought, so also skipping. + +## Commands + +Running with 8x parallelism to not break things; expecting some errors along +the way, may need to add handlers for connection errors etc: + + zcat files_20211007_moreshortts.json.gz \ + | parallel -j8 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \ + | pv -l \ + | gzip \ + > files_20211007_moreshortts.fetched.json.gz + +At 300 records/sec, this should take around 9-10 hours to process. -- cgit v1.2.3