wayback timestamps: updates to handle 4-digit case

author: Bryan Newbold <bnewbold@robocracy.org> 2021-11-04 17:07:54 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2021-11-09 14:17:35 -0800
commit: c2cdf60d509e380029f6e2566fc4f98eff4b9f1a (patch)
tree: 8feeaf39b540c6a224b6d26814a49f987dfb993a /notes/cleanups
parent: 1927a7da466164010f0a6467f4df0c887ba00ad3 (diff)
download: fatcat-c2cdf60d509e380029f6e2566fc4f98eff4b9f1a.tar.gz
fatcat-c2cdf60d509e380029f6e2566fc4f98eff4b9f1a.zip
2 files changed, 108 insertions, 11 deletions
diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py
index 5ffd11cb..6c6817ab 100644
--- a/notes/cleanups/scripts/fetch_full_cdx_ts.py
+++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py
@@ -117,7 +117,7 @@ def process_file(fe, session) -> dict:
         assert seg[3] == "web"
         assert seg[4].isdigit()
         original_url = "/".join(seg[5:])
-        if len(seg[4]) == 12:
+        if len(seg[4]) == 12 or len(seg[4]) == 4:
             short_urls.append(u)
         elif len(seg[4]) == 14:
             self_urls[original_url] = u
@@ -131,14 +131,14 @@ def process_file(fe, session) -> dict:
     for short in list(set(short_urls)):
         seg = short.split('/')
         ts = seg[4]
-        assert len(ts) == 12 and ts.isdigit()
+        assert len(ts) in [12,4] and ts.isdigit()
         original_url = '/'.join(seg[5:])
 
-        if original_url in full_urls:
+        if short in full_urls:
             continue
 
         if original_url in self_urls:
-            full_urls[original_url] = self_urls[original_url]
+            full_urls[short] = self_urls[original_url]
             status = "success-self"
             continue
 
@@ -146,7 +146,7 @@ def process_file(fe, session) -> dict:
         for cdx_row in cdx_row_list:
             if cdx_row['sha1hex'] == fe['sha1'] and cdx_row['url'] == original_url and cdx_row['datetime'].startswith(ts):
                 assert len(cdx_row['datetime']) == 14 and cdx_row['datetime'].isdigit()
-                full_urls[original_url] = f"https://web.archive.org/web/{cdx_row['datetime']}/{original_url}"
+                full_urls[short] = f"https://web.archive.org/web/{cdx_row['datetime']}/{original_url}"
                 status = "success-db"
                 break
             else:
@@ -154,14 +154,14 @@ def process_file(fe, session) -> dict:
                 pass
         cdx_row = None
 
-        if original_url in full_urls:
+        if short in full_urls:
             continue
 
         cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session)
         if cdx_record:
             if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts):
                 assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit()
-                full_urls[original_url] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}"
+                full_urls[short] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}"
                 status = "success-api"
                 break
             else:
@@ -169,7 +169,7 @@ def process_file(fe, session) -> dict:
         else:
             print(f"no CDX API record found: {original_url}", file=sys.stderr)
 
-        if original_url not in full_urls:
+        if short not in full_urls:
             return dict(file_entity=fe, full_urls=full_urls, status="fail-not-found")
 
     return dict(
diff --git a/notes/cleanups/wayback_timestamps.md b/notes/cleanups/wayback_timestamps.md
index c70ec5b2..81785992 100644
--- a/notes/cleanups/wayback_timestamps.md
+++ b/notes/cleanups/wayback_timestamps.md
@@ -26,14 +26,53 @@ Filter to files with problem of interest:
 
 Wow, this is a lot more than I thought!
 
+There might also be some other short URL patterns, check for those:
+
+    zcat file_export.json.gz \
+        | pv -l \
+        | rg 'web.archive.org/web/\d{1,11}/' \
+        | gzip \
+        > files_20211007_veryshortts.json.gz
+    # skipped, mergine with below
+
+    zcat file_export.json.gz \
+        | rg 'web.archive.org/web/None/' \
+        | pv -l \
+        > /dev/null
+    # 0.00  0:10:06 [0.00 /s]
+    # whew, that pattern has been fixed it seems
+
+    zcat file_export.json.gz | rg '/None/' | pv -l > /dev/null
+    # 2.00  0:10:01 [3.33m/s]
+
+    zcat file_export.json.gz \
+        | rg 'web.archive.org/web/\d{13}/' \
+        | pv -l \
+        > /dev/null
+    # 0.00  0:10:09 [0.00 /s]
+
+Yes, 4-digit is a popular pattern as well, need to handle those:
+
+    zcat file_export.json.gz \
+        | pv -l \
+        | rg 'web.archive.org/web/\d{4,12}/' \
+        | gzip \
+        > files_20211007_moreshortts.json.gz
+    # 111M 0:13:22 [ 139k/s]
+
+    zcat files_20211007_moreshortts.json.gz | wc -l
+
+    zcat files_20211007_moreshortts.json.gz | shuf -n10000 > files_20211007_moreshortts.10k_sample.json
+    # 9,958,854
+
 ## Fetch Complete URL
 
 Want to export JSON like:
 
     file_entity
         [existing file entity]
-    full_urls[]
-        <short>: <long>
+    full_urls[]: list of Dicts[str,str]
+        <short_url>: <full_url>
     status: str
 
 Status one of:
@@ -41,5 +80,63 @@ Status one of:
 - 'success-self': the file already has a fixed URL internally
 - 'success-db': lookup URL against sandcrawler-db succeeded, and SHA1 matched
 - 'success-cdx': CDX API lookup succeeded, and SHA1 matched
-- 'fail-hash': found a CDX record, but wrong hash
 - 'fail-not-found': no matching CDX record found
+
+Ran over a sample:
+
+    cat files_20211007_shortts.10k_sample.json | ./fetch_full_cdx_ts.py > sample_out.json
+
+    cat sample_out.json | jq .status | sort | uniq -c
+          5 "fail-not-found"
+        576 "success-api"
+       7212 "success-db"
+       2207 "success-self"
+
+    head -n1000  | ./fetch_full_cdx_ts.py > sample_out.json
+
+    zcat files_20211007_veryshortts.json.gz | head -n1000 | ./fetch_full_cdx_ts.py | jq .status | sort | uniq -c
+          2 "fail-not-found"
+        168 "success-api"
+        208 "success-db"
+        622 "success-self"
+
+Investigating the "fail-not-found", they look like http/https URL
+not-exact-matches. Going to put off handling these for now because it is a
+small fraction and more delicate.
+
+Again with the broader set:
+
+    cat files_20211007_moreshortts.10k_sample.json | ./fetch_full_cdx_ts.py > sample_out.json
+
+    cat sample_out.json | jq .status | sort | uniq -c
+          9 "fail-not-found"
+        781 "success-api"
+       6175 "success-db"
+       3035 "success-self"
+
+
+## Cleanup Process
+
+Other possible cleanups to run at the same time, which would not require
+external requests or other context:
+
+- URL has ://archive.org/ link with rel=repository => rel=archive
+- mimetype is bogus => clean mimetype
+- bogus file => set some new extra field, like scope=stub or scope=partial (?)
+
+It looks like the rel swap is already implemented in `generic_file_cleanups()`.
+From sampling it seems like the mimetype issue is pretty small, so not going to
+bite that off now. The "bogus file" issue requires thought, so also skipping.
+
+## Commands
+
+Running with 8x parallelism to not break things; expecting some errors along
+the way, may need to add handlers for connection errors etc:
+
+    zcat files_20211007_moreshortts.json.gz \
+        | parallel -j8 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \
+        | pv -l \
+        | gzip \
+        > files_20211007_moreshortts.fetched.json.gz
+
+At 300 records/sec, this should take around 9-10 hours to process.
author	Bryan Newbold <bnewbold@robocracy.org>	2021-11-04 17:07:54 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2021-11-09 14:17:35 -0800
commit	c2cdf60d509e380029f6e2566fc4f98eff4b9f1a (patch)
tree	8feeaf39b540c6a224b6d26814a49f987dfb993a /notes/cleanups
parent	1927a7da466164010f0a6467f4df0c887ba00ad3 (diff)
download	fatcat-c2cdf60d509e380029f6e2566fc4f98eff4b9f1a.tar.gz fatcat-c2cdf60d509e380029f6e2566fc4f98eff4b9f1a.zip