From 1927a7da466164010f0a6467f4df0c887ba00ad3 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 4 Nov 2021 14:00:56 -0700
Subject: start work on wayback short-timestamp cleanup

---
 notes/cleanups/scripts/fetch_full_cdx_ts.py | 193 ++++++++++++++++++++++++++++
 1 file changed, 193 insertions(+)
 create mode 100644 notes/cleanups/scripts/fetch_full_cdx_ts.py

(limited to 'notes/cleanups/scripts')

diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py
new file mode 100644
index 00000000..5ffd11cb
--- /dev/null
+++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import base64
+from typing import Optional, List
+
+import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry  # pylint: disable=import-error
+
+def requests_retry_session(
+    retries: int = 10,
+    backoff_factor: int = 3,
+    status_forcelist: List[int] = [500, 502, 504],
+    session: requests.Session = None,
+) -> requests.Session:
+    """
+    From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
+    """
+    session = session or requests.Session()
+    retry = Retry(
+        total=retries,
+        read=retries,
+        connect=retries,
+        backoff_factor=backoff_factor,
+        status_forcelist=status_forcelist,
+    )
+    adapter = HTTPAdapter(max_retries=retry)
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+    return session
+
+def b32_hex(s: str) -> str:
+    """
+    Converts a base32-encoded SHA-1 checksum into hex-encoded
+
+    base32 checksums are used by, eg, heritrix and in wayback CDX files
+    """
+    s = s.strip().split()[0].lower()
+    if s.startswith("sha1:"):
+        s = s[5:]
+    if len(s) != 32:
+        if len(s) == 40:
+            return s
+        raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
+    return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
+
+
+SANDCRAWLER_POSTGREST_URL = "http://wbgrp-svc506.us.archive.org:3030"
+
+def get_db_cdx(url: str, http_session) -> List[dict]:
+    resp = http_session.get(SANDCRAWLER_POSTGREST_URL + "/cdx", params=dict(url="eq." + url))
+    resp.raise_for_status()
+    rows = resp.json()
+    return rows or []
+
+CDX_API_URL = "https://web.archive.org/cdx/search/cdx"
+
+def get_api_cdx(url: str, partial_dt: str, http_session) -> Optional[dict]:
+
+    params = {
+        "url": url,
+        "from": partial_dt,
+        "to": partial_dt,
+        "matchType": "exact",
+        "output": "json",
+        "limit": 20,
+        # can't filter status because might be warc/revisit
+        #"filter": "statuscode:200",
+    }
+    resp = http_session.get(CDX_API_URL, params=params)
+    resp.raise_for_status()
+    rows = resp.json()
+
+    if not rows:
+        return None
+    #print(rows, file=sys.stderr)
+    if len(rows) < 2:
+        return None
+
+    for raw in rows[1:]:
+        record = dict(
+            surt=raw[0],
+            datetime=raw[1],
+            url=raw[2],
+            mimetype=raw[3],
+            status_code=raw[4],
+            sha1b32=raw[5],
+            sha1hex=b32_hex(raw[5]),
+        )
+        if record['url'] != url:
+            # TODO: could allow HTTP/HTTPS fuzzy match
+            print("CDX API near match: URL", file=sys.stderr)
+            continue
+        if not record['datetime'].startswith(partial_dt):
+            print(f"CDX API near match: datetime {partial_dt} {record['datetime']}", file=sys.stderr)
+            continue
+        if record['status_code'] == "200" or (record['status_code'] == '-' and record['mimetype'] == 'warc/revisit'):
+            return record
+        else:
+            print(f"CDX API near match: status  {record['status_code']}", file=sys.stderr)
+    return None
+
+def process_file(fe, session) -> dict:
+    short_urls = []
+    self_urls = dict()
+    full_urls = dict()
+    status = "unknown"
+
+    for pair in fe['urls']:
+        u = pair['url']
+        if not '://web.archive.org/web/' in u:
+            continue
+        seg = u.split('/')
+        assert seg[2] == "web.archive.org"
+        assert seg[3] == "web"
+        assert seg[4].isdigit()
+        original_url = "/".join(seg[5:])
+        if len(seg[4]) == 12:
+            short_urls.append(u)
+        elif len(seg[4]) == 14:
+            self_urls[original_url] = u
+        else:
+            print(f"other bogus ts: {seg[4]}", file=sys.stderr)
+            return dict(file_entity=fe, full_urls=full_urls, status="fail-bogus-ts")
+
+    if len(short_urls) == 0:
+        return dict(file_entity=fe, full_urls=[], status="skip-no-shorts")
+
+    for short in list(set(short_urls)):
+        seg = short.split('/')
+        ts = seg[4]
+        assert len(ts) == 12 and ts.isdigit()
+        original_url = '/'.join(seg[5:])
+
+        if original_url in full_urls:
+            continue
+
+        if original_url in self_urls:
+            full_urls[original_url] = self_urls[original_url]
+            status = "success-self"
+            continue
+
+        cdx_row_list = get_db_cdx(original_url, http_session=session)
+        for cdx_row in cdx_row_list:
+            if cdx_row['sha1hex'] == fe['sha1'] and cdx_row['url'] == original_url and cdx_row['datetime'].startswith(ts):
+                assert len(cdx_row['datetime']) == 14 and cdx_row['datetime'].isdigit()
+                full_urls[original_url] = f"https://web.archive.org/web/{cdx_row['datetime']}/{original_url}"
+                status = "success-db"
+                break
+            else:
+                #print(f"cdx DB found, but no match", file=sys.stderr)
+                pass
+        cdx_row = None
+
+        if original_url in full_urls:
+            continue
+
+        cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session)
+        if cdx_record:
+            if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts):
+                assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit()
+                full_urls[original_url] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}"
+                status = "success-api"
+                break
+            else:
+                print(f"cdx API found, but no match", file=sys.stderr)
+        else:
+            print(f"no CDX API record found: {original_url}", file=sys.stderr)
+
+        if original_url not in full_urls:
+            return dict(file_entity=fe, full_urls=full_urls, status="fail-not-found")
+
+    return dict(
+        file_entity=fe,
+        full_urls=full_urls,
+        status=status,
+    )
+
+def main():
+    session = requests_retry_session()
+    session.headers.update({
+        "User-Agent": "Mozilla/5.0 fatcat.CdxFixupBot",
+    })
+    for line in sys.stdin:
+        if not line.strip():
+            continue
+        fe = json.loads(line)
+        print(json.dumps(process_file(fe, session=session)))
+
+if __name__=="__main__":
+    main()
-- 
cgit v1.2.3


From c2cdf60d509e380029f6e2566fc4f98eff4b9f1a Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 4 Nov 2021 17:07:54 -0700
Subject: wayback timestamps: updates to handle 4-digit case

---
 notes/cleanups/scripts/fetch_full_cdx_ts.py |  16 ++---
 notes/cleanups/wayback_timestamps.md        | 103 +++++++++++++++++++++++++++-
 2 files changed, 108 insertions(+), 11 deletions(-)

(limited to 'notes/cleanups/scripts')

diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py
index 5ffd11cb..6c6817ab 100644
--- a/notes/cleanups/scripts/fetch_full_cdx_ts.py
+++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py
@@ -117,7 +117,7 @@ def process_file(fe, session) -> dict:
         assert seg[3] == "web"
         assert seg[4].isdigit()
         original_url = "/".join(seg[5:])
-        if len(seg[4]) == 12:
+        if len(seg[4]) == 12 or len(seg[4]) == 4:
             short_urls.append(u)
         elif len(seg[4]) == 14:
             self_urls[original_url] = u
@@ -131,14 +131,14 @@ def process_file(fe, session) -> dict:
     for short in list(set(short_urls)):
         seg = short.split('/')
         ts = seg[4]
-        assert len(ts) == 12 and ts.isdigit()
+        assert len(ts) in [12,4] and ts.isdigit()
         original_url = '/'.join(seg[5:])
 
-        if original_url in full_urls:
+        if short in full_urls:
             continue
 
         if original_url in self_urls:
-            full_urls[original_url] = self_urls[original_url]
+            full_urls[short] = self_urls[original_url]
             status = "success-self"
             continue
 
@@ -146,7 +146,7 @@ def process_file(fe, session) -> dict:
         for cdx_row in cdx_row_list:
             if cdx_row['sha1hex'] == fe['sha1'] and cdx_row['url'] == original_url and cdx_row['datetime'].startswith(ts):
                 assert len(cdx_row['datetime']) == 14 and cdx_row['datetime'].isdigit()
-                full_urls[original_url] = f"https://web.archive.org/web/{cdx_row['datetime']}/{original_url}"
+                full_urls[short] = f"https://web.archive.org/web/{cdx_row['datetime']}/{original_url}"
                 status = "success-db"
                 break
             else:
@@ -154,14 +154,14 @@ def process_file(fe, session) -> dict:
                 pass
         cdx_row = None
 
-        if original_url in full_urls:
+        if short in full_urls:
             continue
 
         cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session)
         if cdx_record:
             if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts):
                 assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit()
-                full_urls[original_url] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}"
+                full_urls[short] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}"
                 status = "success-api"
                 break
             else:
@@ -169,7 +169,7 @@ def process_file(fe, session) -> dict:
         else:
             print(f"no CDX API record found: {original_url}", file=sys.stderr)
 
-        if original_url not in full_urls:
+        if short not in full_urls:
             return dict(file_entity=fe, full_urls=full_urls, status="fail-not-found")
 
     return dict(
diff --git a/notes/cleanups/wayback_timestamps.md b/notes/cleanups/wayback_timestamps.md
index c70ec5b2..81785992 100644
--- a/notes/cleanups/wayback_timestamps.md
+++ b/notes/cleanups/wayback_timestamps.md
@@ -26,14 +26,53 @@ Filter to files with problem of interest:
 
 Wow, this is a lot more than I thought!
 
+There might also be some other short URL patterns, check for those:
+
+    zcat file_export.json.gz \
+        | pv -l \
+        | rg 'web.archive.org/web/\d{1,11}/' \
+        | gzip \
+        > files_20211007_veryshortts.json.gz
+    # skipped, mergine with below
+
+    zcat file_export.json.gz \
+        | rg 'web.archive.org/web/None/' \
+        | pv -l \
+        > /dev/null
+    # 0.00  0:10:06 [0.00 /s]
+    # whew, that pattern has been fixed it seems
+
+    zcat file_export.json.gz | rg '/None/' | pv -l > /dev/null
+    # 2.00  0:10:01 [3.33m/s]
+
+    zcat file_export.json.gz \
+        | rg 'web.archive.org/web/\d{13}/' \
+        | pv -l \
+        > /dev/null
+    # 0.00  0:10:09 [0.00 /s]
+
+Yes, 4-digit is a popular pattern as well, need to handle those:
+
+    zcat file_export.json.gz \
+        | pv -l \
+        | rg 'web.archive.org/web/\d{4,12}/' \
+        | gzip \
+        > files_20211007_moreshortts.json.gz
+    # 111M 0:13:22 [ 139k/s]
+
+    zcat files_20211007_moreshortts.json.gz | wc -l
+
+    zcat files_20211007_moreshortts.json.gz | shuf -n10000 > files_20211007_moreshortts.10k_sample.json
+    # 9,958,854
+
 ## Fetch Complete URL
 
 Want to export JSON like:
 
     file_entity
         [existing file entity]
-    full_urls[]
-        <short>: <long>
+    full_urls[]: list of Dicts[str,str]
+        <short_url>: <full_url>
     status: str
 
 Status one of:
@@ -41,5 +80,63 @@ Status one of:
 - 'success-self': the file already has a fixed URL internally
 - 'success-db': lookup URL against sandcrawler-db succeeded, and SHA1 matched
 - 'success-cdx': CDX API lookup succeeded, and SHA1 matched
-- 'fail-hash': found a CDX record, but wrong hash
 - 'fail-not-found': no matching CDX record found
+
+Ran over a sample:
+
+    cat files_20211007_shortts.10k_sample.json | ./fetch_full_cdx_ts.py > sample_out.json
+
+    cat sample_out.json | jq .status | sort | uniq -c
+          5 "fail-not-found"
+        576 "success-api"
+       7212 "success-db"
+       2207 "success-self"
+
+    head -n1000  | ./fetch_full_cdx_ts.py > sample_out.json
+
+    zcat files_20211007_veryshortts.json.gz | head -n1000 | ./fetch_full_cdx_ts.py | jq .status | sort | uniq -c
+          2 "fail-not-found"
+        168 "success-api"
+        208 "success-db"
+        622 "success-self"
+
+Investigating the "fail-not-found", they look like http/https URL
+not-exact-matches. Going to put off handling these for now because it is a
+small fraction and more delicate.
+
+Again with the broader set:
+
+    cat files_20211007_moreshortts.10k_sample.json | ./fetch_full_cdx_ts.py > sample_out.json
+
+    cat sample_out.json | jq .status | sort | uniq -c
+          9 "fail-not-found"
+        781 "success-api"
+       6175 "success-db"
+       3035 "success-self"
+
+
+## Cleanup Process
+
+Other possible cleanups to run at the same time, which would not require
+external requests or other context:
+
+- URL has ://archive.org/ link with rel=repository => rel=archive
+- mimetype is bogus => clean mimetype
+- bogus file => set some new extra field, like scope=stub or scope=partial (?)
+
+It looks like the rel swap is already implemented in `generic_file_cleanups()`.
+From sampling it seems like the mimetype issue is pretty small, so not going to
+bite that off now. The "bogus file" issue requires thought, so also skipping.
+
+## Commands
+
+Running with 8x parallelism to not break things; expecting some errors along
+the way, may need to add handlers for connection errors etc:
+
+    zcat files_20211007_moreshortts.json.gz \
+        | parallel -j8 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \
+        | pv -l \
+        | gzip \
+        > files_20211007_moreshortts.fetched.json.gz
+
+At 300 records/sec, this should take around 9-10 hours to process.
-- 
cgit v1.2.3


From 86e6850e70617e1609b79e0ee4bfe2a26f7f992e Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 9 Nov 2021 14:17:31 -0800
Subject: cleanups: tweaks to wayback CDX cleanup scripts

---
 notes/cleanups/scripts/fetch_full_cdx_ts.py           |  9 ++++++++-
 python/fatcat_tools/cleanups/file_short_wayback_ts.py | 18 +++++++++++++-----
 2 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'notes/cleanups/scripts')

diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py
index 6c6817ab..6f67c7e1 100644
--- a/notes/cleanups/scripts/fetch_full_cdx_ts.py
+++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py
@@ -157,7 +157,14 @@ def process_file(fe, session) -> dict:
         if short in full_urls:
             continue
 
-        cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session)
+        cdx_record = None
+        try:
+            cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session)
+        except requests.exceptions.HTTPError as e:
+            if e.response.status_code == 403:
+                return dict(file_entity=fe, full_urls=full_urls, status="fail-cdx-403")
+            else:
+                raise
         if cdx_record:
             if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts):
                 assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit()
diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
index 56a5c80e..a9b19921 100644
--- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py
+++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
@@ -22,7 +22,7 @@ class FileShortWaybackTimestampCleanup(EntityImporter):
     is not integrated into the `fatcat_import` or `fatcat_cleanup` controller;
     instead it has a __main__ function and is invoked like:
 
-        python -m fatcat_tools.cleans.file_short_wayback-ts < blah.json
+        python -m fatcat_tools.cleans.file_short_wayback_ts - < blah.json
     """
 
     def __init__(self, api: ApiClient, **kwargs):
@@ -77,10 +77,18 @@ class FileShortWaybackTimestampCleanup(EntityImporter):
             if fe_url.url in url_expansions:
                 fix_url = url_expansions[fe_url.url]
                 # defensive checks
-                assert f"/web/{partial_ts}" in fix_url
+                if not (
+                    f"/web/{partial_ts}" in fix_url
+                    and fe_url.url.endswith(original_url)
+                    and fix_url.endswith(original_url)
+                ):
+                    print(
+                        f"bad replacement URL: partial_ts={partial_ts} original={original_url} fix_url={fix_url}",
+                        file=sys.stderr,
+                    )
+                    self.counts["skip-bad-replacement"] += 1
+                    return None
                 assert "://" in fix_url
-                assert fe_url.url.endswith(original_url)
-                assert fix_url.endswith(original_url)
                 fe_url.url = fix_url
                 any_fixed = True
 
@@ -305,7 +313,7 @@ def main() -> None:
     )
     parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)
     parser.set_defaults(
-        auth_var="FATCAT_API_AUTH_TOKEN",
+        auth_var="FATCAT_AUTH_WORKER_CLEANUP",
     )
     parser.add_argument(
         "json_file",
-- 
cgit v1.2.3


From 996b2e2084c1798126bd91dd950c063982398bec Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 9 Nov 2021 15:46:20 -0800
Subject: more iteration on short wayback timestamp cleanup

---
 notes/cleanups/scripts/fetch_full_cdx_ts.py        |   2 +-
 notes/cleanups/wayback_timestamps.md               | 129 ++++++++++++++++++++-
 .../fatcat_tools/cleanups/file_short_wayback_ts.py |   2 +-
 3 files changed, 129 insertions(+), 4 deletions(-)

(limited to 'notes/cleanups/scripts')

diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py
index 6f67c7e1..d5b0c476 100644
--- a/notes/cleanups/scripts/fetch_full_cdx_ts.py
+++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py
@@ -137,7 +137,7 @@ def process_file(fe, session) -> dict:
         if short in full_urls:
             continue
 
-        if original_url in self_urls:
+        if original_url in self_urls and ts in self_urls[original_url]:
             full_urls[short] = self_urls[original_url]
             status = "success-self"
             continue
diff --git a/notes/cleanups/wayback_timestamps.md b/notes/cleanups/wayback_timestamps.md
index 81785992..85e5f94f 100644
--- a/notes/cleanups/wayback_timestamps.md
+++ b/notes/cleanups/wayback_timestamps.md
@@ -61,9 +61,10 @@ Yes, 4-digit is a popular pattern as well, need to handle those:
     # 111M 0:13:22 [ 139k/s]
 
     zcat files_20211007_moreshortts.json.gz | wc -l
+    # 9,958,854
 
     zcat files_20211007_moreshortts.json.gz | shuf -n10000 > files_20211007_moreshortts.10k_sample.json
-    # 9,958,854
+
 
 ## Fetch Complete URL
 
@@ -114,6 +115,14 @@ Again with the broader set:
        6175 "success-db"
        3035 "success-self"
 
+While running a larger batch, got a CDX API error:
+
+    requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://web.archive.org/cdx/search/cdx?url=https%3A%2F%2Fwww.psychologytoday.com%2Ffiles%2Fu47%2FHenry_et_al.pdf&from=2017&to=2017&matchType=exact&output=json&limit=20
+
+    org.archive.util.io.RuntimeIOException: org.archive.wayback.exception.AdministrativeAccessControlException: Blocked Site Error
+
+So maybe need to use credentials after all.
+
 
 ## Cleanup Process
 
@@ -128,11 +137,13 @@ It looks like the rel swap is already implemented in `generic_file_cleanups()`.
 From sampling it seems like the mimetype issue is pretty small, so not going to
 bite that off now. The "bogus file" issue requires thought, so also skipping.
 
-## Commands
+
+## Commands (old)
 
 Running with 8x parallelism to not break things; expecting some errors along
 the way, may need to add handlers for connection errors etc:
 
+    # OLD SNAPSHOT
     zcat files_20211007_moreshortts.json.gz \
         | parallel -j8 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \
         | pv -l \
@@ -140,3 +151,117 @@ the way, may need to add handlers for connection errors etc:
         > files_20211007_moreshortts.fetched.json.gz
 
 At 300 records/sec, this should take around 9-10 hours to process.
+
+
+
+## Prep Again (2021-11-09)
+
+After fixing "sort" issue and re-dumping file entities (2021-11-05 snapshot).
+
+Filter again:
+
+    # note: in the future use pigz instead of gzip here
+    zcat file_export.json.gz \
+        | pv -l \
+        | rg 'web.archive.org/web/\d{4,12}/' \
+        | gzip \
+        > files_20211105_moreshortts.json.gz
+    # 112M 0:13:27 [ 138k/s]
+
+    zcat files_20211105_moreshortts.json.gz | wc -l
+    # 9,958,854
+    # good, exact same number as previous snapshot
+
+    zcat files_20211105_moreshortts.json.gz | shuf -n10000 > files_20211105_moreshortts.10k_sample.json
+    # done
+
+    cat files_20211105_moreshortts.10k_sample.json \
+        | ./fetch_full_cdx_ts.py \
+        | pv -l \
+        > files_20211105_moreshortts.10k_sample.fetched.json
+    # 10.0k 0:03:36 [46.3 /s]
+
+    cat files_20211105_moreshortts.10k_sample.fetched.json | jq .status | sort | uniq -c
+         13 "fail-not-found"
+        774 "success-api"
+       6193 "success-db"
+       3020 "success-self"
+
+After tweaking `success-self` logic:
+
+         13 "fail-not-found"
+        859 "success-api"
+       6229 "success-db"
+       2899 "success-self"
+
+
+## Testing in QA
+
+Copied `sample_out.json` to fatcat QA instance and renamed as `files_20211007_moreshortts.10k_sample.fetched.json`
+
+    # OLD ATTEMPT
+    export FATCAT_API_AUTH_TOKEN=[...]
+    head -n10 /srv/fatcat/datasets/files_20211007_moreshortts.10k_sample.fetched.json \
+        | python -m fatcat_tools.cleanups.file_short_wayback_ts -
+
+Ran in to issues, iterated above.
+
+Trying again with updated script and sample file:
+
+    export FATCAT_AUTH_WORKER_CLEANUP=[...]
+
+    head -n10 /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \
+        | python -m fatcat_tools.cleanups.file_short_wayback_ts -
+    # Counter({'total': 10, 'update': 10, 'skip': 0, 'insert': 0, 'exists': 0})
+
+Manually inspected and these look good. Trying some repeats and larger batched:
+
+    head -n10 /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \
+        | python -m fatcat_tools.cleanups.file_short_wayback_ts -
+    # Counter({'total': 10, 'skip-revision-changed': 10, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0})
+
+    head -n1000 /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \
+        | python -m fatcat_tools.cleanups.file_short_wayback_ts -
+
+    [...]
+    bad replacement URL: partial_ts=201807271139 original=http://www.scielo.br/pdf/qn/v20n1/4918.pdf fix_url=https://web.archive.org/web/20170819080342/http://www.scielo.br/pdf/qn/v20n1/4918.pdf
+    bad replacement URL: partial_ts=201904270207 original=https://www.matec-conferences.org/articles/matecconf/pdf/2018/62/matecconf_iccoee2018_03008.pdf fix_url=https://web.archive.org/web/20190501060839/https://www.matec-conferences.org/articles/matecconf/pdf/2018/62/matecconf_iccoee2018_03008.pdf
+    bad replacement URL: partial_ts=201905011445 original=https://cdn.intechopen.com/pdfs/5886.pdf fix_url=https://web.archive.org/web/20190502203832/https://cdn.intechopen.com/pdfs/5886.pdf
+    [...]
+
+    # Counter({'total': 1000, 'update': 969, 'skip': 19, 'skip-bad-replacement': 18, 'skip-revision-changed': 10, 'skip-bad-wayback-timestamp': 2, 'skip-status': 1, 'insert': 0, 'exists': 0})
+
+
+It looks like these "bad replacement URLs" are due to timestamp mismatches. Eg, the partial timestamp is not part of the final timestamp.
+
+Tweaked fetch script and re-ran:
+
+    # Counter({'total': 1000, 'skip-revision-changed': 979, 'update': 18, 'skip-bad-wayback-timestamp': 2, 'skip': 1, 'skip-status': 1, 'insert': 0, 'exists': 0})
+
+Cool. Sort of curious what the deal is with those `skip-bad-wayback-timestamp`.
+
+Run the rest through:
+
+    cat /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \
+        | python -m fatcat_tools.cleanups.file_short_wayback_ts -
+    # Counter({'total': 10000, 'update': 8976, 'skip-revision-changed': 997, 'skip-bad-wayback-timestamp': 14, 'skip': 13, 'skip-status': 13, 'insert': 0, 'exists': 0})
+
+Should tweak batch size to 100 (vs. 50).
+
+How to parallelize import:
+
+    # from within pipenv
+    cat /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \
+        | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_tools.cleanups.file_short_wayback_ts -
+
+
+## Full Batch Commands
+
+Running in bulk again:
+
+    zcat files_20211105_moreshortts.json.gz \
+        | parallel -j8 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \
+        | pv -l \
+        | gzip \
+        > files_20211105_moreshortts.fetched.json.gz
+
diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
index a9b19921..e2595912 100644
--- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py
+++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
@@ -311,7 +311,7 @@ def main() -> None:
     parser.add_argument(
         "--host-url", default="http://localhost:9411/v0", help="connect to this host/port"
     )
-    parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)
+    parser.add_argument("--batch-size", help="size of batch to send", default=100, type=int)
     parser.set_defaults(
         auth_var="FATCAT_AUTH_WORKER_CLEANUP",
     )
-- 
cgit v1.2.3


From cd09c6d6bd4deef0627de4f8a8a301725db01e14 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 9 Nov 2021 22:55:58 -0800
Subject: wayback ts cleanup: one more filter tweak

---
 notes/cleanups/scripts/fetch_full_cdx_ts.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'notes/cleanups/scripts')

diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py
index d5b0c476..ebcf0d62 100644
--- a/notes/cleanups/scripts/fetch_full_cdx_ts.py
+++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py
@@ -115,7 +115,8 @@ def process_file(fe, session) -> dict:
         seg = u.split('/')
         assert seg[2] == "web.archive.org"
         assert seg[3] == "web"
-        assert seg[4].isdigit()
+        if not seg[4].isdigit():
+            continue
         original_url = "/".join(seg[5:])
         if len(seg[4]) == 12 or len(seg[4]) == 4:
             short_urls.append(u)
-- 
cgit v1.2.3