From 1927a7da466164010f0a6467f4df0c887ba00ad3 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 4 Nov 2021 14:00:56 -0700
Subject: start work on wayback short-timestamp cleanup

---
 notes/cleanups/scripts/fetch_full_cdx_ts.py | 193 ++++++++++++++++++++++++++++
 notes/cleanups/wayback_timestamps.md        |  45 +++++++
 2 files changed, 238 insertions(+)
 create mode 100644 notes/cleanups/scripts/fetch_full_cdx_ts.py
 create mode 100644 notes/cleanups/wayback_timestamps.md

(limited to 'notes/cleanups')

diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py
new file mode 100644
index 00000000..5ffd11cb
--- /dev/null
+++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import base64
+from typing import Optional, List
+
+import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry  # pylint: disable=import-error
+
+def requests_retry_session(
+    retries: int = 10,
+    backoff_factor: int = 3,
+    status_forcelist: List[int] = [500, 502, 504],
+    session: requests.Session = None,
+) -> requests.Session:
+    """
+    From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
+    """
+    session = session or requests.Session()
+    retry = Retry(
+        total=retries,
+        read=retries,
+        connect=retries,
+        backoff_factor=backoff_factor,
+        status_forcelist=status_forcelist,
+    )
+    adapter = HTTPAdapter(max_retries=retry)
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+    return session
+
+def b32_hex(s: str) -> str:
+    """
+    Converts a base32-encoded SHA-1 checksum into hex-encoded
+
+    base32 checksums are used by, eg, heritrix and in wayback CDX files
+    """
+    s = s.strip().split()[0].lower()
+    if s.startswith("sha1:"):
+        s = s[5:]
+    if len(s) != 32:
+        if len(s) == 40:
+            return s
+        raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
+    return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
+
+
+SANDCRAWLER_POSTGREST_URL = "http://wbgrp-svc506.us.archive.org:3030"
+
+def get_db_cdx(url: str, http_session) -> List[dict]:
+    resp = http_session.get(SANDCRAWLER_POSTGREST_URL + "/cdx", params=dict(url="eq." + url))
+    resp.raise_for_status()
+    rows = resp.json()
+    return rows or []
+
+CDX_API_URL = "https://web.archive.org/cdx/search/cdx"
+
+def get_api_cdx(url: str, partial_dt: str, http_session) -> Optional[dict]:
+
+    params = {
+        "url": url,
+        "from": partial_dt,
+        "to": partial_dt,
+        "matchType": "exact",
+        "output": "json",
+        "limit": 20,
+        # can't filter status because might be warc/revisit
+        #"filter": "statuscode:200",
+    }
+    resp = http_session.get(CDX_API_URL, params=params)
+    resp.raise_for_status()
+    rows = resp.json()
+
+    if not rows:
+        return None
+    #print(rows, file=sys.stderr)
+    if len(rows) < 2:
+        return None
+
+    for raw in rows[1:]:
+        record = dict(
+            surt=raw[0],
+            datetime=raw[1],
+            url=raw[2],
+            mimetype=raw[3],
+            status_code=raw[4],
+            sha1b32=raw[5],
+            sha1hex=b32_hex(raw[5]),
+        )
+        if record['url'] != url:
+            # TODO: could allow HTTP/HTTPS fuzzy match
+            print("CDX API near match: URL", file=sys.stderr)
+            continue
+        if not record['datetime'].startswith(partial_dt):
+            print(f"CDX API near match: datetime {partial_dt} {record['datetime']}", file=sys.stderr)
+            continue
+        if record['status_code'] == "200" or (record['status_code'] == '-' and record['mimetype'] == 'warc/revisit'):
+            return record
+        else:
+            print(f"CDX API near match: status  {record['status_code']}", file=sys.stderr)
+    return None
+
+def process_file(fe, session) -> dict:
+    short_urls = []
+    self_urls = dict()
+    full_urls = dict()
+    status = "unknown"
+
+    for pair in fe['urls']:
+        u = pair['url']
+        if not '://web.archive.org/web/' in u:
+            continue
+        seg = u.split('/')
+        assert seg[2] == "web.archive.org"
+        assert seg[3] == "web"
+        assert seg[4].isdigit()
+        original_url = "/".join(seg[5:])
+        if len(seg[4]) == 12:
+            short_urls.append(u)
+        elif len(seg[4]) == 14:
+            self_urls[original_url] = u
+        else:
+            print(f"other bogus ts: {seg[4]}", file=sys.stderr)
+            return dict(file_entity=fe, full_urls=full_urls, status="fail-bogus-ts")
+
+    if len(short_urls) == 0:
+        return dict(file_entity=fe, full_urls=[], status="skip-no-shorts")
+
+    for short in list(set(short_urls)):
+        seg = short.split('/')
+        ts = seg[4]
+        assert len(ts) == 12 and ts.isdigit()
+        original_url = '/'.join(seg[5:])
+
+        if original_url in full_urls:
+            continue
+
+        if original_url in self_urls:
+            full_urls[original_url] = self_urls[original_url]
+            status = "success-self"
+            continue
+
+        cdx_row_list = get_db_cdx(original_url, http_session=session)
+        for cdx_row in cdx_row_list:
+            if cdx_row['sha1hex'] == fe['sha1'] and cdx_row['url'] == original_url and cdx_row['datetime'].startswith(ts):
+                assert len(cdx_row['datetime']) == 14 and cdx_row['datetime'].isdigit()
+                full_urls[original_url] = f"https://web.archive.org/web/{cdx_row['datetime']}/{original_url}"
+                status = "success-db"
+                break
+            else:
+                #print(f"cdx DB found, but no match", file=sys.stderr)
+                pass
+        cdx_row = None
+
+        if original_url in full_urls:
+            continue
+
+        cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session)
+        if cdx_record:
+            if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts):
+                assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit()
+                full_urls[original_url] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}"
+                status = "success-api"
+                break
+            else:
+                print(f"cdx API found, but no match", file=sys.stderr)
+        else:
+            print(f"no CDX API record found: {original_url}", file=sys.stderr)
+
+        if original_url not in full_urls:
+            return dict(file_entity=fe, full_urls=full_urls, status="fail-not-found")
+
+    return dict(
+        file_entity=fe,
+        full_urls=full_urls,
+        status=status,
+    )
+
+def main():
+    session = requests_retry_session()
+    session.headers.update({
+        "User-Agent": "Mozilla/5.0 fatcat.CdxFixupBot",
+    })
+    for line in sys.stdin:
+        if not line.strip():
+            continue
+        fe = json.loads(line)
+        print(json.dumps(process_file(fe, session=session)))
+
+if __name__=="__main__":
+    main()
diff --git a/notes/cleanups/wayback_timestamps.md b/notes/cleanups/wayback_timestamps.md
new file mode 100644
index 00000000..c70ec5b2
--- /dev/null
+++ b/notes/cleanups/wayback_timestamps.md
@@ -0,0 +1,45 @@
+
+At some point, using the arabesque importer (from targetted crawling), we
+accidentially imported a bunch of files with wayback URLs that have 12-digit
+timestamps, instead of the full canonical 14-digit timestamps.
+
+
+## Prep (2021-11-04)
+
+Download most recent file export:
+
+    wget https://archive.org/download/fatcat_bulk_exports_2021-10-07/file_export.json.gz
+
+Filter to files with problem of interest:
+
+    zcat file_export.json.gz \
+        | pv -l \
+        | rg 'web.archive.org/web/\d{12}/' \
+        | gzip \
+        > files_20211007_shortts.json.gz
+    # 111M 0:12:35
+
+    zcat files_20211007_shortts.json.gz | wc -l
+    # 7,935,009
+
+    zcat files_20211007_shortts.json.gz | shuf -n10000 > files_20211007_shortts.10k_sample.json
+
+Wow, this is a lot more than I thought!
+
+## Fetch Complete URL
+
+Want to export JSON like:
+
+    file_entity
+        [existing file entity]
+    full_urls[]
+        <short>: <long>
+    status: str
+
+Status one of:
+
+- 'success-self': the file already has a fixed URL internally
+- 'success-db': lookup URL against sandcrawler-db succeeded, and SHA1 matched
+- 'success-cdx': CDX API lookup succeeded, and SHA1 matched
+- 'fail-hash': found a CDX record, but wrong hash
+- 'fail-not-found': no matching CDX record found
-- 
cgit v1.2.3


From c2cdf60d509e380029f6e2566fc4f98eff4b9f1a Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 4 Nov 2021 17:07:54 -0700
Subject: wayback timestamps: updates to handle 4-digit case

---
 notes/cleanups/scripts/fetch_full_cdx_ts.py |  16 ++---
 notes/cleanups/wayback_timestamps.md        | 103 +++++++++++++++++++++++++++-
 2 files changed, 108 insertions(+), 11 deletions(-)

(limited to 'notes/cleanups')

diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py
index 5ffd11cb..6c6817ab 100644
--- a/notes/cleanups/scripts/fetch_full_cdx_ts.py
+++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py
@@ -117,7 +117,7 @@ def process_file(fe, session) -> dict:
         assert seg[3] == "web"
         assert seg[4].isdigit()
         original_url = "/".join(seg[5:])
-        if len(seg[4]) == 12:
+        if len(seg[4]) == 12 or len(seg[4]) == 4:
             short_urls.append(u)
         elif len(seg[4]) == 14:
             self_urls[original_url] = u
@@ -131,14 +131,14 @@ def process_file(fe, session) -> dict:
     for short in list(set(short_urls)):
         seg = short.split('/')
         ts = seg[4]
-        assert len(ts) == 12 and ts.isdigit()
+        assert len(ts) in [12,4] and ts.isdigit()
         original_url = '/'.join(seg[5:])
 
-        if original_url in full_urls:
+        if short in full_urls:
             continue
 
         if original_url in self_urls:
-            full_urls[original_url] = self_urls[original_url]
+            full_urls[short] = self_urls[original_url]
             status = "success-self"
             continue
 
@@ -146,7 +146,7 @@ def process_file(fe, session) -> dict:
         for cdx_row in cdx_row_list:
             if cdx_row['sha1hex'] == fe['sha1'] and cdx_row['url'] == original_url and cdx_row['datetime'].startswith(ts):
                 assert len(cdx_row['datetime']) == 14 and cdx_row['datetime'].isdigit()
-                full_urls[original_url] = f"https://web.archive.org/web/{cdx_row['datetime']}/{original_url}"
+                full_urls[short] = f"https://web.archive.org/web/{cdx_row['datetime']}/{original_url}"
                 status = "success-db"
                 break
             else:
@@ -154,14 +154,14 @@ def process_file(fe, session) -> dict:
                 pass
         cdx_row = None
 
-        if original_url in full_urls:
+        if short in full_urls:
             continue
 
         cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session)
         if cdx_record:
             if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts):
                 assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit()
-                full_urls[original_url] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}"
+                full_urls[short] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}"
                 status = "success-api"
                 break
             else:
@@ -169,7 +169,7 @@ def process_file(fe, session) -> dict:
         else:
             print(f"no CDX API record found: {original_url}", file=sys.stderr)
 
-        if original_url not in full_urls:
+        if short not in full_urls:
             return dict(file_entity=fe, full_urls=full_urls, status="fail-not-found")
 
     return dict(
diff --git a/notes/cleanups/wayback_timestamps.md b/notes/cleanups/wayback_timestamps.md
index c70ec5b2..81785992 100644
--- a/notes/cleanups/wayback_timestamps.md
+++ b/notes/cleanups/wayback_timestamps.md
@@ -26,14 +26,53 @@ Filter to files with problem of interest:
 
 Wow, this is a lot more than I thought!
 
+There might also be some other short URL patterns, check for those:
+
+    zcat file_export.json.gz \
+        | pv -l \
+        | rg 'web.archive.org/web/\d{1,11}/' \
+        | gzip \
+        > files_20211007_veryshortts.json.gz
+    # skipped, mergine with below
+
+    zcat file_export.json.gz \
+        | rg 'web.archive.org/web/None/' \
+        | pv -l \
+        > /dev/null
+    # 0.00  0:10:06 [0.00 /s]
+    # whew, that pattern has been fixed it seems
+
+    zcat file_export.json.gz | rg '/None/' | pv -l > /dev/null
+    # 2.00  0:10:01 [3.33m/s]
+
+    zcat file_export.json.gz \
+        | rg 'web.archive.org/web/\d{13}/' \
+        | pv -l \
+        > /dev/null
+    # 0.00  0:10:09 [0.00 /s]
+
+Yes, 4-digit is a popular pattern as well, need to handle those:
+
+    zcat file_export.json.gz \
+        | pv -l \
+        | rg 'web.archive.org/web/\d{4,12}/' \
+        | gzip \
+        > files_20211007_moreshortts.json.gz
+    # 111M 0:13:22 [ 139k/s]
+
+    zcat files_20211007_moreshortts.json.gz | wc -l
+
+    zcat files_20211007_moreshortts.json.gz | shuf -n10000 > files_20211007_moreshortts.10k_sample.json
+    # 9,958,854
+
 ## Fetch Complete URL
 
 Want to export JSON like:
 
     file_entity
         [existing file entity]
-    full_urls[]
-        <short>: <long>
+    full_urls[]: list of Dicts[str,str]
+        <short_url>: <full_url>
     status: str
 
 Status one of:
@@ -41,5 +80,63 @@ Status one of:
 - 'success-self': the file already has a fixed URL internally
 - 'success-db': lookup URL against sandcrawler-db succeeded, and SHA1 matched
 - 'success-cdx': CDX API lookup succeeded, and SHA1 matched
-- 'fail-hash': found a CDX record, but wrong hash
 - 'fail-not-found': no matching CDX record found
+
+Ran over a sample:
+
+    cat files_20211007_shortts.10k_sample.json | ./fetch_full_cdx_ts.py > sample_out.json
+
+    cat sample_out.json | jq .status | sort | uniq -c
+          5 "fail-not-found"
+        576 "success-api"
+       7212 "success-db"
+       2207 "success-self"
+
+    head -n1000  | ./fetch_full_cdx_ts.py > sample_out.json
+
+    zcat files_20211007_veryshortts.json.gz | head -n1000 | ./fetch_full_cdx_ts.py | jq .status | sort | uniq -c
+          2 "fail-not-found"
+        168 "success-api"
+        208 "success-db"
+        622 "success-self"
+
+Investigating the "fail-not-found", they look like http/https URL
+not-exact-matches. Going to put off handling these for now because it is a
+small fraction and more delicate.
+
+Again with the broader set:
+
+    cat files_20211007_moreshortts.10k_sample.json | ./fetch_full_cdx_ts.py > sample_out.json
+
+    cat sample_out.json | jq .status | sort | uniq -c
+          9 "fail-not-found"
+        781 "success-api"
+       6175 "success-db"
+       3035 "success-self"
+
+
+## Cleanup Process
+
+Other possible cleanups to run at the same time, which would not require
+external requests or other context:
+
+- URL has ://archive.org/ link with rel=repository => rel=archive
+- mimetype is bogus => clean mimetype
+- bogus file => set some new extra field, like scope=stub or scope=partial (?)
+
+It looks like the rel swap is already implemented in `generic_file_cleanups()`.
+From sampling it seems like the mimetype issue is pretty small, so not going to
+bite that off now. The "bogus file" issue requires thought, so also skipping.
+
+## Commands
+
+Running with 8x parallelism to not break things; expecting some errors along
+the way, may need to add handlers for connection errors etc:
+
+    zcat files_20211007_moreshortts.json.gz \
+        | parallel -j8 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \
+        | pv -l \
+        | gzip \
+        > files_20211007_moreshortts.fetched.json.gz
+
+At 300 records/sec, this should take around 9-10 hours to process.
-- 
cgit v1.2.3


From 86e6850e70617e1609b79e0ee4bfe2a26f7f992e Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 9 Nov 2021 14:17:31 -0800
Subject: cleanups: tweaks to wayback CDX cleanup scripts

---
 notes/cleanups/scripts/fetch_full_cdx_ts.py           |  9 ++++++++-
 python/fatcat_tools/cleanups/file_short_wayback_ts.py | 18 +++++++++++++-----
 2 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'notes/cleanups')

diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py
index 6c6817ab..6f67c7e1 100644
--- a/notes/cleanups/scripts/fetch_full_cdx_ts.py
+++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py
@@ -157,7 +157,14 @@ def process_file(fe, session) -> dict:
         if short in full_urls:
             continue
 
-        cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session)
+        cdx_record = None
+        try:
+            cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session)
+        except requests.exceptions.HTTPError as e:
+            if e.response.status_code == 403:
+                return dict(file_entity=fe, full_urls=full_urls, status="fail-cdx-403")
+            else:
+                raise
         if cdx_record:
             if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts):
                 assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit()
diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
index 56a5c80e..a9b19921 100644
--- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py
+++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
@@ -22,7 +22,7 @@ class FileShortWaybackTimestampCleanup(EntityImporter):
     is not integrated into the `fatcat_import` or `fatcat_cleanup` controller;
     instead it has a __main__ function and is invoked like:
 
-        python -m fatcat_tools.cleans.file_short_wayback-ts < blah.json
+        python -m fatcat_tools.cleans.file_short_wayback_ts - < blah.json
     """
 
     def __init__(self, api: ApiClient, **kwargs):
@@ -77,10 +77,18 @@ class FileShortWaybackTimestampCleanup(EntityImporter):
             if fe_url.url in url_expansions:
                 fix_url = url_expansions[fe_url.url]
                 # defensive checks
-                assert f"/web/{partial_ts}" in fix_url
+                if not (
+                    f"/web/{partial_ts}" in fix_url
+                    and fe_url.url.endswith(original_url)
+                    and fix_url.endswith(original_url)
+                ):
+                    print(
+                        f"bad replacement URL: partial_ts={partial_ts} original={original_url} fix_url={fix_url}",
+                        file=sys.stderr,
+                    )
+                    self.counts["skip-bad-replacement"] += 1
+                    return None
                 assert "://" in fix_url
-                assert fe_url.url.endswith(original_url)
-                assert fix_url.endswith(original_url)
                 fe_url.url = fix_url
                 any_fixed = True
 
@@ -305,7 +313,7 @@ def main() -> None:
     )
     parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)
     parser.set_defaults(
-        auth_var="FATCAT_API_AUTH_TOKEN",
+        auth_var="FATCAT_AUTH_WORKER_CLEANUP",
     )
     parser.add_argument(
         "json_file",
-- 
cgit v1.2.3


From 996b2e2084c1798126bd91dd950c063982398bec Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 9 Nov 2021 15:46:20 -0800
Subject: more iteration on short wayback timestamp cleanup

---
 notes/cleanups/scripts/fetch_full_cdx_ts.py        |   2 +-
 notes/cleanups/wayback_timestamps.md               | 129 ++++++++++++++++++++-
 .../fatcat_tools/cleanups/file_short_wayback_ts.py |   2 +-
 3 files changed, 129 insertions(+), 4 deletions(-)

(limited to 'notes/cleanups')

diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py
index 6f67c7e1..d5b0c476 100644
--- a/notes/cleanups/scripts/fetch_full_cdx_ts.py
+++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py
@@ -137,7 +137,7 @@ def process_file(fe, session) -> dict:
         if short in full_urls:
             continue
 
-        if original_url in self_urls:
+        if original_url in self_urls and ts in self_urls[original_url]:
             full_urls[short] = self_urls[original_url]
             status = "success-self"
             continue
diff --git a/notes/cleanups/wayback_timestamps.md b/notes/cleanups/wayback_timestamps.md
index 81785992..85e5f94f 100644
--- a/notes/cleanups/wayback_timestamps.md
+++ b/notes/cleanups/wayback_timestamps.md
@@ -61,9 +61,10 @@ Yes, 4-digit is a popular pattern as well, need to handle those:
     # 111M 0:13:22 [ 139k/s]
 
     zcat files_20211007_moreshortts.json.gz | wc -l
+    # 9,958,854
 
     zcat files_20211007_moreshortts.json.gz | shuf -n10000 > files_20211007_moreshortts.10k_sample.json
-    # 9,958,854
+
 
 ## Fetch Complete URL
 
@@ -114,6 +115,14 @@ Again with the broader set:
        6175 "success-db"
        3035 "success-self"
 
+While running a larger batch, got a CDX API error:
+
+    requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://web.archive.org/cdx/search/cdx?url=https%3A%2F%2Fwww.psychologytoday.com%2Ffiles%2Fu47%2FHenry_et_al.pdf&from=2017&to=2017&matchType=exact&output=json&limit=20
+
+    org.archive.util.io.RuntimeIOException: org.archive.wayback.exception.AdministrativeAccessControlException: Blocked Site Error
+
+So maybe need to use credentials after all.
+
 
 ## Cleanup Process
 
@@ -128,11 +137,13 @@ It looks like the rel swap is already implemented in `generic_file_cleanups()`.
 From sampling it seems like the mimetype issue is pretty small, so not going to
 bite that off now. The "bogus file" issue requires thought, so also skipping.
 
-## Commands
+
+## Commands (old)
 
 Running with 8x parallelism to not break things; expecting some errors along
 the way, may need to add handlers for connection errors etc:
 
+    # OLD SNAPSHOT
     zcat files_20211007_moreshortts.json.gz \
         | parallel -j8 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \
         | pv -l \
@@ -140,3 +151,117 @@ the way, may need to add handlers for connection errors etc:
         > files_20211007_moreshortts.fetched.json.gz
 
 At 300 records/sec, this should take around 9-10 hours to process.
+
+
+
+## Prep Again (2021-11-09)
+
+After fixing "sort" issue and re-dumping file entities (2021-11-05 snapshot).
+
+Filter again:
+
+    # note: in the future use pigz instead of gzip here
+    zcat file_export.json.gz \
+        | pv -l \
+        | rg 'web.archive.org/web/\d{4,12}/' \
+        | gzip \
+        > files_20211105_moreshortts.json.gz
+    # 112M 0:13:27 [ 138k/s]
+
+    zcat files_20211105_moreshortts.json.gz | wc -l
+    # 9,958,854
+    # good, exact same number as previous snapshot
+
+    zcat files_20211105_moreshortts.json.gz | shuf -n10000 > files_20211105_moreshortts.10k_sample.json
+    # done
+
+    cat files_20211105_moreshortts.10k_sample.json \
+        | ./fetch_full_cdx_ts.py \
+        | pv -l \
+        > files_20211105_moreshortts.10k_sample.fetched.json
+    # 10.0k 0:03:36 [46.3 /s]
+
+    cat files_20211105_moreshortts.10k_sample.fetched.json | jq .status | sort | uniq -c
+         13 "fail-not-found"
+        774 "success-api"
+       6193 "success-db"
+       3020 "success-self"
+
+After tweaking `success-self` logic:
+
+         13 "fail-not-found"
+        859 "success-api"
+       6229 "success-db"
+       2899 "success-self"
+
+
+## Testing in QA
+
+Copied `sample_out.json` to fatcat QA instance and renamed as `files_20211007_moreshortts.10k_sample.fetched.json`
+
+    # OLD ATTEMPT
+    export FATCAT_API_AUTH_TOKEN=[...]
+    head -n10 /srv/fatcat/datasets/files_20211007_moreshortts.10k_sample.fetched.json \
+        | python -m fatcat_tools.cleanups.file_short_wayback_ts -
+
+Ran in to issues, iterated above.
+
+Trying again with updated script and sample file:
+
+    export FATCAT_AUTH_WORKER_CLEANUP=[...]
+
+    head -n10 /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \
+        | python -m fatcat_tools.cleanups.file_short_wayback_ts -
+    # Counter({'total': 10, 'update': 10, 'skip': 0, 'insert': 0, 'exists': 0})
+
+Manually inspected and these look good. Trying some repeats and larger batched:
+
+    head -n10 /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \
+        | python -m fatcat_tools.cleanups.file_short_wayback_ts -
+    # Counter({'total': 10, 'skip-revision-changed': 10, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0})
+
+    head -n1000 /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \
+        | python -m fatcat_tools.cleanups.file_short_wayback_ts -
+
+    [...]
+    bad replacement URL: partial_ts=201807271139 original=http://www.scielo.br/pdf/qn/v20n1/4918.pdf fix_url=https://web.archive.org/web/20170819080342/http://www.scielo.br/pdf/qn/v20n1/4918.pdf
+    bad replacement URL: partial_ts=201904270207 original=https://www.matec-conferences.org/articles/matecconf/pdf/2018/62/matecconf_iccoee2018_03008.pdf fix_url=https://web.archive.org/web/20190501060839/https://www.matec-conferences.org/articles/matecconf/pdf/2018/62/matecconf_iccoee2018_03008.pdf
+    bad replacement URL: partial_ts=201905011445 original=https://cdn.intechopen.com/pdfs/5886.pdf fix_url=https://web.archive.org/web/20190502203832/https://cdn.intechopen.com/pdfs/5886.pdf
+    [...]
+
+    # Counter({'total': 1000, 'update': 969, 'skip': 19, 'skip-bad-replacement': 18, 'skip-revision-changed': 10, 'skip-bad-wayback-timestamp': 2, 'skip-status': 1, 'insert': 0, 'exists': 0})
+
+
+It looks like these "bad replacement URLs" are due to timestamp mismatches. Eg, the partial timestamp is not part of the final timestamp.
+
+Tweaked fetch script and re-ran:
+
+    # Counter({'total': 1000, 'skip-revision-changed': 979, 'update': 18, 'skip-bad-wayback-timestamp': 2, 'skip': 1, 'skip-status': 1, 'insert': 0, 'exists': 0})
+
+Cool. Sort of curious what the deal is with those `skip-bad-wayback-timestamp`.
+
+Run the rest through:
+
+    cat /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \
+        | python -m fatcat_tools.cleanups.file_short_wayback_ts -
+    # Counter({'total': 10000, 'update': 8976, 'skip-revision-changed': 997, 'skip-bad-wayback-timestamp': 14, 'skip': 13, 'skip-status': 13, 'insert': 0, 'exists': 0})
+
+Should tweak batch size to 100 (vs. 50).
+
+How to parallelize import:
+
+    # from within pipenv
+    cat /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \
+        | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_tools.cleanups.file_short_wayback_ts -
+
+
+## Full Batch Commands
+
+Running in bulk again:
+
+    zcat files_20211105_moreshortts.json.gz \
+        | parallel -j8 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \
+        | pv -l \
+        | gzip \
+        > files_20211105_moreshortts.fetched.json.gz
+
diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
index a9b19921..e2595912 100644
--- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py
+++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
@@ -311,7 +311,7 @@ def main() -> None:
     parser.add_argument(
         "--host-url", default="http://localhost:9411/v0", help="connect to this host/port"
     )
-    parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)
+    parser.add_argument("--batch-size", help="size of batch to send", default=100, type=int)
     parser.set_defaults(
         auth_var="FATCAT_AUTH_WORKER_CLEANUP",
     )
-- 
cgit v1.2.3


From d9284d421618742f5ecd76ba2c6d92dcefa5e5db Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 9 Nov 2021 16:56:28 -0800
Subject: updates to lowercase DOI cleanup

---
 notes/cleanups/case_sensitive_dois.md              | 71 ++++++++++++++++++++++
 .../fatcat_tools/cleanups/release_lowercase_doi.py | 22 ++++---
 2 files changed, 86 insertions(+), 7 deletions(-)
 create mode 100644 notes/cleanups/case_sensitive_dois.md

(limited to 'notes/cleanups')

diff --git a/notes/cleanups/case_sensitive_dois.md b/notes/cleanups/case_sensitive_dois.md
new file mode 100644
index 00000000..1bf1901e
--- /dev/null
+++ b/notes/cleanups/case_sensitive_dois.md
@@ -0,0 +1,71 @@
+
+Relevant github issue: https://github.com/internetarchive/fatcat/issues/83
+
+How many existing fatcat releases have a non-lowercase DOI? As of June 2021:
+
+    zcat release_extid.tsv.gz | cut -f3 | rg '[A-Z]' | pv -l | wc -l
+    139964
+
+## Prep
+
+    wget https://archive.org/download/fatcat_bulk_exports_2021-11-05/release_extid.tsv.gz
+
+    # scratch:bin/fcid.py is roughly the same as `fatcat_util.py uuid2fcid`
+
+    zcat release_extid.tsv.gz \
+        | cut -f1,3 \
+        | rg '[A-Z]' \
+        | /fast/scratch/bin/fcid.py \
+        | pv -l \
+        > nonlowercase_doi_releases.tsv
+    # 140k 0:03:54 [ 599 /s]
+
+    wc -l nonlowercase_doi_releases.tsv
+    140530 nonlowercase_doi_releases.tsv
+
+Uhoh, there are ~500 more than previously? Guess those are from after the fix?
+
+Create a sample for testing:
+
+    shuf -n10000 nonlowercase_doi_releases.tsv \
+        > nonlowercase_doi_releases.10k_sample.tsv
+
+## Test in QA
+
+In pipenv:
+
+    export FATCAT_AUTH_WORKER_CLEANUP=[...]
+
+    head -n100 /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \
+        | python -m fatcat_tools.cleanups.release_lowercase_doi -
+    # Counter({'total': 100, 'update': 100, 'skip': 0, 'insert': 0, 'exists': 0})
+
+    head -n100 /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \
+        | python -m fatcat_tools.cleanups.release_lowercase_doi -
+    # Counter({'total': 100, 'skip-existing-doi-fine': 100, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0})
+
+    head -n2000 /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \
+        | python -m fatcat_tools.cleanups.release_lowercase_doi -
+    # no such release_ident found: dcjsybvqanffhmu4dhzdnptave
+
+Presumably because this is being run in QA, and there are some newer prod releases in the snapshot.
+
+Did a quick update, and then:
+
+    head -n2000 /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \
+        | python -m fatcat_tools.cleanups.release_lowercase_doi -
+    # Counter({'total': 2000, 'skip-existing-doi-fine': 1100, 'update': 898, 'skip-existing-not-found': 2, 'skip': 0, 'insert': 0, 'exists': 0})
+
+Did some spot checking in QA. Out of 20 DOIs checked, 15 were valid, 5 were not
+valid (doi.org 404). It seems like roughly 1/3 have a dupe DOI (the lower-case
+DOI exists); didn't count exact numbers.
+
+This cleanup is simple and looks good to go. Batch size of 50 is good for full
+releases.
+
+Example of parallelization:
+
+    cat /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \
+        | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_tools.cleanups.release_lowercase_doi -
+
+Ready to go!
diff --git a/python/fatcat_tools/cleanups/release_lowercase_doi.py b/python/fatcat_tools/cleanups/release_lowercase_doi.py
index 4e5a6f43..5e3275db 100644
--- a/python/fatcat_tools/cleanups/release_lowercase_doi.py
+++ b/python/fatcat_tools/cleanups/release_lowercase_doi.py
@@ -2,7 +2,7 @@ import argparse
 import os
 import sys
 
-from fatcat_openapi_client import ApiClient, ReleaseEntity, ReleaseExtIds
+from fatcat_openapi_client import ApiClient, ApiException, ReleaseEntity, ReleaseExtIds
 
 from fatcat_tools import authenticated_api, public_api
 from fatcat_tools.importers.common import EntityImporter, LinePusher
@@ -45,7 +45,10 @@ class ReleaseLowercaseDoiCleanup(EntityImporter):
         self.testing_mode = False
 
     def want(self, row: str) -> bool:
-        row = row.strip().split()[0]
+        row = row.strip()
+        if not row:
+            return False
+        row = row.split()[0]
         if len(row) == 26:
             return True
         else:
@@ -66,15 +69,20 @@ class ReleaseLowercaseDoiCleanup(EntityImporter):
 
     def try_update(self, re: ReleaseEntity) -> bool:
 
-        # should always be existing
-        existing = self.api.get_release(re.ident)
+        # should always be existing, but sometimes not because of prod/QA flip
+        existing = None
+        try:
+            existing = self.api.get_release(re.ident)
+        except ApiException as err:
+            if err.status != 404:
+                raise err
 
         if not existing:
             self.counts["skip-existing-not-found"] += 1
             return False
 
-        if existing.status != "active":
-            self.counts["skip-existing-entity-status"] += 1
+        if existing.state != "active":
+            self.counts["skip-existing-entity-state"] += 1
             return False
 
         if not existing.ext_ids.doi:
@@ -91,7 +99,7 @@ class ReleaseLowercaseDoiCleanup(EntityImporter):
         # these corrections (entity dump) contains no dupes
 
         if not self.testing_mode:
-            self.api.update_release(self.get_editgroup_id(), re.ident, re)
+            self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
         self.counts["update"] += 1
         return False
 
-- 
cgit v1.2.3


From fd71860047f84305a50772aa2e7494fd61cfff30 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 9 Nov 2021 18:48:37 -0800
Subject: initial file/release bugfix cleanup worker and notes

---
 notes/cleanups/file_release_ingest_bug.md          | 144 +++++++++++++
 .../fatcat_tools/cleanups/file_release_bugfix.py   | 231 +++++++++++++++++++++
 2 files changed, 375 insertions(+)
 create mode 100644 notes/cleanups/file_release_ingest_bug.md
 create mode 100644 python/fatcat_tools/cleanups/file_release_bugfix.py

(limited to 'notes/cleanups')

diff --git a/notes/cleanups/file_release_ingest_bug.md b/notes/cleanups/file_release_ingest_bug.md
new file mode 100644
index 00000000..8690157a
--- /dev/null
+++ b/notes/cleanups/file_release_ingest_bug.md
@@ -0,0 +1,144 @@
+
+We want to find cases where the `file_edit` metadata for a file entity does not match the DOI of the release it is linked to.
+
+## Background
+
+
+Seems to mostly happen when the `link_source_id` DOI is not found during a
+fatcat 'lookup', eg due to upstream DOI metadata not including a 'title' field
+(required by fatcat schema). Many of these seem to be 'journal-issue' or
+something like that. Presumably upstream (unpaywall) crawled and found some PDF
+for the DOI and indexes that.
+
+TODO: what was the specific code bug that caused this? time window of edits?
+https://github.com/internetarchive/fatcat/commit/d8cf02dbdc6848be4de2710e54f4463d702b6e7c
+
+TODO: how many empty-title Crossref DOIs are there? perhaps we should import these after all, in some situations? eg, pull `html_biblio`?
+TODO: examples of other mismatches? edits from the same timespan with mismatching identifiers
+
+## Queries
+
+    file_edit
+        ident_id
+        rev_id
+        extra_json->>'link_source_id'::text
+    file_rev_release
+        file_rev
+        target_release_ident_id
+    release_ident
+        id
+        rev_id
+    release_rev
+        id
+        doi
+
+Query some samples:
+
+    SELECT file_edit.ident_id as file_ident, release_ident.id as release_ident, file_edit.extra_json->>'link_source_id' as file_edit_doi, release_rev.doi as release_doi
+    FROM file_edit
+        LEFT JOIN file_rev_release ON file_edit.rev_id = file_rev_release.file_rev
+        LEFT JOIN release_ident ON file_rev_release.target_release_ident_id = release_ident.id
+        LEFT JOIN release_rev ON release_rev.id = release_ident.rev_id
+    WHERE
+        file_edit.extra_json->>'ingest_request_source' = 'unpaywall'
+        AND release_rev.doi != file_edit.extra_json->>'link_source_id'
+    LIMIT 20;
+
+                  file_ident              |            release_ident             |             file_edit_doi             |             release_doi             
+    --------------------------------------+--------------------------------------+---------------------------------------+-------------------------------------
+     8fe6eb2a-da3a-4e1c-b281-85fad3cae601 | 0a0cbabd-3f04-4296-94d5-b61c27fddee4 | 10.7167/2013/520930/dataset/3         | 10.7167/2013/520930
+     cbb2d4e7-a9a7-41df-be1d-faf9dd552ee1 | d7b356c7-de6b-4885-b37d-ea603849a30a | 10.1525/elementa.124.t1               | 10.1525/elementa.124
+     26242648-2a7b-42ea-b2ca-e4fba8f0a2c7 | d2647e07-fbb3-47f5-b06a-de2872bf84ec | 10.1023/a:1000301702170               | 10.1007/bfb0085374
+     f774854a-3939-4aa9-bf73-7e0573908b88 | e19ce792-422d-4feb-92ba-dd8536a60618 | 10.32835/2223-5752.2018.17.14-21      | 10.31652/2412-1142-2018-50-151-156
+     615ddf8c-2d91-4c1c-b04a-76b69f07b300 | e7af8377-9542-4711-9444-872f45521dc1 | 10.5644/sjm.10.1.00                   | 10.5644/sjm.10.1.01
+     43fa1b53-eddb-4f31-96af-bc9bd3bb31b6 | e1ec5d8a-ff88-49f4-8540-73ee4906e119 | 10.1136/bjo.81.11.934                 | 10.1038/sj.bjc.6690790
+     64b04cbc-ab3d-4cef-aff5-9d6b2532c47a | 43c387d1-e4c9-49f3-9995-552451a9a246 | 10.1023/a:1002015330987               | 10.1006/jnth.1998.2271
+     31e6e1a8-8457-4c93-82d3-12a51c0dc1bb | a2ec4305-63d2-4164-8470-02bf5c4ba74c | 10.1186/1742-4690-2-s1-s84            | 10.1038/nm0703-847
+     3461024a-1800-44a0-a7be-73f14bac99dd | 6849d39f-6bae-460b-8a9d-d6b86fbac2d3 | 10.17265/2328-2150/2014.11            | 10.1590/s0074-02762002000900009
+     c2707e43-6a4b-4f5d-8fb4-babe16b42b4d | 82127faf-d125-4cd4-88b9-9a8618392019 | 10.1055/s-005-28993                   | 10.1055/s-0034-1380711
+     0fe72304-1884-4dde-98c6-7cf3aec5bf31 | e3fe46cd-0205-42e4-8255-490e0eba38ea | 10.1787/5k4c1s0z2gs1-en               | 10.1787/9789282105931-2-en
+     686a1508-b6c5-4060-9035-1dd8a4b85be1 | dcecd03d-e0f6-40ce-a376-7e64bd90bdca | 10.15406/jlprr.2.3                    | 10.15406/jlprr.2015.02.00039
+     9403bc0e-2a50-4c8a-ab79-943bce20e583 | 3073dee2-c8f7-42ed-a727-12306d86fa35 | 10.1787/5jrtgzfl6g9w-en               | 10.1111/1468-2443.00004
+     9a080a6f-aaaf-4f4b-acab-3eda2bfacc22 | af28a167-2ddc-43db-98ca-1cc29b863f9f | 10.13040/ijpsr.0975-8232.4(6).2094-05 | 10.1002/chin.201418290
+     810f1a00-623f-4a5c-88e2-7240232ae8f9 | 639d4ddd-ef24-49dc-8d4e-ea626b0b85f8 | 10.13040/ijpsr.0975-8232.5(6).2216-24 | 10.1006/phrs.1997.0184
+                                                                                   10.13040/IJPSR.0975-8232.5(6).2216-24
+     72752c59-9532-467d-8b4f-fe4d994bcff5 | cc3db36c-27e9-45bf-96b0-56d1119b93d6 | 10.22201/facmed.2007865xe.2018.26     | 10.22201/facmed.2007865x.2018.26.01
+     a0740fc2-a1db-4bc8-ac98-177ccebde24f | 0a7136ac-86ad-4636-9c2a-b56a6d5a3a27 | 10.24966/cmph-1978/vol6iss1           | 10.1093/milmed/146.4.283
+     5b05df6a-7a0e-411b-a4e1-0081d7522673 | dc0cb585-c709-4990-a82e-c7c9b254fc74 | 10.15406/jig.3.1                      | 10.15406/jig.2016.03.00039
+     011964dd-24a6-4d25-b9dd-921077f1947e | deabab6d-7381-4567-bf56-5294ab081e20 | 10.17265/1537-1506/2013.01            | 10.1109/icsssm.2016.7538527
+     4d7efa59-30c7-4015-bea9-d7df171a97ed | a05449a0-34d7-4f45-9b92-bcf71db4918d | 10.14413/herj.2017.01.09              | 10.14413/herj.2017.01.09.
+    (20 rows)
+
+Total counts, broken DOIs, by source:
+
+    SELECT file_edit.extra_json->>'ingest_request_source' as source, COUNT(*) as broken_files
+    FROM file_edit
+        LEFT JOIN file_rev_release ON file_edit.rev_id = file_rev_release.file_rev
+        LEFT JOIN release_ident ON file_rev_release.target_release_ident_id = release_ident.id
+        LEFT JOIN release_rev ON release_rev.id = release_ident.rev_id
+    WHERE
+        file_edit.extra_json->>'link_source_id' IS NOT NULL
+        AND file_edit.extra_json->>'link_source_id' LIKE '10.%'
+        AND release_rev.doi != file_edit.extra_json->>'link_source_id'
+    GROUP BY file_edit.extra_json->>'ingest_request_source';
+
+
+          source      | broken_files
+    ------------------+--------------
+     fatcat-changelog |          872
+     unpaywall        |       227954
+    (2 rows)
+
+## Cleanup Pseudocode
+
+Dump file idents from above SQL queries. Transform from UUID to fatcat ident.
+
+For each file ident, fetch file entity, with releases expanded. Also fetch edit
+history. Filter by:
+
+- single release for file
+- single edit in history, matching broken agent
+- edit metadata indicates a DOI, which doesn't match release DOI (or release doesn't have DOI?)
+
+If all that is the case, do a lookup by the correct DOI. If there is a match,
+update the file entity to point to that release (and only that release).
+Operate in batches.
+
+Write cleanup bot, run in QA to test. Run SQL query again and iterate if some
+files were not updated. Merge to master, run full dump and update in prod.
+
+## Export Matches Needing Fix
+
+    COPY (
+        SELECT row_to_json(row.*) FROM (
+            SELECT file_edit.ident_id as file_ident, release_ident.id as wrong_release_ident, file_edit.extra_json as edit_extra
+            FROM file_edit
+                LEFT JOIN file_rev_release ON file_edit.rev_id = file_rev_release.file_rev
+                LEFT JOIN release_ident ON file_rev_release.target_release_ident_id = release_ident.id
+                LEFT JOIN release_rev ON release_rev.id = release_ident.rev_id
+            WHERE
+                file_edit.extra_json->>'link_source_id' IS NOT NULL
+                AND file_edit.extra_json->>'link_source_id' LIKE '10.%'
+                AND release_rev.doi != file_edit.extra_json->>'link_source_id'
+            -- LIMIT 20;
+        ) as row
+    ) TO '/srv/fatcat/snapshots/file_release_bugfix_20211105.unfiltered.json';
+    # COPY 228,827
+
+Then need to do a `sed` pass, and a `rg` pass, to filter out bad escapes:
+
+    cat /srv/fatcat/snapshots/file_release_bugfix_20211105.unfiltered.json \
+        | sed 's/\\"/\"/g' \
+        | rg -v "\\\\" \
+        | rg '^\{' \
+        | pv -l \
+        > /srv/fatcat/snapshots/file_release_bugfix_20211105.json
+    # 228k 0:00:00 [ 667k/s]
+
+    wc -l /srv/fatcat/snapshots/file_release_bugfix_20211105.json
+    # 228,826
+
+And create a sample file:
+
+    shuf -n10000 /srv/fatcat/snapshots/file_release_bugfix_20211105.json > /srv/fatcat/snapshots/file_release_bugfix_20211105.10k_sample.json
+
diff --git a/python/fatcat_tools/cleanups/file_release_bugfix.py b/python/fatcat_tools/cleanups/file_release_bugfix.py
new file mode 100644
index 00000000..025c1370
--- /dev/null
+++ b/python/fatcat_tools/cleanups/file_release_bugfix.py
@@ -0,0 +1,231 @@
+import argparse
+import os
+import sys
+from typing import Any, Dict
+
+import fatcat_openapi_client
+from fatcat_openapi_client import ApiClient, FileEntity
+
+from fatcat_tools import authenticated_api, public_api, uuid2fcid
+from fatcat_tools.importers.common import EntityImporter, JsonLinePusher
+from fatcat_tools.normal import clean_doi
+
+
+class FileReleaseBugfix(EntityImporter):
+    """
+    This is a one-off / one-time cleanup script for file entities which got
+    imported with incorrect release ident mappings, due to a bug in the file
+    ingest importer.
+
+    While this calls itself a cleanup, it is based on the import code path. It
+    is not integrated into the `fatcat_import` or `fatcat_cleanup` controller;
+    instead it has a __main__ function and is invoked like:
+
+        python -m fatcat_tools.cleans.file_release_bugfix - < blah.json
+    """
+
+    def __init__(self, api: ApiClient, **kwargs):
+
+        eg_desc = (
+            kwargs.pop("editgroup_description", None)
+            or "Correct bad file/release import mappings"
+        )
+        eg_extra = kwargs.pop("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FileReleaseBugfix")
+        super().__init__(
+            api,
+            do_updates=True,
+            editgroup_description=eg_desc,
+            editgroup_extra=eg_extra,
+            **kwargs,
+        )
+        self.testing_mode = False
+
+    def want(self, row: Dict[str, Any]) -> bool:
+        if not (
+            row.get("edit_extra")
+            and row["edit_extra"].get("link_source")
+            and row["edit_extra"].get("link_source_id")
+        ):
+            self.counts["skip-partial"] += 1
+            return False
+        if row["edit_extra"]["link_source"] not in ["unpaywall", "fatcat-changelog"]:
+            self.counts["skip-link-source"] += 1
+            return False
+        if not row["edit_extra"]["link_source_id"].startswith("10."):
+            self.counts["skip-source-id-not-doi"] += 1
+            return False
+        return True
+
+    def parse_record(self, row: Dict[str, Any]) -> FileEntity:
+
+        # bezerk mode doesn't make sense for this importer
+        assert self.bezerk_mode is False
+
+        file_ident = uuid2fcid(row["file_ident"])
+        wrong_release_ident = uuid2fcid(row["wrong_release_ident"])
+        edit_extra = row["edit_extra"]
+        assert edit_extra["link_source"] in ["unpaywall", "fatcat-changelog"]
+        file_edit_doi = clean_doi(edit_extra["link_source_id"])
+
+        if not file_edit_doi:
+            self.counts["skip-bad-doi"] += 1
+            return False
+
+        # check that the "wrong" release exists and doesn't have the DOI
+        wrong_release = None
+        try:
+            wrong_release = self.api.get_release(wrong_release_ident)
+        except fatcat_openapi_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+
+        if not wrong_release:
+            self.counts["skip-wrong-release-missing"] += 1
+            return None
+
+        if clean_doi(wrong_release.ext_ids.doi) == file_edit_doi:
+            self.counts["skip-wrong-release-is-ok"] += 1
+            return None
+
+        # fetch the "correct" release, if any
+        fixed_release_ids = []
+        correct_release = None
+        try:
+            correct_release = self.api.lookup_release(doi=file_edit_doi)
+        except fatcat_openapi_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+
+        if correct_release:
+            fixed_release_ids.append(correct_release.ident)
+
+        fe = FileEntity(
+            ident=file_ident,
+            release_ids=fixed_release_ids,
+            edit_extra=edit_extra,
+        )
+        fe._wrong_release_ident = wrong_release_ident
+        return fe
+
+    def try_update(self, fe: FileEntity) -> bool:
+
+        wrong_release_ident = fe._wrong_release_ident
+        assert len(wrong_release_ident) == 26
+
+        # should always be existing... but in QA it might not be
+        existing = None
+        try:
+            existing = self.api.get_file(fe.ident)
+        except fatcat_openapi_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+
+        if not existing:
+            self.counts["skip-existing-not-found"] += 1
+            return False
+
+        if wrong_release_ident not in existing.release_ids:
+            self.counts["skip-existing-fixed"] += 1
+            return False
+
+        # fetch existing history to verify mismatch
+        history = self.api.get_file_history(existing.ident)
+
+        if len(history) != 1:
+            self.counts["skip-existing-history-updated"] += 1
+            return False
+
+        bad_edit = history[0].edit
+        if bad_edit.extra != fe.edit_extra:
+            self.counts["skip-existing-edit-history-extra-mismatch"] += 1
+            return False
+
+        bad_editgroup = history[0].editgroup
+
+        if not bad_editgroup.extra:
+            self.counts["skip-existing-editgroup-missing-extra"] += 1
+            return False
+
+        if (
+            bad_editgroup.editor_id != "scmbogxw25evtcesfcab5qaboa"
+            or bad_editgroup.extra.get("agent") != "fatcat_tools.IngestFileResultImporter"
+            or not bad_editgroup.extra.get("git_rev", "").startswith("v0.3")
+            or bad_editgroup.created.year != 2020
+        ):
+            self.counts["skip-existing-edit-history-mismatch"] += 1
+            return False
+
+        existing.release_ids = [ri for ri in existing.release_ids if ri != wrong_release_ident]
+
+        if len(fe.release_ids) == 1:
+            if fe.release_ids[0] not in existing.release_ids:
+                existing.release_ids.append(fe.release_ids[0])
+
+        existing.edit_extra = fe.edit_extra
+
+        # not doing a check for "in current editgroup", because the source of
+        # these corrections (entity dump) contains no dupes
+
+        if not self.testing_mode:
+            self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
+        self.counts["update"] += 1
+        return False
+
+
+def test_file_release_bugfix() -> None:
+    api = public_api("http://localhost:9411/v0")
+    frbc = FileReleaseBugfix(api=api)
+    frbc.testing_mode = True
+
+    assert frbc.want({"this": "asdf"}) is False
+
+    example_line: Dict[str, Any] = {
+        "file_ident": "00000000-0000-0000-3333-000000000002",
+        "wrong_release_ident": "00000000-0000-0000-4444-000000000002",
+        "edit_extra": {
+            "link_source": "unpaywall",
+            "link_source_id": "10.1371/journal.pmed.0020124",
+            "ingest_request_source": "unpaywall",
+        },
+    }
+
+    fe1 = frbc.parse_record(example_line)
+    print(frbc.counts)
+    frbc.try_update(fe1)
+
+    # NOTE: this test is pretty incompleted
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "--host-url", default="http://localhost:9411/v0", help="connect to this host/port"
+    )
+    parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)
+    parser.set_defaults(
+        auth_var="FATCAT_AUTH_WORKER_CLEANUP",
+    )
+    parser.add_argument(
+        "json_file",
+        help="File with jsonlines with cleanup context",
+        default=sys.stdin,
+        type=argparse.FileType("r"),
+    )
+
+    args = parser.parse_args()
+    api = authenticated_api(
+        args.host_url,
+        # token is an optional kwarg (can be empty string, None, etc)
+        token=os.environ.get(args.auth_var),
+    )
+
+    frbc = FileReleaseBugfix(
+        api,
+        edit_batch_size=args.batch_size,
+    )
+    JsonLinePusher(frbc, args.json_file).run()
+
+
+if __name__ == "__main__":
+    main()
-- 
cgit v1.2.3


From ca3aedcefd5c3835b6fa9b685200ef12d635d22c Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 9 Nov 2021 21:12:54 -0800
Subject: update cleanups notes

---
 notes/cleanups/file_release_ingest_bug.md | 48 +++++++++++++++++++++++++++++++
 notes/cleanups/wayback_timestamps.md      | 24 ++++++++++++++++
 2 files changed, 72 insertions(+)

(limited to 'notes/cleanups')

diff --git a/notes/cleanups/file_release_ingest_bug.md b/notes/cleanups/file_release_ingest_bug.md
index 8690157a..d818905b 100644
--- a/notes/cleanups/file_release_ingest_bug.md
+++ b/notes/cleanups/file_release_ingest_bug.md
@@ -142,3 +142,51 @@ And create a sample file:
 
     shuf -n10000 /srv/fatcat/snapshots/file_release_bugfix_20211105.json > /srv/fatcat/snapshots/file_release_bugfix_20211105.10k_sample.json
 
+
+## Testing in QA
+
+    export FATCAT_AUTH_WORKER_CLEANUP=[...]
+
+    head -n10 /srv/fatcat/snapshots/file_release_bugfix_20211105.10k_sample.json \
+        | python -m fatcat_tools.cleanups.file_release_bugfix -
+    # Counter({'total': 10, 'update': 10, 'skip': 0, 'insert': 0, 'exists': 0})
+
+    file_wmjzuybuorfrjdgnr2d32vg5va: was wrong, now correct, no other files
+    file_wgszcwehlffnzmoypr4l2yhvza: was correct, now correct, no other files. multiple articles in single PDF
+    file_p4r5sbminzgrhn4yaiqyr7ahwi: was correct, now wrong (!!!)
+        doi:10.1055/s-0036-1579844
+        PDF says: 10.4103/0028-3886.158210
+        unpaywall still has this incorrect linkage
+    file_n5jtvrnodbfdbccl5d6hshhvw4: now stub, was wrong
+        doi:10.19080/bboaj.2018.04.555640 release_suvtcm7hdbbr3fczcxxt4thgoi seems correct?
+        doi:10.19080/bboaj.4.4 is the bad DOI, not in fatcat. is registered, as an entire issue
+    file_kt5fv4d5lrbk7j3dxxgcm7hph4: was wrong, now correct
+    file_jq4juugnynhsdkh3whkcjod46q: was wrong, now correct
+    file_ef7w6y7k4jhjhgwmfai37yjjmm: was wrong, now correct
+    file_ca2svhd6knfnff4dktuonp45du: was correct, now correct. complicated, multiple DOIs/copies/files of same work
+    file_borst2aewvcwzhucnyth2vf3lm: was correct, now correct. complicated, multiple DOIs, single file of same work
+
+Overall, seems like this might not be as much of an obvious improvement as
+hoped! But still progress and more correct.
+
+    head -n1000 /srv/fatcat/snapshots/file_release_bugfix_20211105.10k_sample.json \
+        | python -m fatcat_tools.cleanups.file_release_bugfix -
+    # Counter({'total': 1000, 'update': 929, 'skip-existing-history-updated': 58, 'skip-existing-fixed': 10, 'skip': 3, 'skip-link-source': 3, 'insert': 0, 'exists': 0})
+
+Looking at `skip-link-source`, it is cases where `link_source` is 'doi' not
+'fatcat-changelog'. Will update filter behavior, 'fatcat-changelog' is a
+`ingest_request_source`.
+
+Checking another 10 examples. They all seem to end up as correct matches.
+
+Did a small update and running the whole batch:
+
+    cat /srv/fatcat/snapshots/file_release_bugfix_20211105.10k_sample.json \
+        | python -m fatcat_tools.cleanups.file_release_bugfix -
+    # Counter({'total': 10000, 'update': 8499, 'skip-existing-fixed': 939, 'skip-existing-history-updated': 560, 'skip': 2, 'skip-wrong-release-is-ok': 2, 'insert': 0, 'exists': 0})
+
+I think this is ready to go! Example with parallel:
+
+    cat /srv/fatcat/snapshots/file_release_bugfix_20211105.10k_sample.json \
+        | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_tools.cleanups.file_release_bugfix -
+
diff --git a/notes/cleanups/wayback_timestamps.md b/notes/cleanups/wayback_timestamps.md
index 85e5f94f..38bc3f7d 100644
--- a/notes/cleanups/wayback_timestamps.md
+++ b/notes/cleanups/wayback_timestamps.md
@@ -265,3 +265,27 @@ Running in bulk again:
         | gzip \
         > files_20211105_moreshortts.fetched.json.gz
 
+Ran in to one: `requests.exceptions.HTTPError: 503 Server Error: Service
+Temporarily Unavailable for url: [...]`. Will try again, if there are more
+failures may need to split up in smaller chunks.
+
+Unexpected:
+
+    Traceback (most recent call last):
+      File "./fetch_full_cdx_ts.py", line 200, in <module>
+        main()
+      File "./fetch_full_cdx_ts.py", line 197, in main
+        print(json.dumps(process_file(fe, session=session)))
+      File "./fetch_full_cdx_ts.py", line 118, in process_file
+        assert seg[4].isdigit()
+    AssertionError
+    3.96M 3:04:46 [ 357 /s]
+
+Ugh.
+
+    zcat files_20211105_moreshortts.json.gz \
+        | tac \
+        | parallel -j8 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \
+        | pv -l \
+        | gzip \
+        > files_20211105_moreshortts.fetched.json.gz
-- 
cgit v1.2.3


From cd09c6d6bd4deef0627de4f8a8a301725db01e14 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 9 Nov 2021 22:55:58 -0800
Subject: wayback ts cleanup: one more filter tweak

---
 notes/cleanups/scripts/fetch_full_cdx_ts.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'notes/cleanups')

diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py
index d5b0c476..ebcf0d62 100644
--- a/notes/cleanups/scripts/fetch_full_cdx_ts.py
+++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py
@@ -115,7 +115,8 @@ def process_file(fe, session) -> dict:
         seg = u.split('/')
         assert seg[2] == "web.archive.org"
         assert seg[3] == "web"
-        assert seg[4].isdigit()
+        if not seg[4].isdigit():
+            continue
         original_url = "/".join(seg[5:])
         if len(seg[4]) == 12 or len(seg[4]) == 4:
             short_urls.append(u)
-- 
cgit v1.2.3