From 94ddfd7167994b4c0f7940317655d152aba302e6 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 4 Nov 2021 17:08:36 -0700
Subject: short wayback ts: initial cleanup script implementation

---
 .../fatcat_tools/cleanups/file_short_wayback_ts.py | 251 +++++++++++++++++++++
 1 file changed, 251 insertions(+)
 create mode 100644 python/fatcat_tools/cleanups/file_short_wayback_ts.py

(limited to 'python/fatcat_tools/cleanups/file_short_wayback_ts.py')

diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
new file mode 100644
index 00000000..2d893dbf
--- /dev/null
+++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
@@ -0,0 +1,251 @@
+import argparse
+import copy
+import os
+import sys
+from typing import Any, Dict
+
+import fatcat_openapi_client
+from fatcat_openapi_client import ApiClient, FileEntity
+
+from fatcat_tools import authenticated_api, entity_from_dict, public_api
+from fatcat_tools.importers.common import EntityImporter, JsonLinePusher
+
+
+class FileShortWaybackTimestampCleanup(EntityImporter):
+    """
+    This is a one-off / one-time cleanup script for file entities, fix short
+    timestamps in wayback URLs. These timestamps are supposed to have 14 digits
+    (datetime with year, hour, seconds, etc). Some legacy file imports ended up
+    with only 4 or 12 digits.
+
+    While this calls itself a cleanup, it is based on the import code path. It
+    is not integrated into the `fatcat_import` or `fatcat_cleanup` controller;
+    instead it has a __main__ function and is invoked like:
+
+        python -m fatcat_tools.cleans.file_short_wayback-ts < blah.json
+    """
+
+    def __init__(self, api: ApiClient, **kwargs):
+
+        eg_desc = kwargs.pop("editgroup_description", None) or "Expand short wayback timestamps"
+        eg_extra = kwargs.pop("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get(
+            "agent", "fatcat_tools.FileShortWaybackTimestampCleanup"
+        )
+        super().__init__(
+            api,
+            do_updates=True,
+            editgroup_description=eg_desc,
+            editgroup_extra=eg_extra,
+            **kwargs
+        )
+        self.testing_mode = False
+
+    def want(self, row: Dict[str, Any]) -> bool:
+        if row["status"].startswith("success"):
+            return True
+        else:
+            self.counts["skip-status"] += 1
+            return False
+
+    def parse_record(self, row: Dict[str, Any]) -> FileEntity:
+
+        # bezerk mode doesn't make sense for this importer
+        assert self.bezerk_mode is False
+
+        fe: FileEntity = entity_from_dict(row["file_entity"], FileEntity)
+        status: str = row["status"]
+        assert status.startswith("success")
+        url_expansions: Dict[str, str] = row["full_urls"]
+        assert len(url_expansions) >= 1
+
+        # actual cleanup happens here
+        any_fixed = False
+        for fe_url in fe.urls:
+            if "://web.archive.org/web/" not in fe_url.url:
+                continue
+            seq = fe_url.url.split("/")
+            partial_ts = seq[4]
+            original_url = "/".join(seq[5:])
+            if seq[2] != "web.archive.org":
+                continue
+            if len(partial_ts) not in [4, 12]:
+                continue
+            if fe_url.url in url_expansions:
+                fix_url = url_expansions[fe_url.url]
+                # defensive checks
+                assert partial_ts in fix_url
+                assert "://" in fix_url
+                assert fe_url.url.endswith(original_url)
+                assert fix_url.endswith(original_url)
+                fe_url.url = fix_url
+                any_fixed = True
+
+        if not any_fixed:
+            self.counts["skip-no-fixes"] += 1
+            return None
+
+        # do any other generic file entity cleanups
+        # this includes removing duplicates
+        fe = self.generic_file_cleanups(fe)
+
+        # verify that there are no exact duplicates
+        final_urls = [u.url for u in fe.urls]
+        assert len(final_urls) == len(list(set(final_urls)))
+
+        return fe
+
+    def try_update(self, fe: FileEntity) -> bool:
+
+        # should always be existing
+        try:
+            existing = self.api.get_file(fe.ident)
+        except fatcat_openapi_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+
+        if not existing:
+            self.counts["skip-existing-not-found"] += 1
+            return False
+
+        if existing.sha1 != fe.sha1:
+            self.counts["skip-existing-mismatch"] += 1
+            return False
+
+        assert fe.revision and existing.revision
+        if existing.revision != fe.revision:
+            self.counts["skip-revision-changed"] += 1
+            return False
+
+        # verify that at least one URL remains
+        if not fe.urls or len(fe.urls) < 1:
+            self.counts["skip-no-urls"] += 1
+            return False
+
+        # verify that all wayback urls have 14-digit timestamps, and are generally well-formed
+        for u in fe.urls:
+            if "://web.archive.org/web/" not in u.url:
+                continue
+            if u.rel != "webarchive":
+                self.counts["skip-bad-wayback-rel"] += 1
+                return False
+            seg = u.url.split("/")
+            if (
+                len(seg) < 6
+                or seg[0] != "https:"
+                or seg[2] != "web.archive.org"
+                or seg[3] != "web"
+            ):
+                self.counts["skip-bad-wayback"] += 1
+                return False
+            if len(seg[4]) != 14 or not seg[4].isdigit():
+                self.counts["skip-bad-wayback-timestamp"] += 1
+                return False
+
+        if existing == fe or existing.urls == fe.urls:
+            self.counts["skip-no-change"] += 1
+            return False
+
+        # not doing a check for "in current editgroup", because the source of
+        # these corrections (entity dump) contains no dupes
+
+        if not self.testing_mode:
+            self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
+        self.counts["update"] += 1
+        return False
+
+
+def test_short_wayback_ts() -> None:
+    api = public_api("http://localhost:9411/v0")
+    fswtc = FileShortWaybackTimestampCleanup(api=api)
+    fswtc.testing_mode = True
+
+    assert fswtc.want({"status": "fail"}) is False
+    assert fswtc.want({"status": "success-self"}) is True
+
+    example_line: Dict[str, Any] = {
+        "status": "success-db",
+        "file_entity": {
+            # note: doesn't match actual entity
+            "release_ids": ["waldfsctnbcpdbmasgduhaaaaa"],
+            "mimetype": "application/pdf",
+            "urls": [
+                {
+                    "url": "https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971",
+                    "rel": "web",
+                },
+                {
+                    "url": "https://web.archive.org/web/201904301022/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971",
+                    "rel": "webarchive",
+                },
+            ],
+            "sha256": "0b9e09480ed2e1f08f3c6c72f57ce12b52ea265f580f8810e606b49d64234b29",
+            "sha1": "be714299b9be21b5afdaa7affd7d710c58269433",
+            "md5": "9edb542be5b3446a1905e61a8a3abebd",
+            "size": 666242,
+            "revision": "fe949be8-7bf9-4c17-be28-8e3e90fb85bd",
+            "ident": "4ghpvs2t2rdtrdum2mkreh62me",
+            "state": "active",
+        },
+        "full_urls": {
+            "https://web.archive.org/web/201904301022/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971": "https://web.archive.org/web/20190430102239/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971"
+        },
+    }
+    example_fe = entity_from_dict(example_line["file_entity"], FileEntity)
+
+    fe1 = copy.copy(example_fe)
+    fe1.urls[
+        1
+    ].url = "https://web.archive.org/web/20190430102239/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971"
+    assert fswtc.parse_record(example_line) == fe1
+
+    # update code path; requires a known file ident and API running locally
+    assert fswtc.counts["update"] == 0
+    dummy_fe = api.get_file("aaaaaaaaaaaaamztaaaaaaaaai")
+    fe1.ident = dummy_fe.ident
+
+    assert fswtc.try_update(fe1) is False
+    assert fswtc.counts["skip-existing-mismatch"] == 1
+
+    fe1.sha1 = dummy_fe.sha1
+    assert fswtc.try_update(fe1) is False
+    assert fswtc.counts["skip-revision-changed"] == 1
+
+    fe1.revision = dummy_fe.revision
+    assert fswtc.try_update(fe1) is False
+    print(fswtc.counts)
+    assert fswtc.counts["update"] == 1
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "--host-url", default="http://localhost:9411/v0", help="connect to this host/port"
+    )
+    parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)
+    parser.set_defaults(
+        auth_var="FATCAT_API_AUTH_TOKEN",
+    )
+    parser.add_argument(
+        "json_file",
+        help="File with jsonlines from file_meta schema to import from",
+        default=sys.stdin,
+        type=argparse.FileType("r"),
+    )
+
+    args = parser.parse_args()
+    api = authenticated_api(
+        args.host_url,
+        # token is an optional kwarg (can be empty string, None, etc)
+        token=os.environ.get(args.auth_var),
+    )
+
+    fswtc = FileShortWaybackTimestampCleanup(
+        api,
+        edit_batch_size=args.batch_size,
+    )
+    JsonLinePusher(fswtc, args.json_file).run()
+
+
+if __name__ == "__main__":
+    main()
-- 
cgit v1.2.3


From 1ad08edf2d3d06196119ec1eddc932e6423e3e7c Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 4 Nov 2021 17:56:16 -0700
Subject: wayback short ts: add regression test for dupe URLs

---
 .../fatcat_tools/cleanups/file_short_wayback_ts.py | 44 ++++++++++++++++++++++
 1 file changed, 44 insertions(+)

(limited to 'python/fatcat_tools/cleanups/file_short_wayback_ts.py')

diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
index 2d893dbf..ab1b2a5f 100644
--- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py
+++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
@@ -216,6 +216,50 @@ def test_short_wayback_ts() -> None:
     print(fswtc.counts)
     assert fswtc.counts["update"] == 1
 
+    # another example, which failed with an assertion in prod due to duplicated URLs
+    example_line2: Dict[str, Any] = {
+        "file_entity": {
+            "release_ids": ["22jt7euq4fafhblzullmnesso4"],
+            "mimetype": "application/pdf",
+            "urls": [
+                {
+                    "url": "https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf",
+                    "rel": "repository",
+                },
+                {
+                    "url": "https://web.archive.org/web/201811010021/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf",
+                    "rel": "webarchive",
+                },
+                {
+                    "url": "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf",
+                    "rel": "webarchive",
+                },
+            ],
+            "sha256": "51ec58e7a2325d28d1deb0a4bc6422c0e4ae7b12ffb0b6298981a7b8b7730b19",
+            "sha1": "ad96a584fc6073b9a23736bc61ae0ec4a5661433",
+            "md5": "3d509743359649e34a27ae70c5cd3018",
+            "size": 430665,
+            "extra": {
+                "shadows": {"scimag_doi": "10.4259/ibk.59.1_194", "scimag_id": "69089904"}
+            },
+            "revision": "f1fa11ff-d521-45cf-9db1-cb3c8bd3ea48",
+            "ident": "duymhmxk3fgtzk37yp2pvthtxq",
+            "state": "active",
+        },
+        "full_urls": {
+            "https://web.archive.org/web/201811010021/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf": "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf"
+        },
+        "status": "success-self",
+    }
+
+    fe2 = fswtc.parse_record(example_line2)
+    assert len(fe2.urls) == 2
+    assert fe2.urls[0].rel == "repository"
+    assert (
+        fe2.urls[1].url
+        == "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf"
+    )
+
 
 def main() -> None:
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-- 
cgit v1.2.3


From 766bc9c6ebe22a29c39c91941f12eed29b85a41d Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 4 Nov 2021 18:01:49 -0700
Subject: wayback cleanup: actually update entity

---
 python/fatcat_tools/cleanups/file_short_wayback_ts.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'python/fatcat_tools/cleanups/file_short_wayback_ts.py')

diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
index ab1b2a5f..d8b8ab9d 100644
--- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py
+++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
@@ -27,7 +27,7 @@ class FileShortWaybackTimestampCleanup(EntityImporter):
 
     def __init__(self, api: ApiClient, **kwargs):
 
-        eg_desc = kwargs.pop("editgroup_description", None) or "Expand short wayback timestamps"
+        eg_desc = kwargs.pop("editgroup_description", None) or "Expand trunacted timestamps in wayback URLs"
         eg_extra = kwargs.pop("editgroup_extra", dict())
         eg_extra["agent"] = eg_extra.get(
             "agent", "fatcat_tools.FileShortWaybackTimestampCleanup"
@@ -150,7 +150,9 @@ class FileShortWaybackTimestampCleanup(EntityImporter):
         # these corrections (entity dump) contains no dupes
 
         if not self.testing_mode:
-            self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
+            # note: passing 'fe' instead of 'existing' here, which is not
+            # usually how it goes!
+            self.api.update_file(self.get_editgroup_id(), fe.ident, fe)
         self.counts["update"] += 1
         return False
 
-- 
cgit v1.2.3


From 2712b56580c7d5e2e07fdccd990e1918c43880f9 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Fri, 5 Nov 2021 11:03:06 -0700
Subject: wayback short ts: another regression test, and some small fmt/tweaks

---
 .../fatcat_tools/cleanups/file_short_wayback_ts.py | 41 ++++++++++++++++++++--
 1 file changed, 38 insertions(+), 3 deletions(-)

(limited to 'python/fatcat_tools/cleanups/file_short_wayback_ts.py')

diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
index d8b8ab9d..56a5c80e 100644
--- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py
+++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
@@ -27,7 +27,10 @@ class FileShortWaybackTimestampCleanup(EntityImporter):
 
     def __init__(self, api: ApiClient, **kwargs):
 
-        eg_desc = kwargs.pop("editgroup_description", None) or "Expand trunacted timestamps in wayback URLs"
+        eg_desc = (
+            kwargs.pop("editgroup_description", None)
+            or "Expand trunacted timestamps in wayback URLs"
+        )
         eg_extra = kwargs.pop("editgroup_extra", dict())
         eg_extra["agent"] = eg_extra.get(
             "agent", "fatcat_tools.FileShortWaybackTimestampCleanup"
@@ -37,7 +40,7 @@ class FileShortWaybackTimestampCleanup(EntityImporter):
             do_updates=True,
             editgroup_description=eg_desc,
             editgroup_extra=eg_extra,
-            **kwargs
+            **kwargs,
         )
         self.testing_mode = False
 
@@ -74,7 +77,7 @@ class FileShortWaybackTimestampCleanup(EntityImporter):
             if fe_url.url in url_expansions:
                 fix_url = url_expansions[fe_url.url]
                 # defensive checks
-                assert partial_ts in fix_url
+                assert f"/web/{partial_ts}" in fix_url
                 assert "://" in fix_url
                 assert fe_url.url.endswith(original_url)
                 assert fix_url.endswith(original_url)
@@ -262,6 +265,38 @@ def test_short_wayback_ts() -> None:
         == "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf"
     )
 
+    # ensure URL order is stable
+    example_line3: Dict[str, Any] = {
+        "file_entity": {
+            "release_ids": ["5rin7f2cdvc5hjkqqw53z7sr3i"],
+            "mimetype": "application/pdf",
+            "urls": [
+                {"url": "https://pubs.usgs.gov/bul/1108/report.pdf", "rel": "web"},
+                {
+                    "url": "https://web.archive.org/web/201904291643/https://pubs.usgs.gov/bul/1108/report.pdf",
+                    "rel": "webarchive",
+                },
+            ],
+            "sha256": "714cd48c2577e9b058b8f16b4574765da685f67582cc53898a9d6933e45d6cc0",
+            "sha1": "4efbdb517c0ff3f58136e4efbbec2bd9315400d3",
+            "md5": "89b6e6cc4e0259317e26ddf1a9a336a0",
+            "size": 41265,
+            "revision": "926fcf73-e644-4446-a24b-4d0940a2cf65",
+            "ident": "lvnz23nzijaapf5iti45zez6zu",
+            "state": "active",
+        },
+        "full_urls": {
+            "https://web.archive.org/web/201904291643/https://pubs.usgs.gov/bul/1108/report.pdf": "https://web.archive.org/web/20190429164342/https://pubs.usgs.gov/bul/1108/report.pdf"
+        },
+        "status": "success-db",
+    }
+
+    fe3 = fswtc.parse_record(example_line3)
+    assert len(fe3.urls) == 2
+    assert fe3.urls[0].rel == "web"
+    assert fe3.urls[0].url == "https://pubs.usgs.gov/bul/1108/report.pdf"
+    assert fe3.urls[1].rel == "webarchive"
+
 
 def main() -> None:
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-- 
cgit v1.2.3


From 86e6850e70617e1609b79e0ee4bfe2a26f7f992e Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 9 Nov 2021 14:17:31 -0800
Subject: cleanups: tweaks to wayback CDX cleanup scripts

---
 notes/cleanups/scripts/fetch_full_cdx_ts.py           |  9 ++++++++-
 python/fatcat_tools/cleanups/file_short_wayback_ts.py | 18 +++++++++++++-----
 2 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'python/fatcat_tools/cleanups/file_short_wayback_ts.py')

diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py
index 6c6817ab..6f67c7e1 100644
--- a/notes/cleanups/scripts/fetch_full_cdx_ts.py
+++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py
@@ -157,7 +157,14 @@ def process_file(fe, session) -> dict:
         if short in full_urls:
             continue
 
-        cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session)
+        cdx_record = None
+        try:
+            cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session)
+        except requests.exceptions.HTTPError as e:
+            if e.response.status_code == 403:
+                return dict(file_entity=fe, full_urls=full_urls, status="fail-cdx-403")
+            else:
+                raise
         if cdx_record:
             if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts):
                 assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit()
diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
index 56a5c80e..a9b19921 100644
--- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py
+++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
@@ -22,7 +22,7 @@ class FileShortWaybackTimestampCleanup(EntityImporter):
     is not integrated into the `fatcat_import` or `fatcat_cleanup` controller;
     instead it has a __main__ function and is invoked like:
 
-        python -m fatcat_tools.cleans.file_short_wayback-ts < blah.json
+        python -m fatcat_tools.cleans.file_short_wayback_ts - < blah.json
     """
 
     def __init__(self, api: ApiClient, **kwargs):
@@ -77,10 +77,18 @@ class FileShortWaybackTimestampCleanup(EntityImporter):
             if fe_url.url in url_expansions:
                 fix_url = url_expansions[fe_url.url]
                 # defensive checks
-                assert f"/web/{partial_ts}" in fix_url
+                if not (
+                    f"/web/{partial_ts}" in fix_url
+                    and fe_url.url.endswith(original_url)
+                    and fix_url.endswith(original_url)
+                ):
+                    print(
+                        f"bad replacement URL: partial_ts={partial_ts} original={original_url} fix_url={fix_url}",
+                        file=sys.stderr,
+                    )
+                    self.counts["skip-bad-replacement"] += 1
+                    return None
                 assert "://" in fix_url
-                assert fe_url.url.endswith(original_url)
-                assert fix_url.endswith(original_url)
                 fe_url.url = fix_url
                 any_fixed = True
 
@@ -305,7 +313,7 @@ def main() -> None:
     )
     parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)
     parser.set_defaults(
-        auth_var="FATCAT_API_AUTH_TOKEN",
+        auth_var="FATCAT_AUTH_WORKER_CLEANUP",
     )
     parser.add_argument(
         "json_file",
-- 
cgit v1.2.3


From 996b2e2084c1798126bd91dd950c063982398bec Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 9 Nov 2021 15:46:20 -0800
Subject: more iteration on short wayback timestamp cleanup

---
 notes/cleanups/scripts/fetch_full_cdx_ts.py        |   2 +-
 notes/cleanups/wayback_timestamps.md               | 129 ++++++++++++++++++++-
 .../fatcat_tools/cleanups/file_short_wayback_ts.py |   2 +-
 3 files changed, 129 insertions(+), 4 deletions(-)

(limited to 'python/fatcat_tools/cleanups/file_short_wayback_ts.py')

diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py
index 6f67c7e1..d5b0c476 100644
--- a/notes/cleanups/scripts/fetch_full_cdx_ts.py
+++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py
@@ -137,7 +137,7 @@ def process_file(fe, session) -> dict:
         if short in full_urls:
             continue
 
-        if original_url in self_urls:
+        if original_url in self_urls and ts in self_urls[original_url]:
             full_urls[short] = self_urls[original_url]
             status = "success-self"
             continue
diff --git a/notes/cleanups/wayback_timestamps.md b/notes/cleanups/wayback_timestamps.md
index 81785992..85e5f94f 100644
--- a/notes/cleanups/wayback_timestamps.md
+++ b/notes/cleanups/wayback_timestamps.md
@@ -61,9 +61,10 @@ Yes, 4-digit is a popular pattern as well, need to handle those:
     # 111M 0:13:22 [ 139k/s]
 
     zcat files_20211007_moreshortts.json.gz | wc -l
+    # 9,958,854
 
     zcat files_20211007_moreshortts.json.gz | shuf -n10000 > files_20211007_moreshortts.10k_sample.json
-    # 9,958,854
+
 
 ## Fetch Complete URL
 
@@ -114,6 +115,14 @@ Again with the broader set:
        6175 "success-db"
        3035 "success-self"
 
+While running a larger batch, got a CDX API error:
+
+    requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://web.archive.org/cdx/search/cdx?url=https%3A%2F%2Fwww.psychologytoday.com%2Ffiles%2Fu47%2FHenry_et_al.pdf&from=2017&to=2017&matchType=exact&output=json&limit=20
+
+    org.archive.util.io.RuntimeIOException: org.archive.wayback.exception.AdministrativeAccessControlException: Blocked Site Error
+
+So maybe need to use credentials after all.
+
 
 ## Cleanup Process
 
@@ -128,11 +137,13 @@ It looks like the rel swap is already implemented in `generic_file_cleanups()`.
 From sampling it seems like the mimetype issue is pretty small, so not going to
 bite that off now. The "bogus file" issue requires thought, so also skipping.
 
-## Commands
+
+## Commands (old)
 
 Running with 8x parallelism to not break things; expecting some errors along
 the way, may need to add handlers for connection errors etc:
 
+    # OLD SNAPSHOT
     zcat files_20211007_moreshortts.json.gz \
         | parallel -j8 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \
         | pv -l \
@@ -140,3 +151,117 @@ the way, may need to add handlers for connection errors etc:
         > files_20211007_moreshortts.fetched.json.gz
 
 At 300 records/sec, this should take around 9-10 hours to process.
+
+
+
+## Prep Again (2021-11-09)
+
+After fixing "sort" issue and re-dumping file entities (2021-11-05 snapshot).
+
+Filter again:
+
+    # note: in the future use pigz instead of gzip here
+    zcat file_export.json.gz \
+        | pv -l \
+        | rg 'web.archive.org/web/\d{4,12}/' \
+        | gzip \
+        > files_20211105_moreshortts.json.gz
+    # 112M 0:13:27 [ 138k/s]
+
+    zcat files_20211105_moreshortts.json.gz | wc -l
+    # 9,958,854
+    # good, exact same number as previous snapshot
+
+    zcat files_20211105_moreshortts.json.gz | shuf -n10000 > files_20211105_moreshortts.10k_sample.json
+    # done
+
+    cat files_20211105_moreshortts.10k_sample.json \
+        | ./fetch_full_cdx_ts.py \
+        | pv -l \
+        > files_20211105_moreshortts.10k_sample.fetched.json
+    # 10.0k 0:03:36 [46.3 /s]
+
+    cat files_20211105_moreshortts.10k_sample.fetched.json | jq .status | sort | uniq -c
+         13 "fail-not-found"
+        774 "success-api"
+       6193 "success-db"
+       3020 "success-self"
+
+After tweaking `success-self` logic:
+
+         13 "fail-not-found"
+        859 "success-api"
+       6229 "success-db"
+       2899 "success-self"
+
+
+## Testing in QA
+
+Copied `sample_out.json` to fatcat QA instance and renamed as `files_20211007_moreshortts.10k_sample.fetched.json`
+
+    # OLD ATTEMPT
+    export FATCAT_API_AUTH_TOKEN=[...]
+    head -n10 /srv/fatcat/datasets/files_20211007_moreshortts.10k_sample.fetched.json \
+        | python -m fatcat_tools.cleanups.file_short_wayback_ts -
+
+Ran in to issues, iterated above.
+
+Trying again with updated script and sample file:
+
+    export FATCAT_AUTH_WORKER_CLEANUP=[...]
+
+    head -n10 /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \
+        | python -m fatcat_tools.cleanups.file_short_wayback_ts -
+    # Counter({'total': 10, 'update': 10, 'skip': 0, 'insert': 0, 'exists': 0})
+
+Manually inspected and these look good. Trying some repeats and larger batched:
+
+    head -n10 /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \
+        | python -m fatcat_tools.cleanups.file_short_wayback_ts -
+    # Counter({'total': 10, 'skip-revision-changed': 10, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0})
+
+    head -n1000 /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \
+        | python -m fatcat_tools.cleanups.file_short_wayback_ts -
+
+    [...]
+    bad replacement URL: partial_ts=201807271139 original=http://www.scielo.br/pdf/qn/v20n1/4918.pdf fix_url=https://web.archive.org/web/20170819080342/http://www.scielo.br/pdf/qn/v20n1/4918.pdf
+    bad replacement URL: partial_ts=201904270207 original=https://www.matec-conferences.org/articles/matecconf/pdf/2018/62/matecconf_iccoee2018_03008.pdf fix_url=https://web.archive.org/web/20190501060839/https://www.matec-conferences.org/articles/matecconf/pdf/2018/62/matecconf_iccoee2018_03008.pdf
+    bad replacement URL: partial_ts=201905011445 original=https://cdn.intechopen.com/pdfs/5886.pdf fix_url=https://web.archive.org/web/20190502203832/https://cdn.intechopen.com/pdfs/5886.pdf
+    [...]
+
+    # Counter({'total': 1000, 'update': 969, 'skip': 19, 'skip-bad-replacement': 18, 'skip-revision-changed': 10, 'skip-bad-wayback-timestamp': 2, 'skip-status': 1, 'insert': 0, 'exists': 0})
+
+
+It looks like these "bad replacement URLs" are due to timestamp mismatches. Eg, the partial timestamp is not part of the final timestamp.
+
+Tweaked fetch script and re-ran:
+
+    # Counter({'total': 1000, 'skip-revision-changed': 979, 'update': 18, 'skip-bad-wayback-timestamp': 2, 'skip': 1, 'skip-status': 1, 'insert': 0, 'exists': 0})
+
+Cool. Sort of curious what the deal is with those `skip-bad-wayback-timestamp`.
+
+Run the rest through:
+
+    cat /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \
+        | python -m fatcat_tools.cleanups.file_short_wayback_ts -
+    # Counter({'total': 10000, 'update': 8976, 'skip-revision-changed': 997, 'skip-bad-wayback-timestamp': 14, 'skip': 13, 'skip-status': 13, 'insert': 0, 'exists': 0})
+
+Should tweak batch size to 100 (vs. 50).
+
+How to parallelize import:
+
+    # from within pipenv
+    cat /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \
+        | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_tools.cleanups.file_short_wayback_ts -
+
+
+## Full Batch Commands
+
+Running in bulk again:
+
+    zcat files_20211105_moreshortts.json.gz \
+        | parallel -j8 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \
+        | pv -l \
+        | gzip \
+        > files_20211105_moreshortts.fetched.json.gz
+
diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
index a9b19921..e2595912 100644
--- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py
+++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
@@ -311,7 +311,7 @@ def main() -> None:
     parser.add_argument(
         "--host-url", default="http://localhost:9411/v0", help="connect to this host/port"
     )
-    parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)
+    parser.add_argument("--batch-size", help="size of batch to send", default=100, type=int)
     parser.set_defaults(
         auth_var="FATCAT_AUTH_WORKER_CLEANUP",
     )
-- 
cgit v1.2.3


From fd0cf6b55567e3081b9e57e2423d6d0e529ace41 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 9 Nov 2021 19:46:55 -0800
Subject: cleanups: add more state=active checks

---
 python/fatcat_tools/cleanups/file_release_bugfix.py   | 4 ++++
 python/fatcat_tools/cleanups/file_short_wayback_ts.py | 4 ++++
 2 files changed, 8 insertions(+)

(limited to 'python/fatcat_tools/cleanups/file_short_wayback_ts.py')

diff --git a/python/fatcat_tools/cleanups/file_release_bugfix.py b/python/fatcat_tools/cleanups/file_release_bugfix.py
index 6eb60205..5ac69d1a 100644
--- a/python/fatcat_tools/cleanups/file_release_bugfix.py
+++ b/python/fatcat_tools/cleanups/file_release_bugfix.py
@@ -131,6 +131,10 @@ class FileReleaseBugfix(EntityImporter):
             self.counts["skip-existing-not-found"] += 1
             return False
 
+        if existing.state != "active":
+            self.counts["skip-existing-entity-state"] += 1
+            return False
+
         if wrong_release_ident not in existing.release_ids:
             self.counts["skip-existing-fixed"] += 1
             return False
diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
index e2595912..bdd49f9b 100644
--- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py
+++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
@@ -119,6 +119,10 @@ class FileShortWaybackTimestampCleanup(EntityImporter):
             self.counts["skip-existing-not-found"] += 1
             return False
 
+        if existing.state != "active":
+            self.counts["skip-existing-entity-state"] += 1
+            return False
+
         if existing.sha1 != fe.sha1:
             self.counts["skip-existing-mismatch"] += 1
             return False
-- 
cgit v1.2.3