5 files changed, 754 insertions, 1 deletions
diff --git a/python/fatcat_tools/cleanups/file_release_bugfix.py b/python/fatcat_tools/cleanups/file_release_bugfix.py
new file mode 100644
index 00000000..dc27f9b5
--- /dev/null
+++ b/python/fatcat_tools/cleanups/file_release_bugfix.py
@@ -0,0 +1,241 @@
+import argparse
+import os
+import sys
+from typing import Any, Dict
+
+import fatcat_openapi_client
+from fatcat_openapi_client import ApiClient, FileEntity
+
+from fatcat_tools import authenticated_api, public_api, uuid2fcid
+from fatcat_tools.importers.common import EntityImporter, JsonLinePusher
+from fatcat_tools.normal import clean_doi
+
+
+class FileReleaseBugfix(EntityImporter):
+    """
+    This is a one-off / one-time cleanup script for file entities which got
+    imported with incorrect release ident mappings, due to a bug in the file
+    ingest importer.
+
+    While this calls itself a cleanup, it is based on the import code path. It
+    is not integrated into the `fatcat_import` or `fatcat_cleanup` controller;
+    instead it has a __main__ function and is invoked like:
+
+        python -m fatcat_tools.cleans.file_release_bugfix - < blah.json
+    """
+
+    def __init__(self, api: ApiClient, **kwargs):
+
+        eg_desc = (
+            kwargs.pop("editgroup_description", None)
+            or "Correct bad file/release import mappings"
+        )
+        eg_extra = kwargs.pop("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FileReleaseBugfix")
+        super().__init__(
+            api,
+            do_updates=True,
+            editgroup_description=eg_desc,
+            editgroup_extra=eg_extra,
+            **kwargs,
+        )
+        self.testing_mode = False
+
+    def want(self, row: Dict[str, Any]) -> bool:
+        if not (
+            row.get("edit_extra")
+            and row["edit_extra"].get("link_source")
+            and row["edit_extra"].get("link_source_id")
+        ):
+            self.counts["skip-partial"] += 1
+            return False
+        if row["edit_extra"]["link_source"] not in ["unpaywall", "doi"]:
+            self.counts["skip-link-source"] += 1
+            return False
+        if row["edit_extra"].get("ingest_request_source") not in [
+            "unpaywall",
+            "fatcat-changelog",
+        ]:
+            self.counts["skip-ingest-request-source"] += 1
+            return False
+        if not row["edit_extra"]["link_source_id"].startswith("10."):
+            self.counts["skip-source-id-not-doi"] += 1
+            return False
+        return True
+
+    def parse_record(self, row: Dict[str, Any]) -> FileEntity:
+
+        # bezerk mode doesn't make sense for this importer
+        assert self.bezerk_mode is False
+
+        file_ident = uuid2fcid(row["file_ident"])
+        wrong_release_ident = uuid2fcid(row["wrong_release_ident"])
+        edit_extra = row["edit_extra"]
+        assert edit_extra["link_source"] in ["unpaywall", "doi"]
+        file_edit_doi = clean_doi(edit_extra["link_source_id"])
+
+        if not file_edit_doi:
+            self.counts["skip-bad-doi"] += 1
+            return False
+
+        # check that the "wrong" release exists and doesn't have the DOI
+        wrong_release = None
+        try:
+            wrong_release = self.api.get_release(wrong_release_ident)
+        except fatcat_openapi_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+
+        if not wrong_release:
+            self.counts["skip-wrong-release-missing"] += 1
+            return None
+
+        if clean_doi(wrong_release.ext_ids.doi) == file_edit_doi:
+            self.counts["skip-wrong-release-is-ok"] += 1
+            return None
+
+        # fetch the "correct" release, if any
+        fixed_release_ids = []
+        correct_release = None
+        try:
+            correct_release = self.api.lookup_release(doi=file_edit_doi)
+        except fatcat_openapi_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+
+        if correct_release:
+            fixed_release_ids.append(correct_release.ident)
+
+        fe = FileEntity(
+            ident=file_ident,
+            release_ids=fixed_release_ids,
+            edit_extra=edit_extra,
+        )
+        fe._wrong_release_ident = wrong_release_ident
+        return fe
+
+    def try_update(self, fe: FileEntity) -> bool:
+
+        wrong_release_ident = fe._wrong_release_ident
+        assert len(wrong_release_ident) == 26
+
+        # should always be existing... but in QA it might not be
+        existing = None
+        try:
+            existing = self.api.get_file(fe.ident)
+        except fatcat_openapi_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+
+        if not existing:
+            self.counts["skip-existing-not-found"] += 1
+            return False
+
+        if existing.state != "active":
+            self.counts["skip-existing-entity-state"] += 1
+            return False
+
+        if wrong_release_ident not in existing.release_ids:
+            self.counts["skip-existing-fixed"] += 1
+            return False
+
+        # fetch existing history to verify mismatch
+        history = self.api.get_file_history(existing.ident)
+
+        for entry in history:
+            if entry.editgroup.editor.is_bot is not True:
+                self.counts["skip-existing-edit-history-human"] += 1
+                return False
+
+        bad_edit = history[-1].edit
+        if bad_edit.extra != fe.edit_extra:
+            self.counts["skip-existing-edit-history-extra-mismatch"] += 1
+            return False
+
+        bad_editgroup = history[-1].editgroup
+        if not bad_editgroup.extra:
+            self.counts["skip-existing-editgroup-missing-extra"] += 1
+            return False
+
+        if (
+            bad_editgroup.editor_id != "scmbogxw25evtcesfcab5qaboa"
+            or bad_editgroup.extra.get("agent") != "fatcat_tools.IngestFileResultImporter"
+            or not bad_editgroup.extra.get("git_rev", "").startswith("v0.3")
+            or bad_editgroup.created.year != 2020
+        ):
+            self.counts["skip-existing-edit-history-mismatch"] += 1
+            return False
+
+        existing.release_ids = [ri for ri in existing.release_ids if ri != wrong_release_ident]
+
+        if len(fe.release_ids) == 1:
+            if fe.release_ids[0] not in existing.release_ids:
+                existing.release_ids.append(fe.release_ids[0])
+
+        existing.edit_extra = fe.edit_extra
+
+        # not doing a check for "in current editgroup", because the source of
+        # these corrections (entity dump) contains no dupes
+
+        if not self.testing_mode:
+            self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
+        self.counts["update"] += 1
+        return False
+
+
+def test_file_release_bugfix() -> None:
+    api = public_api("http://localhost:9411/v0")
+    frbc = FileReleaseBugfix(api=api)
+    frbc.testing_mode = True
+
+    assert frbc.want({"this": "asdf"}) is False
+
+    example_line: Dict[str, Any] = {
+        "file_ident": "00000000-0000-0000-3333-000000000002",
+        "wrong_release_ident": "00000000-0000-0000-4444-000000000002",
+        "edit_extra": {
+            "link_source": "unpaywall",
+            "link_source_id": "10.1371/journal.pmed.0020124",
+            "ingest_request_source": "unpaywall",
+        },
+    }
+
+    fe1 = frbc.parse_record(example_line)
+    print(frbc.counts)
+    frbc.try_update(fe1)
+
+    # NOTE: this test is pretty incompleted
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "--host-url", default="http://localhost:9411/v0", help="connect to this host/port"
+    )
+    parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)
+    parser.set_defaults(
+        auth_var="FATCAT_AUTH_WORKER_CLEANUP",
+    )
+    parser.add_argument(
+        "json_file",
+        help="File with jsonlines with cleanup context",
+        default=sys.stdin,
+        type=argparse.FileType("r"),
+    )
+
+    args = parser.parse_args()
+    api = authenticated_api(
+        args.host_url,
+        # token is an optional kwarg (can be empty string, None, etc)
+        token=os.environ.get(args.auth_var),
+    )
+
+    frbc = FileReleaseBugfix(
+        api,
+        edit_batch_size=args.batch_size,
+    )
+    JsonLinePusher(frbc, args.json_file).run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
new file mode 100644
index 00000000..bdd49f9b
--- /dev/null
+++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
@@ -0,0 +1,344 @@
+import argparse
+import copy
+import os
+import sys
+from typing import Any, Dict
+
+import fatcat_openapi_client
+from fatcat_openapi_client import ApiClient, FileEntity
+
+from fatcat_tools import authenticated_api, entity_from_dict, public_api
+from fatcat_tools.importers.common import EntityImporter, JsonLinePusher
+
+
+class FileShortWaybackTimestampCleanup(EntityImporter):
+    """
+    This is a one-off / one-time cleanup script for file entities, fix short
+    timestamps in wayback URLs. These timestamps are supposed to have 14 digits
+    (datetime with year, hour, seconds, etc). Some legacy file imports ended up
+    with only 4 or 12 digits.
+
+    While this calls itself a cleanup, it is based on the import code path. It
+    is not integrated into the `fatcat_import` or `fatcat_cleanup` controller;
+    instead it has a __main__ function and is invoked like:
+
+        python -m fatcat_tools.cleans.file_short_wayback_ts - < blah.json
+    """
+
+    def __init__(self, api: ApiClient, **kwargs):
+
+        eg_desc = (
+            kwargs.pop("editgroup_description", None)
+            or "Expand trunacted timestamps in wayback URLs"
+        )
+        eg_extra = kwargs.pop("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get(
+            "agent", "fatcat_tools.FileShortWaybackTimestampCleanup"
+        )
+        super().__init__(
+            api,
+            do_updates=True,
+            editgroup_description=eg_desc,
+            editgroup_extra=eg_extra,
+            **kwargs,
+        )
+        self.testing_mode = False
+
+    def want(self, row: Dict[str, Any]) -> bool:
+        if row["status"].startswith("success"):
+            return True
+        else:
+            self.counts["skip-status"] += 1
+            return False
+
+    def parse_record(self, row: Dict[str, Any]) -> FileEntity:
+
+        # bezerk mode doesn't make sense for this importer
+        assert self.bezerk_mode is False
+
+        fe: FileEntity = entity_from_dict(row["file_entity"], FileEntity)
+        status: str = row["status"]
+        assert status.startswith("success")
+        url_expansions: Dict[str, str] = row["full_urls"]
+        assert len(url_expansions) >= 1
+
+        # actual cleanup happens here
+        any_fixed = False
+        for fe_url in fe.urls:
+            if "://web.archive.org/web/" not in fe_url.url:
+                continue
+            seq = fe_url.url.split("/")
+            partial_ts = seq[4]
+            original_url = "/".join(seq[5:])
+            if seq[2] != "web.archive.org":
+                continue
+            if len(partial_ts) not in [4, 12]:
+                continue
+            if fe_url.url in url_expansions:
+                fix_url = url_expansions[fe_url.url]
+                # defensive checks
+                if not (
+                    f"/web/{partial_ts}" in fix_url
+                    and fe_url.url.endswith(original_url)
+                    and fix_url.endswith(original_url)
+                ):
+                    print(
+                        f"bad replacement URL: partial_ts={partial_ts} original={original_url} fix_url={fix_url}",
+                        file=sys.stderr,
+                    )
+                    self.counts["skip-bad-replacement"] += 1
+                    return None
+                assert "://" in fix_url
+                fe_url.url = fix_url
+                any_fixed = True
+
+        if not any_fixed:
+            self.counts["skip-no-fixes"] += 1
+            return None
+
+        # do any other generic file entity cleanups
+        # this includes removing duplicates
+        fe = self.generic_file_cleanups(fe)
+
+        # verify that there are no exact duplicates
+        final_urls = [u.url for u in fe.urls]
+        assert len(final_urls) == len(list(set(final_urls)))
+
+        return fe
+
+    def try_update(self, fe: FileEntity) -> bool:
+
+        # should always be existing
+        try:
+            existing = self.api.get_file(fe.ident)
+        except fatcat_openapi_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+
+        if not existing:
+            self.counts["skip-existing-not-found"] += 1
+            return False
+
+        if existing.state != "active":
+            self.counts["skip-existing-entity-state"] += 1
+            return False
+
+        if existing.sha1 != fe.sha1:
+            self.counts["skip-existing-mismatch"] += 1
+            return False
+
+        assert fe.revision and existing.revision
+        if existing.revision != fe.revision:
+            self.counts["skip-revision-changed"] += 1
+            return False
+
+        # verify that at least one URL remains
+        if not fe.urls or len(fe.urls) < 1:
+            self.counts["skip-no-urls"] += 1
+            return False
+
+        # verify that all wayback urls have 14-digit timestamps, and are generally well-formed
+        for u in fe.urls:
+            if "://web.archive.org/web/" not in u.url:
+                continue
+            if u.rel != "webarchive":
+                self.counts["skip-bad-wayback-rel"] += 1
+                return False
+            seg = u.url.split("/")
+            if (
+                len(seg) < 6
+                or seg[0] != "https:"
+                or seg[2] != "web.archive.org"
+                or seg[3] != "web"
+            ):
+                self.counts["skip-bad-wayback"] += 1
+                return False
+            if len(seg[4]) != 14 or not seg[4].isdigit():
+                self.counts["skip-bad-wayback-timestamp"] += 1
+                return False
+
+        if existing == fe or existing.urls == fe.urls:
+            self.counts["skip-no-change"] += 1
+            return False
+
+        # not doing a check for "in current editgroup", because the source of
+        # these corrections (entity dump) contains no dupes
+
+        if not self.testing_mode:
+            # note: passing 'fe' instead of 'existing' here, which is not
+            # usually how it goes!
+            self.api.update_file(self.get_editgroup_id(), fe.ident, fe)
+        self.counts["update"] += 1
+        return False
+
+
+def test_short_wayback_ts() -> None:
+    api = public_api("http://localhost:9411/v0")
+    fswtc = FileShortWaybackTimestampCleanup(api=api)
+    fswtc.testing_mode = True
+
+    assert fswtc.want({"status": "fail"}) is False
+    assert fswtc.want({"status": "success-self"}) is True
+
+    example_line: Dict[str, Any] = {
+        "status": "success-db",
+        "file_entity": {
+            # note: doesn't match actual entity
+            "release_ids": ["waldfsctnbcpdbmasgduhaaaaa"],
+            "mimetype": "application/pdf",
+            "urls": [
+                {
+                    "url": "https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971",
+                    "rel": "web",
+                },
+                {
+                    "url": "https://web.archive.org/web/201904301022/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971",
+                    "rel": "webarchive",
+                },
+            ],
+            "sha256": "0b9e09480ed2e1f08f3c6c72f57ce12b52ea265f580f8810e606b49d64234b29",
+            "sha1": "be714299b9be21b5afdaa7affd7d710c58269433",
+            "md5": "9edb542be5b3446a1905e61a8a3abebd",
+            "size": 666242,
+            "revision": "fe949be8-7bf9-4c17-be28-8e3e90fb85bd",
+            "ident": "4ghpvs2t2rdtrdum2mkreh62me",
+            "state": "active",
+        },
+        "full_urls": {
+            "https://web.archive.org/web/201904301022/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971": "https://web.archive.org/web/20190430102239/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971"
+        },
+    }
+    example_fe = entity_from_dict(example_line["file_entity"], FileEntity)
+
+    fe1 = copy.copy(example_fe)
+    fe1.urls[
+        1
+    ].url = "https://web.archive.org/web/20190430102239/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971"
+    assert fswtc.parse_record(example_line) == fe1
+
+    # update code path; requires a known file ident and API running locally
+    assert fswtc.counts["update"] == 0
+    dummy_fe = api.get_file("aaaaaaaaaaaaamztaaaaaaaaai")
+    fe1.ident = dummy_fe.ident
+
+    assert fswtc.try_update(fe1) is False
+    assert fswtc.counts["skip-existing-mismatch"] == 1
+
+    fe1.sha1 = dummy_fe.sha1
+    assert fswtc.try_update(fe1) is False
+    assert fswtc.counts["skip-revision-changed"] == 1
+
+    fe1.revision = dummy_fe.revision
+    assert fswtc.try_update(fe1) is False
+    print(fswtc.counts)
+    assert fswtc.counts["update"] == 1
+
+    # another example, which failed with an assertion in prod due to duplicated URLs
+    example_line2: Dict[str, Any] = {
+        "file_entity": {
+            "release_ids": ["22jt7euq4fafhblzullmnesso4"],
+            "mimetype": "application/pdf",
+            "urls": [
+                {
+                    "url": "https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf",
+                    "rel": "repository",
+                },
+                {
+                    "url": "https://web.archive.org/web/201811010021/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf",
+                    "rel": "webarchive",
+                },
+                {
+                    "url": "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf",
+                    "rel": "webarchive",
+                },
+            ],
+            "sha256": "51ec58e7a2325d28d1deb0a4bc6422c0e4ae7b12ffb0b6298981a7b8b7730b19",
+            "sha1": "ad96a584fc6073b9a23736bc61ae0ec4a5661433",
+            "md5": "3d509743359649e34a27ae70c5cd3018",
+            "size": 430665,
+            "extra": {
+                "shadows": {"scimag_doi": "10.4259/ibk.59.1_194", "scimag_id": "69089904"}
+            },
+            "revision": "f1fa11ff-d521-45cf-9db1-cb3c8bd3ea48",
+            "ident": "duymhmxk3fgtzk37yp2pvthtxq",
+            "state": "active",
+        },
+        "full_urls": {
+            "https://web.archive.org/web/201811010021/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf": "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf"
+        },
+        "status": "success-self",
+    }
+
+    fe2 = fswtc.parse_record(example_line2)
+    assert len(fe2.urls) == 2
+    assert fe2.urls[0].rel == "repository"
+    assert (
+        fe2.urls[1].url
+        == "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf"
+    )
+
+    # ensure URL order is stable
+    example_line3: Dict[str, Any] = {
+        "file_entity": {
+            "release_ids": ["5rin7f2cdvc5hjkqqw53z7sr3i"],
+            "mimetype": "application/pdf",
+            "urls": [
+                {"url": "https://pubs.usgs.gov/bul/1108/report.pdf", "rel": "web"},
+                {
+                    "url": "https://web.archive.org/web/201904291643/https://pubs.usgs.gov/bul/1108/report.pdf",
+                    "rel": "webarchive",
+                },
+            ],
+            "sha256": "714cd48c2577e9b058b8f16b4574765da685f67582cc53898a9d6933e45d6cc0",
+            "sha1": "4efbdb517c0ff3f58136e4efbbec2bd9315400d3",
+            "md5": "89b6e6cc4e0259317e26ddf1a9a336a0",
+            "size": 41265,
+            "revision": "926fcf73-e644-4446-a24b-4d0940a2cf65",
+            "ident": "lvnz23nzijaapf5iti45zez6zu",
+            "state": "active",
+        },
+        "full_urls": {
+            "https://web.archive.org/web/201904291643/https://pubs.usgs.gov/bul/1108/report.pdf": "https://web.archive.org/web/20190429164342/https://pubs.usgs.gov/bul/1108/report.pdf"
+        },
+        "status": "success-db",
+    }
+
+    fe3 = fswtc.parse_record(example_line3)
+    assert len(fe3.urls) == 2
+    assert fe3.urls[0].rel == "web"
+    assert fe3.urls[0].url == "https://pubs.usgs.gov/bul/1108/report.pdf"
+    assert fe3.urls[1].rel == "webarchive"
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "--host-url", default="http://localhost:9411/v0", help="connect to this host/port"
+    )
+    parser.add_argument("--batch-size", help="size of batch to send", default=100, type=int)
+    parser.set_defaults(
+        auth_var="FATCAT_AUTH_WORKER_CLEANUP",
+    )
+    parser.add_argument(
+        "json_file",
+        help="File with jsonlines from file_meta schema to import from",
+        default=sys.stdin,
+        type=argparse.FileType("r"),
+    )
+
+    args = parser.parse_args()
+    api = authenticated_api(
+        args.host_url,
+        # token is an optional kwarg (can be empty string, None, etc)
+        token=os.environ.get(args.auth_var),
+    )
+
+    fswtc = FileShortWaybackTimestampCleanup(
+        api,
+        edit_batch_size=args.batch_size,
+    )
+    JsonLinePusher(fswtc, args.json_file).run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/fatcat_tools/cleanups/release_lowercase_doi.py b/python/fatcat_tools/cleanups/release_lowercase_doi.py
new file mode 100644
index 00000000..5e3275db
--- /dev/null
+++ b/python/fatcat_tools/cleanups/release_lowercase_doi.py
@@ -0,0 +1,154 @@
+import argparse
+import os
+import sys
+
+from fatcat_openapi_client import ApiClient, ApiException, ReleaseEntity, ReleaseExtIds
+
+from fatcat_tools import authenticated_api, public_api
+from fatcat_tools.importers.common import EntityImporter, LinePusher
+
+
+class ReleaseLowercaseDoiCleanup(EntityImporter):
+    """
+    This is a one-off / one-time cleanup script for release entities, to fix
+    upper-case DOIs. In fatcat, all DOIs should be normalized to lowercase.
+
+    While this calls itself a cleanup, it is based on the import code path. It
+    is not integrated into the `fatcat_import` or `fatcat_cleanup` controller;
+    instead it has a __main__ function and is invoked like:
+
+        python -m fatcat_tools.cleans.release_lowercase_doi - < blah.tsv
+
+    It expects to get a simple text line on stdin, which is a release entity.
+    The correction is implemented by fetching the current version of the
+    entity, verifying the issue, and updating if it is still a problem.
+
+    This does not try to do any merging, just corrects the case in a single
+    update.
+    """
+
+    def __init__(self, api: ApiClient, **kwargs):
+
+        eg_desc = (
+            kwargs.pop("editgroup_description", None)
+            or "Normalize release DOIs (extid) to lower-case"
+        )
+        eg_extra = kwargs.pop("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ReleaseLowercaseDoiCleanup")
+        super().__init__(
+            api,
+            do_updates=True,
+            editgroup_description=eg_desc,
+            editgroup_extra=eg_extra,
+            **kwargs,
+        )
+        self.testing_mode = False
+
+    def want(self, row: str) -> bool:
+        row = row.strip()
+        if not row:
+            return False
+        row = row.split()[0]
+        if len(row) == 26:
+            return True
+        else:
+            return False
+
+    def parse_record(self, row: str) -> ReleaseEntity:
+
+        # bezerk mode doesn't make sense for this importer
+        assert self.bezerk_mode is False
+
+        ident = row.strip().split()[0]
+        assert len(ident) == 26
+
+        return ReleaseEntity(
+            ident=ident,
+            ext_ids=ReleaseExtIds(),
+        )
+
+    def try_update(self, re: ReleaseEntity) -> bool:
+
+        # should always be existing, but sometimes not because of prod/QA flip
+        existing = None
+        try:
+            existing = self.api.get_release(re.ident)
+        except ApiException as err:
+            if err.status != 404:
+                raise err
+
+        if not existing:
+            self.counts["skip-existing-not-found"] += 1
+            return False
+
+        if existing.state != "active":
+            self.counts["skip-existing-entity-state"] += 1
+            return False
+
+        if not existing.ext_ids.doi:
+            self.counts["skip-existing-no-doi"] += 1
+            return False
+
+        if existing.ext_ids.doi == existing.ext_ids.doi.lower():
+            self.counts["skip-existing-doi-fine"] += 1
+            return False
+
+        existing.ext_ids.doi = existing.ext_ids.doi.lower()
+
+        # not doing a check for "in current editgroup", because the source of
+        # these corrections (entity dump) contains no dupes
+
+        if not self.testing_mode:
+            self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
+        self.counts["update"] += 1
+        return False
+
+
+def test_lowercase_doi() -> None:
+    api = public_api("http://localhost:9411/v0")
+    rldc = ReleaseLowercaseDoiCleanup(api=api)
+    rldc.testing_mode = True
+
+    assert rldc.want("") is False
+    assert rldc.want("aaaaaaaaaaaaarceaaaaaaaaai") is True
+    assert rldc.want("aaaaaaaaaaaaarceaaaaaaaaai\t10.1234/ABCD") is True
+    rldc.parse_record("aaaaaaaaaaaaarceaaaaaaaaai")
+
+    dummy_re = api.get_release("aaaaaaaaaaaaarceaaaaaaaaai")
+    assert rldc.try_update(dummy_re) is False
+    assert rldc.counts["skip-existing-doi-fine"] == 1
+    # this isn't a very complete test, doesn't get to update part
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "--host-url", default="http://localhost:9411/v0", help="connect to this host/port"
+    )
+    parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)
+    parser.set_defaults(
+        auth_var="FATCAT_AUTH_WORKER_CLEANUP",
+    )
+    parser.add_argument(
+        "idents_file",
+        help="File with release identifier to try updating",
+        default=sys.stdin,
+        type=argparse.FileType("r"),
+    )
+
+    args = parser.parse_args()
+    api = authenticated_api(
+        args.host_url,
+        # token is an optional kwarg (can be empty string, None, etc)
+        token=os.environ.get(args.auth_var),
+    )
+
+    rldc = ReleaseLowercaseDoiCleanup(
+        api,
+        edit_batch_size=args.batch_size,
+    )
+    LinePusher(rldc, args.idents_file).run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 7c587395..e2157ee5 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -375,6 +375,15 @@ class EntityImporter:
             if u.rel == "social":
                 u.rel = "academicsocial"
 
+        # remove exact URL duplicates, while preserving order, and removing
+        # "later" copies, not "first" copies
+        # this is sensitive to both url.url and url.rel combined!
+        dedupe_urls = []
+        for url_pair in existing.urls:
+            if url_pair not in dedupe_urls:
+                dedupe_urls.append(url_pair)
+        existing.urls = dedupe_urls
+
         # remove URLs which are near-duplicates
         redundant_urls = []
         all_urls = [u.url for u in existing.urls]
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 5bc7a9ff..a6c7409d 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -466,7 +466,12 @@ class PubmedImporter(EntityImporter):
             self.counts["exists"] += 1
             return False
 
-        if existing and existing.ext_ids.pmid and (existing.refs or not re.refs):
+        if (
+            existing
+            and existing.ext_ids.pmid
+            and (existing.ext_ids.pmcid or not re.ext_ids.pmcid)
+            and (existing.refs or not re.refs)
+        ):
             # TODO: any other reasons to do an update?
             # don't update if it already has PMID
             self.counts["exists"] += 1