From ad050445ac4f3e218ec101790bbf187731646361 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 5 Nov 2021 16:31:36 -0700 Subject: cleanups: initial lowercase DOI cleanup script --- .../fatcat_tools/cleanups/release_lowercase_doi.py | 145 +++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 python/fatcat_tools/cleanups/release_lowercase_doi.py (limited to 'python/fatcat_tools/cleanups/release_lowercase_doi.py') diff --git a/python/fatcat_tools/cleanups/release_lowercase_doi.py b/python/fatcat_tools/cleanups/release_lowercase_doi.py new file mode 100644 index 00000000..812262d4 --- /dev/null +++ b/python/fatcat_tools/cleanups/release_lowercase_doi.py @@ -0,0 +1,145 @@ +import argparse +import copy +import os +import sys +from typing import Any, Dict + +import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, ReleaseEntity, ReleaseExtIds + +from fatcat_tools import authenticated_api, entity_from_dict, public_api +from fatcat_tools.importers.common import EntityImporter, LinePusher + + +class ReleaseLowercaseDoiCleanup(EntityImporter): + """ + This is a one-off / one-time cleanup script for release entities, to fix + upper-case DOIs. In fatcat, all DOIs should be normalized to lowercase. + + While this calls itself a cleanup, it is based on the import code path. It + is not integrated into the `fatcat_import` or `fatcat_cleanup` controller; + instead it has a __main__ function and is invoked like: + + python -m fatcat_tools.cleans.release_lowercase_doi - < blah.tsv + + It expects to get a simple text line on stdin, which is a release entity. + The correction is implemented by fetching the current version of the + entity, verifying the issue, and updating if it is still a problem. + + This does not try to do any merging, just corrects the case in a single + update. + """ + + def __init__(self, api: ApiClient, **kwargs): + + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Normalize release DOIs (extid) to lower-case" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ReleaseLowercaseDoiCleanup") + super().__init__( + api, + do_updates=True, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs, + ) + self.testing_mode = False + + def want(self, row: str) -> bool: + row = row.strip().split()[0] + if len(row) == 26: + return True + else: + return False + + def parse_record(self, row: str) -> ReleaseEntity: + + # bezerk mode doesn't make sense for this importer + assert self.bezerk_mode is False + + ident = row.strip().split()[0] + assert len(ident) == 26 + + return ReleaseEntity( + ident=ident, + ext_ids=ReleaseExtIds(), + ) + + def try_update(self, re: ReleaseEntity) -> bool: + + # should always be existing + existing = self.api.get_release(re.ident) + + if not existing: + self.counts["skip-existing-not-found"] += 1 + return False + + if not existing.ext_ids.doi: + self.counts["skip-existing-no-doi"] += 1 + return False + + if existing.ext_ids.doi == existing.ext_ids.doi.lower(): + self.counts["skip-existing-doi-fine"] += 1 + return False + + existing.ext_ids.doi = existing.ext_ids.doi.lower() + + # not doing a check for "in current editgroup", because the source of + # these corrections (entity dump) contains no dupes + + if not self.testing_mode: + self.api.update_release(self.get_editgroup_id(), re.ident, re) + self.counts["update"] += 1 + return False + + +def test_lowercase_doi() -> None: + api = public_api("http://localhost:9411/v0") + rldc = ReleaseLowercaseDoiCleanup(api=api) + rldc.testing_mode = True + + assert rldc.want("") is False + assert rldc.want("aaaaaaaaaaaaarceaaaaaaaaai") is True + assert rldc.want("aaaaaaaaaaaaarceaaaaaaaaai\t10.1234/ABCD") is True + rldc.parse_record("aaaaaaaaaaaaarceaaaaaaaaai") + + dummy_re = api.get_release("aaaaaaaaaaaaarceaaaaaaaaai") + assert rldc.try_update(dummy_re) is False + assert rldc.counts["skip-existing-doi-fine"] == 1 + # this isn't a very complete test, doesn't get to update part + + +def main() -> None: + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "--host-url", default="http://localhost:9411/v0", help="connect to this host/port" + ) + parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) + parser.set_defaults( + auth_var="FATCAT_AUTH_WORKER_CLEANUP", + ) + parser.add_argument( + "idents_file", + help="File with release identifier to try updating", + default=sys.stdin, + type=argparse.FileType("r"), + ) + + args = parser.parse_args() + api = authenticated_api( + args.host_url, + # token is an optional kwarg (can be empty string, None, etc) + token=os.environ.get(args.auth_var), + ) + + rldc = ReleaseLowercaseDoiCleanup( + api, + edit_batch_size=args.batch_size, + ) + LinePusher(rldc, args.idents_file).run() + + +if __name__ == "__main__": + main() -- cgit v1.2.3 From 78eb2cc0f3f81c35474eb68231fe5f60d47bbcde Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 9 Nov 2021 16:04:31 -0800 Subject: lowercase DOI lint and check entity status --- python/fatcat_tools/cleanups/release_lowercase_doi.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'python/fatcat_tools/cleanups/release_lowercase_doi.py') diff --git a/python/fatcat_tools/cleanups/release_lowercase_doi.py b/python/fatcat_tools/cleanups/release_lowercase_doi.py index 812262d4..4e5a6f43 100644 --- a/python/fatcat_tools/cleanups/release_lowercase_doi.py +++ b/python/fatcat_tools/cleanups/release_lowercase_doi.py @@ -1,13 +1,10 @@ import argparse -import copy import os import sys -from typing import Any, Dict -import fatcat_openapi_client from fatcat_openapi_client import ApiClient, ReleaseEntity, ReleaseExtIds -from fatcat_tools import authenticated_api, entity_from_dict, public_api +from fatcat_tools import authenticated_api, public_api from fatcat_tools.importers.common import EntityImporter, LinePusher @@ -76,6 +73,10 @@ class ReleaseLowercaseDoiCleanup(EntityImporter): self.counts["skip-existing-not-found"] += 1 return False + if existing.status != "active": + self.counts["skip-existing-entity-status"] += 1 + return False + if not existing.ext_ids.doi: self.counts["skip-existing-no-doi"] += 1 return False -- cgit v1.2.3 From d9284d421618742f5ecd76ba2c6d92dcefa5e5db Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 9 Nov 2021 16:56:28 -0800 Subject: updates to lowercase DOI cleanup --- notes/cleanups/case_sensitive_dois.md | 71 ++++++++++++++++++++++ .../fatcat_tools/cleanups/release_lowercase_doi.py | 22 ++++--- 2 files changed, 86 insertions(+), 7 deletions(-) create mode 100644 notes/cleanups/case_sensitive_dois.md (limited to 'python/fatcat_tools/cleanups/release_lowercase_doi.py') diff --git a/notes/cleanups/case_sensitive_dois.md b/notes/cleanups/case_sensitive_dois.md new file mode 100644 index 00000000..1bf1901e --- /dev/null +++ b/notes/cleanups/case_sensitive_dois.md @@ -0,0 +1,71 @@ + +Relevant github issue: https://github.com/internetarchive/fatcat/issues/83 + +How many existing fatcat releases have a non-lowercase DOI? As of June 2021: + + zcat release_extid.tsv.gz | cut -f3 | rg '[A-Z]' | pv -l | wc -l + 139964 + +## Prep + + wget https://archive.org/download/fatcat_bulk_exports_2021-11-05/release_extid.tsv.gz + + # scratch:bin/fcid.py is roughly the same as `fatcat_util.py uuid2fcid` + + zcat release_extid.tsv.gz \ + | cut -f1,3 \ + | rg '[A-Z]' \ + | /fast/scratch/bin/fcid.py \ + | pv -l \ + > nonlowercase_doi_releases.tsv + # 140k 0:03:54 [ 599 /s] + + wc -l nonlowercase_doi_releases.tsv + 140530 nonlowercase_doi_releases.tsv + +Uhoh, there are ~500 more than previously? Guess those are from after the fix? + +Create a sample for testing: + + shuf -n10000 nonlowercase_doi_releases.tsv \ + > nonlowercase_doi_releases.10k_sample.tsv + +## Test in QA + +In pipenv: + + export FATCAT_AUTH_WORKER_CLEANUP=[...] + + head -n100 /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \ + | python -m fatcat_tools.cleanups.release_lowercase_doi - + # Counter({'total': 100, 'update': 100, 'skip': 0, 'insert': 0, 'exists': 0}) + + head -n100 /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \ + | python -m fatcat_tools.cleanups.release_lowercase_doi - + # Counter({'total': 100, 'skip-existing-doi-fine': 100, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0}) + + head -n2000 /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \ + | python -m fatcat_tools.cleanups.release_lowercase_doi - + # no such release_ident found: dcjsybvqanffhmu4dhzdnptave + +Presumably because this is being run in QA, and there are some newer prod releases in the snapshot. + +Did a quick update, and then: + + head -n2000 /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \ + | python -m fatcat_tools.cleanups.release_lowercase_doi - + # Counter({'total': 2000, 'skip-existing-doi-fine': 1100, 'update': 898, 'skip-existing-not-found': 2, 'skip': 0, 'insert': 0, 'exists': 0}) + +Did some spot checking in QA. Out of 20 DOIs checked, 15 were valid, 5 were not +valid (doi.org 404). It seems like roughly 1/3 have a dupe DOI (the lower-case +DOI exists); didn't count exact numbers. + +This cleanup is simple and looks good to go. Batch size of 50 is good for full +releases. + +Example of parallelization: + + cat /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \ + | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_tools.cleanups.release_lowercase_doi - + +Ready to go! diff --git a/python/fatcat_tools/cleanups/release_lowercase_doi.py b/python/fatcat_tools/cleanups/release_lowercase_doi.py index 4e5a6f43..5e3275db 100644 --- a/python/fatcat_tools/cleanups/release_lowercase_doi.py +++ b/python/fatcat_tools/cleanups/release_lowercase_doi.py @@ -2,7 +2,7 @@ import argparse import os import sys -from fatcat_openapi_client import ApiClient, ReleaseEntity, ReleaseExtIds +from fatcat_openapi_client import ApiClient, ApiException, ReleaseEntity, ReleaseExtIds from fatcat_tools import authenticated_api, public_api from fatcat_tools.importers.common import EntityImporter, LinePusher @@ -45,7 +45,10 @@ class ReleaseLowercaseDoiCleanup(EntityImporter): self.testing_mode = False def want(self, row: str) -> bool: - row = row.strip().split()[0] + row = row.strip() + if not row: + return False + row = row.split()[0] if len(row) == 26: return True else: @@ -66,15 +69,20 @@ class ReleaseLowercaseDoiCleanup(EntityImporter): def try_update(self, re: ReleaseEntity) -> bool: - # should always be existing - existing = self.api.get_release(re.ident) + # should always be existing, but sometimes not because of prod/QA flip + existing = None + try: + existing = self.api.get_release(re.ident) + except ApiException as err: + if err.status != 404: + raise err if not existing: self.counts["skip-existing-not-found"] += 1 return False - if existing.status != "active": - self.counts["skip-existing-entity-status"] += 1 + if existing.state != "active": + self.counts["skip-existing-entity-state"] += 1 return False if not existing.ext_ids.doi: @@ -91,7 +99,7 @@ class ReleaseLowercaseDoiCleanup(EntityImporter): # these corrections (entity dump) contains no dupes if not self.testing_mode: - self.api.update_release(self.get_editgroup_id(), re.ident, re) + self.api.update_release(self.get_editgroup_id(), existing.ident, existing) self.counts["update"] += 1 return False -- cgit v1.2.3