diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat_tools/cleanups/release_lowercase_doi.py | 145 | 
1 files changed, 145 insertions, 0 deletions
| diff --git a/python/fatcat_tools/cleanups/release_lowercase_doi.py b/python/fatcat_tools/cleanups/release_lowercase_doi.py new file mode 100644 index 00000000..812262d4 --- /dev/null +++ b/python/fatcat_tools/cleanups/release_lowercase_doi.py @@ -0,0 +1,145 @@ +import argparse +import copy +import os +import sys +from typing import Any, Dict + +import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, ReleaseEntity, ReleaseExtIds + +from fatcat_tools import authenticated_api, entity_from_dict, public_api +from fatcat_tools.importers.common import EntityImporter, LinePusher + + +class ReleaseLowercaseDoiCleanup(EntityImporter): +    """ +    This is a one-off / one-time cleanup script for release entities, to fix +    upper-case DOIs. In fatcat, all DOIs should be normalized to lowercase. + +    While this calls itself a cleanup, it is based on the import code path. It +    is not integrated into the `fatcat_import` or `fatcat_cleanup` controller; +    instead it has a __main__ function and is invoked like: + +        python -m fatcat_tools.cleans.release_lowercase_doi - < blah.tsv + +    It expects to get a simple text line on stdin, which is a release entity. +    The correction is implemented by fetching the current version of the +    entity, verifying the issue, and updating if it is still a problem. + +    This does not try to do any merging, just corrects the case in a single +    update. +    """ + +    def __init__(self, api: ApiClient, **kwargs): + +        eg_desc = ( +            kwargs.pop("editgroup_description", None) +            or "Normalize release DOIs (extid) to lower-case" +        ) +        eg_extra = kwargs.pop("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ReleaseLowercaseDoiCleanup") +        super().__init__( +            api, +            do_updates=True, +            editgroup_description=eg_desc, +            editgroup_extra=eg_extra, +            **kwargs, +        ) +        self.testing_mode = False + +    def want(self, row: str) -> bool: +        row = row.strip().split()[0] +        if len(row) == 26: +            return True +        else: +            return False + +    def parse_record(self, row: str) -> ReleaseEntity: + +        # bezerk mode doesn't make sense for this importer +        assert self.bezerk_mode is False + +        ident = row.strip().split()[0] +        assert len(ident) == 26 + +        return ReleaseEntity( +            ident=ident, +            ext_ids=ReleaseExtIds(), +        ) + +    def try_update(self, re: ReleaseEntity) -> bool: + +        # should always be existing +        existing = self.api.get_release(re.ident) + +        if not existing: +            self.counts["skip-existing-not-found"] += 1 +            return False + +        if not existing.ext_ids.doi: +            self.counts["skip-existing-no-doi"] += 1 +            return False + +        if existing.ext_ids.doi == existing.ext_ids.doi.lower(): +            self.counts["skip-existing-doi-fine"] += 1 +            return False + +        existing.ext_ids.doi = existing.ext_ids.doi.lower() + +        # not doing a check for "in current editgroup", because the source of +        # these corrections (entity dump) contains no dupes + +        if not self.testing_mode: +            self.api.update_release(self.get_editgroup_id(), re.ident, re) +        self.counts["update"] += 1 +        return False + + +def test_lowercase_doi() -> None: +    api = public_api("http://localhost:9411/v0") +    rldc = ReleaseLowercaseDoiCleanup(api=api) +    rldc.testing_mode = True + +    assert rldc.want("") is False +    assert rldc.want("aaaaaaaaaaaaarceaaaaaaaaai") is True +    assert rldc.want("aaaaaaaaaaaaarceaaaaaaaaai\t10.1234/ABCD") is True +    rldc.parse_record("aaaaaaaaaaaaarceaaaaaaaaai") + +    dummy_re = api.get_release("aaaaaaaaaaaaarceaaaaaaaaai") +    assert rldc.try_update(dummy_re) is False +    assert rldc.counts["skip-existing-doi-fine"] == 1 +    # this isn't a very complete test, doesn't get to update part + + +def main() -> None: +    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) +    parser.add_argument( +        "--host-url", default="http://localhost:9411/v0", help="connect to this host/port" +    ) +    parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) +    parser.set_defaults( +        auth_var="FATCAT_AUTH_WORKER_CLEANUP", +    ) +    parser.add_argument( +        "idents_file", +        help="File with release identifier to try updating", +        default=sys.stdin, +        type=argparse.FileType("r"), +    ) + +    args = parser.parse_args() +    api = authenticated_api( +        args.host_url, +        # token is an optional kwarg (can be empty string, None, etc) +        token=os.environ.get(args.auth_var), +    ) + +    rldc = ReleaseLowercaseDoiCleanup( +        api, +        edit_batch_size=args.batch_size, +    ) +    LinePusher(rldc, args.idents_file).run() + + +if __name__ == "__main__": +    main() | 
