summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/cleanups/release_lowercase_doi.py
blob: 5e3275db82b507f15fffaef117a21c2b46844e2a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import argparse
import os
import sys

from fatcat_openapi_client import ApiClient, ApiException, ReleaseEntity, ReleaseExtIds

from fatcat_tools import authenticated_api, public_api
from fatcat_tools.importers.common import EntityImporter, LinePusher


class ReleaseLowercaseDoiCleanup(EntityImporter):
    """
    This is a one-off / one-time cleanup script for release entities, to fix
    upper-case DOIs. In fatcat, all DOIs should be normalized to lowercase.

    While this calls itself a cleanup, it is based on the import code path. It
    is not integrated into the `fatcat_import` or `fatcat_cleanup` controller;
    instead it has a __main__ function and is invoked like:

        python -m fatcat_tools.cleans.release_lowercase_doi - < blah.tsv

    It expects to get a simple text line on stdin, which is a release entity.
    The correction is implemented by fetching the current version of the
    entity, verifying the issue, and updating if it is still a problem.

    This does not try to do any merging, just corrects the case in a single
    update.
    """

    def __init__(self, api: ApiClient, **kwargs):

        eg_desc = (
            kwargs.pop("editgroup_description", None)
            or "Normalize release DOIs (extid) to lower-case"
        )
        eg_extra = kwargs.pop("editgroup_extra", dict())
        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ReleaseLowercaseDoiCleanup")
        super().__init__(
            api,
            do_updates=True,
            editgroup_description=eg_desc,
            editgroup_extra=eg_extra,
            **kwargs,
        )
        self.testing_mode = False

    def want(self, row: str) -> bool:
        row = row.strip()
        if not row:
            return False
        row = row.split()[0]
        if len(row) == 26:
            return True
        else:
            return False

    def parse_record(self, row: str) -> ReleaseEntity:

        # bezerk mode doesn't make sense for this importer
        assert self.bezerk_mode is False

        ident = row.strip().split()[0]
        assert len(ident) == 26

        return ReleaseEntity(
            ident=ident,
            ext_ids=ReleaseExtIds(),
        )

    def try_update(self, re: ReleaseEntity) -> bool:

        # should always be existing, but sometimes not because of prod/QA flip
        existing = None
        try:
            existing = self.api.get_release(re.ident)
        except ApiException as err:
            if err.status != 404:
                raise err

        if not existing:
            self.counts["skip-existing-not-found"] += 1
            return False

        if existing.state != "active":
            self.counts["skip-existing-entity-state"] += 1
            return False

        if not existing.ext_ids.doi:
            self.counts["skip-existing-no-doi"] += 1
            return False

        if existing.ext_ids.doi == existing.ext_ids.doi.lower():
            self.counts["skip-existing-doi-fine"] += 1
            return False

        existing.ext_ids.doi = existing.ext_ids.doi.lower()

        # not doing a check for "in current editgroup", because the source of
        # these corrections (entity dump) contains no dupes

        if not self.testing_mode:
            self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
        self.counts["update"] += 1
        return False


def test_lowercase_doi() -> None:
    api = public_api("http://localhost:9411/v0")
    rldc = ReleaseLowercaseDoiCleanup(api=api)
    rldc.testing_mode = True

    assert rldc.want("") is False
    assert rldc.want("aaaaaaaaaaaaarceaaaaaaaaai") is True
    assert rldc.want("aaaaaaaaaaaaarceaaaaaaaaai\t10.1234/ABCD") is True
    rldc.parse_record("aaaaaaaaaaaaarceaaaaaaaaai")

    dummy_re = api.get_release("aaaaaaaaaaaaarceaaaaaaaaai")
    assert rldc.try_update(dummy_re) is False
    assert rldc.counts["skip-existing-doi-fine"] == 1
    # this isn't a very complete test, doesn't get to update part


def main() -> None:
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "--host-url", default="http://localhost:9411/v0", help="connect to this host/port"
    )
    parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)
    parser.set_defaults(
        auth_var="FATCAT_AUTH_WORKER_CLEANUP",
    )
    parser.add_argument(
        "idents_file",
        help="File with release identifier to try updating",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )

    args = parser.parse_args()
    api = authenticated_api(
        args.host_url,
        # token is an optional kwarg (can be empty string, None, etc)
        token=os.environ.get(args.auth_var),
    )

    rldc = ReleaseLowercaseDoiCleanup(
        api,
        edit_batch_size=args.batch_size,
    )
    LinePusher(rldc, args.idents_file).run()


if __name__ == "__main__":
    main()