1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
  | 
import argparse
import os
import sys
from fatcat_openapi_client import ApiClient, ApiException, ReleaseEntity, ReleaseExtIds
from fatcat_tools import authenticated_api, public_api
from fatcat_tools.importers.common import EntityImporter, LinePusher
class ReleaseLowercaseDoiCleanup(EntityImporter):
    """
    This is a one-off / one-time cleanup script for release entities, to fix
    upper-case DOIs. In fatcat, all DOIs should be normalized to lowercase.
    While this calls itself a cleanup, it is based on the import code path. It
    is not integrated into the `fatcat_import` or `fatcat_cleanup` controller;
    instead it has a __main__ function and is invoked like:
        python -m fatcat_tools.cleans.release_lowercase_doi - < blah.tsv
    It expects to get a simple text line on stdin, which is a release entity.
    The correction is implemented by fetching the current version of the
    entity, verifying the issue, and updating if it is still a problem.
    This does not try to do any merging, just corrects the case in a single
    update.
    """
    def __init__(self, api: ApiClient, **kwargs):
        eg_desc = (
            kwargs.pop("editgroup_description", None)
            or "Normalize release DOIs (extid) to lower-case"
        )
        eg_extra = kwargs.pop("editgroup_extra", dict())
        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ReleaseLowercaseDoiCleanup")
        super().__init__(
            api,
            do_updates=True,
            editgroup_description=eg_desc,
            editgroup_extra=eg_extra,
            **kwargs,
        )
        self.testing_mode = False
    def want(self, row: str) -> bool:
        row = row.strip()
        if not row:
            return False
        row = row.split()[0]
        if len(row) == 26:
            return True
        else:
            return False
    def parse_record(self, row: str) -> ReleaseEntity:
        # bezerk mode doesn't make sense for this importer
        assert self.bezerk_mode is False
        ident = row.strip().split()[0]
        assert len(ident) == 26
        return ReleaseEntity(
            ident=ident,
            ext_ids=ReleaseExtIds(),
        )
    def try_update(self, re: ReleaseEntity) -> bool:
        # should always be existing, but sometimes not because of prod/QA flip
        existing = None
        try:
            existing = self.api.get_release(re.ident)
        except ApiException as err:
            if err.status != 404:
                raise err
        if not existing:
            self.counts["skip-existing-not-found"] += 1
            return False
        if existing.state != "active":
            self.counts["skip-existing-entity-state"] += 1
            return False
        if not existing.ext_ids.doi:
            self.counts["skip-existing-no-doi"] += 1
            return False
        if existing.ext_ids.doi == existing.ext_ids.doi.lower():
            self.counts["skip-existing-doi-fine"] += 1
            return False
        existing.ext_ids.doi = existing.ext_ids.doi.lower()
        # not doing a check for "in current editgroup", because the source of
        # these corrections (entity dump) contains no dupes
        if not self.testing_mode:
            self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
        self.counts["update"] += 1
        return False
def test_lowercase_doi() -> None:
    api = public_api("http://localhost:9411/v0")
    rldc = ReleaseLowercaseDoiCleanup(api=api)
    rldc.testing_mode = True
    assert rldc.want("") is False
    assert rldc.want("aaaaaaaaaaaaarceaaaaaaaaai") is True
    assert rldc.want("aaaaaaaaaaaaarceaaaaaaaaai\t10.1234/ABCD") is True
    rldc.parse_record("aaaaaaaaaaaaarceaaaaaaaaai")
    dummy_re = api.get_release("aaaaaaaaaaaaarceaaaaaaaaai")
    assert rldc.try_update(dummy_re) is False
    assert rldc.counts["skip-existing-doi-fine"] == 1
    # this isn't a very complete test, doesn't get to update part
def main() -> None:
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "--host-url", default="http://localhost:9411/v0", help="connect to this host/port"
    )
    parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)
    parser.set_defaults(
        auth_var="FATCAT_AUTH_WORKER_CLEANUP",
    )
    parser.add_argument(
        "idents_file",
        help="File with release identifier to try updating",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )
    args = parser.parse_args()
    api = authenticated_api(
        args.host_url,
        # token is an optional kwarg (can be empty string, None, etc)
        token=os.environ.get(args.auth_var),
    )
    rldc = ReleaseLowercaseDoiCleanup(
        api,
        edit_batch_size=args.batch_size,
    )
    LinePusher(rldc, args.idents_file).run()
if __name__ == "__main__":
    main()
  |