1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
|
import argparse
import os
import sys
from typing import Any, Dict
import fatcat_openapi_client
from fatcat_openapi_client import DefaultApi, FileEntity
from fatcat_tools import authenticated_api, public_api, uuid2fcid
from fatcat_tools.importers.common import EntityImporter, JsonLinePusher
from fatcat_tools.normal import clean_doi
class FileReleaseBugfix(EntityImporter):
"""
This is a one-off / one-time cleanup script for file entities which got
imported with incorrect release ident mappings, due to a bug in the file
ingest importer.
While this calls itself a cleanup, it is based on the import code path. It
is not integrated into the `fatcat_import` or `fatcat_cleanup` controller;
instead it has a __main__ function and is invoked like:
python -m fatcat_tools.cleans.file_release_bugfix - < blah.json
"""
def __init__(self, api: DefaultApi, **kwargs):
eg_desc = (
kwargs.pop("editgroup_description", None)
or "Correct bad file/release import mappings"
)
eg_extra = kwargs.pop("editgroup_extra", dict())
eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FileReleaseBugfix")
super().__init__(
api,
do_updates=True,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
**kwargs,
)
self.testing_mode = False
def want(self, row: Dict[str, Any]) -> bool:
if not (
row.get("edit_extra")
and row["edit_extra"].get("link_source")
and row["edit_extra"].get("link_source_id")
):
self.counts["skip-partial"] += 1
return False
if row["edit_extra"]["link_source"] not in ["unpaywall", "doi"]:
self.counts["skip-link-source"] += 1
return False
if row["edit_extra"].get("ingest_request_source") not in [
"unpaywall",
"fatcat-changelog",
]:
self.counts["skip-ingest-request-source"] += 1
return False
if not row["edit_extra"]["link_source_id"].startswith("10."):
self.counts["skip-source-id-not-doi"] += 1
return False
return True
def parse_record(self, row: Dict[str, Any]) -> FileEntity:
# bezerk mode doesn't make sense for this importer
assert self.bezerk_mode is False
file_ident = uuid2fcid(row["file_ident"])
wrong_release_ident = uuid2fcid(row["wrong_release_ident"])
edit_extra = row["edit_extra"]
assert edit_extra["link_source"] in ["unpaywall", "doi"]
file_edit_doi = clean_doi(edit_extra["link_source_id"])
if not file_edit_doi:
self.counts["skip-bad-doi"] += 1
return False
# check that the "wrong" release exists and doesn't have the DOI
wrong_release = None
try:
wrong_release = self.api.get_release(wrong_release_ident)
except fatcat_openapi_client.rest.ApiException as err:
if err.status != 404:
raise err
if not wrong_release:
self.counts["skip-wrong-release-missing"] += 1
return None
if clean_doi(wrong_release.ext_ids.doi) == file_edit_doi:
self.counts["skip-wrong-release-is-ok"] += 1
return None
# fetch the "correct" release, if any
fixed_release_ids = []
correct_release = None
try:
correct_release = self.api.lookup_release(doi=file_edit_doi)
except fatcat_openapi_client.rest.ApiException as err:
if err.status != 404:
raise err
if correct_release:
fixed_release_ids.append(correct_release.ident)
fe = FileEntity(
ident=file_ident,
release_ids=fixed_release_ids,
edit_extra=edit_extra,
)
fe._wrong_release_ident = wrong_release_ident
return fe
def try_update(self, fe: FileEntity) -> bool:
wrong_release_ident = fe._wrong_release_ident
assert len(wrong_release_ident) == 26
# should always be existing... but in QA it might not be
existing = None
try:
existing = self.api.get_file(fe.ident)
except fatcat_openapi_client.rest.ApiException as err:
if err.status != 404:
raise err
if not existing:
self.counts["skip-existing-not-found"] += 1
return False
if existing.state != "active":
self.counts["skip-existing-entity-state"] += 1
return False
if wrong_release_ident not in existing.release_ids:
self.counts["skip-existing-fixed"] += 1
return False
# fetch existing history to verify mismatch
history = self.api.get_file_history(existing.ident)
for entry in history:
if entry.editgroup.editor.is_bot is not True:
self.counts["skip-existing-edit-history-human"] += 1
return False
bad_edit = history[-1].edit
if bad_edit.extra != fe.edit_extra:
self.counts["skip-existing-edit-history-extra-mismatch"] += 1
return False
bad_editgroup = history[-1].editgroup
if not bad_editgroup.extra:
self.counts["skip-existing-editgroup-missing-extra"] += 1
return False
if (
bad_editgroup.editor_id != "scmbogxw25evtcesfcab5qaboa"
or bad_editgroup.extra.get("agent") != "fatcat_tools.IngestFileResultImporter"
or not bad_editgroup.extra.get("git_rev", "").startswith("v0.3")
or bad_editgroup.created.year != 2020
):
self.counts["skip-existing-edit-history-mismatch"] += 1
return False
existing.release_ids = [ri for ri in existing.release_ids if ri != wrong_release_ident]
if len(fe.release_ids) == 1:
if fe.release_ids[0] not in existing.release_ids:
existing.release_ids.append(fe.release_ids[0])
existing.edit_extra = fe.edit_extra
# not doing a check for "in current editgroup", because the source of
# these corrections (entity dump) contains no dupes
if not self.testing_mode:
self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
self.counts["update"] += 1
return False
def test_file_release_bugfix() -> None:
api = public_api("http://localhost:9411/v0")
frbc = FileReleaseBugfix(api=api)
frbc.testing_mode = True
assert frbc.want({"this": "asdf"}) is False
example_line: Dict[str, Any] = {
"file_ident": "00000000-0000-0000-3333-000000000002",
"wrong_release_ident": "00000000-0000-0000-4444-000000000002",
"edit_extra": {
"link_source": "unpaywall",
"link_source_id": "10.1371/journal.pmed.0020124",
"ingest_request_source": "unpaywall",
},
}
fe1 = frbc.parse_record(example_line)
print(frbc.counts)
frbc.try_update(fe1)
# NOTE: this test is pretty incompleted
def main() -> None:
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
"--host-url", default="http://localhost:9411/v0", help="connect to this host/port"
)
parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)
parser.set_defaults(
auth_var="FATCAT_AUTH_WORKER_CLEANUP",
)
parser.add_argument(
"json_file",
help="File with jsonlines with cleanup context",
default=sys.stdin,
type=argparse.FileType("r"),
)
args = parser.parse_args()
api = authenticated_api(
args.host_url,
# token is an optional kwarg (can be empty string, None, etc)
token=os.environ.get(args.auth_var),
)
frbc = FileReleaseBugfix(
api,
edit_batch_size=args.batch_size,
)
JsonLinePusher(frbc, args.json_file).run()
if __name__ == "__main__":
main()
|