1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
|
import argparse
import copy
import os
import sys
from typing import Any, Dict
import fatcat_openapi_client
from fatcat_openapi_client import DefaultApi, FileEntity
from fatcat_tools import authenticated_api, entity_from_dict, public_api
from fatcat_tools.importers.common import EntityImporter, JsonLinePusher
class FileShortWaybackTimestampCleanup(EntityImporter):
"""
This is a one-off / one-time cleanup script for file entities, fix short
timestamps in wayback URLs. These timestamps are supposed to have 14 digits
(datetime with year, hour, seconds, etc). Some legacy file imports ended up
with only 4 or 12 digits.
While this calls itself a cleanup, it is based on the import code path. It
is not integrated into the `fatcat_import` or `fatcat_cleanup` controller;
instead it has a __main__ function and is invoked like:
python -m fatcat_tools.cleans.file_short_wayback_ts - < blah.json
"""
def __init__(self, api: DefaultApi, **kwargs):
eg_desc = (
kwargs.pop("editgroup_description", None)
or "Expand trunacted timestamps in wayback URLs"
)
eg_extra = kwargs.pop("editgroup_extra", dict())
eg_extra["agent"] = eg_extra.get(
"agent", "fatcat_tools.FileShortWaybackTimestampCleanup"
)
super().__init__(
api,
do_updates=True,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
**kwargs,
)
self.testing_mode = False
def want(self, row: Dict[str, Any]) -> bool:
if row["status"].startswith("success"):
return True
else:
self.counts["skip-status"] += 1
return False
def parse_record(self, row: Dict[str, Any]) -> FileEntity:
# bezerk mode doesn't make sense for this importer
assert self.bezerk_mode is False
fe: FileEntity = entity_from_dict(row["file_entity"], FileEntity)
status: str = row["status"]
assert status.startswith("success")
url_expansions: Dict[str, str] = row["full_urls"]
assert len(url_expansions) >= 1
# actual cleanup happens here
any_fixed = False
for fe_url in fe.urls:
if "://web.archive.org/web/" not in fe_url.url:
continue
seq = fe_url.url.split("/")
partial_ts = seq[4]
original_url = "/".join(seq[5:])
if seq[2] != "web.archive.org":
continue
if len(partial_ts) not in [4, 12]:
continue
if fe_url.url in url_expansions:
fix_url = url_expansions[fe_url.url]
# defensive checks
if not (
f"/web/{partial_ts}" in fix_url
and fe_url.url.endswith(original_url)
and fix_url.endswith(original_url)
):
print(
f"bad replacement URL: partial_ts={partial_ts} original={original_url} fix_url={fix_url}",
file=sys.stderr,
)
self.counts["skip-bad-replacement"] += 1
return None
assert "://" in fix_url
fe_url.url = fix_url
any_fixed = True
if not any_fixed:
self.counts["skip-no-fixes"] += 1
return None
# do any other generic file entity cleanups
# this includes removing duplicates
fe = self.generic_file_cleanups(fe)
# verify that there are no exact duplicates
final_urls = [u.url for u in fe.urls]
assert len(final_urls) == len(list(set(final_urls)))
return fe
def try_update(self, fe: FileEntity) -> bool:
# should always be existing
try:
existing = self.api.get_file(fe.ident)
except fatcat_openapi_client.rest.ApiException as err:
if err.status != 404:
raise err
if not existing:
self.counts["skip-existing-not-found"] += 1
return False
if existing.state != "active":
self.counts["skip-existing-entity-state"] += 1
return False
if existing.sha1 != fe.sha1:
self.counts["skip-existing-mismatch"] += 1
return False
assert fe.revision and existing.revision
if existing.revision != fe.revision:
self.counts["skip-revision-changed"] += 1
return False
# verify that at least one URL remains
if not fe.urls or len(fe.urls) < 1:
self.counts["skip-no-urls"] += 1
return False
# verify that all wayback urls have 14-digit timestamps, and are generally well-formed
for u in fe.urls:
if "://web.archive.org/web/" not in u.url:
continue
if u.rel != "webarchive":
self.counts["skip-bad-wayback-rel"] += 1
return False
seg = u.url.split("/")
if (
len(seg) < 6
or seg[0] != "https:"
or seg[2] != "web.archive.org"
or seg[3] != "web"
):
self.counts["skip-bad-wayback"] += 1
return False
if len(seg[4]) != 14 or not seg[4].isdigit():
self.counts["skip-bad-wayback-timestamp"] += 1
return False
if existing == fe or existing.urls == fe.urls:
self.counts["skip-no-change"] += 1
return False
# not doing a check for "in current editgroup", because the source of
# these corrections (entity dump) contains no dupes
if not self.testing_mode:
# note: passing 'fe' instead of 'existing' here, which is not
# usually how it goes!
self.api.update_file(self.get_editgroup_id(), fe.ident, fe)
self.counts["update"] += 1
return False
def test_short_wayback_ts() -> None:
api = public_api("http://localhost:9411/v0")
fswtc = FileShortWaybackTimestampCleanup(api=api)
fswtc.testing_mode = True
assert fswtc.want({"status": "fail"}) is False
assert fswtc.want({"status": "success-self"}) is True
example_line: Dict[str, Any] = {
"status": "success-db",
"file_entity": {
# note: doesn't match actual entity
"release_ids": ["waldfsctnbcpdbmasgduhaaaaa"],
"mimetype": "application/pdf",
"urls": [
{
"url": "https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971",
"rel": "web",
},
{
"url": "https://web.archive.org/web/201904301022/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971",
"rel": "webarchive",
},
],
"sha256": "0b9e09480ed2e1f08f3c6c72f57ce12b52ea265f580f8810e606b49d64234b29",
"sha1": "be714299b9be21b5afdaa7affd7d710c58269433",
"md5": "9edb542be5b3446a1905e61a8a3abebd",
"size": 666242,
"revision": "fe949be8-7bf9-4c17-be28-8e3e90fb85bd",
"ident": "4ghpvs2t2rdtrdum2mkreh62me",
"state": "active",
},
"full_urls": {
"https://web.archive.org/web/201904301022/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971": "https://web.archive.org/web/20190430102239/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971"
},
}
example_fe = entity_from_dict(example_line["file_entity"], FileEntity)
fe1 = copy.copy(example_fe)
fe1.urls[
1
].url = "https://web.archive.org/web/20190430102239/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971"
assert fswtc.parse_record(example_line) == fe1
# update code path; requires a known file ident and API running locally
assert fswtc.counts["update"] == 0
dummy_fe = api.get_file("aaaaaaaaaaaaamztaaaaaaaaai")
fe1.ident = dummy_fe.ident
assert fswtc.try_update(fe1) is False
assert fswtc.counts["skip-existing-mismatch"] == 1
fe1.sha1 = dummy_fe.sha1
assert fswtc.try_update(fe1) is False
assert fswtc.counts["skip-revision-changed"] == 1
fe1.revision = dummy_fe.revision
assert fswtc.try_update(fe1) is False
print(fswtc.counts)
assert fswtc.counts["update"] == 1
# another example, which failed with an assertion in prod due to duplicated URLs
example_line2: Dict[str, Any] = {
"file_entity": {
"release_ids": ["22jt7euq4fafhblzullmnesso4"],
"mimetype": "application/pdf",
"urls": [
{
"url": "https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf",
"rel": "repository",
},
{
"url": "https://web.archive.org/web/201811010021/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf",
"rel": "webarchive",
},
{
"url": "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf",
"rel": "webarchive",
},
],
"sha256": "51ec58e7a2325d28d1deb0a4bc6422c0e4ae7b12ffb0b6298981a7b8b7730b19",
"sha1": "ad96a584fc6073b9a23736bc61ae0ec4a5661433",
"md5": "3d509743359649e34a27ae70c5cd3018",
"size": 430665,
"extra": {
"shadows": {"scimag_doi": "10.4259/ibk.59.1_194", "scimag_id": "69089904"}
},
"revision": "f1fa11ff-d521-45cf-9db1-cb3c8bd3ea48",
"ident": "duymhmxk3fgtzk37yp2pvthtxq",
"state": "active",
},
"full_urls": {
"https://web.archive.org/web/201811010021/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf": "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf"
},
"status": "success-self",
}
fe2 = fswtc.parse_record(example_line2)
assert len(fe2.urls) == 2
assert fe2.urls[0].rel == "repository"
assert (
fe2.urls[1].url
== "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf"
)
# ensure URL order is stable
example_line3: Dict[str, Any] = {
"file_entity": {
"release_ids": ["5rin7f2cdvc5hjkqqw53z7sr3i"],
"mimetype": "application/pdf",
"urls": [
{"url": "https://pubs.usgs.gov/bul/1108/report.pdf", "rel": "web"},
{
"url": "https://web.archive.org/web/201904291643/https://pubs.usgs.gov/bul/1108/report.pdf",
"rel": "webarchive",
},
],
"sha256": "714cd48c2577e9b058b8f16b4574765da685f67582cc53898a9d6933e45d6cc0",
"sha1": "4efbdb517c0ff3f58136e4efbbec2bd9315400d3",
"md5": "89b6e6cc4e0259317e26ddf1a9a336a0",
"size": 41265,
"revision": "926fcf73-e644-4446-a24b-4d0940a2cf65",
"ident": "lvnz23nzijaapf5iti45zez6zu",
"state": "active",
},
"full_urls": {
"https://web.archive.org/web/201904291643/https://pubs.usgs.gov/bul/1108/report.pdf": "https://web.archive.org/web/20190429164342/https://pubs.usgs.gov/bul/1108/report.pdf"
},
"status": "success-db",
}
fe3 = fswtc.parse_record(example_line3)
assert len(fe3.urls) == 2
assert fe3.urls[0].rel == "web"
assert fe3.urls[0].url == "https://pubs.usgs.gov/bul/1108/report.pdf"
assert fe3.urls[1].rel == "webarchive"
def main() -> None:
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
"--host-url", default="http://localhost:9411/v0", help="connect to this host/port"
)
parser.add_argument("--batch-size", help="size of batch to send", default=100, type=int)
parser.set_defaults(
auth_var="FATCAT_AUTH_WORKER_CLEANUP",
)
parser.add_argument(
"json_file",
help="File with jsonlines from file_meta schema to import from",
default=sys.stdin,
type=argparse.FileType("r"),
)
args = parser.parse_args()
api = authenticated_api(
args.host_url,
# token is an optional kwarg (can be empty string, None, etc)
token=os.environ.get(args.auth_var),
)
fswtc = FileShortWaybackTimestampCleanup(
api,
edit_batch_size=args.batch_size,
)
JsonLinePusher(fswtc, args.json_file).run()
if __name__ == "__main__":
main()
|