aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/cleanups/file_short_wayback_ts.py
blob: e2595912102f5652ad59676dac97bf72bd21d7c5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
import argparse
import copy
import os
import sys
from typing import Any, Dict

import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, FileEntity

from fatcat_tools import authenticated_api, entity_from_dict, public_api
from fatcat_tools.importers.common import EntityImporter, JsonLinePusher


class FileShortWaybackTimestampCleanup(EntityImporter):
    """
    This is a one-off / one-time cleanup script for file entities, fix short
    timestamps in wayback URLs. These timestamps are supposed to have 14 digits
    (datetime with year, hour, seconds, etc). Some legacy file imports ended up
    with only 4 or 12 digits.

    While this calls itself a cleanup, it is based on the import code path. It
    is not integrated into the `fatcat_import` or `fatcat_cleanup` controller;
    instead it has a __main__ function and is invoked like:

        python -m fatcat_tools.cleans.file_short_wayback_ts - < blah.json
    """

    def __init__(self, api: ApiClient, **kwargs):

        eg_desc = (
            kwargs.pop("editgroup_description", None)
            or "Expand trunacted timestamps in wayback URLs"
        )
        eg_extra = kwargs.pop("editgroup_extra", dict())
        eg_extra["agent"] = eg_extra.get(
            "agent", "fatcat_tools.FileShortWaybackTimestampCleanup"
        )
        super().__init__(
            api,
            do_updates=True,
            editgroup_description=eg_desc,
            editgroup_extra=eg_extra,
            **kwargs,
        )
        self.testing_mode = False

    def want(self, row: Dict[str, Any]) -> bool:
        if row["status"].startswith("success"):
            return True
        else:
            self.counts["skip-status"] += 1
            return False

    def parse_record(self, row: Dict[str, Any]) -> FileEntity:

        # bezerk mode doesn't make sense for this importer
        assert self.bezerk_mode is False

        fe: FileEntity = entity_from_dict(row["file_entity"], FileEntity)
        status: str = row["status"]
        assert status.startswith("success")
        url_expansions: Dict[str, str] = row["full_urls"]
        assert len(url_expansions) >= 1

        # actual cleanup happens here
        any_fixed = False
        for fe_url in fe.urls:
            if "://web.archive.org/web/" not in fe_url.url:
                continue
            seq = fe_url.url.split("/")
            partial_ts = seq[4]
            original_url = "/".join(seq[5:])
            if seq[2] != "web.archive.org":
                continue
            if len(partial_ts) not in [4, 12]:
                continue
            if fe_url.url in url_expansions:
                fix_url = url_expansions[fe_url.url]
                # defensive checks
                if not (
                    f"/web/{partial_ts}" in fix_url
                    and fe_url.url.endswith(original_url)
                    and fix_url.endswith(original_url)
                ):
                    print(
                        f"bad replacement URL: partial_ts={partial_ts} original={original_url} fix_url={fix_url}",
                        file=sys.stderr,
                    )
                    self.counts["skip-bad-replacement"] += 1
                    return None
                assert "://" in fix_url
                fe_url.url = fix_url
                any_fixed = True

        if not any_fixed:
            self.counts["skip-no-fixes"] += 1
            return None

        # do any other generic file entity cleanups
        # this includes removing duplicates
        fe = self.generic_file_cleanups(fe)

        # verify that there are no exact duplicates
        final_urls = [u.url for u in fe.urls]
        assert len(final_urls) == len(list(set(final_urls)))

        return fe

    def try_update(self, fe: FileEntity) -> bool:

        # should always be existing
        try:
            existing = self.api.get_file(fe.ident)
        except fatcat_openapi_client.rest.ApiException as err:
            if err.status != 404:
                raise err

        if not existing:
            self.counts["skip-existing-not-found"] += 1
            return False

        if existing.sha1 != fe.sha1:
            self.counts["skip-existing-mismatch"] += 1
            return False

        assert fe.revision and existing.revision
        if existing.revision != fe.revision:
            self.counts["skip-revision-changed"] += 1
            return False

        # verify that at least one URL remains
        if not fe.urls or len(fe.urls) < 1:
            self.counts["skip-no-urls"] += 1
            return False

        # verify that all wayback urls have 14-digit timestamps, and are generally well-formed
        for u in fe.urls:
            if "://web.archive.org/web/" not in u.url:
                continue
            if u.rel != "webarchive":
                self.counts["skip-bad-wayback-rel"] += 1
                return False
            seg = u.url.split("/")
            if (
                len(seg) < 6
                or seg[0] != "https:"
                or seg[2] != "web.archive.org"
                or seg[3] != "web"
            ):
                self.counts["skip-bad-wayback"] += 1
                return False
            if len(seg[4]) != 14 or not seg[4].isdigit():
                self.counts["skip-bad-wayback-timestamp"] += 1
                return False

        if existing == fe or existing.urls == fe.urls:
            self.counts["skip-no-change"] += 1
            return False

        # not doing a check for "in current editgroup", because the source of
        # these corrections (entity dump) contains no dupes

        if not self.testing_mode:
            # note: passing 'fe' instead of 'existing' here, which is not
            # usually how it goes!
            self.api.update_file(self.get_editgroup_id(), fe.ident, fe)
        self.counts["update"] += 1
        return False


def test_short_wayback_ts() -> None:
    api = public_api("http://localhost:9411/v0")
    fswtc = FileShortWaybackTimestampCleanup(api=api)
    fswtc.testing_mode = True

    assert fswtc.want({"status": "fail"}) is False
    assert fswtc.want({"status": "success-self"}) is True

    example_line: Dict[str, Any] = {
        "status": "success-db",
        "file_entity": {
            # note: doesn't match actual entity
            "release_ids": ["waldfsctnbcpdbmasgduhaaaaa"],
            "mimetype": "application/pdf",
            "urls": [
                {
                    "url": "https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971",
                    "rel": "web",
                },
                {
                    "url": "https://web.archive.org/web/201904301022/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971",
                    "rel": "webarchive",
                },
            ],
            "sha256": "0b9e09480ed2e1f08f3c6c72f57ce12b52ea265f580f8810e606b49d64234b29",
            "sha1": "be714299b9be21b5afdaa7affd7d710c58269433",
            "md5": "9edb542be5b3446a1905e61a8a3abebd",
            "size": 666242,
            "revision": "fe949be8-7bf9-4c17-be28-8e3e90fb85bd",
            "ident": "4ghpvs2t2rdtrdum2mkreh62me",
            "state": "active",
        },
        "full_urls": {
            "https://web.archive.org/web/201904301022/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971": "https://web.archive.org/web/20190430102239/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971"
        },
    }
    example_fe = entity_from_dict(example_line["file_entity"], FileEntity)

    fe1 = copy.copy(example_fe)
    fe1.urls[
        1
    ].url = "https://web.archive.org/web/20190430102239/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971"
    assert fswtc.parse_record(example_line) == fe1

    # update code path; requires a known file ident and API running locally
    assert fswtc.counts["update"] == 0
    dummy_fe = api.get_file("aaaaaaaaaaaaamztaaaaaaaaai")
    fe1.ident = dummy_fe.ident

    assert fswtc.try_update(fe1) is False
    assert fswtc.counts["skip-existing-mismatch"] == 1

    fe1.sha1 = dummy_fe.sha1
    assert fswtc.try_update(fe1) is False
    assert fswtc.counts["skip-revision-changed"] == 1

    fe1.revision = dummy_fe.revision
    assert fswtc.try_update(fe1) is False
    print(fswtc.counts)
    assert fswtc.counts["update"] == 1

    # another example, which failed with an assertion in prod due to duplicated URLs
    example_line2: Dict[str, Any] = {
        "file_entity": {
            "release_ids": ["22jt7euq4fafhblzullmnesso4"],
            "mimetype": "application/pdf",
            "urls": [
                {
                    "url": "https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf",
                    "rel": "repository",
                },
                {
                    "url": "https://web.archive.org/web/201811010021/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf",
                    "rel": "webarchive",
                },
                {
                    "url": "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf",
                    "rel": "webarchive",
                },
            ],
            "sha256": "51ec58e7a2325d28d1deb0a4bc6422c0e4ae7b12ffb0b6298981a7b8b7730b19",
            "sha1": "ad96a584fc6073b9a23736bc61ae0ec4a5661433",
            "md5": "3d509743359649e34a27ae70c5cd3018",
            "size": 430665,
            "extra": {
                "shadows": {"scimag_doi": "10.4259/ibk.59.1_194", "scimag_id": "69089904"}
            },
            "revision": "f1fa11ff-d521-45cf-9db1-cb3c8bd3ea48",
            "ident": "duymhmxk3fgtzk37yp2pvthtxq",
            "state": "active",
        },
        "full_urls": {
            "https://web.archive.org/web/201811010021/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf": "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf"
        },
        "status": "success-self",
    }

    fe2 = fswtc.parse_record(example_line2)
    assert len(fe2.urls) == 2
    assert fe2.urls[0].rel == "repository"
    assert (
        fe2.urls[1].url
        == "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf"
    )

    # ensure URL order is stable
    example_line3: Dict[str, Any] = {
        "file_entity": {
            "release_ids": ["5rin7f2cdvc5hjkqqw53z7sr3i"],
            "mimetype": "application/pdf",
            "urls": [
                {"url": "https://pubs.usgs.gov/bul/1108/report.pdf", "rel": "web"},
                {
                    "url": "https://web.archive.org/web/201904291643/https://pubs.usgs.gov/bul/1108/report.pdf",
                    "rel": "webarchive",
                },
            ],
            "sha256": "714cd48c2577e9b058b8f16b4574765da685f67582cc53898a9d6933e45d6cc0",
            "sha1": "4efbdb517c0ff3f58136e4efbbec2bd9315400d3",
            "md5": "89b6e6cc4e0259317e26ddf1a9a336a0",
            "size": 41265,
            "revision": "926fcf73-e644-4446-a24b-4d0940a2cf65",
            "ident": "lvnz23nzijaapf5iti45zez6zu",
            "state": "active",
        },
        "full_urls": {
            "https://web.archive.org/web/201904291643/https://pubs.usgs.gov/bul/1108/report.pdf": "https://web.archive.org/web/20190429164342/https://pubs.usgs.gov/bul/1108/report.pdf"
        },
        "status": "success-db",
    }

    fe3 = fswtc.parse_record(example_line3)
    assert len(fe3.urls) == 2
    assert fe3.urls[0].rel == "web"
    assert fe3.urls[0].url == "https://pubs.usgs.gov/bul/1108/report.pdf"
    assert fe3.urls[1].rel == "webarchive"


def main() -> None:
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "--host-url", default="http://localhost:9411/v0", help="connect to this host/port"
    )
    parser.add_argument("--batch-size", help="size of batch to send", default=100, type=int)
    parser.set_defaults(
        auth_var="FATCAT_AUTH_WORKER_CLEANUP",
    )
    parser.add_argument(
        "json_file",
        help="File with jsonlines from file_meta schema to import from",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )

    args = parser.parse_args()
    api = authenticated_api(
        args.host_url,
        # token is an optional kwarg (can be empty string, None, etc)
        token=os.environ.get(args.auth_var),
    )

    fswtc = FileShortWaybackTimestampCleanup(
        api,
        edit_batch_size=args.batch_size,
    )
    JsonLinePusher(fswtc, args.json_file).run()


if __name__ == "__main__":
    main()