From 94ddfd7167994b4c0f7940317655d152aba302e6 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 4 Nov 2021 17:08:36 -0700 Subject: short wayback ts: initial cleanup script implementation --- .../fatcat_tools/cleanups/file_short_wayback_ts.py | 251 +++++++++++++++++++++ 1 file changed, 251 insertions(+) create mode 100644 python/fatcat_tools/cleanups/file_short_wayback_ts.py (limited to 'python/fatcat_tools/cleanups') diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py new file mode 100644 index 00000000..2d893dbf --- /dev/null +++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py @@ -0,0 +1,251 @@ +import argparse +import copy +import os +import sys +from typing import Any, Dict + +import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, FileEntity + +from fatcat_tools import authenticated_api, entity_from_dict, public_api +from fatcat_tools.importers.common import EntityImporter, JsonLinePusher + + +class FileShortWaybackTimestampCleanup(EntityImporter): + """ + This is a one-off / one-time cleanup script for file entities, fix short + timestamps in wayback URLs. These timestamps are supposed to have 14 digits + (datetime with year, hour, seconds, etc). Some legacy file imports ended up + with only 4 or 12 digits. + + While this calls itself a cleanup, it is based on the import code path. It + is not integrated into the `fatcat_import` or `fatcat_cleanup` controller; + instead it has a __main__ function and is invoked like: + + python -m fatcat_tools.cleans.file_short_wayback-ts < blah.json + """ + + def __init__(self, api: ApiClient, **kwargs): + + eg_desc = kwargs.pop("editgroup_description", None) or "Expand short wayback timestamps" + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get( + "agent", "fatcat_tools.FileShortWaybackTimestampCleanup" + ) + super().__init__( + api, + do_updates=True, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs + ) + self.testing_mode = False + + def want(self, row: Dict[str, Any]) -> bool: + if row["status"].startswith("success"): + return True + else: + self.counts["skip-status"] += 1 + return False + + def parse_record(self, row: Dict[str, Any]) -> FileEntity: + + # bezerk mode doesn't make sense for this importer + assert self.bezerk_mode is False + + fe: FileEntity = entity_from_dict(row["file_entity"], FileEntity) + status: str = row["status"] + assert status.startswith("success") + url_expansions: Dict[str, str] = row["full_urls"] + assert len(url_expansions) >= 1 + + # actual cleanup happens here + any_fixed = False + for fe_url in fe.urls: + if "://web.archive.org/web/" not in fe_url.url: + continue + seq = fe_url.url.split("/") + partial_ts = seq[4] + original_url = "/".join(seq[5:]) + if seq[2] != "web.archive.org": + continue + if len(partial_ts) not in [4, 12]: + continue + if fe_url.url in url_expansions: + fix_url = url_expansions[fe_url.url] + # defensive checks + assert partial_ts in fix_url + assert "://" in fix_url + assert fe_url.url.endswith(original_url) + assert fix_url.endswith(original_url) + fe_url.url = fix_url + any_fixed = True + + if not any_fixed: + self.counts["skip-no-fixes"] += 1 + return None + + # do any other generic file entity cleanups + # this includes removing duplicates + fe = self.generic_file_cleanups(fe) + + # verify that there are no exact duplicates + final_urls = [u.url for u in fe.urls] + assert len(final_urls) == len(list(set(final_urls))) + + return fe + + def try_update(self, fe: FileEntity) -> bool: + + # should always be existing + try: + existing = self.api.get_file(fe.ident) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + + if not existing: + self.counts["skip-existing-not-found"] += 1 + return False + + if existing.sha1 != fe.sha1: + self.counts["skip-existing-mismatch"] += 1 + return False + + assert fe.revision and existing.revision + if existing.revision != fe.revision: + self.counts["skip-revision-changed"] += 1 + return False + + # verify that at least one URL remains + if not fe.urls or len(fe.urls) < 1: + self.counts["skip-no-urls"] += 1 + return False + + # verify that all wayback urls have 14-digit timestamps, and are generally well-formed + for u in fe.urls: + if "://web.archive.org/web/" not in u.url: + continue + if u.rel != "webarchive": + self.counts["skip-bad-wayback-rel"] += 1 + return False + seg = u.url.split("/") + if ( + len(seg) < 6 + or seg[0] != "https:" + or seg[2] != "web.archive.org" + or seg[3] != "web" + ): + self.counts["skip-bad-wayback"] += 1 + return False + if len(seg[4]) != 14 or not seg[4].isdigit(): + self.counts["skip-bad-wayback-timestamp"] += 1 + return False + + if existing == fe or existing.urls == fe.urls: + self.counts["skip-no-change"] += 1 + return False + + # not doing a check for "in current editgroup", because the source of + # these corrections (entity dump) contains no dupes + + if not self.testing_mode: + self.api.update_file(self.get_editgroup_id(), existing.ident, existing) + self.counts["update"] += 1 + return False + + +def test_short_wayback_ts() -> None: + api = public_api("http://localhost:9411/v0") + fswtc = FileShortWaybackTimestampCleanup(api=api) + fswtc.testing_mode = True + + assert fswtc.want({"status": "fail"}) is False + assert fswtc.want({"status": "success-self"}) is True + + example_line: Dict[str, Any] = { + "status": "success-db", + "file_entity": { + # note: doesn't match actual entity + "release_ids": ["waldfsctnbcpdbmasgduhaaaaa"], + "mimetype": "application/pdf", + "urls": [ + { + "url": "https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971", + "rel": "web", + }, + { + "url": "https://web.archive.org/web/201904301022/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971", + "rel": "webarchive", + }, + ], + "sha256": "0b9e09480ed2e1f08f3c6c72f57ce12b52ea265f580f8810e606b49d64234b29", + "sha1": "be714299b9be21b5afdaa7affd7d710c58269433", + "md5": "9edb542be5b3446a1905e61a8a3abebd", + "size": 666242, + "revision": "fe949be8-7bf9-4c17-be28-8e3e90fb85bd", + "ident": "4ghpvs2t2rdtrdum2mkreh62me", + "state": "active", + }, + "full_urls": { + "https://web.archive.org/web/201904301022/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971": "https://web.archive.org/web/20190430102239/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971" + }, + } + example_fe = entity_from_dict(example_line["file_entity"], FileEntity) + + fe1 = copy.copy(example_fe) + fe1.urls[ + 1 + ].url = "https://web.archive.org/web/20190430102239/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971" + assert fswtc.parse_record(example_line) == fe1 + + # update code path; requires a known file ident and API running locally + assert fswtc.counts["update"] == 0 + dummy_fe = api.get_file("aaaaaaaaaaaaamztaaaaaaaaai") + fe1.ident = dummy_fe.ident + + assert fswtc.try_update(fe1) is False + assert fswtc.counts["skip-existing-mismatch"] == 1 + + fe1.sha1 = dummy_fe.sha1 + assert fswtc.try_update(fe1) is False + assert fswtc.counts["skip-revision-changed"] == 1 + + fe1.revision = dummy_fe.revision + assert fswtc.try_update(fe1) is False + print(fswtc.counts) + assert fswtc.counts["update"] == 1 + + +def main() -> None: + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "--host-url", default="http://localhost:9411/v0", help="connect to this host/port" + ) + parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) + parser.set_defaults( + auth_var="FATCAT_API_AUTH_TOKEN", + ) + parser.add_argument( + "json_file", + help="File with jsonlines from file_meta schema to import from", + default=sys.stdin, + type=argparse.FileType("r"), + ) + + args = parser.parse_args() + api = authenticated_api( + args.host_url, + # token is an optional kwarg (can be empty string, None, etc) + token=os.environ.get(args.auth_var), + ) + + fswtc = FileShortWaybackTimestampCleanup( + api, + edit_batch_size=args.batch_size, + ) + JsonLinePusher(fswtc, args.json_file).run() + + +if __name__ == "__main__": + main() -- cgit v1.2.3 From 1ad08edf2d3d06196119ec1eddc932e6423e3e7c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 4 Nov 2021 17:56:16 -0700 Subject: wayback short ts: add regression test for dupe URLs --- .../fatcat_tools/cleanups/file_short_wayback_ts.py | 44 ++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'python/fatcat_tools/cleanups') diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py index 2d893dbf..ab1b2a5f 100644 --- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py +++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py @@ -216,6 +216,50 @@ def test_short_wayback_ts() -> None: print(fswtc.counts) assert fswtc.counts["update"] == 1 + # another example, which failed with an assertion in prod due to duplicated URLs + example_line2: Dict[str, Any] = { + "file_entity": { + "release_ids": ["22jt7euq4fafhblzullmnesso4"], + "mimetype": "application/pdf", + "urls": [ + { + "url": "https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf", + "rel": "repository", + }, + { + "url": "https://web.archive.org/web/201811010021/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf", + "rel": "webarchive", + }, + { + "url": "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf", + "rel": "webarchive", + }, + ], + "sha256": "51ec58e7a2325d28d1deb0a4bc6422c0e4ae7b12ffb0b6298981a7b8b7730b19", + "sha1": "ad96a584fc6073b9a23736bc61ae0ec4a5661433", + "md5": "3d509743359649e34a27ae70c5cd3018", + "size": 430665, + "extra": { + "shadows": {"scimag_doi": "10.4259/ibk.59.1_194", "scimag_id": "69089904"} + }, + "revision": "f1fa11ff-d521-45cf-9db1-cb3c8bd3ea48", + "ident": "duymhmxk3fgtzk37yp2pvthtxq", + "state": "active", + }, + "full_urls": { + "https://web.archive.org/web/201811010021/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf": "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf" + }, + "status": "success-self", + } + + fe2 = fswtc.parse_record(example_line2) + assert len(fe2.urls) == 2 + assert fe2.urls[0].rel == "repository" + assert ( + fe2.urls[1].url + == "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf" + ) + def main() -> None: parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) -- cgit v1.2.3 From 766bc9c6ebe22a29c39c91941f12eed29b85a41d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 4 Nov 2021 18:01:49 -0700 Subject: wayback cleanup: actually update entity --- python/fatcat_tools/cleanups/file_short_wayback_ts.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'python/fatcat_tools/cleanups') diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py index ab1b2a5f..d8b8ab9d 100644 --- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py +++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py @@ -27,7 +27,7 @@ class FileShortWaybackTimestampCleanup(EntityImporter): def __init__(self, api: ApiClient, **kwargs): - eg_desc = kwargs.pop("editgroup_description", None) or "Expand short wayback timestamps" + eg_desc = kwargs.pop("editgroup_description", None) or "Expand trunacted timestamps in wayback URLs" eg_extra = kwargs.pop("editgroup_extra", dict()) eg_extra["agent"] = eg_extra.get( "agent", "fatcat_tools.FileShortWaybackTimestampCleanup" @@ -150,7 +150,9 @@ class FileShortWaybackTimestampCleanup(EntityImporter): # these corrections (entity dump) contains no dupes if not self.testing_mode: - self.api.update_file(self.get_editgroup_id(), existing.ident, existing) + # note: passing 'fe' instead of 'existing' here, which is not + # usually how it goes! + self.api.update_file(self.get_editgroup_id(), fe.ident, fe) self.counts["update"] += 1 return False -- cgit v1.2.3 From 2712b56580c7d5e2e07fdccd990e1918c43880f9 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 5 Nov 2021 11:03:06 -0700 Subject: wayback short ts: another regression test, and some small fmt/tweaks --- .../fatcat_tools/cleanups/file_short_wayback_ts.py | 41 ++++++++++++++++++++-- 1 file changed, 38 insertions(+), 3 deletions(-) (limited to 'python/fatcat_tools/cleanups') diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py index d8b8ab9d..56a5c80e 100644 --- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py +++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py @@ -27,7 +27,10 @@ class FileShortWaybackTimestampCleanup(EntityImporter): def __init__(self, api: ApiClient, **kwargs): - eg_desc = kwargs.pop("editgroup_description", None) or "Expand trunacted timestamps in wayback URLs" + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Expand trunacted timestamps in wayback URLs" + ) eg_extra = kwargs.pop("editgroup_extra", dict()) eg_extra["agent"] = eg_extra.get( "agent", "fatcat_tools.FileShortWaybackTimestampCleanup" @@ -37,7 +40,7 @@ class FileShortWaybackTimestampCleanup(EntityImporter): do_updates=True, editgroup_description=eg_desc, editgroup_extra=eg_extra, - **kwargs + **kwargs, ) self.testing_mode = False @@ -74,7 +77,7 @@ class FileShortWaybackTimestampCleanup(EntityImporter): if fe_url.url in url_expansions: fix_url = url_expansions[fe_url.url] # defensive checks - assert partial_ts in fix_url + assert f"/web/{partial_ts}" in fix_url assert "://" in fix_url assert fe_url.url.endswith(original_url) assert fix_url.endswith(original_url) @@ -262,6 +265,38 @@ def test_short_wayback_ts() -> None: == "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf" ) + # ensure URL order is stable + example_line3: Dict[str, Any] = { + "file_entity": { + "release_ids": ["5rin7f2cdvc5hjkqqw53z7sr3i"], + "mimetype": "application/pdf", + "urls": [ + {"url": "https://pubs.usgs.gov/bul/1108/report.pdf", "rel": "web"}, + { + "url": "https://web.archive.org/web/201904291643/https://pubs.usgs.gov/bul/1108/report.pdf", + "rel": "webarchive", + }, + ], + "sha256": "714cd48c2577e9b058b8f16b4574765da685f67582cc53898a9d6933e45d6cc0", + "sha1": "4efbdb517c0ff3f58136e4efbbec2bd9315400d3", + "md5": "89b6e6cc4e0259317e26ddf1a9a336a0", + "size": 41265, + "revision": "926fcf73-e644-4446-a24b-4d0940a2cf65", + "ident": "lvnz23nzijaapf5iti45zez6zu", + "state": "active", + }, + "full_urls": { + "https://web.archive.org/web/201904291643/https://pubs.usgs.gov/bul/1108/report.pdf": "https://web.archive.org/web/20190429164342/https://pubs.usgs.gov/bul/1108/report.pdf" + }, + "status": "success-db", + } + + fe3 = fswtc.parse_record(example_line3) + assert len(fe3.urls) == 2 + assert fe3.urls[0].rel == "web" + assert fe3.urls[0].url == "https://pubs.usgs.gov/bul/1108/report.pdf" + assert fe3.urls[1].rel == "webarchive" + def main() -> None: parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) -- cgit v1.2.3 From ad050445ac4f3e218ec101790bbf187731646361 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 5 Nov 2021 16:31:36 -0700 Subject: cleanups: initial lowercase DOI cleanup script --- .../fatcat_tools/cleanups/release_lowercase_doi.py | 145 +++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 python/fatcat_tools/cleanups/release_lowercase_doi.py (limited to 'python/fatcat_tools/cleanups') diff --git a/python/fatcat_tools/cleanups/release_lowercase_doi.py b/python/fatcat_tools/cleanups/release_lowercase_doi.py new file mode 100644 index 00000000..812262d4 --- /dev/null +++ b/python/fatcat_tools/cleanups/release_lowercase_doi.py @@ -0,0 +1,145 @@ +import argparse +import copy +import os +import sys +from typing import Any, Dict + +import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, ReleaseEntity, ReleaseExtIds + +from fatcat_tools import authenticated_api, entity_from_dict, public_api +from fatcat_tools.importers.common import EntityImporter, LinePusher + + +class ReleaseLowercaseDoiCleanup(EntityImporter): + """ + This is a one-off / one-time cleanup script for release entities, to fix + upper-case DOIs. In fatcat, all DOIs should be normalized to lowercase. + + While this calls itself a cleanup, it is based on the import code path. It + is not integrated into the `fatcat_import` or `fatcat_cleanup` controller; + instead it has a __main__ function and is invoked like: + + python -m fatcat_tools.cleans.release_lowercase_doi - < blah.tsv + + It expects to get a simple text line on stdin, which is a release entity. + The correction is implemented by fetching the current version of the + entity, verifying the issue, and updating if it is still a problem. + + This does not try to do any merging, just corrects the case in a single + update. + """ + + def __init__(self, api: ApiClient, **kwargs): + + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Normalize release DOIs (extid) to lower-case" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ReleaseLowercaseDoiCleanup") + super().__init__( + api, + do_updates=True, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs, + ) + self.testing_mode = False + + def want(self, row: str) -> bool: + row = row.strip().split()[0] + if len(row) == 26: + return True + else: + return False + + def parse_record(self, row: str) -> ReleaseEntity: + + # bezerk mode doesn't make sense for this importer + assert self.bezerk_mode is False + + ident = row.strip().split()[0] + assert len(ident) == 26 + + return ReleaseEntity( + ident=ident, + ext_ids=ReleaseExtIds(), + ) + + def try_update(self, re: ReleaseEntity) -> bool: + + # should always be existing + existing = self.api.get_release(re.ident) + + if not existing: + self.counts["skip-existing-not-found"] += 1 + return False + + if not existing.ext_ids.doi: + self.counts["skip-existing-no-doi"] += 1 + return False + + if existing.ext_ids.doi == existing.ext_ids.doi.lower(): + self.counts["skip-existing-doi-fine"] += 1 + return False + + existing.ext_ids.doi = existing.ext_ids.doi.lower() + + # not doing a check for "in current editgroup", because the source of + # these corrections (entity dump) contains no dupes + + if not self.testing_mode: + self.api.update_release(self.get_editgroup_id(), re.ident, re) + self.counts["update"] += 1 + return False + + +def test_lowercase_doi() -> None: + api = public_api("http://localhost:9411/v0") + rldc = ReleaseLowercaseDoiCleanup(api=api) + rldc.testing_mode = True + + assert rldc.want("") is False + assert rldc.want("aaaaaaaaaaaaarceaaaaaaaaai") is True + assert rldc.want("aaaaaaaaaaaaarceaaaaaaaaai\t10.1234/ABCD") is True + rldc.parse_record("aaaaaaaaaaaaarceaaaaaaaaai") + + dummy_re = api.get_release("aaaaaaaaaaaaarceaaaaaaaaai") + assert rldc.try_update(dummy_re) is False + assert rldc.counts["skip-existing-doi-fine"] == 1 + # this isn't a very complete test, doesn't get to update part + + +def main() -> None: + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "--host-url", default="http://localhost:9411/v0", help="connect to this host/port" + ) + parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) + parser.set_defaults( + auth_var="FATCAT_AUTH_WORKER_CLEANUP", + ) + parser.add_argument( + "idents_file", + help="File with release identifier to try updating", + default=sys.stdin, + type=argparse.FileType("r"), + ) + + args = parser.parse_args() + api = authenticated_api( + args.host_url, + # token is an optional kwarg (can be empty string, None, etc) + token=os.environ.get(args.auth_var), + ) + + rldc = ReleaseLowercaseDoiCleanup( + api, + edit_batch_size=args.batch_size, + ) + LinePusher(rldc, args.idents_file).run() + + +if __name__ == "__main__": + main() -- cgit v1.2.3 From 86e6850e70617e1609b79e0ee4bfe2a26f7f992e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 9 Nov 2021 14:17:31 -0800 Subject: cleanups: tweaks to wayback CDX cleanup scripts --- notes/cleanups/scripts/fetch_full_cdx_ts.py | 9 ++++++++- python/fatcat_tools/cleanups/file_short_wayback_ts.py | 18 +++++++++++++----- 2 files changed, 21 insertions(+), 6 deletions(-) (limited to 'python/fatcat_tools/cleanups') diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py index 6c6817ab..6f67c7e1 100644 --- a/notes/cleanups/scripts/fetch_full_cdx_ts.py +++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py @@ -157,7 +157,14 @@ def process_file(fe, session) -> dict: if short in full_urls: continue - cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session) + cdx_record = None + try: + cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 403: + return dict(file_entity=fe, full_urls=full_urls, status="fail-cdx-403") + else: + raise if cdx_record: if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts): assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit() diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py index 56a5c80e..a9b19921 100644 --- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py +++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py @@ -22,7 +22,7 @@ class FileShortWaybackTimestampCleanup(EntityImporter): is not integrated into the `fatcat_import` or `fatcat_cleanup` controller; instead it has a __main__ function and is invoked like: - python -m fatcat_tools.cleans.file_short_wayback-ts < blah.json + python -m fatcat_tools.cleans.file_short_wayback_ts - < blah.json """ def __init__(self, api: ApiClient, **kwargs): @@ -77,10 +77,18 @@ class FileShortWaybackTimestampCleanup(EntityImporter): if fe_url.url in url_expansions: fix_url = url_expansions[fe_url.url] # defensive checks - assert f"/web/{partial_ts}" in fix_url + if not ( + f"/web/{partial_ts}" in fix_url + and fe_url.url.endswith(original_url) + and fix_url.endswith(original_url) + ): + print( + f"bad replacement URL: partial_ts={partial_ts} original={original_url} fix_url={fix_url}", + file=sys.stderr, + ) + self.counts["skip-bad-replacement"] += 1 + return None assert "://" in fix_url - assert fe_url.url.endswith(original_url) - assert fix_url.endswith(original_url) fe_url.url = fix_url any_fixed = True @@ -305,7 +313,7 @@ def main() -> None: ) parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) parser.set_defaults( - auth_var="FATCAT_API_AUTH_TOKEN", + auth_var="FATCAT_AUTH_WORKER_CLEANUP", ) parser.add_argument( "json_file", -- cgit v1.2.3 From 996b2e2084c1798126bd91dd950c063982398bec Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 9 Nov 2021 15:46:20 -0800 Subject: more iteration on short wayback timestamp cleanup --- notes/cleanups/scripts/fetch_full_cdx_ts.py | 2 +- notes/cleanups/wayback_timestamps.md | 129 ++++++++++++++++++++- .../fatcat_tools/cleanups/file_short_wayback_ts.py | 2 +- 3 files changed, 129 insertions(+), 4 deletions(-) (limited to 'python/fatcat_tools/cleanups') diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py index 6f67c7e1..d5b0c476 100644 --- a/notes/cleanups/scripts/fetch_full_cdx_ts.py +++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py @@ -137,7 +137,7 @@ def process_file(fe, session) -> dict: if short in full_urls: continue - if original_url in self_urls: + if original_url in self_urls and ts in self_urls[original_url]: full_urls[short] = self_urls[original_url] status = "success-self" continue diff --git a/notes/cleanups/wayback_timestamps.md b/notes/cleanups/wayback_timestamps.md index 81785992..85e5f94f 100644 --- a/notes/cleanups/wayback_timestamps.md +++ b/notes/cleanups/wayback_timestamps.md @@ -61,9 +61,10 @@ Yes, 4-digit is a popular pattern as well, need to handle those: # 111M 0:13:22 [ 139k/s] zcat files_20211007_moreshortts.json.gz | wc -l + # 9,958,854 zcat files_20211007_moreshortts.json.gz | shuf -n10000 > files_20211007_moreshortts.10k_sample.json - # 9,958,854 + ## Fetch Complete URL @@ -114,6 +115,14 @@ Again with the broader set: 6175 "success-db" 3035 "success-self" +While running a larger batch, got a CDX API error: + + requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://web.archive.org/cdx/search/cdx?url=https%3A%2F%2Fwww.psychologytoday.com%2Ffiles%2Fu47%2FHenry_et_al.pdf&from=2017&to=2017&matchType=exact&output=json&limit=20 + + org.archive.util.io.RuntimeIOException: org.archive.wayback.exception.AdministrativeAccessControlException: Blocked Site Error + +So maybe need to use credentials after all. + ## Cleanup Process @@ -128,11 +137,13 @@ It looks like the rel swap is already implemented in `generic_file_cleanups()`. From sampling it seems like the mimetype issue is pretty small, so not going to bite that off now. The "bogus file" issue requires thought, so also skipping. -## Commands + +## Commands (old) Running with 8x parallelism to not break things; expecting some errors along the way, may need to add handlers for connection errors etc: + # OLD SNAPSHOT zcat files_20211007_moreshortts.json.gz \ | parallel -j8 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \ | pv -l \ @@ -140,3 +151,117 @@ the way, may need to add handlers for connection errors etc: > files_20211007_moreshortts.fetched.json.gz At 300 records/sec, this should take around 9-10 hours to process. + + + +## Prep Again (2021-11-09) + +After fixing "sort" issue and re-dumping file entities (2021-11-05 snapshot). + +Filter again: + + # note: in the future use pigz instead of gzip here + zcat file_export.json.gz \ + | pv -l \ + | rg 'web.archive.org/web/\d{4,12}/' \ + | gzip \ + > files_20211105_moreshortts.json.gz + # 112M 0:13:27 [ 138k/s] + + zcat files_20211105_moreshortts.json.gz | wc -l + # 9,958,854 + # good, exact same number as previous snapshot + + zcat files_20211105_moreshortts.json.gz | shuf -n10000 > files_20211105_moreshortts.10k_sample.json + # done + + cat files_20211105_moreshortts.10k_sample.json \ + | ./fetch_full_cdx_ts.py \ + | pv -l \ + > files_20211105_moreshortts.10k_sample.fetched.json + # 10.0k 0:03:36 [46.3 /s] + + cat files_20211105_moreshortts.10k_sample.fetched.json | jq .status | sort | uniq -c + 13 "fail-not-found" + 774 "success-api" + 6193 "success-db" + 3020 "success-self" + +After tweaking `success-self` logic: + + 13 "fail-not-found" + 859 "success-api" + 6229 "success-db" + 2899 "success-self" + + +## Testing in QA + +Copied `sample_out.json` to fatcat QA instance and renamed as `files_20211007_moreshortts.10k_sample.fetched.json` + + # OLD ATTEMPT + export FATCAT_API_AUTH_TOKEN=[...] + head -n10 /srv/fatcat/datasets/files_20211007_moreshortts.10k_sample.fetched.json \ + | python -m fatcat_tools.cleanups.file_short_wayback_ts - + +Ran in to issues, iterated above. + +Trying again with updated script and sample file: + + export FATCAT_AUTH_WORKER_CLEANUP=[...] + + head -n10 /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \ + | python -m fatcat_tools.cleanups.file_short_wayback_ts - + # Counter({'total': 10, 'update': 10, 'skip': 0, 'insert': 0, 'exists': 0}) + +Manually inspected and these look good. Trying some repeats and larger batched: + + head -n10 /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \ + | python -m fatcat_tools.cleanups.file_short_wayback_ts - + # Counter({'total': 10, 'skip-revision-changed': 10, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0}) + + head -n1000 /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \ + | python -m fatcat_tools.cleanups.file_short_wayback_ts - + + [...] + bad replacement URL: partial_ts=201807271139 original=http://www.scielo.br/pdf/qn/v20n1/4918.pdf fix_url=https://web.archive.org/web/20170819080342/http://www.scielo.br/pdf/qn/v20n1/4918.pdf + bad replacement URL: partial_ts=201904270207 original=https://www.matec-conferences.org/articles/matecconf/pdf/2018/62/matecconf_iccoee2018_03008.pdf fix_url=https://web.archive.org/web/20190501060839/https://www.matec-conferences.org/articles/matecconf/pdf/2018/62/matecconf_iccoee2018_03008.pdf + bad replacement URL: partial_ts=201905011445 original=https://cdn.intechopen.com/pdfs/5886.pdf fix_url=https://web.archive.org/web/20190502203832/https://cdn.intechopen.com/pdfs/5886.pdf + [...] + + # Counter({'total': 1000, 'update': 969, 'skip': 19, 'skip-bad-replacement': 18, 'skip-revision-changed': 10, 'skip-bad-wayback-timestamp': 2, 'skip-status': 1, 'insert': 0, 'exists': 0}) + + +It looks like these "bad replacement URLs" are due to timestamp mismatches. Eg, the partial timestamp is not part of the final timestamp. + +Tweaked fetch script and re-ran: + + # Counter({'total': 1000, 'skip-revision-changed': 979, 'update': 18, 'skip-bad-wayback-timestamp': 2, 'skip': 1, 'skip-status': 1, 'insert': 0, 'exists': 0}) + +Cool. Sort of curious what the deal is with those `skip-bad-wayback-timestamp`. + +Run the rest through: + + cat /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \ + | python -m fatcat_tools.cleanups.file_short_wayback_ts - + # Counter({'total': 10000, 'update': 8976, 'skip-revision-changed': 997, 'skip-bad-wayback-timestamp': 14, 'skip': 13, 'skip-status': 13, 'insert': 0, 'exists': 0}) + +Should tweak batch size to 100 (vs. 50). + +How to parallelize import: + + # from within pipenv + cat /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \ + | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_tools.cleanups.file_short_wayback_ts - + + +## Full Batch Commands + +Running in bulk again: + + zcat files_20211105_moreshortts.json.gz \ + | parallel -j8 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \ + | pv -l \ + | gzip \ + > files_20211105_moreshortts.fetched.json.gz + diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py index a9b19921..e2595912 100644 --- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py +++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py @@ -311,7 +311,7 @@ def main() -> None: parser.add_argument( "--host-url", default="http://localhost:9411/v0", help="connect to this host/port" ) - parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) + parser.add_argument("--batch-size", help="size of batch to send", default=100, type=int) parser.set_defaults( auth_var="FATCAT_AUTH_WORKER_CLEANUP", ) -- cgit v1.2.3 From 78eb2cc0f3f81c35474eb68231fe5f60d47bbcde Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 9 Nov 2021 16:04:31 -0800 Subject: lowercase DOI lint and check entity status --- python/fatcat_tools/cleanups/release_lowercase_doi.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'python/fatcat_tools/cleanups') diff --git a/python/fatcat_tools/cleanups/release_lowercase_doi.py b/python/fatcat_tools/cleanups/release_lowercase_doi.py index 812262d4..4e5a6f43 100644 --- a/python/fatcat_tools/cleanups/release_lowercase_doi.py +++ b/python/fatcat_tools/cleanups/release_lowercase_doi.py @@ -1,13 +1,10 @@ import argparse -import copy import os import sys -from typing import Any, Dict -import fatcat_openapi_client from fatcat_openapi_client import ApiClient, ReleaseEntity, ReleaseExtIds -from fatcat_tools import authenticated_api, entity_from_dict, public_api +from fatcat_tools import authenticated_api, public_api from fatcat_tools.importers.common import EntityImporter, LinePusher @@ -76,6 +73,10 @@ class ReleaseLowercaseDoiCleanup(EntityImporter): self.counts["skip-existing-not-found"] += 1 return False + if existing.status != "active": + self.counts["skip-existing-entity-status"] += 1 + return False + if not existing.ext_ids.doi: self.counts["skip-existing-no-doi"] += 1 return False -- cgit v1.2.3 From d9284d421618742f5ecd76ba2c6d92dcefa5e5db Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 9 Nov 2021 16:56:28 -0800 Subject: updates to lowercase DOI cleanup --- notes/cleanups/case_sensitive_dois.md | 71 ++++++++++++++++++++++ .../fatcat_tools/cleanups/release_lowercase_doi.py | 22 ++++--- 2 files changed, 86 insertions(+), 7 deletions(-) create mode 100644 notes/cleanups/case_sensitive_dois.md (limited to 'python/fatcat_tools/cleanups') diff --git a/notes/cleanups/case_sensitive_dois.md b/notes/cleanups/case_sensitive_dois.md new file mode 100644 index 00000000..1bf1901e --- /dev/null +++ b/notes/cleanups/case_sensitive_dois.md @@ -0,0 +1,71 @@ + +Relevant github issue: https://github.com/internetarchive/fatcat/issues/83 + +How many existing fatcat releases have a non-lowercase DOI? As of June 2021: + + zcat release_extid.tsv.gz | cut -f3 | rg '[A-Z]' | pv -l | wc -l + 139964 + +## Prep + + wget https://archive.org/download/fatcat_bulk_exports_2021-11-05/release_extid.tsv.gz + + # scratch:bin/fcid.py is roughly the same as `fatcat_util.py uuid2fcid` + + zcat release_extid.tsv.gz \ + | cut -f1,3 \ + | rg '[A-Z]' \ + | /fast/scratch/bin/fcid.py \ + | pv -l \ + > nonlowercase_doi_releases.tsv + # 140k 0:03:54 [ 599 /s] + + wc -l nonlowercase_doi_releases.tsv + 140530 nonlowercase_doi_releases.tsv + +Uhoh, there are ~500 more than previously? Guess those are from after the fix? + +Create a sample for testing: + + shuf -n10000 nonlowercase_doi_releases.tsv \ + > nonlowercase_doi_releases.10k_sample.tsv + +## Test in QA + +In pipenv: + + export FATCAT_AUTH_WORKER_CLEANUP=[...] + + head -n100 /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \ + | python -m fatcat_tools.cleanups.release_lowercase_doi - + # Counter({'total': 100, 'update': 100, 'skip': 0, 'insert': 0, 'exists': 0}) + + head -n100 /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \ + | python -m fatcat_tools.cleanups.release_lowercase_doi - + # Counter({'total': 100, 'skip-existing-doi-fine': 100, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0}) + + head -n2000 /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \ + | python -m fatcat_tools.cleanups.release_lowercase_doi - + # no such release_ident found: dcjsybvqanffhmu4dhzdnptave + +Presumably because this is being run in QA, and there are some newer prod releases in the snapshot. + +Did a quick update, and then: + + head -n2000 /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \ + | python -m fatcat_tools.cleanups.release_lowercase_doi - + # Counter({'total': 2000, 'skip-existing-doi-fine': 1100, 'update': 898, 'skip-existing-not-found': 2, 'skip': 0, 'insert': 0, 'exists': 0}) + +Did some spot checking in QA. Out of 20 DOIs checked, 15 were valid, 5 were not +valid (doi.org 404). It seems like roughly 1/3 have a dupe DOI (the lower-case +DOI exists); didn't count exact numbers. + +This cleanup is simple and looks good to go. Batch size of 50 is good for full +releases. + +Example of parallelization: + + cat /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \ + | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_tools.cleanups.release_lowercase_doi - + +Ready to go! diff --git a/python/fatcat_tools/cleanups/release_lowercase_doi.py b/python/fatcat_tools/cleanups/release_lowercase_doi.py index 4e5a6f43..5e3275db 100644 --- a/python/fatcat_tools/cleanups/release_lowercase_doi.py +++ b/python/fatcat_tools/cleanups/release_lowercase_doi.py @@ -2,7 +2,7 @@ import argparse import os import sys -from fatcat_openapi_client import ApiClient, ReleaseEntity, ReleaseExtIds +from fatcat_openapi_client import ApiClient, ApiException, ReleaseEntity, ReleaseExtIds from fatcat_tools import authenticated_api, public_api from fatcat_tools.importers.common import EntityImporter, LinePusher @@ -45,7 +45,10 @@ class ReleaseLowercaseDoiCleanup(EntityImporter): self.testing_mode = False def want(self, row: str) -> bool: - row = row.strip().split()[0] + row = row.strip() + if not row: + return False + row = row.split()[0] if len(row) == 26: return True else: @@ -66,15 +69,20 @@ class ReleaseLowercaseDoiCleanup(EntityImporter): def try_update(self, re: ReleaseEntity) -> bool: - # should always be existing - existing = self.api.get_release(re.ident) + # should always be existing, but sometimes not because of prod/QA flip + existing = None + try: + existing = self.api.get_release(re.ident) + except ApiException as err: + if err.status != 404: + raise err if not existing: self.counts["skip-existing-not-found"] += 1 return False - if existing.status != "active": - self.counts["skip-existing-entity-status"] += 1 + if existing.state != "active": + self.counts["skip-existing-entity-state"] += 1 return False if not existing.ext_ids.doi: @@ -91,7 +99,7 @@ class ReleaseLowercaseDoiCleanup(EntityImporter): # these corrections (entity dump) contains no dupes if not self.testing_mode: - self.api.update_release(self.get_editgroup_id(), re.ident, re) + self.api.update_release(self.get_editgroup_id(), existing.ident, existing) self.counts["update"] += 1 return False -- cgit v1.2.3 From fd71860047f84305a50772aa2e7494fd61cfff30 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 9 Nov 2021 18:48:37 -0800 Subject: initial file/release bugfix cleanup worker and notes --- notes/cleanups/file_release_ingest_bug.md | 144 +++++++++++++ .../fatcat_tools/cleanups/file_release_bugfix.py | 231 +++++++++++++++++++++ 2 files changed, 375 insertions(+) create mode 100644 notes/cleanups/file_release_ingest_bug.md create mode 100644 python/fatcat_tools/cleanups/file_release_bugfix.py (limited to 'python/fatcat_tools/cleanups') diff --git a/notes/cleanups/file_release_ingest_bug.md b/notes/cleanups/file_release_ingest_bug.md new file mode 100644 index 00000000..8690157a --- /dev/null +++ b/notes/cleanups/file_release_ingest_bug.md @@ -0,0 +1,144 @@ + +We want to find cases where the `file_edit` metadata for a file entity does not match the DOI of the release it is linked to. + +## Background + + +Seems to mostly happen when the `link_source_id` DOI is not found during a +fatcat 'lookup', eg due to upstream DOI metadata not including a 'title' field +(required by fatcat schema). Many of these seem to be 'journal-issue' or +something like that. Presumably upstream (unpaywall) crawled and found some PDF +for the DOI and indexes that. + +TODO: what was the specific code bug that caused this? time window of edits? +https://github.com/internetarchive/fatcat/commit/d8cf02dbdc6848be4de2710e54f4463d702b6e7c + +TODO: how many empty-title Crossref DOIs are there? perhaps we should import these after all, in some situations? eg, pull `html_biblio`? +TODO: examples of other mismatches? edits from the same timespan with mismatching identifiers + +## Queries + + file_edit + ident_id + rev_id + extra_json->>'link_source_id'::text + file_rev_release + file_rev + target_release_ident_id + release_ident + id + rev_id + release_rev + id + doi + +Query some samples: + + SELECT file_edit.ident_id as file_ident, release_ident.id as release_ident, file_edit.extra_json->>'link_source_id' as file_edit_doi, release_rev.doi as release_doi + FROM file_edit + LEFT JOIN file_rev_release ON file_edit.rev_id = file_rev_release.file_rev + LEFT JOIN release_ident ON file_rev_release.target_release_ident_id = release_ident.id + LEFT JOIN release_rev ON release_rev.id = release_ident.rev_id + WHERE + file_edit.extra_json->>'ingest_request_source' = 'unpaywall' + AND release_rev.doi != file_edit.extra_json->>'link_source_id' + LIMIT 20; + + file_ident | release_ident | file_edit_doi | release_doi + --------------------------------------+--------------------------------------+---------------------------------------+------------------------------------- + 8fe6eb2a-da3a-4e1c-b281-85fad3cae601 | 0a0cbabd-3f04-4296-94d5-b61c27fddee4 | 10.7167/2013/520930/dataset/3 | 10.7167/2013/520930 + cbb2d4e7-a9a7-41df-be1d-faf9dd552ee1 | d7b356c7-de6b-4885-b37d-ea603849a30a | 10.1525/elementa.124.t1 | 10.1525/elementa.124 + 26242648-2a7b-42ea-b2ca-e4fba8f0a2c7 | d2647e07-fbb3-47f5-b06a-de2872bf84ec | 10.1023/a:1000301702170 | 10.1007/bfb0085374 + f774854a-3939-4aa9-bf73-7e0573908b88 | e19ce792-422d-4feb-92ba-dd8536a60618 | 10.32835/2223-5752.2018.17.14-21 | 10.31652/2412-1142-2018-50-151-156 + 615ddf8c-2d91-4c1c-b04a-76b69f07b300 | e7af8377-9542-4711-9444-872f45521dc1 | 10.5644/sjm.10.1.00 | 10.5644/sjm.10.1.01 + 43fa1b53-eddb-4f31-96af-bc9bd3bb31b6 | e1ec5d8a-ff88-49f4-8540-73ee4906e119 | 10.1136/bjo.81.11.934 | 10.1038/sj.bjc.6690790 + 64b04cbc-ab3d-4cef-aff5-9d6b2532c47a | 43c387d1-e4c9-49f3-9995-552451a9a246 | 10.1023/a:1002015330987 | 10.1006/jnth.1998.2271 + 31e6e1a8-8457-4c93-82d3-12a51c0dc1bb | a2ec4305-63d2-4164-8470-02bf5c4ba74c | 10.1186/1742-4690-2-s1-s84 | 10.1038/nm0703-847 + 3461024a-1800-44a0-a7be-73f14bac99dd | 6849d39f-6bae-460b-8a9d-d6b86fbac2d3 | 10.17265/2328-2150/2014.11 | 10.1590/s0074-02762002000900009 + c2707e43-6a4b-4f5d-8fb4-babe16b42b4d | 82127faf-d125-4cd4-88b9-9a8618392019 | 10.1055/s-005-28993 | 10.1055/s-0034-1380711 + 0fe72304-1884-4dde-98c6-7cf3aec5bf31 | e3fe46cd-0205-42e4-8255-490e0eba38ea | 10.1787/5k4c1s0z2gs1-en | 10.1787/9789282105931-2-en + 686a1508-b6c5-4060-9035-1dd8a4b85be1 | dcecd03d-e0f6-40ce-a376-7e64bd90bdca | 10.15406/jlprr.2.3 | 10.15406/jlprr.2015.02.00039 + 9403bc0e-2a50-4c8a-ab79-943bce20e583 | 3073dee2-c8f7-42ed-a727-12306d86fa35 | 10.1787/5jrtgzfl6g9w-en | 10.1111/1468-2443.00004 + 9a080a6f-aaaf-4f4b-acab-3eda2bfacc22 | af28a167-2ddc-43db-98ca-1cc29b863f9f | 10.13040/ijpsr.0975-8232.4(6).2094-05 | 10.1002/chin.201418290 + 810f1a00-623f-4a5c-88e2-7240232ae8f9 | 639d4ddd-ef24-49dc-8d4e-ea626b0b85f8 | 10.13040/ijpsr.0975-8232.5(6).2216-24 | 10.1006/phrs.1997.0184 + 10.13040/IJPSR.0975-8232.5(6).2216-24 + 72752c59-9532-467d-8b4f-fe4d994bcff5 | cc3db36c-27e9-45bf-96b0-56d1119b93d6 | 10.22201/facmed.2007865xe.2018.26 | 10.22201/facmed.2007865x.2018.26.01 + a0740fc2-a1db-4bc8-ac98-177ccebde24f | 0a7136ac-86ad-4636-9c2a-b56a6d5a3a27 | 10.24966/cmph-1978/vol6iss1 | 10.1093/milmed/146.4.283 + 5b05df6a-7a0e-411b-a4e1-0081d7522673 | dc0cb585-c709-4990-a82e-c7c9b254fc74 | 10.15406/jig.3.1 | 10.15406/jig.2016.03.00039 + 011964dd-24a6-4d25-b9dd-921077f1947e | deabab6d-7381-4567-bf56-5294ab081e20 | 10.17265/1537-1506/2013.01 | 10.1109/icsssm.2016.7538527 + 4d7efa59-30c7-4015-bea9-d7df171a97ed | a05449a0-34d7-4f45-9b92-bcf71db4918d | 10.14413/herj.2017.01.09 | 10.14413/herj.2017.01.09. + (20 rows) + +Total counts, broken DOIs, by source: + + SELECT file_edit.extra_json->>'ingest_request_source' as source, COUNT(*) as broken_files + FROM file_edit + LEFT JOIN file_rev_release ON file_edit.rev_id = file_rev_release.file_rev + LEFT JOIN release_ident ON file_rev_release.target_release_ident_id = release_ident.id + LEFT JOIN release_rev ON release_rev.id = release_ident.rev_id + WHERE + file_edit.extra_json->>'link_source_id' IS NOT NULL + AND file_edit.extra_json->>'link_source_id' LIKE '10.%' + AND release_rev.doi != file_edit.extra_json->>'link_source_id' + GROUP BY file_edit.extra_json->>'ingest_request_source'; + + + source | broken_files + ------------------+-------------- + fatcat-changelog | 872 + unpaywall | 227954 + (2 rows) + +## Cleanup Pseudocode + +Dump file idents from above SQL queries. Transform from UUID to fatcat ident. + +For each file ident, fetch file entity, with releases expanded. Also fetch edit +history. Filter by: + +- single release for file +- single edit in history, matching broken agent +- edit metadata indicates a DOI, which doesn't match release DOI (or release doesn't have DOI?) + +If all that is the case, do a lookup by the correct DOI. If there is a match, +update the file entity to point to that release (and only that release). +Operate in batches. + +Write cleanup bot, run in QA to test. Run SQL query again and iterate if some +files were not updated. Merge to master, run full dump and update in prod. + +## Export Matches Needing Fix + + COPY ( + SELECT row_to_json(row.*) FROM ( + SELECT file_edit.ident_id as file_ident, release_ident.id as wrong_release_ident, file_edit.extra_json as edit_extra + FROM file_edit + LEFT JOIN file_rev_release ON file_edit.rev_id = file_rev_release.file_rev + LEFT JOIN release_ident ON file_rev_release.target_release_ident_id = release_ident.id + LEFT JOIN release_rev ON release_rev.id = release_ident.rev_id + WHERE + file_edit.extra_json->>'link_source_id' IS NOT NULL + AND file_edit.extra_json->>'link_source_id' LIKE '10.%' + AND release_rev.doi != file_edit.extra_json->>'link_source_id' + -- LIMIT 20; + ) as row + ) TO '/srv/fatcat/snapshots/file_release_bugfix_20211105.unfiltered.json'; + # COPY 228,827 + +Then need to do a `sed` pass, and a `rg` pass, to filter out bad escapes: + + cat /srv/fatcat/snapshots/file_release_bugfix_20211105.unfiltered.json \ + | sed 's/\\"/\"/g' \ + | rg -v "\\\\" \ + | rg '^\{' \ + | pv -l \ + > /srv/fatcat/snapshots/file_release_bugfix_20211105.json + # 228k 0:00:00 [ 667k/s] + + wc -l /srv/fatcat/snapshots/file_release_bugfix_20211105.json + # 228,826 + +And create a sample file: + + shuf -n10000 /srv/fatcat/snapshots/file_release_bugfix_20211105.json > /srv/fatcat/snapshots/file_release_bugfix_20211105.10k_sample.json + diff --git a/python/fatcat_tools/cleanups/file_release_bugfix.py b/python/fatcat_tools/cleanups/file_release_bugfix.py new file mode 100644 index 00000000..025c1370 --- /dev/null +++ b/python/fatcat_tools/cleanups/file_release_bugfix.py @@ -0,0 +1,231 @@ +import argparse +import os +import sys +from typing import Any, Dict + +import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, FileEntity + +from fatcat_tools import authenticated_api, public_api, uuid2fcid +from fatcat_tools.importers.common import EntityImporter, JsonLinePusher +from fatcat_tools.normal import clean_doi + + +class FileReleaseBugfix(EntityImporter): + """ + This is a one-off / one-time cleanup script for file entities which got + imported with incorrect release ident mappings, due to a bug in the file + ingest importer. + + While this calls itself a cleanup, it is based on the import code path. It + is not integrated into the `fatcat_import` or `fatcat_cleanup` controller; + instead it has a __main__ function and is invoked like: + + python -m fatcat_tools.cleans.file_release_bugfix - < blah.json + """ + + def __init__(self, api: ApiClient, **kwargs): + + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Correct bad file/release import mappings" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FileReleaseBugfix") + super().__init__( + api, + do_updates=True, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs, + ) + self.testing_mode = False + + def want(self, row: Dict[str, Any]) -> bool: + if not ( + row.get("edit_extra") + and row["edit_extra"].get("link_source") + and row["edit_extra"].get("link_source_id") + ): + self.counts["skip-partial"] += 1 + return False + if row["edit_extra"]["link_source"] not in ["unpaywall", "fatcat-changelog"]: + self.counts["skip-link-source"] += 1 + return False + if not row["edit_extra"]["link_source_id"].startswith("10."): + self.counts["skip-source-id-not-doi"] += 1 + return False + return True + + def parse_record(self, row: Dict[str, Any]) -> FileEntity: + + # bezerk mode doesn't make sense for this importer + assert self.bezerk_mode is False + + file_ident = uuid2fcid(row["file_ident"]) + wrong_release_ident = uuid2fcid(row["wrong_release_ident"]) + edit_extra = row["edit_extra"] + assert edit_extra["link_source"] in ["unpaywall", "fatcat-changelog"] + file_edit_doi = clean_doi(edit_extra["link_source_id"]) + + if not file_edit_doi: + self.counts["skip-bad-doi"] += 1 + return False + + # check that the "wrong" release exists and doesn't have the DOI + wrong_release = None + try: + wrong_release = self.api.get_release(wrong_release_ident) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + + if not wrong_release: + self.counts["skip-wrong-release-missing"] += 1 + return None + + if clean_doi(wrong_release.ext_ids.doi) == file_edit_doi: + self.counts["skip-wrong-release-is-ok"] += 1 + return None + + # fetch the "correct" release, if any + fixed_release_ids = [] + correct_release = None + try: + correct_release = self.api.lookup_release(doi=file_edit_doi) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + + if correct_release: + fixed_release_ids.append(correct_release.ident) + + fe = FileEntity( + ident=file_ident, + release_ids=fixed_release_ids, + edit_extra=edit_extra, + ) + fe._wrong_release_ident = wrong_release_ident + return fe + + def try_update(self, fe: FileEntity) -> bool: + + wrong_release_ident = fe._wrong_release_ident + assert len(wrong_release_ident) == 26 + + # should always be existing... but in QA it might not be + existing = None + try: + existing = self.api.get_file(fe.ident) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + + if not existing: + self.counts["skip-existing-not-found"] += 1 + return False + + if wrong_release_ident not in existing.release_ids: + self.counts["skip-existing-fixed"] += 1 + return False + + # fetch existing history to verify mismatch + history = self.api.get_file_history(existing.ident) + + if len(history) != 1: + self.counts["skip-existing-history-updated"] += 1 + return False + + bad_edit = history[0].edit + if bad_edit.extra != fe.edit_extra: + self.counts["skip-existing-edit-history-extra-mismatch"] += 1 + return False + + bad_editgroup = history[0].editgroup + + if not bad_editgroup.extra: + self.counts["skip-existing-editgroup-missing-extra"] += 1 + return False + + if ( + bad_editgroup.editor_id != "scmbogxw25evtcesfcab5qaboa" + or bad_editgroup.extra.get("agent") != "fatcat_tools.IngestFileResultImporter" + or not bad_editgroup.extra.get("git_rev", "").startswith("v0.3") + or bad_editgroup.created.year != 2020 + ): + self.counts["skip-existing-edit-history-mismatch"] += 1 + return False + + existing.release_ids = [ri for ri in existing.release_ids if ri != wrong_release_ident] + + if len(fe.release_ids) == 1: + if fe.release_ids[0] not in existing.release_ids: + existing.release_ids.append(fe.release_ids[0]) + + existing.edit_extra = fe.edit_extra + + # not doing a check for "in current editgroup", because the source of + # these corrections (entity dump) contains no dupes + + if not self.testing_mode: + self.api.update_file(self.get_editgroup_id(), existing.ident, existing) + self.counts["update"] += 1 + return False + + +def test_file_release_bugfix() -> None: + api = public_api("http://localhost:9411/v0") + frbc = FileReleaseBugfix(api=api) + frbc.testing_mode = True + + assert frbc.want({"this": "asdf"}) is False + + example_line: Dict[str, Any] = { + "file_ident": "00000000-0000-0000-3333-000000000002", + "wrong_release_ident": "00000000-0000-0000-4444-000000000002", + "edit_extra": { + "link_source": "unpaywall", + "link_source_id": "10.1371/journal.pmed.0020124", + "ingest_request_source": "unpaywall", + }, + } + + fe1 = frbc.parse_record(example_line) + print(frbc.counts) + frbc.try_update(fe1) + + # NOTE: this test is pretty incompleted + + +def main() -> None: + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "--host-url", default="http://localhost:9411/v0", help="connect to this host/port" + ) + parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) + parser.set_defaults( + auth_var="FATCAT_AUTH_WORKER_CLEANUP", + ) + parser.add_argument( + "json_file", + help="File with jsonlines with cleanup context", + default=sys.stdin, + type=argparse.FileType("r"), + ) + + args = parser.parse_args() + api = authenticated_api( + args.host_url, + # token is an optional kwarg (can be empty string, None, etc) + token=os.environ.get(args.auth_var), + ) + + frbc = FileReleaseBugfix( + api, + edit_batch_size=args.batch_size, + ) + JsonLinePusher(frbc, args.json_file).run() + + +if __name__ == "__main__": + main() -- cgit v1.2.3 From 415cc3c78940247bcb48afcb62a612fec03261eb Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 9 Nov 2021 19:37:57 -0800 Subject: update link source filters in file/release bugfix --- python/fatcat_tools/cleanups/file_release_bugfix.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'python/fatcat_tools/cleanups') diff --git a/python/fatcat_tools/cleanups/file_release_bugfix.py b/python/fatcat_tools/cleanups/file_release_bugfix.py index 025c1370..6eb60205 100644 --- a/python/fatcat_tools/cleanups/file_release_bugfix.py +++ b/python/fatcat_tools/cleanups/file_release_bugfix.py @@ -49,9 +49,15 @@ class FileReleaseBugfix(EntityImporter): ): self.counts["skip-partial"] += 1 return False - if row["edit_extra"]["link_source"] not in ["unpaywall", "fatcat-changelog"]: + if row["edit_extra"]["link_source"] not in ["unpaywall", "doi"]: self.counts["skip-link-source"] += 1 return False + if row["edit_extra"].get("ingest_request_source") not in [ + "unpaywall", + "fatcat-changelog", + ]: + self.counts["skip-ingest-request-source"] += 1 + return False if not row["edit_extra"]["link_source_id"].startswith("10."): self.counts["skip-source-id-not-doi"] += 1 return False @@ -65,7 +71,7 @@ class FileReleaseBugfix(EntityImporter): file_ident = uuid2fcid(row["file_ident"]) wrong_release_ident = uuid2fcid(row["wrong_release_ident"]) edit_extra = row["edit_extra"] - assert edit_extra["link_source"] in ["unpaywall", "fatcat-changelog"] + assert edit_extra["link_source"] in ["unpaywall", "doi"] file_edit_doi = clean_doi(edit_extra["link_source_id"]) if not file_edit_doi: -- cgit v1.2.3 From fd0cf6b55567e3081b9e57e2423d6d0e529ace41 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 9 Nov 2021 19:46:55 -0800 Subject: cleanups: add more state=active checks --- python/fatcat_tools/cleanups/file_release_bugfix.py | 4 ++++ python/fatcat_tools/cleanups/file_short_wayback_ts.py | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'python/fatcat_tools/cleanups') diff --git a/python/fatcat_tools/cleanups/file_release_bugfix.py b/python/fatcat_tools/cleanups/file_release_bugfix.py index 6eb60205..5ac69d1a 100644 --- a/python/fatcat_tools/cleanups/file_release_bugfix.py +++ b/python/fatcat_tools/cleanups/file_release_bugfix.py @@ -131,6 +131,10 @@ class FileReleaseBugfix(EntityImporter): self.counts["skip-existing-not-found"] += 1 return False + if existing.state != "active": + self.counts["skip-existing-entity-state"] += 1 + return False + if wrong_release_ident not in existing.release_ids: self.counts["skip-existing-fixed"] += 1 return False diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py index e2595912..bdd49f9b 100644 --- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py +++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py @@ -119,6 +119,10 @@ class FileShortWaybackTimestampCleanup(EntityImporter): self.counts["skip-existing-not-found"] += 1 return False + if existing.state != "active": + self.counts["skip-existing-entity-state"] += 1 + return False + if existing.sha1 != fe.sha1: self.counts["skip-existing-mismatch"] += 1 return False -- cgit v1.2.3 From 86056629e7778543dc31c962e7e3f273e1ec48e6 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 9 Nov 2021 20:31:21 -0800 Subject: file/release bugfix: handle files with multiple edits --- python/fatcat_tools/cleanups/file_release_bugfix.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'python/fatcat_tools/cleanups') diff --git a/python/fatcat_tools/cleanups/file_release_bugfix.py b/python/fatcat_tools/cleanups/file_release_bugfix.py index 5ac69d1a..dc27f9b5 100644 --- a/python/fatcat_tools/cleanups/file_release_bugfix.py +++ b/python/fatcat_tools/cleanups/file_release_bugfix.py @@ -142,17 +142,17 @@ class FileReleaseBugfix(EntityImporter): # fetch existing history to verify mismatch history = self.api.get_file_history(existing.ident) - if len(history) != 1: - self.counts["skip-existing-history-updated"] += 1 - return False + for entry in history: + if entry.editgroup.editor.is_bot is not True: + self.counts["skip-existing-edit-history-human"] += 1 + return False - bad_edit = history[0].edit + bad_edit = history[-1].edit if bad_edit.extra != fe.edit_extra: self.counts["skip-existing-edit-history-extra-mismatch"] += 1 return False - bad_editgroup = history[0].editgroup - + bad_editgroup = history[-1].editgroup if not bad_editgroup.extra: self.counts["skip-existing-editgroup-missing-extra"] += 1 return False -- cgit v1.2.3