aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/cleanups/file_release_bugfix.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/cleanups/file_release_bugfix.py')
-rw-r--r--python/fatcat_tools/cleanups/file_release_bugfix.py241
1 files changed, 241 insertions, 0 deletions
diff --git a/python/fatcat_tools/cleanups/file_release_bugfix.py b/python/fatcat_tools/cleanups/file_release_bugfix.py
new file mode 100644
index 00000000..dc27f9b5
--- /dev/null
+++ b/python/fatcat_tools/cleanups/file_release_bugfix.py
@@ -0,0 +1,241 @@
+import argparse
+import os
+import sys
+from typing import Any, Dict
+
+import fatcat_openapi_client
+from fatcat_openapi_client import ApiClient, FileEntity
+
+from fatcat_tools import authenticated_api, public_api, uuid2fcid
+from fatcat_tools.importers.common import EntityImporter, JsonLinePusher
+from fatcat_tools.normal import clean_doi
+
+
+class FileReleaseBugfix(EntityImporter):
+ """
+ This is a one-off / one-time cleanup script for file entities which got
+ imported with incorrect release ident mappings, due to a bug in the file
+ ingest importer.
+
+ While this calls itself a cleanup, it is based on the import code path. It
+ is not integrated into the `fatcat_import` or `fatcat_cleanup` controller;
+ instead it has a __main__ function and is invoked like:
+
+ python -m fatcat_tools.cleans.file_release_bugfix - < blah.json
+ """
+
+ def __init__(self, api: ApiClient, **kwargs):
+
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Correct bad file/release import mappings"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FileReleaseBugfix")
+ super().__init__(
+ api,
+ do_updates=True,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs,
+ )
+ self.testing_mode = False
+
+ def want(self, row: Dict[str, Any]) -> bool:
+ if not (
+ row.get("edit_extra")
+ and row["edit_extra"].get("link_source")
+ and row["edit_extra"].get("link_source_id")
+ ):
+ self.counts["skip-partial"] += 1
+ return False
+ if row["edit_extra"]["link_source"] not in ["unpaywall", "doi"]:
+ self.counts["skip-link-source"] += 1
+ return False
+ if row["edit_extra"].get("ingest_request_source") not in [
+ "unpaywall",
+ "fatcat-changelog",
+ ]:
+ self.counts["skip-ingest-request-source"] += 1
+ return False
+ if not row["edit_extra"]["link_source_id"].startswith("10."):
+ self.counts["skip-source-id-not-doi"] += 1
+ return False
+ return True
+
+ def parse_record(self, row: Dict[str, Any]) -> FileEntity:
+
+ # bezerk mode doesn't make sense for this importer
+ assert self.bezerk_mode is False
+
+ file_ident = uuid2fcid(row["file_ident"])
+ wrong_release_ident = uuid2fcid(row["wrong_release_ident"])
+ edit_extra = row["edit_extra"]
+ assert edit_extra["link_source"] in ["unpaywall", "doi"]
+ file_edit_doi = clean_doi(edit_extra["link_source_id"])
+
+ if not file_edit_doi:
+ self.counts["skip-bad-doi"] += 1
+ return False
+
+ # check that the "wrong" release exists and doesn't have the DOI
+ wrong_release = None
+ try:
+ wrong_release = self.api.get_release(wrong_release_ident)
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ if not wrong_release:
+ self.counts["skip-wrong-release-missing"] += 1
+ return None
+
+ if clean_doi(wrong_release.ext_ids.doi) == file_edit_doi:
+ self.counts["skip-wrong-release-is-ok"] += 1
+ return None
+
+ # fetch the "correct" release, if any
+ fixed_release_ids = []
+ correct_release = None
+ try:
+ correct_release = self.api.lookup_release(doi=file_edit_doi)
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ if correct_release:
+ fixed_release_ids.append(correct_release.ident)
+
+ fe = FileEntity(
+ ident=file_ident,
+ release_ids=fixed_release_ids,
+ edit_extra=edit_extra,
+ )
+ fe._wrong_release_ident = wrong_release_ident
+ return fe
+
+ def try_update(self, fe: FileEntity) -> bool:
+
+ wrong_release_ident = fe._wrong_release_ident
+ assert len(wrong_release_ident) == 26
+
+ # should always be existing... but in QA it might not be
+ existing = None
+ try:
+ existing = self.api.get_file(fe.ident)
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ if not existing:
+ self.counts["skip-existing-not-found"] += 1
+ return False
+
+ if existing.state != "active":
+ self.counts["skip-existing-entity-state"] += 1
+ return False
+
+ if wrong_release_ident not in existing.release_ids:
+ self.counts["skip-existing-fixed"] += 1
+ return False
+
+ # fetch existing history to verify mismatch
+ history = self.api.get_file_history(existing.ident)
+
+ for entry in history:
+ if entry.editgroup.editor.is_bot is not True:
+ self.counts["skip-existing-edit-history-human"] += 1
+ return False
+
+ bad_edit = history[-1].edit
+ if bad_edit.extra != fe.edit_extra:
+ self.counts["skip-existing-edit-history-extra-mismatch"] += 1
+ return False
+
+ bad_editgroup = history[-1].editgroup
+ if not bad_editgroup.extra:
+ self.counts["skip-existing-editgroup-missing-extra"] += 1
+ return False
+
+ if (
+ bad_editgroup.editor_id != "scmbogxw25evtcesfcab5qaboa"
+ or bad_editgroup.extra.get("agent") != "fatcat_tools.IngestFileResultImporter"
+ or not bad_editgroup.extra.get("git_rev", "").startswith("v0.3")
+ or bad_editgroup.created.year != 2020
+ ):
+ self.counts["skip-existing-edit-history-mismatch"] += 1
+ return False
+
+ existing.release_ids = [ri for ri in existing.release_ids if ri != wrong_release_ident]
+
+ if len(fe.release_ids) == 1:
+ if fe.release_ids[0] not in existing.release_ids:
+ existing.release_ids.append(fe.release_ids[0])
+
+ existing.edit_extra = fe.edit_extra
+
+ # not doing a check for "in current editgroup", because the source of
+ # these corrections (entity dump) contains no dupes
+
+ if not self.testing_mode:
+ self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
+ self.counts["update"] += 1
+ return False
+
+
+def test_file_release_bugfix() -> None:
+ api = public_api("http://localhost:9411/v0")
+ frbc = FileReleaseBugfix(api=api)
+ frbc.testing_mode = True
+
+ assert frbc.want({"this": "asdf"}) is False
+
+ example_line: Dict[str, Any] = {
+ "file_ident": "00000000-0000-0000-3333-000000000002",
+ "wrong_release_ident": "00000000-0000-0000-4444-000000000002",
+ "edit_extra": {
+ "link_source": "unpaywall",
+ "link_source_id": "10.1371/journal.pmed.0020124",
+ "ingest_request_source": "unpaywall",
+ },
+ }
+
+ fe1 = frbc.parse_record(example_line)
+ print(frbc.counts)
+ frbc.try_update(fe1)
+
+ # NOTE: this test is pretty incompleted
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--host-url", default="http://localhost:9411/v0", help="connect to this host/port"
+ )
+ parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)
+ parser.set_defaults(
+ auth_var="FATCAT_AUTH_WORKER_CLEANUP",
+ )
+ parser.add_argument(
+ "json_file",
+ help="File with jsonlines with cleanup context",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+
+ args = parser.parse_args()
+ api = authenticated_api(
+ args.host_url,
+ # token is an optional kwarg (can be empty string, None, etc)
+ token=os.environ.get(args.auth_var),
+ )
+
+ frbc = FileReleaseBugfix(
+ api,
+ edit_batch_size=args.batch_size,
+ )
+ JsonLinePusher(frbc, args.json_file).run()
+
+
+if __name__ == "__main__":
+ main()