import argparse import os import sys from typing import Any, Dict import fatcat_openapi_client from fatcat_openapi_client import ApiClient, FileEntity from fatcat_tools import authenticated_api, public_api, uuid2fcid from fatcat_tools.importers.common import EntityImporter, JsonLinePusher from fatcat_tools.normal import clean_doi class FileReleaseBugfix(EntityImporter): """ This is a one-off / one-time cleanup script for file entities which got imported with incorrect release ident mappings, due to a bug in the file ingest importer. While this calls itself a cleanup, it is based on the import code path. It is not integrated into the `fatcat_import` or `fatcat_cleanup` controller; instead it has a __main__ function and is invoked like: python -m fatcat_tools.cleans.file_release_bugfix - < blah.json """ def __init__(self, api: ApiClient, **kwargs): eg_desc = ( kwargs.pop("editgroup_description", None) or "Correct bad file/release import mappings" ) eg_extra = kwargs.pop("editgroup_extra", dict()) eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FileReleaseBugfix") super().__init__( api, do_updates=True, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs, ) self.testing_mode = False def want(self, row: Dict[str, Any]) -> bool: if not ( row.get("edit_extra") and row["edit_extra"].get("link_source") and row["edit_extra"].get("link_source_id") ): self.counts["skip-partial"] += 1 return False if row["edit_extra"]["link_source"] not in ["unpaywall", "doi"]: self.counts["skip-link-source"] += 1 return False if row["edit_extra"].get("ingest_request_source") not in [ "unpaywall", "fatcat-changelog", ]: self.counts["skip-ingest-request-source"] += 1 return False if not row["edit_extra"]["link_source_id"].startswith("10."): self.counts["skip-source-id-not-doi"] += 1 return False return True def parse_record(self, row: Dict[str, Any]) -> FileEntity: # bezerk mode doesn't make sense for this importer assert self.bezerk_mode is False file_ident = uuid2fcid(row["file_ident"]) wrong_release_ident = uuid2fcid(row["wrong_release_ident"]) edit_extra = row["edit_extra"] assert edit_extra["link_source"] in ["unpaywall", "doi"] file_edit_doi = clean_doi(edit_extra["link_source_id"]) if not file_edit_doi: self.counts["skip-bad-doi"] += 1 return False # check that the "wrong" release exists and doesn't have the DOI wrong_release = None try: wrong_release = self.api.get_release(wrong_release_ident) except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err if not wrong_release: self.counts["skip-wrong-release-missing"] += 1 return None if clean_doi(wrong_release.ext_ids.doi) == file_edit_doi: self.counts["skip-wrong-release-is-ok"] += 1 return None # fetch the "correct" release, if any fixed_release_ids = [] correct_release = None try: correct_release = self.api.lookup_release(doi=file_edit_doi) except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err if correct_release: fixed_release_ids.append(correct_release.ident) fe = FileEntity( ident=file_ident, release_ids=fixed_release_ids, edit_extra=edit_extra, ) fe._wrong_release_ident = wrong_release_ident return fe def try_update(self, fe: FileEntity) -> bool: wrong_release_ident = fe._wrong_release_ident assert len(wrong_release_ident) == 26 # should always be existing... but in QA it might not be existing = None try: existing = self.api.get_file(fe.ident) except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err if not existing: self.counts["skip-existing-not-found"] += 1 return False if existing.state != "active": self.counts["skip-existing-entity-state"] += 1 return False if wrong_release_ident not in existing.release_ids: self.counts["skip-existing-fixed"] += 1 return False # fetch existing history to verify mismatch history = self.api.get_file_history(existing.ident) for entry in history: if entry.editgroup.editor.is_bot is not True: self.counts["skip-existing-edit-history-human"] += 1 return False bad_edit = history[-1].edit if bad_edit.extra != fe.edit_extra: self.counts["skip-existing-edit-history-extra-mismatch"] += 1 return False bad_editgroup = history[-1].editgroup if not bad_editgroup.extra: self.counts["skip-existing-editgroup-missing-extra"] += 1 return False if ( bad_editgroup.editor_id != "scmbogxw25evtcesfcab5qaboa" or bad_editgroup.extra.get("agent") != "fatcat_tools.IngestFileResultImporter" or not bad_editgroup.extra.get("git_rev", "").startswith("v0.3") or bad_editgroup.created.year != 2020 ): self.counts["skip-existing-edit-history-mismatch"] += 1 return False existing.release_ids = [ri for ri in existing.release_ids if ri != wrong_release_ident] if len(fe.release_ids) == 1: if fe.release_ids[0] not in existing.release_ids: existing.release_ids.append(fe.release_ids[0]) existing.edit_extra = fe.edit_extra # not doing a check for "in current editgroup", because the source of # these corrections (entity dump) contains no dupes if not self.testing_mode: self.api.update_file(self.get_editgroup_id(), existing.ident, existing) self.counts["update"] += 1 return False def test_file_release_bugfix() -> None: api = public_api("http://localhost:9411/v0") frbc = FileReleaseBugfix(api=api) frbc.testing_mode = True assert frbc.want({"this": "asdf"}) is False example_line: Dict[str, Any] = { "file_ident": "00000000-0000-0000-3333-000000000002", "wrong_release_ident": "00000000-0000-0000-4444-000000000002", "edit_extra": { "link_source": "unpaywall", "link_source_id": "10.1371/journal.pmed.0020124", "ingest_request_source": "unpaywall", }, } fe1 = frbc.parse_record(example_line) print(frbc.counts) frbc.try_update(fe1) # NOTE: this test is pretty incompleted def main() -> None: parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "--host-url", default="http://localhost:9411/v0", help="connect to this host/port" ) parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) parser.set_defaults( auth_var="FATCAT_AUTH_WORKER_CLEANUP", ) parser.add_argument( "json_file", help="File with jsonlines with cleanup context", default=sys.stdin, type=argparse.FileType("r"), ) args = parser.parse_args() api = authenticated_api( args.host_url, # token is an optional kwarg (can be empty string, None, etc) token=os.environ.get(args.auth_var), ) frbc = FileReleaseBugfix( api, edit_batch_size=args.batch_size, ) JsonLinePusher(frbc, args.json_file).run() if __name__ == "__main__": main()