diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-23 19:30:14 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-23 19:32:21 -0800 |
commit | 8debf771c540b0bef7f4745195f8af87490917b0 (patch) | |
tree | 5be781b0e96d010182428a854218a114414d641d /python/fatcat_tools/mergers/releases.py | |
parent | 112c41a1157862d2c8f758eac685b0b26c921797 (diff) | |
download | fatcat-8debf771c540b0bef7f4745195f8af87490917b0.tar.gz fatcat-8debf771c540b0bef7f4745195f8af87490917b0.zip |
release merger: some progress, but also disable (not complete)
Diffstat (limited to 'python/fatcat_tools/mergers/releases.py')
-rw-r--r-- | python/fatcat_tools/mergers/releases.py | 84 |
1 files changed, 72 insertions, 12 deletions
diff --git a/python/fatcat_tools/mergers/releases.py b/python/fatcat_tools/mergers/releases.py index fc970057..1f995b00 100644 --- a/python/fatcat_tools/mergers/releases.py +++ b/python/fatcat_tools/mergers/releases.py @@ -15,32 +15,83 @@ from .common import EntityMerger class ReleaseMerger(EntityMerger): """ Hard merges a set of release entities, redirecting all entities to a single - primary release. + primary release. This is different from "grouping" multiple releases under + a single work. - Will also redirect works (if appropriate), and re-point {files, filesets, - webcaptures} to the new merged release. + A "primary" (which the other releases will redirect to) can be provided, or + one will be chosen from the set of duplicate releases based on the + completeness of metadata and other heuristic factors. + + Releases are some of the most complex entities to merge, because of + the complexity of bibliographic metadata and the number of related entities + which also need to be updated. + + File, Fileset, and Webcapture entities which currently point to a release + which gets redirected will be updated to point at the "primary" release. + + Any Work entities which will end up with no releases pointing at them after + the merging will get redirected to the work corresponding to the "primary" + release. + + NOTE: the "primary" release will currently (as implemented) *not* get + updated with metadata from all the redirected releases """ def __init__(self, api: fatcat_openapi_client.ApiClient, **kwargs) -> None: - eg_desc = kwargs.get("editgroup_description", "Automated merge of release entities") + eg_desc = ( + kwargs.pop("editgroup_description", None) or "Automated merge of release entities" + ) eg_extra = kwargs.get("editgroup_extra", dict()) eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ReleaseMerger") + self.dry_run_mode: bool = eg_extra.get("dry_run_mode", False) super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.entity_type_name = "release" + def choose_primary_release( + self, entities: List[ReleaseEntity], existing_redirects: Dict[str, List[str]] + ) -> str: + assert entities and len(entities) >= 2 + + # want to sort in descending order, so reverse=True + entities = sorted( + entities, + key=lambda a: ( + # number of entities already redirected to this one + len(existing_redirects[a.ident]), + # number of file/fileset/webcapture entities (would need to update) + int(len(a.files or []) + len(a.filesets or []) + len(a.webcaptures or [])), + # has a strong identifier? + bool(a.ext_id.doi or a.ext_id.pmid or a.ext_id.arxiv_id), + # has any identifier? + bool(a.ext_id), + # has basic metadata? + bool(a.release_type), + bool(a.release_status), + bool(a.release_year), + bool(a.container_id), + # has refs, abstracts, extra stuff? + bool(a.refs), + bool(a.abstracts), + ), + reverse=True, + ) + return entities[0].ident + def try_merge( self, dupe_ids: List[str], primary_id: Optional[str] = None, evidence: Optional[Dict[str, Any]] = None, ) -> int: - """ - XXX: review/refactor; this code is very old - """ + + # TODO: this code is pretty old and has only been partially refactored. + # Needs more testing and review. + raise NotImplementedError updated_entities = 0 releases = dict() + existing_redirects: Dict[str, List[str]] = dict() eg_id = self.get_editgroup_id() all_ids = dupe_ids.copy() @@ -48,12 +99,16 @@ class ReleaseMerger(EntityMerger): all_ids.append(primary_id) for ident in all_ids: releases[ident] = self.api.get_release(ident, expand="files,filesets,webcaptures") + existing_redirects[ident] = self.api.get_release_redirects(ident) if not primary_id: - # XXX: - primary_id = dupe_ids[0] + primary_id = self.choose_primary_release( + list(releases.values()), existing_redirects + ) dupe_ids = [d for d in dupe_ids if d != primary_id] + assert primary_id not in dupe_ids + primary_work_id = releases[primary_id].work_id updated_work_ids = [] redirected_release_ids = [] @@ -65,6 +120,7 @@ class ReleaseMerger(EntityMerger): # file redirects for e in release.files: + assert release.ident in e.release_ids e.release_ids.remove(release.ident) if primary_id not in e.release_ids: e.release_ids.append(primary_id) @@ -75,6 +131,7 @@ class ReleaseMerger(EntityMerger): # fileset redirects for e in release.filesets: + assert release.ident in e.release_ids e.release_ids.remove(release.ident) if primary_id not in e.release_ids: e.release_ids.append(primary_id) @@ -85,6 +142,7 @@ class ReleaseMerger(EntityMerger): # webcapture redirects for e in release.webcaptures: + assert release.ident in e.release_ids e.release_ids.remove(release.ident) if primary_id not in e.release_ids: e.release_ids.append(primary_id) @@ -93,12 +151,14 @@ class ReleaseMerger(EntityMerger): updated_entities += 1 self.counts["updated-webcaptures"] += 1 - # release redirect itself + # the release redirect itself updated_work_ids.append(release.work_id) redirected_release_ids.append(release.ident) if not self.dry_run_mode: self.api.update_release( - eg_id, release.ident, ReleaseEntity(redirect=primary_id) + eg_id, + release.ident, + ReleaseEntity(redirect=primary_id, edit_extra=evidence), ) updated_entities += 1 self.counts["updated-releases"] += 1 @@ -108,7 +168,7 @@ class ReleaseMerger(EntityMerger): updated_work_ids = list(set(updated_work_ids)) assert primary_work_id not in updated_work_ids for work_id in updated_work_ids: - work_releases = self.api.get_work_releases(work_id) + work_releases = self.api.get_work_releases(work_id, hide="abstracts,refs") rids = set([r.ident for r in work_releases]) if rids.issubset(redirected_release_ids): # all the releases for this work were updated/merged; we should |