aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-23 19:30:14 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-23 19:32:21 -0800
commit8debf771c540b0bef7f4745195f8af87490917b0 (patch)
tree5be781b0e96d010182428a854218a114414d641d /python/fatcat_tools
parent112c41a1157862d2c8f758eac685b0b26c921797 (diff)
downloadfatcat-8debf771c540b0bef7f4745195f8af87490917b0.tar.gz
fatcat-8debf771c540b0bef7f4745195f8af87490917b0.zip
release merger: some progress, but also disable (not complete)
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/mergers/releases.py84
1 files changed, 72 insertions, 12 deletions
diff --git a/python/fatcat_tools/mergers/releases.py b/python/fatcat_tools/mergers/releases.py
index fc970057..1f995b00 100644
--- a/python/fatcat_tools/mergers/releases.py
+++ b/python/fatcat_tools/mergers/releases.py
@@ -15,32 +15,83 @@ from .common import EntityMerger
class ReleaseMerger(EntityMerger):
"""
Hard merges a set of release entities, redirecting all entities to a single
- primary release.
+ primary release. This is different from "grouping" multiple releases under
+ a single work.
- Will also redirect works (if appropriate), and re-point {files, filesets,
- webcaptures} to the new merged release.
+ A "primary" (which the other releases will redirect to) can be provided, or
+ one will be chosen from the set of duplicate releases based on the
+ completeness of metadata and other heuristic factors.
+
+ Releases are some of the most complex entities to merge, because of
+ the complexity of bibliographic metadata and the number of related entities
+ which also need to be updated.
+
+ File, Fileset, and Webcapture entities which currently point to a release
+ which gets redirected will be updated to point at the "primary" release.
+
+ Any Work entities which will end up with no releases pointing at them after
+ the merging will get redirected to the work corresponding to the "primary"
+ release.
+
+ NOTE: the "primary" release will currently (as implemented) *not* get
+ updated with metadata from all the redirected releases
"""
def __init__(self, api: fatcat_openapi_client.ApiClient, **kwargs) -> None:
- eg_desc = kwargs.get("editgroup_description", "Automated merge of release entities")
+ eg_desc = (
+ kwargs.pop("editgroup_description", None) or "Automated merge of release entities"
+ )
eg_extra = kwargs.get("editgroup_extra", dict())
eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ReleaseMerger")
+ self.dry_run_mode: bool = eg_extra.get("dry_run_mode", False)
super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
self.entity_type_name = "release"
+ def choose_primary_release(
+ self, entities: List[ReleaseEntity], existing_redirects: Dict[str, List[str]]
+ ) -> str:
+ assert entities and len(entities) >= 2
+
+ # want to sort in descending order, so reverse=True
+ entities = sorted(
+ entities,
+ key=lambda a: (
+ # number of entities already redirected to this one
+ len(existing_redirects[a.ident]),
+ # number of file/fileset/webcapture entities (would need to update)
+ int(len(a.files or []) + len(a.filesets or []) + len(a.webcaptures or [])),
+ # has a strong identifier?
+ bool(a.ext_id.doi or a.ext_id.pmid or a.ext_id.arxiv_id),
+ # has any identifier?
+ bool(a.ext_id),
+ # has basic metadata?
+ bool(a.release_type),
+ bool(a.release_status),
+ bool(a.release_year),
+ bool(a.container_id),
+ # has refs, abstracts, extra stuff?
+ bool(a.refs),
+ bool(a.abstracts),
+ ),
+ reverse=True,
+ )
+ return entities[0].ident
+
def try_merge(
self,
dupe_ids: List[str],
primary_id: Optional[str] = None,
evidence: Optional[Dict[str, Any]] = None,
) -> int:
- """
- XXX: review/refactor; this code is very old
- """
+
+ # TODO: this code is pretty old and has only been partially refactored.
+ # Needs more testing and review.
+ raise NotImplementedError
updated_entities = 0
releases = dict()
+ existing_redirects: Dict[str, List[str]] = dict()
eg_id = self.get_editgroup_id()
all_ids = dupe_ids.copy()
@@ -48,12 +99,16 @@ class ReleaseMerger(EntityMerger):
all_ids.append(primary_id)
for ident in all_ids:
releases[ident] = self.api.get_release(ident, expand="files,filesets,webcaptures")
+ existing_redirects[ident] = self.api.get_release_redirects(ident)
if not primary_id:
- # XXX:
- primary_id = dupe_ids[0]
+ primary_id = self.choose_primary_release(
+ list(releases.values()), existing_redirects
+ )
dupe_ids = [d for d in dupe_ids if d != primary_id]
+ assert primary_id not in dupe_ids
+
primary_work_id = releases[primary_id].work_id
updated_work_ids = []
redirected_release_ids = []
@@ -65,6 +120,7 @@ class ReleaseMerger(EntityMerger):
# file redirects
for e in release.files:
+ assert release.ident in e.release_ids
e.release_ids.remove(release.ident)
if primary_id not in e.release_ids:
e.release_ids.append(primary_id)
@@ -75,6 +131,7 @@ class ReleaseMerger(EntityMerger):
# fileset redirects
for e in release.filesets:
+ assert release.ident in e.release_ids
e.release_ids.remove(release.ident)
if primary_id not in e.release_ids:
e.release_ids.append(primary_id)
@@ -85,6 +142,7 @@ class ReleaseMerger(EntityMerger):
# webcapture redirects
for e in release.webcaptures:
+ assert release.ident in e.release_ids
e.release_ids.remove(release.ident)
if primary_id not in e.release_ids:
e.release_ids.append(primary_id)
@@ -93,12 +151,14 @@ class ReleaseMerger(EntityMerger):
updated_entities += 1
self.counts["updated-webcaptures"] += 1
- # release redirect itself
+ # the release redirect itself
updated_work_ids.append(release.work_id)
redirected_release_ids.append(release.ident)
if not self.dry_run_mode:
self.api.update_release(
- eg_id, release.ident, ReleaseEntity(redirect=primary_id)
+ eg_id,
+ release.ident,
+ ReleaseEntity(redirect=primary_id, edit_extra=evidence),
)
updated_entities += 1
self.counts["updated-releases"] += 1
@@ -108,7 +168,7 @@ class ReleaseMerger(EntityMerger):
updated_work_ids = list(set(updated_work_ids))
assert primary_work_id not in updated_work_ids
for work_id in updated_work_ids:
- work_releases = self.api.get_work_releases(work_id)
+ work_releases = self.api.get_work_releases(work_id, hide="abstracts,refs")
rids = set([r.ident for r in work_releases])
if rids.issubset(redirected_release_ids):
# all the releases for this work were updated/merged; we should