From b09be729863f8860b1b81b1498ff325a2a08d36b Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 24 Nov 2021 15:09:06 -0800 Subject: initial implementation of container merger --- python/fatcat_tools/mergers/containers.py | 237 ++++++++++++++++++++++++++++++ python/tests/merge_containers.py | 116 +++++++++++++++ 2 files changed, 353 insertions(+) create mode 100644 python/fatcat_tools/mergers/containers.py create mode 100644 python/tests/merge_containers.py (limited to 'python') diff --git a/python/fatcat_tools/mergers/containers.py b/python/fatcat_tools/mergers/containers.py new file mode 100644 index 00000000..4bdafe1b --- /dev/null +++ b/python/fatcat_tools/mergers/containers.py @@ -0,0 +1,237 @@ +import argparse +import os +import sys +from typing import Any, Dict, List, Optional + +import fatcat_openapi_client +from fatcat_openapi_client.models import ContainerEntity + +from fatcat_tools import authenticated_api +from fatcat_tools.harvest.harvest_common import requests_retry_session +from fatcat_tools.importers import JsonLinePusher + +from .common import EntityMerger + + +class ContainerMerger(EntityMerger): + """ + Combines container entities into a single primary. Does not merge partial + metadata (identifiers, etc). Can chose "primary" container to redirect to, + if necessary. + + The `max_container_releases` argument (int or None) can be used to + prevent redirecting containers which already have releases pointed at them + (based on release ES index stats). If set to 0, no releases are allowed. If + set to None (or a negative number), the parameter is ignored. + + The `clobber_human_edited` flag (boolean) can be used to allow updating + entities even if they have had human edits in the past. + + This merger makes external HTTP requests to fatcat.wiki, for the purpose of + fetching release stats. + """ + + def __init__(self, api: fatcat_openapi_client.ApiClient, **kwargs) -> None: + + eg_desc = ( + kwargs.pop("editgroup_description", None) or "Automated merge of container entities" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ContainerMerger") + self.dry_run_mode: bool = eg_extra.get("dry_run_mode", False) + self.clobber_human_edited: bool = eg_extra.get("clobber_human_edited", False) + self.max_container_releases: Optional[int] = eg_extra.get("max_container_releases", 0) + if self.max_container_releases and self.max_container_releases < 0: + self.max_container_releases = None + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) + self.entity_type_name = "container" + self.http_session = requests_retry_session() + + def choose_primary_container( + self, + entities: List[ContainerEntity], + redirects: Dict[str, List[str]], + release_counts: Dict[str, int], + ) -> str: + assert entities and len(entities) >= 2 + + # want to sort in descending order, so reverse=True + entities = sorted( + entities, + key=lambda a: ( + # linked release counts + release_counts[a.ident], + # number of redirected entities + len(redirects[a.ident]), + # not a stub + bool(a.container_type != "stub"), + # has a strong identifier? + bool(a.issnl or (a.extra and a.extra.get("dblp"))), + # has IA sim metadata? + bool(a.extra and a.extra.get("ia")), + # has additional metadata? + bool(a.publication_status or a.container_type), + bool(a.extra), + ), + reverse=True, + ) + return entities[0].ident + + def try_merge( + self, + dupe_ids: List[str], + primary_id: Optional[str] = None, + evidence: Optional[Dict[str, Any]] = None, + ) -> int: + + # currently required for extid validation + if not evidence or not (evidence.get("extid_type") and evidence.get("extid")): + self.counts["skip-missing-evidence"] += 1 + return 0 + + updated_entities = 0 + entities: Dict[str, ContainerEntity] = dict() + redirects: Dict[str, List[str]] = dict() + release_counts: Dict[str, int] = dict() + eg_id = self.get_editgroup_id() + + all_ids = dupe_ids.copy() + if primary_id: + all_ids.append(primary_id) + for ident in all_ids: + try: + entities[ident] = self.api.get_container(ident) + redirects[ident] = self.api.get_container_redirects(ident) + except fatcat_openapi_client.ApiException as ae: + if ae.status == 404: + self.counts["skip-entity-not-found"] += 1 + return 0 + else: + raise + if entities[ident].state != "active": + self.counts["skip-not-active-entity"] += 1 + return 0 + if getattr(entities[ident], evidence["extid_type"]) != evidence["extid"]: + self.counts["skip-extid-mismatch"] += 1 + return 0 + if not self.clobber_human_edited: + edit_history = self.api.get_container_history(ident) + for edit in edit_history: + if edit.editor.is_bot is not True: + self.counts["skip-human-edited"] += 1 + return 0 + resp = self.http_session.get("https://fatcat.wiki/container/{ident}/stats.json") + resp.raise_for_status() + stats = resp.json() + release_counts[ident] = stats["total"] + if self.max_container_releases is not None: + if release_counts[ident] > self.max_container_releases: + self.counts["skip-container-release-count"] += 1 + continue + + if not primary_id: + primary_id = self.choose_primary_container( + list(entities.values()), redirects, release_counts + ) + dupe_ids = [d for d in dupe_ids if d != primary_id] + + assert primary_id not in dupe_ids + + primary = entities[primary_id] + for other_id in dupe_ids: + other = entities[other_id] + if not self.dry_run_mode: + self.api.update_container( + eg_id, + other.ident, + ContainerEntity( + redirect=primary.ident, + edit_extra=evidence, + ), + ) + updated_entities += 1 + + return updated_entities + + +def run_merge_containers(args: argparse.Namespace) -> None: + em = ContainerMerger( + args.api, + edit_batch_size=args.batch_size, + dry_run_mode=args.dry_run, + max_container_releases=args.max_container_releases, + clobber_human_edited=args.clobber_human_edited, + editgroup_description=args.editgroup_description_override, + ) + JsonLinePusher(em, args.json_file).run() + + +def main() -> None: + """ + Invoke like: + + python3 -m fatcat_tools.mergers.containers [options] + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--host-url", default="http://localhost:9411/v0", help="connect to this host/port" + ) + parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) + parser.add_argument( + "--editgroup-description-override", + help="editgroup description override", + default=None, + type=str, + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="don't actually commit merges, just count what would have been", + ) + parser.set_defaults( + auth_var="FATCAT_AUTH_API_TOKEN", + ) + subparsers = parser.add_subparsers() + + sub_merge_containers = subparsers.add_parser("merge-containers") + sub_merge_containers.set_defaults(func=run_merge_containers) + sub_merge_containers.add_argument( + "json_file", + help="source of merge lines to process (or stdin)", + default=sys.stdin, + type=argparse.FileType("r"), + ) + parser.add_argument( + "--clobber-human-edited", + action="store_true", + help="if set, entities which have non-bot (human) edits can be updated/redirected", + ) + parser.add_argument( + "--max-container-releases", + default=0, + type=int, + help="if container has more than this many releases linked, don't update (set to -1 to disable limit)", + ) + + args = parser.parse_args() + if not args.__dict__.get("func"): + print("tell me what to do!") + sys.exit(-1) + + # allow editgroup description override via env variable (but CLI arg takes + # precedence) + if not args.editgroup_description_override and os.environ.get( + "FATCAT_EDITGROUP_DESCRIPTION" + ): + args.editgroup_description_override = os.environ.get("FATCAT_EDITGROUP_DESCRIPTION") + + args.api = authenticated_api( + args.host_url, + # token is an optional kwarg (can be empty string, None, etc) + token=os.environ.get(args.auth_var), + ) + args.func(args) + + +if __name__ == "__main__": + main() diff --git a/python/tests/merge_containers.py b/python/tests/merge_containers.py new file mode 100644 index 00000000..a657522e --- /dev/null +++ b/python/tests/merge_containers.py @@ -0,0 +1,116 @@ +from fatcat_openapi_client import ContainerEntity +from fixtures import api + +from fatcat_tools.mergers.containers import ContainerMerger + + +def test_choose_primary_container(api) -> None: + + release_counts = dict() + redirects = dict() + em = ContainerMerger(api=api) + + ce_stub = ContainerEntity( + ident="pppppp5apzfhbbxxc7rgu2yw6m", + name="dummy journal", + ) + release_counts[ce_stub.ident] = 0 + redirects[ce_stub.ident] = [] + + ce_partial = ContainerEntity( + ident="eeeeeeeapzfhbbxxc7rgu2yw6m", + name="dummy complete journal", + publisher="some publisher", + issnl="1234-5678", + publication_status="active", + extra=dict(asdf=123, ia=dict(asdf=True)), + ) + release_counts[ce_partial.ident] = 0 + redirects[ce_partial.ident] = [] + + ce_partial_redirects = ContainerEntity( + ident="rrrrrrrrrrfhbbxxc7rgu2yw6m", + name="dummy complete journal", + publisher="some publisher", + issnl="1234-5678", + publication_status="active", + extra=dict(asdf=123, ia=dict(asdf=True)), + ) + release_counts[ce_partial_redirects.ident] = 0 + redirects[ce_partial_redirects.ident] = [ + "zzzzzzzzrrfhbbxxc7rgu2yw6m", + ] + + ce_complete_zero = ContainerEntity( + ident="oooooooapzfhbbxxc7rgu2yw6m", + name="dummy complete journal", + publisher="some publisher", + issnl="1234-5678", + publication_status="active", + extra=dict(asdf=123, ia=dict(asdf=True)), + ) + release_counts[ce_complete_zero.ident] = 0 + redirects[ce_complete_zero.ident] = [] + + ce_complete_small = ContainerEntity( + ident="cccccccapzfhbbxxc7rgu2yw6m", + name="dummy complete journal", + publisher="some publisher", + issnl="1234-5678", + publication_status="active", + extra=dict(asdf=123, ia=dict(asdf=True)), + ) + release_counts[ce_complete_small.ident] = 10 + redirects[ce_complete_small.ident] = [] + + ce_complete_big = ContainerEntity( + ident="ddddddddpzfhbbxxc7rgu2yw6m", + name="dummy complete journal", + publisher="some publisher", + issnl="1234-5678", + publication_status="active", + extra=dict(asdf=123, ia=dict(asdf=True)), + ) + release_counts[ce_complete_big.ident] = 9999999 + redirects[ce_complete_big.ident] = [] + + assert ( + em.choose_primary_container([ce_stub, ce_partial], redirects, release_counts) + == ce_partial.ident + ) + assert ( + em.choose_primary_container( + [ce_stub, ce_complete_zero, ce_partial], redirects, release_counts + ) + == ce_complete_zero.ident + ) + assert ( + em.choose_primary_container( + [ce_stub, ce_partial_redirects, ce_complete_zero, ce_partial], + redirects, + release_counts, + ) + == ce_partial_redirects.ident + ) + assert ( + em.choose_primary_container( + [ce_stub, ce_complete_zero, ce_complete_small, ce_partial], + redirects, + release_counts, + ) + == ce_complete_small.ident + ) + assert ( + em.choose_primary_container( + [ce_stub, ce_complete_big, ce_complete_zero, ce_complete_small, ce_partial], + redirects, + release_counts, + ) + == ce_complete_big.ident + ) + assert ( + em.choose_primary_container( + [ce_complete_small, ce_complete_big], redirects, release_counts + ) + == ce_complete_big.ident + ) -- cgit v1.2.3 From e72d61e60c43911b6d77c4842951441235561dcf Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 24 Nov 2021 17:07:25 -0800 Subject: container merger: defer allocation of editgroup_id; and dummy code path --- python/fatcat_tools/mergers/containers.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'python') diff --git a/python/fatcat_tools/mergers/containers.py b/python/fatcat_tools/mergers/containers.py index 4bdafe1b..069ce23c 100644 --- a/python/fatcat_tools/mergers/containers.py +++ b/python/fatcat_tools/mergers/containers.py @@ -93,7 +93,6 @@ class ContainerMerger(EntityMerger): entities: Dict[str, ContainerEntity] = dict() redirects: Dict[str, List[str]] = dict() release_counts: Dict[str, int] = dict() - eg_id = self.get_editgroup_id() all_ids = dupe_ids.copy() if primary_id: @@ -137,6 +136,11 @@ class ContainerMerger(EntityMerger): assert primary_id not in dupe_ids + if self.dry_run_mode: + eg_id = "dummy-editgroup-id" + else: + eg_id = self.get_editgroup_id() + primary = entities[primary_id] for other_id in dupe_ids: other = entities[other_id] -- cgit v1.2.3 From 0584499e7887d8c1ff216b27652d28c8377c3a17 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 24 Nov 2021 18:16:34 -0800 Subject: ES release transform: handle redirected containers better Despite the inline comment, we were not actually grabbing the "redirected" ident correctly, meaning some counts would not be accurate. --- python/fatcat_tools/transforms/elasticsearch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'python') diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index c16053ec..a6d85f7e 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -284,7 +284,7 @@ def _rte_container_helper(container: ContainerEntity, release_year: Optional[int t["container_name"] = container.name # this is container.ident, not release.container_id, because there may # be a redirect involved - t["container_id"] = container.ident + t["container_id"] = container.redirect or container.ident t["container_issnl"] = container.issnl issns = [container.issnl, container.issne, container.issnp] issns = list(set([i for i in issns if i])) -- cgit v1.2.3 From 5a8661e647b27932bdccbaa3b6a445fd35518814 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 24 Nov 2021 18:19:03 -0800 Subject: mergers: don't try to accept empty editgroups in dry-run-mode --- python/fatcat_tools/mergers/common.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'python') diff --git a/python/fatcat_tools/mergers/common.py b/python/fatcat_tools/mergers/common.py index e25f8194..f8197519 100644 --- a/python/fatcat_tools/mergers/common.py +++ b/python/fatcat_tools/mergers/common.py @@ -111,7 +111,8 @@ class EntityMerger(EntityImporter): else: self.counts["skip"] += 1 if self._edit_count >= self.edit_batch_size: - self.api.accept_editgroup(self._editgroup_id) + if not self.dry_run_mode: + self.api.accept_editgroup(self._editgroup_id) self._editgroup_id = None self._edit_count = 0 self._idents_inflight = [] @@ -128,7 +129,8 @@ class EntityMerger(EntityImporter): def finish(self) -> Counter: if self._edit_count > 0: - self.api.accept_editgroup(self._editgroup_id) + if not self.dry_run_mode: + self.api.accept_editgroup(self._editgroup_id) self._editgroup_id = None self._edit_count = 0 self._idents_inflight = [] -- cgit v1.2.3 From 20106ce2c65e6f07e5f27ecbc8665c68d1aa31c2 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 24 Nov 2021 18:20:56 -0800 Subject: container merger: fixes from QA testing --- python/fatcat_tools/mergers/containers.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'python') diff --git a/python/fatcat_tools/mergers/containers.py b/python/fatcat_tools/mergers/containers.py index 069ce23c..1b9975e5 100644 --- a/python/fatcat_tools/mergers/containers.py +++ b/python/fatcat_tools/mergers/containers.py @@ -39,13 +39,13 @@ class ContainerMerger(EntityMerger): eg_extra = kwargs.pop("editgroup_extra", dict()) eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ContainerMerger") self.dry_run_mode: bool = eg_extra.get("dry_run_mode", False) - self.clobber_human_edited: bool = eg_extra.get("clobber_human_edited", False) - self.max_container_releases: Optional[int] = eg_extra.get("max_container_releases", 0) - if self.max_container_releases and self.max_container_releases < 0: + self.clobber_human_edited: bool = kwargs.get("clobber_human_edited", False) + self.max_container_releases: Optional[int] = kwargs.get("max_container_releases", 0) + if self.max_container_releases is not None and self.max_container_releases < 0: self.max_container_releases = None super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.entity_type_name = "container" - self.http_session = requests_retry_session() + self.http_session = requests_retry_session(status_forcelist=[429, 500, 502, 504, 504]) def choose_primary_container( self, @@ -116,16 +116,21 @@ class ContainerMerger(EntityMerger): if not self.clobber_human_edited: edit_history = self.api.get_container_history(ident) for edit in edit_history: - if edit.editor.is_bot is not True: + if edit.editgroup.editor.is_bot is not True: + print(f"skipping container_{ident}: human edited", file=sys.stderr) self.counts["skip-human-edited"] += 1 return 0 - resp = self.http_session.get("https://fatcat.wiki/container/{ident}/stats.json") + resp = self.http_session.get(f"https://fatcat.wiki/container/{ident}/stats.json") resp.raise_for_status() stats = resp.json() release_counts[ident] = stats["total"] if self.max_container_releases is not None: if release_counts[ident] > self.max_container_releases: self.counts["skip-container-release-count"] += 1 + print( + f"skipping container_{ident}: release count {release_counts[ident]}", + file=sys.stderr, + ) continue if not primary_id: @@ -205,12 +210,12 @@ def main() -> None: default=sys.stdin, type=argparse.FileType("r"), ) - parser.add_argument( + sub_merge_containers.add_argument( "--clobber-human-edited", action="store_true", help="if set, entities which have non-bot (human) edits can be updated/redirected", ) - parser.add_argument( + sub_merge_containers.add_argument( "--max-container-releases", default=0, type=int, -- cgit v1.2.3 From d1c8a582d31dc6f3254e477774aea0fa75fc8b23 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 24 Nov 2021 18:21:27 -0800 Subject: release merger: same editgroup_id fixes as for file and container mergers --- python/fatcat_tools/mergers/releases.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'python') diff --git a/python/fatcat_tools/mergers/releases.py b/python/fatcat_tools/mergers/releases.py index 1f995b00..0149bbbe 100644 --- a/python/fatcat_tools/mergers/releases.py +++ b/python/fatcat_tools/mergers/releases.py @@ -92,7 +92,6 @@ class ReleaseMerger(EntityMerger): updated_entities = 0 releases = dict() existing_redirects: Dict[str, List[str]] = dict() - eg_id = self.get_editgroup_id() all_ids = dupe_ids.copy() if primary_id: @@ -113,6 +112,11 @@ class ReleaseMerger(EntityMerger): updated_work_ids = [] redirected_release_ids = [] + if self.dry_run_mode: + eg_id = "dummy-editgroup-id" + else: + eg_id = self.get_editgroup_id() + # execute all the release redirects for release in releases.values(): if release.ident == primary_id: -- cgit v1.2.3