diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-24 15:09:06 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-24 17:05:34 -0800 |
commit | b09be729863f8860b1b81b1498ff325a2a08d36b (patch) | |
tree | 66837cbba27c395cb5332f55ba8418c9e4e4a651 /python | |
parent | 75bde4ad3970e8e63b04009cfd16ed4b9a924ce7 (diff) | |
download | fatcat-b09be729863f8860b1b81b1498ff325a2a08d36b.tar.gz fatcat-b09be729863f8860b1b81b1498ff325a2a08d36b.zip |
initial implementation of container merger
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_tools/mergers/containers.py | 237 | ||||
-rw-r--r-- | python/tests/merge_containers.py | 116 |
2 files changed, 353 insertions, 0 deletions
diff --git a/python/fatcat_tools/mergers/containers.py b/python/fatcat_tools/mergers/containers.py new file mode 100644 index 00000000..4bdafe1b --- /dev/null +++ b/python/fatcat_tools/mergers/containers.py @@ -0,0 +1,237 @@ +import argparse +import os +import sys +from typing import Any, Dict, List, Optional + +import fatcat_openapi_client +from fatcat_openapi_client.models import ContainerEntity + +from fatcat_tools import authenticated_api +from fatcat_tools.harvest.harvest_common import requests_retry_session +from fatcat_tools.importers import JsonLinePusher + +from .common import EntityMerger + + +class ContainerMerger(EntityMerger): + """ + Combines container entities into a single primary. Does not merge partial + metadata (identifiers, etc). Can chose "primary" container to redirect to, + if necessary. + + The `max_container_releases` argument (int or None) can be used to + prevent redirecting containers which already have releases pointed at them + (based on release ES index stats). If set to 0, no releases are allowed. If + set to None (or a negative number), the parameter is ignored. + + The `clobber_human_edited` flag (boolean) can be used to allow updating + entities even if they have had human edits in the past. + + This merger makes external HTTP requests to fatcat.wiki, for the purpose of + fetching release stats. + """ + + def __init__(self, api: fatcat_openapi_client.ApiClient, **kwargs) -> None: + + eg_desc = ( + kwargs.pop("editgroup_description", None) or "Automated merge of container entities" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ContainerMerger") + self.dry_run_mode: bool = eg_extra.get("dry_run_mode", False) + self.clobber_human_edited: bool = eg_extra.get("clobber_human_edited", False) + self.max_container_releases: Optional[int] = eg_extra.get("max_container_releases", 0) + if self.max_container_releases and self.max_container_releases < 0: + self.max_container_releases = None + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) + self.entity_type_name = "container" + self.http_session = requests_retry_session() + + def choose_primary_container( + self, + entities: List[ContainerEntity], + redirects: Dict[str, List[str]], + release_counts: Dict[str, int], + ) -> str: + assert entities and len(entities) >= 2 + + # want to sort in descending order, so reverse=True + entities = sorted( + entities, + key=lambda a: ( + # linked release counts + release_counts[a.ident], + # number of redirected entities + len(redirects[a.ident]), + # not a stub + bool(a.container_type != "stub"), + # has a strong identifier? + bool(a.issnl or (a.extra and a.extra.get("dblp"))), + # has IA sim metadata? + bool(a.extra and a.extra.get("ia")), + # has additional metadata? + bool(a.publication_status or a.container_type), + bool(a.extra), + ), + reverse=True, + ) + return entities[0].ident + + def try_merge( + self, + dupe_ids: List[str], + primary_id: Optional[str] = None, + evidence: Optional[Dict[str, Any]] = None, + ) -> int: + + # currently required for extid validation + if not evidence or not (evidence.get("extid_type") and evidence.get("extid")): + self.counts["skip-missing-evidence"] += 1 + return 0 + + updated_entities = 0 + entities: Dict[str, ContainerEntity] = dict() + redirects: Dict[str, List[str]] = dict() + release_counts: Dict[str, int] = dict() + eg_id = self.get_editgroup_id() + + all_ids = dupe_ids.copy() + if primary_id: + all_ids.append(primary_id) + for ident in all_ids: + try: + entities[ident] = self.api.get_container(ident) + redirects[ident] = self.api.get_container_redirects(ident) + except fatcat_openapi_client.ApiException as ae: + if ae.status == 404: + self.counts["skip-entity-not-found"] += 1 + return 0 + else: + raise + if entities[ident].state != "active": + self.counts["skip-not-active-entity"] += 1 + return 0 + if getattr(entities[ident], evidence["extid_type"]) != evidence["extid"]: + self.counts["skip-extid-mismatch"] += 1 + return 0 + if not self.clobber_human_edited: + edit_history = self.api.get_container_history(ident) + for edit in edit_history: + if edit.editor.is_bot is not True: + self.counts["skip-human-edited"] += 1 + return 0 + resp = self.http_session.get("https://fatcat.wiki/container/{ident}/stats.json") + resp.raise_for_status() + stats = resp.json() + release_counts[ident] = stats["total"] + if self.max_container_releases is not None: + if release_counts[ident] > self.max_container_releases: + self.counts["skip-container-release-count"] += 1 + continue + + if not primary_id: + primary_id = self.choose_primary_container( + list(entities.values()), redirects, release_counts + ) + dupe_ids = [d for d in dupe_ids if d != primary_id] + + assert primary_id not in dupe_ids + + primary = entities[primary_id] + for other_id in dupe_ids: + other = entities[other_id] + if not self.dry_run_mode: + self.api.update_container( + eg_id, + other.ident, + ContainerEntity( + redirect=primary.ident, + edit_extra=evidence, + ), + ) + updated_entities += 1 + + return updated_entities + + +def run_merge_containers(args: argparse.Namespace) -> None: + em = ContainerMerger( + args.api, + edit_batch_size=args.batch_size, + dry_run_mode=args.dry_run, + max_container_releases=args.max_container_releases, + clobber_human_edited=args.clobber_human_edited, + editgroup_description=args.editgroup_description_override, + ) + JsonLinePusher(em, args.json_file).run() + + +def main() -> None: + """ + Invoke like: + + python3 -m fatcat_tools.mergers.containers [options] + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--host-url", default="http://localhost:9411/v0", help="connect to this host/port" + ) + parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) + parser.add_argument( + "--editgroup-description-override", + help="editgroup description override", + default=None, + type=str, + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="don't actually commit merges, just count what would have been", + ) + parser.set_defaults( + auth_var="FATCAT_AUTH_API_TOKEN", + ) + subparsers = parser.add_subparsers() + + sub_merge_containers = subparsers.add_parser("merge-containers") + sub_merge_containers.set_defaults(func=run_merge_containers) + sub_merge_containers.add_argument( + "json_file", + help="source of merge lines to process (or stdin)", + default=sys.stdin, + type=argparse.FileType("r"), + ) + parser.add_argument( + "--clobber-human-edited", + action="store_true", + help="if set, entities which have non-bot (human) edits can be updated/redirected", + ) + parser.add_argument( + "--max-container-releases", + default=0, + type=int, + help="if container has more than this many releases linked, don't update (set to -1 to disable limit)", + ) + + args = parser.parse_args() + if not args.__dict__.get("func"): + print("tell me what to do!") + sys.exit(-1) + + # allow editgroup description override via env variable (but CLI arg takes + # precedence) + if not args.editgroup_description_override and os.environ.get( + "FATCAT_EDITGROUP_DESCRIPTION" + ): + args.editgroup_description_override = os.environ.get("FATCAT_EDITGROUP_DESCRIPTION") + + args.api = authenticated_api( + args.host_url, + # token is an optional kwarg (can be empty string, None, etc) + token=os.environ.get(args.auth_var), + ) + args.func(args) + + +if __name__ == "__main__": + main() diff --git a/python/tests/merge_containers.py b/python/tests/merge_containers.py new file mode 100644 index 00000000..a657522e --- /dev/null +++ b/python/tests/merge_containers.py @@ -0,0 +1,116 @@ +from fatcat_openapi_client import ContainerEntity +from fixtures import api + +from fatcat_tools.mergers.containers import ContainerMerger + + +def test_choose_primary_container(api) -> None: + + release_counts = dict() + redirects = dict() + em = ContainerMerger(api=api) + + ce_stub = ContainerEntity( + ident="pppppp5apzfhbbxxc7rgu2yw6m", + name="dummy journal", + ) + release_counts[ce_stub.ident] = 0 + redirects[ce_stub.ident] = [] + + ce_partial = ContainerEntity( + ident="eeeeeeeapzfhbbxxc7rgu2yw6m", + name="dummy complete journal", + publisher="some publisher", + issnl="1234-5678", + publication_status="active", + extra=dict(asdf=123, ia=dict(asdf=True)), + ) + release_counts[ce_partial.ident] = 0 + redirects[ce_partial.ident] = [] + + ce_partial_redirects = ContainerEntity( + ident="rrrrrrrrrrfhbbxxc7rgu2yw6m", + name="dummy complete journal", + publisher="some publisher", + issnl="1234-5678", + publication_status="active", + extra=dict(asdf=123, ia=dict(asdf=True)), + ) + release_counts[ce_partial_redirects.ident] = 0 + redirects[ce_partial_redirects.ident] = [ + "zzzzzzzzrrfhbbxxc7rgu2yw6m", + ] + + ce_complete_zero = ContainerEntity( + ident="oooooooapzfhbbxxc7rgu2yw6m", + name="dummy complete journal", + publisher="some publisher", + issnl="1234-5678", + publication_status="active", + extra=dict(asdf=123, ia=dict(asdf=True)), + ) + release_counts[ce_complete_zero.ident] = 0 + redirects[ce_complete_zero.ident] = [] + + ce_complete_small = ContainerEntity( + ident="cccccccapzfhbbxxc7rgu2yw6m", + name="dummy complete journal", + publisher="some publisher", + issnl="1234-5678", + publication_status="active", + extra=dict(asdf=123, ia=dict(asdf=True)), + ) + release_counts[ce_complete_small.ident] = 10 + redirects[ce_complete_small.ident] = [] + + ce_complete_big = ContainerEntity( + ident="ddddddddpzfhbbxxc7rgu2yw6m", + name="dummy complete journal", + publisher="some publisher", + issnl="1234-5678", + publication_status="active", + extra=dict(asdf=123, ia=dict(asdf=True)), + ) + release_counts[ce_complete_big.ident] = 9999999 + redirects[ce_complete_big.ident] = [] + + assert ( + em.choose_primary_container([ce_stub, ce_partial], redirects, release_counts) + == ce_partial.ident + ) + assert ( + em.choose_primary_container( + [ce_stub, ce_complete_zero, ce_partial], redirects, release_counts + ) + == ce_complete_zero.ident + ) + assert ( + em.choose_primary_container( + [ce_stub, ce_partial_redirects, ce_complete_zero, ce_partial], + redirects, + release_counts, + ) + == ce_partial_redirects.ident + ) + assert ( + em.choose_primary_container( + [ce_stub, ce_complete_zero, ce_complete_small, ce_partial], + redirects, + release_counts, + ) + == ce_complete_small.ident + ) + assert ( + em.choose_primary_container( + [ce_stub, ce_complete_big, ce_complete_zero, ce_complete_small, ce_partial], + redirects, + release_counts, + ) + == ce_complete_big.ident + ) + assert ( + em.choose_primary_container( + [ce_complete_small, ce_complete_big], redirects, release_counts + ) + == ce_complete_big.ident + ) |