diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_tools/mergers/common.py | 6 | ||||
-rw-r--r-- | python/fatcat_tools/mergers/containers.py | 246 | ||||
-rw-r--r-- | python/fatcat_tools/mergers/releases.py | 6 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 2 | ||||
-rw-r--r-- | python/tests/merge_containers.py | 116 |
5 files changed, 372 insertions, 4 deletions
diff --git a/python/fatcat_tools/mergers/common.py b/python/fatcat_tools/mergers/common.py index e25f8194..f8197519 100644 --- a/python/fatcat_tools/mergers/common.py +++ b/python/fatcat_tools/mergers/common.py @@ -111,7 +111,8 @@ class EntityMerger(EntityImporter): else: self.counts["skip"] += 1 if self._edit_count >= self.edit_batch_size: - self.api.accept_editgroup(self._editgroup_id) + if not self.dry_run_mode: + self.api.accept_editgroup(self._editgroup_id) self._editgroup_id = None self._edit_count = 0 self._idents_inflight = [] @@ -128,7 +129,8 @@ class EntityMerger(EntityImporter): def finish(self) -> Counter: if self._edit_count > 0: - self.api.accept_editgroup(self._editgroup_id) + if not self.dry_run_mode: + self.api.accept_editgroup(self._editgroup_id) self._editgroup_id = None self._edit_count = 0 self._idents_inflight = [] diff --git a/python/fatcat_tools/mergers/containers.py b/python/fatcat_tools/mergers/containers.py new file mode 100644 index 00000000..1b9975e5 --- /dev/null +++ b/python/fatcat_tools/mergers/containers.py @@ -0,0 +1,246 @@ +import argparse +import os +import sys +from typing import Any, Dict, List, Optional + +import fatcat_openapi_client +from fatcat_openapi_client.models import ContainerEntity + +from fatcat_tools import authenticated_api +from fatcat_tools.harvest.harvest_common import requests_retry_session +from fatcat_tools.importers import JsonLinePusher + +from .common import EntityMerger + + +class ContainerMerger(EntityMerger): + """ + Combines container entities into a single primary. Does not merge partial + metadata (identifiers, etc). Can chose "primary" container to redirect to, + if necessary. + + The `max_container_releases` argument (int or None) can be used to + prevent redirecting containers which already have releases pointed at them + (based on release ES index stats). If set to 0, no releases are allowed. If + set to None (or a negative number), the parameter is ignored. + + The `clobber_human_edited` flag (boolean) can be used to allow updating + entities even if they have had human edits in the past. + + This merger makes external HTTP requests to fatcat.wiki, for the purpose of + fetching release stats. + """ + + def __init__(self, api: fatcat_openapi_client.ApiClient, **kwargs) -> None: + + eg_desc = ( + kwargs.pop("editgroup_description", None) or "Automated merge of container entities" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ContainerMerger") + self.dry_run_mode: bool = eg_extra.get("dry_run_mode", False) + self.clobber_human_edited: bool = kwargs.get("clobber_human_edited", False) + self.max_container_releases: Optional[int] = kwargs.get("max_container_releases", 0) + if self.max_container_releases is not None and self.max_container_releases < 0: + self.max_container_releases = None + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) + self.entity_type_name = "container" + self.http_session = requests_retry_session(status_forcelist=[429, 500, 502, 504, 504]) + + def choose_primary_container( + self, + entities: List[ContainerEntity], + redirects: Dict[str, List[str]], + release_counts: Dict[str, int], + ) -> str: + assert entities and len(entities) >= 2 + + # want to sort in descending order, so reverse=True + entities = sorted( + entities, + key=lambda a: ( + # linked release counts + release_counts[a.ident], + # number of redirected entities + len(redirects[a.ident]), + # not a stub + bool(a.container_type != "stub"), + # has a strong identifier? + bool(a.issnl or (a.extra and a.extra.get("dblp"))), + # has IA sim metadata? + bool(a.extra and a.extra.get("ia")), + # has additional metadata? + bool(a.publication_status or a.container_type), + bool(a.extra), + ), + reverse=True, + ) + return entities[0].ident + + def try_merge( + self, + dupe_ids: List[str], + primary_id: Optional[str] = None, + evidence: Optional[Dict[str, Any]] = None, + ) -> int: + + # currently required for extid validation + if not evidence or not (evidence.get("extid_type") and evidence.get("extid")): + self.counts["skip-missing-evidence"] += 1 + return 0 + + updated_entities = 0 + entities: Dict[str, ContainerEntity] = dict() + redirects: Dict[str, List[str]] = dict() + release_counts: Dict[str, int] = dict() + + all_ids = dupe_ids.copy() + if primary_id: + all_ids.append(primary_id) + for ident in all_ids: + try: + entities[ident] = self.api.get_container(ident) + redirects[ident] = self.api.get_container_redirects(ident) + except fatcat_openapi_client.ApiException as ae: + if ae.status == 404: + self.counts["skip-entity-not-found"] += 1 + return 0 + else: + raise + if entities[ident].state != "active": + self.counts["skip-not-active-entity"] += 1 + return 0 + if getattr(entities[ident], evidence["extid_type"]) != evidence["extid"]: + self.counts["skip-extid-mismatch"] += 1 + return 0 + if not self.clobber_human_edited: + edit_history = self.api.get_container_history(ident) + for edit in edit_history: + if edit.editgroup.editor.is_bot is not True: + print(f"skipping container_{ident}: human edited", file=sys.stderr) + self.counts["skip-human-edited"] += 1 + return 0 + resp = self.http_session.get(f"https://fatcat.wiki/container/{ident}/stats.json") + resp.raise_for_status() + stats = resp.json() + release_counts[ident] = stats["total"] + if self.max_container_releases is not None: + if release_counts[ident] > self.max_container_releases: + self.counts["skip-container-release-count"] += 1 + print( + f"skipping container_{ident}: release count {release_counts[ident]}", + file=sys.stderr, + ) + continue + + if not primary_id: + primary_id = self.choose_primary_container( + list(entities.values()), redirects, release_counts + ) + dupe_ids = [d for d in dupe_ids if d != primary_id] + + assert primary_id not in dupe_ids + + if self.dry_run_mode: + eg_id = "dummy-editgroup-id" + else: + eg_id = self.get_editgroup_id() + + primary = entities[primary_id] + for other_id in dupe_ids: + other = entities[other_id] + if not self.dry_run_mode: + self.api.update_container( + eg_id, + other.ident, + ContainerEntity( + redirect=primary.ident, + edit_extra=evidence, + ), + ) + updated_entities += 1 + + return updated_entities + + +def run_merge_containers(args: argparse.Namespace) -> None: + em = ContainerMerger( + args.api, + edit_batch_size=args.batch_size, + dry_run_mode=args.dry_run, + max_container_releases=args.max_container_releases, + clobber_human_edited=args.clobber_human_edited, + editgroup_description=args.editgroup_description_override, + ) + JsonLinePusher(em, args.json_file).run() + + +def main() -> None: + """ + Invoke like: + + python3 -m fatcat_tools.mergers.containers [options] + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--host-url", default="http://localhost:9411/v0", help="connect to this host/port" + ) + parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) + parser.add_argument( + "--editgroup-description-override", + help="editgroup description override", + default=None, + type=str, + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="don't actually commit merges, just count what would have been", + ) + parser.set_defaults( + auth_var="FATCAT_AUTH_API_TOKEN", + ) + subparsers = parser.add_subparsers() + + sub_merge_containers = subparsers.add_parser("merge-containers") + sub_merge_containers.set_defaults(func=run_merge_containers) + sub_merge_containers.add_argument( + "json_file", + help="source of merge lines to process (or stdin)", + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_merge_containers.add_argument( + "--clobber-human-edited", + action="store_true", + help="if set, entities which have non-bot (human) edits can be updated/redirected", + ) + sub_merge_containers.add_argument( + "--max-container-releases", + default=0, + type=int, + help="if container has more than this many releases linked, don't update (set to -1 to disable limit)", + ) + + args = parser.parse_args() + if not args.__dict__.get("func"): + print("tell me what to do!") + sys.exit(-1) + + # allow editgroup description override via env variable (but CLI arg takes + # precedence) + if not args.editgroup_description_override and os.environ.get( + "FATCAT_EDITGROUP_DESCRIPTION" + ): + args.editgroup_description_override = os.environ.get("FATCAT_EDITGROUP_DESCRIPTION") + + args.api = authenticated_api( + args.host_url, + # token is an optional kwarg (can be empty string, None, etc) + token=os.environ.get(args.auth_var), + ) + args.func(args) + + +if __name__ == "__main__": + main() diff --git a/python/fatcat_tools/mergers/releases.py b/python/fatcat_tools/mergers/releases.py index 1f995b00..0149bbbe 100644 --- a/python/fatcat_tools/mergers/releases.py +++ b/python/fatcat_tools/mergers/releases.py @@ -92,7 +92,6 @@ class ReleaseMerger(EntityMerger): updated_entities = 0 releases = dict() existing_redirects: Dict[str, List[str]] = dict() - eg_id = self.get_editgroup_id() all_ids = dupe_ids.copy() if primary_id: @@ -113,6 +112,11 @@ class ReleaseMerger(EntityMerger): updated_work_ids = [] redirected_release_ids = [] + if self.dry_run_mode: + eg_id = "dummy-editgroup-id" + else: + eg_id = self.get_editgroup_id() + # execute all the release redirects for release in releases.values(): if release.ident == primary_id: diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index c16053ec..a6d85f7e 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -284,7 +284,7 @@ def _rte_container_helper(container: ContainerEntity, release_year: Optional[int t["container_name"] = container.name # this is container.ident, not release.container_id, because there may # be a redirect involved - t["container_id"] = container.ident + t["container_id"] = container.redirect or container.ident t["container_issnl"] = container.issnl issns = [container.issnl, container.issne, container.issnp] issns = list(set([i for i in issns if i])) diff --git a/python/tests/merge_containers.py b/python/tests/merge_containers.py new file mode 100644 index 00000000..a657522e --- /dev/null +++ b/python/tests/merge_containers.py @@ -0,0 +1,116 @@ +from fatcat_openapi_client import ContainerEntity +from fixtures import api + +from fatcat_tools.mergers.containers import ContainerMerger + + +def test_choose_primary_container(api) -> None: + + release_counts = dict() + redirects = dict() + em = ContainerMerger(api=api) + + ce_stub = ContainerEntity( + ident="pppppp5apzfhbbxxc7rgu2yw6m", + name="dummy journal", + ) + release_counts[ce_stub.ident] = 0 + redirects[ce_stub.ident] = [] + + ce_partial = ContainerEntity( + ident="eeeeeeeapzfhbbxxc7rgu2yw6m", + name="dummy complete journal", + publisher="some publisher", + issnl="1234-5678", + publication_status="active", + extra=dict(asdf=123, ia=dict(asdf=True)), + ) + release_counts[ce_partial.ident] = 0 + redirects[ce_partial.ident] = [] + + ce_partial_redirects = ContainerEntity( + ident="rrrrrrrrrrfhbbxxc7rgu2yw6m", + name="dummy complete journal", + publisher="some publisher", + issnl="1234-5678", + publication_status="active", + extra=dict(asdf=123, ia=dict(asdf=True)), + ) + release_counts[ce_partial_redirects.ident] = 0 + redirects[ce_partial_redirects.ident] = [ + "zzzzzzzzrrfhbbxxc7rgu2yw6m", + ] + + ce_complete_zero = ContainerEntity( + ident="oooooooapzfhbbxxc7rgu2yw6m", + name="dummy complete journal", + publisher="some publisher", + issnl="1234-5678", + publication_status="active", + extra=dict(asdf=123, ia=dict(asdf=True)), + ) + release_counts[ce_complete_zero.ident] = 0 + redirects[ce_complete_zero.ident] = [] + + ce_complete_small = ContainerEntity( + ident="cccccccapzfhbbxxc7rgu2yw6m", + name="dummy complete journal", + publisher="some publisher", + issnl="1234-5678", + publication_status="active", + extra=dict(asdf=123, ia=dict(asdf=True)), + ) + release_counts[ce_complete_small.ident] = 10 + redirects[ce_complete_small.ident] = [] + + ce_complete_big = ContainerEntity( + ident="ddddddddpzfhbbxxc7rgu2yw6m", + name="dummy complete journal", + publisher="some publisher", + issnl="1234-5678", + publication_status="active", + extra=dict(asdf=123, ia=dict(asdf=True)), + ) + release_counts[ce_complete_big.ident] = 9999999 + redirects[ce_complete_big.ident] = [] + + assert ( + em.choose_primary_container([ce_stub, ce_partial], redirects, release_counts) + == ce_partial.ident + ) + assert ( + em.choose_primary_container( + [ce_stub, ce_complete_zero, ce_partial], redirects, release_counts + ) + == ce_complete_zero.ident + ) + assert ( + em.choose_primary_container( + [ce_stub, ce_partial_redirects, ce_complete_zero, ce_partial], + redirects, + release_counts, + ) + == ce_partial_redirects.ident + ) + assert ( + em.choose_primary_container( + [ce_stub, ce_complete_zero, ce_complete_small, ce_partial], + redirects, + release_counts, + ) + == ce_complete_small.ident + ) + assert ( + em.choose_primary_container( + [ce_stub, ce_complete_big, ce_complete_zero, ce_complete_small, ce_partial], + redirects, + release_counts, + ) + == ce_complete_big.ident + ) + assert ( + em.choose_primary_container( + [ce_complete_small, ce_complete_big], redirects, release_counts + ) + == ce_complete_big.ident + ) |