aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-29 14:31:26 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-29 14:31:26 -0800
commitec2809ef2ac51c992463839c1e3451927f5e1661 (patch)
treed95c1b17e3bd8fc93179551ee130004c73513c16 /python
parenteb60449cdc9614ec7eda79b8481d1d8487b9a5f6 (diff)
parent487923dc81d877207556f8a90a3ce048fe6bafb5 (diff)
downloadfatcat-ec2809ef2ac51c992463839c1e3451927f5e1661.tar.gz
fatcat-ec2809ef2ac51c992463839c1e3451927f5e1661.zip
Merge branch 'bnewbold-container-merger'
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_tools/mergers/common.py6
-rw-r--r--python/fatcat_tools/mergers/containers.py246
-rw-r--r--python/fatcat_tools/mergers/releases.py6
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py2
-rw-r--r--python/tests/merge_containers.py116
5 files changed, 372 insertions, 4 deletions
diff --git a/python/fatcat_tools/mergers/common.py b/python/fatcat_tools/mergers/common.py
index e25f8194..f8197519 100644
--- a/python/fatcat_tools/mergers/common.py
+++ b/python/fatcat_tools/mergers/common.py
@@ -111,7 +111,8 @@ class EntityMerger(EntityImporter):
else:
self.counts["skip"] += 1
if self._edit_count >= self.edit_batch_size:
- self.api.accept_editgroup(self._editgroup_id)
+ if not self.dry_run_mode:
+ self.api.accept_editgroup(self._editgroup_id)
self._editgroup_id = None
self._edit_count = 0
self._idents_inflight = []
@@ -128,7 +129,8 @@ class EntityMerger(EntityImporter):
def finish(self) -> Counter:
if self._edit_count > 0:
- self.api.accept_editgroup(self._editgroup_id)
+ if not self.dry_run_mode:
+ self.api.accept_editgroup(self._editgroup_id)
self._editgroup_id = None
self._edit_count = 0
self._idents_inflight = []
diff --git a/python/fatcat_tools/mergers/containers.py b/python/fatcat_tools/mergers/containers.py
new file mode 100644
index 00000000..1b9975e5
--- /dev/null
+++ b/python/fatcat_tools/mergers/containers.py
@@ -0,0 +1,246 @@
+import argparse
+import os
+import sys
+from typing import Any, Dict, List, Optional
+
+import fatcat_openapi_client
+from fatcat_openapi_client.models import ContainerEntity
+
+from fatcat_tools import authenticated_api
+from fatcat_tools.harvest.harvest_common import requests_retry_session
+from fatcat_tools.importers import JsonLinePusher
+
+from .common import EntityMerger
+
+
+class ContainerMerger(EntityMerger):
+ """
+ Combines container entities into a single primary. Does not merge partial
+ metadata (identifiers, etc). Can chose "primary" container to redirect to,
+ if necessary.
+
+ The `max_container_releases` argument (int or None) can be used to
+ prevent redirecting containers which already have releases pointed at them
+ (based on release ES index stats). If set to 0, no releases are allowed. If
+ set to None (or a negative number), the parameter is ignored.
+
+ The `clobber_human_edited` flag (boolean) can be used to allow updating
+ entities even if they have had human edits in the past.
+
+ This merger makes external HTTP requests to fatcat.wiki, for the purpose of
+ fetching release stats.
+ """
+
+ def __init__(self, api: fatcat_openapi_client.ApiClient, **kwargs) -> None:
+
+ eg_desc = (
+ kwargs.pop("editgroup_description", None) or "Automated merge of container entities"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ContainerMerger")
+ self.dry_run_mode: bool = eg_extra.get("dry_run_mode", False)
+ self.clobber_human_edited: bool = kwargs.get("clobber_human_edited", False)
+ self.max_container_releases: Optional[int] = kwargs.get("max_container_releases", 0)
+ if self.max_container_releases is not None and self.max_container_releases < 0:
+ self.max_container_releases = None
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
+ self.entity_type_name = "container"
+ self.http_session = requests_retry_session(status_forcelist=[429, 500, 502, 504, 504])
+
+ def choose_primary_container(
+ self,
+ entities: List[ContainerEntity],
+ redirects: Dict[str, List[str]],
+ release_counts: Dict[str, int],
+ ) -> str:
+ assert entities and len(entities) >= 2
+
+ # want to sort in descending order, so reverse=True
+ entities = sorted(
+ entities,
+ key=lambda a: (
+ # linked release counts
+ release_counts[a.ident],
+ # number of redirected entities
+ len(redirects[a.ident]),
+ # not a stub
+ bool(a.container_type != "stub"),
+ # has a strong identifier?
+ bool(a.issnl or (a.extra and a.extra.get("dblp"))),
+ # has IA sim metadata?
+ bool(a.extra and a.extra.get("ia")),
+ # has additional metadata?
+ bool(a.publication_status or a.container_type),
+ bool(a.extra),
+ ),
+ reverse=True,
+ )
+ return entities[0].ident
+
+ def try_merge(
+ self,
+ dupe_ids: List[str],
+ primary_id: Optional[str] = None,
+ evidence: Optional[Dict[str, Any]] = None,
+ ) -> int:
+
+ # currently required for extid validation
+ if not evidence or not (evidence.get("extid_type") and evidence.get("extid")):
+ self.counts["skip-missing-evidence"] += 1
+ return 0
+
+ updated_entities = 0
+ entities: Dict[str, ContainerEntity] = dict()
+ redirects: Dict[str, List[str]] = dict()
+ release_counts: Dict[str, int] = dict()
+
+ all_ids = dupe_ids.copy()
+ if primary_id:
+ all_ids.append(primary_id)
+ for ident in all_ids:
+ try:
+ entities[ident] = self.api.get_container(ident)
+ redirects[ident] = self.api.get_container_redirects(ident)
+ except fatcat_openapi_client.ApiException as ae:
+ if ae.status == 404:
+ self.counts["skip-entity-not-found"] += 1
+ return 0
+ else:
+ raise
+ if entities[ident].state != "active":
+ self.counts["skip-not-active-entity"] += 1
+ return 0
+ if getattr(entities[ident], evidence["extid_type"]) != evidence["extid"]:
+ self.counts["skip-extid-mismatch"] += 1
+ return 0
+ if not self.clobber_human_edited:
+ edit_history = self.api.get_container_history(ident)
+ for edit in edit_history:
+ if edit.editgroup.editor.is_bot is not True:
+ print(f"skipping container_{ident}: human edited", file=sys.stderr)
+ self.counts["skip-human-edited"] += 1
+ return 0
+ resp = self.http_session.get(f"https://fatcat.wiki/container/{ident}/stats.json")
+ resp.raise_for_status()
+ stats = resp.json()
+ release_counts[ident] = stats["total"]
+ if self.max_container_releases is not None:
+ if release_counts[ident] > self.max_container_releases:
+ self.counts["skip-container-release-count"] += 1
+ print(
+ f"skipping container_{ident}: release count {release_counts[ident]}",
+ file=sys.stderr,
+ )
+ continue
+
+ if not primary_id:
+ primary_id = self.choose_primary_container(
+ list(entities.values()), redirects, release_counts
+ )
+ dupe_ids = [d for d in dupe_ids if d != primary_id]
+
+ assert primary_id not in dupe_ids
+
+ if self.dry_run_mode:
+ eg_id = "dummy-editgroup-id"
+ else:
+ eg_id = self.get_editgroup_id()
+
+ primary = entities[primary_id]
+ for other_id in dupe_ids:
+ other = entities[other_id]
+ if not self.dry_run_mode:
+ self.api.update_container(
+ eg_id,
+ other.ident,
+ ContainerEntity(
+ redirect=primary.ident,
+ edit_extra=evidence,
+ ),
+ )
+ updated_entities += 1
+
+ return updated_entities
+
+
+def run_merge_containers(args: argparse.Namespace) -> None:
+ em = ContainerMerger(
+ args.api,
+ edit_batch_size=args.batch_size,
+ dry_run_mode=args.dry_run,
+ max_container_releases=args.max_container_releases,
+ clobber_human_edited=args.clobber_human_edited,
+ editgroup_description=args.editgroup_description_override,
+ )
+ JsonLinePusher(em, args.json_file).run()
+
+
+def main() -> None:
+ """
+ Invoke like:
+
+ python3 -m fatcat_tools.mergers.containers [options]
+ """
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--host-url", default="http://localhost:9411/v0", help="connect to this host/port"
+ )
+ parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)
+ parser.add_argument(
+ "--editgroup-description-override",
+ help="editgroup description override",
+ default=None,
+ type=str,
+ )
+ parser.add_argument(
+ "--dry-run",
+ action="store_true",
+ help="don't actually commit merges, just count what would have been",
+ )
+ parser.set_defaults(
+ auth_var="FATCAT_AUTH_API_TOKEN",
+ )
+ subparsers = parser.add_subparsers()
+
+ sub_merge_containers = subparsers.add_parser("merge-containers")
+ sub_merge_containers.set_defaults(func=run_merge_containers)
+ sub_merge_containers.add_argument(
+ "json_file",
+ help="source of merge lines to process (or stdin)",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_merge_containers.add_argument(
+ "--clobber-human-edited",
+ action="store_true",
+ help="if set, entities which have non-bot (human) edits can be updated/redirected",
+ )
+ sub_merge_containers.add_argument(
+ "--max-container-releases",
+ default=0,
+ type=int,
+ help="if container has more than this many releases linked, don't update (set to -1 to disable limit)",
+ )
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ print("tell me what to do!")
+ sys.exit(-1)
+
+ # allow editgroup description override via env variable (but CLI arg takes
+ # precedence)
+ if not args.editgroup_description_override and os.environ.get(
+ "FATCAT_EDITGROUP_DESCRIPTION"
+ ):
+ args.editgroup_description_override = os.environ.get("FATCAT_EDITGROUP_DESCRIPTION")
+
+ args.api = authenticated_api(
+ args.host_url,
+ # token is an optional kwarg (can be empty string, None, etc)
+ token=os.environ.get(args.auth_var),
+ )
+ args.func(args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/fatcat_tools/mergers/releases.py b/python/fatcat_tools/mergers/releases.py
index 1f995b00..0149bbbe 100644
--- a/python/fatcat_tools/mergers/releases.py
+++ b/python/fatcat_tools/mergers/releases.py
@@ -92,7 +92,6 @@ class ReleaseMerger(EntityMerger):
updated_entities = 0
releases = dict()
existing_redirects: Dict[str, List[str]] = dict()
- eg_id = self.get_editgroup_id()
all_ids = dupe_ids.copy()
if primary_id:
@@ -113,6 +112,11 @@ class ReleaseMerger(EntityMerger):
updated_work_ids = []
redirected_release_ids = []
+ if self.dry_run_mode:
+ eg_id = "dummy-editgroup-id"
+ else:
+ eg_id = self.get_editgroup_id()
+
# execute all the release redirects
for release in releases.values():
if release.ident == primary_id:
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index c16053ec..a6d85f7e 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -284,7 +284,7 @@ def _rte_container_helper(container: ContainerEntity, release_year: Optional[int
t["container_name"] = container.name
# this is container.ident, not release.container_id, because there may
# be a redirect involved
- t["container_id"] = container.ident
+ t["container_id"] = container.redirect or container.ident
t["container_issnl"] = container.issnl
issns = [container.issnl, container.issne, container.issnp]
issns = list(set([i for i in issns if i]))
diff --git a/python/tests/merge_containers.py b/python/tests/merge_containers.py
new file mode 100644
index 00000000..a657522e
--- /dev/null
+++ b/python/tests/merge_containers.py
@@ -0,0 +1,116 @@
+from fatcat_openapi_client import ContainerEntity
+from fixtures import api
+
+from fatcat_tools.mergers.containers import ContainerMerger
+
+
+def test_choose_primary_container(api) -> None:
+
+ release_counts = dict()
+ redirects = dict()
+ em = ContainerMerger(api=api)
+
+ ce_stub = ContainerEntity(
+ ident="pppppp5apzfhbbxxc7rgu2yw6m",
+ name="dummy journal",
+ )
+ release_counts[ce_stub.ident] = 0
+ redirects[ce_stub.ident] = []
+
+ ce_partial = ContainerEntity(
+ ident="eeeeeeeapzfhbbxxc7rgu2yw6m",
+ name="dummy complete journal",
+ publisher="some publisher",
+ issnl="1234-5678",
+ publication_status="active",
+ extra=dict(asdf=123, ia=dict(asdf=True)),
+ )
+ release_counts[ce_partial.ident] = 0
+ redirects[ce_partial.ident] = []
+
+ ce_partial_redirects = ContainerEntity(
+ ident="rrrrrrrrrrfhbbxxc7rgu2yw6m",
+ name="dummy complete journal",
+ publisher="some publisher",
+ issnl="1234-5678",
+ publication_status="active",
+ extra=dict(asdf=123, ia=dict(asdf=True)),
+ )
+ release_counts[ce_partial_redirects.ident] = 0
+ redirects[ce_partial_redirects.ident] = [
+ "zzzzzzzzrrfhbbxxc7rgu2yw6m",
+ ]
+
+ ce_complete_zero = ContainerEntity(
+ ident="oooooooapzfhbbxxc7rgu2yw6m",
+ name="dummy complete journal",
+ publisher="some publisher",
+ issnl="1234-5678",
+ publication_status="active",
+ extra=dict(asdf=123, ia=dict(asdf=True)),
+ )
+ release_counts[ce_complete_zero.ident] = 0
+ redirects[ce_complete_zero.ident] = []
+
+ ce_complete_small = ContainerEntity(
+ ident="cccccccapzfhbbxxc7rgu2yw6m",
+ name="dummy complete journal",
+ publisher="some publisher",
+ issnl="1234-5678",
+ publication_status="active",
+ extra=dict(asdf=123, ia=dict(asdf=True)),
+ )
+ release_counts[ce_complete_small.ident] = 10
+ redirects[ce_complete_small.ident] = []
+
+ ce_complete_big = ContainerEntity(
+ ident="ddddddddpzfhbbxxc7rgu2yw6m",
+ name="dummy complete journal",
+ publisher="some publisher",
+ issnl="1234-5678",
+ publication_status="active",
+ extra=dict(asdf=123, ia=dict(asdf=True)),
+ )
+ release_counts[ce_complete_big.ident] = 9999999
+ redirects[ce_complete_big.ident] = []
+
+ assert (
+ em.choose_primary_container([ce_stub, ce_partial], redirects, release_counts)
+ == ce_partial.ident
+ )
+ assert (
+ em.choose_primary_container(
+ [ce_stub, ce_complete_zero, ce_partial], redirects, release_counts
+ )
+ == ce_complete_zero.ident
+ )
+ assert (
+ em.choose_primary_container(
+ [ce_stub, ce_partial_redirects, ce_complete_zero, ce_partial],
+ redirects,
+ release_counts,
+ )
+ == ce_partial_redirects.ident
+ )
+ assert (
+ em.choose_primary_container(
+ [ce_stub, ce_complete_zero, ce_complete_small, ce_partial],
+ redirects,
+ release_counts,
+ )
+ == ce_complete_small.ident
+ )
+ assert (
+ em.choose_primary_container(
+ [ce_stub, ce_complete_big, ce_complete_zero, ce_complete_small, ce_partial],
+ redirects,
+ release_counts,
+ )
+ == ce_complete_big.ident
+ )
+ assert (
+ em.choose_primary_container(
+ [ce_complete_small, ce_complete_big], redirects, release_counts
+ )
+ == ce_complete_big.ident
+ )