aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-29 14:31:26 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-29 14:31:26 -0800
commitec2809ef2ac51c992463839c1e3451927f5e1661 (patch)
treed95c1b17e3bd8fc93179551ee130004c73513c16
parenteb60449cdc9614ec7eda79b8481d1d8487b9a5f6 (diff)
parent487923dc81d877207556f8a90a3ce048fe6bafb5 (diff)
downloadfatcat-ec2809ef2ac51c992463839c1e3451927f5e1661.tar.gz
fatcat-ec2809ef2ac51c992463839c1e3451927f5e1661.zip
Merge branch 'bnewbold-container-merger'
-rw-r--r--notes/cleanups/container_issnl_dedupe.md105
-rwxr-xr-xnotes/cleanups/scripts/container_dupe_to_json.py55
-rw-r--r--python/fatcat_tools/mergers/common.py6
-rw-r--r--python/fatcat_tools/mergers/containers.py246
-rw-r--r--python/fatcat_tools/mergers/releases.py6
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py2
-rw-r--r--python/tests/merge_containers.py116
7 files changed, 532 insertions, 4 deletions
diff --git a/notes/cleanups/container_issnl_dedupe.md b/notes/cleanups/container_issnl_dedupe.md
new file mode 100644
index 00000000..a76bc961
--- /dev/null
+++ b/notes/cleanups/container_issnl_dedupe.md
@@ -0,0 +1,105 @@
+
+Simply de-duplicating container entities on the basis of ISSN-L.
+
+Initial plan is to:
+
+- only merge containers with zero (0) release entities pointing at them
+- not update any containers which have had human edits
+- not merge additional metadata from redirected entities to the "primary" entity
+
+
+## Prep
+
+Using commands from `check_issnl.sh`:
+
+ zcat container_export.json.gz \
+ | jq '[.issnl, .ident] | @tsv' -r \
+ | sort -S 4G \
+ | uniq -D -w 9 \
+ > issnl_ident.dupes.tsv
+
+ wc -l issnl_ident.dupes.tsv
+ # 3174 issnl_ident.dupes.tsv
+
+ cut -f1 issnl_ident.dupes.tsv | uniq | wc -l
+ # 835
+
+Run transform script:
+
+ cat issnl_ident.dupes.tsv | ./container_dupe_to_json.py | pv -l > container_issnl_dupes.json
+
+Create a small random sample:
+
+ shuf -n100 container_issnl_dupes.json > container_issnl_dupes.sample.json
+
+## QA Testing
+
+ git log | head -n1
+ # commit e72d61e60c43911b6d77c4842951441235561dcf
+
+ export FATCAT_AUTH_API_TOKEN=[...]
+
+ head -n25 /srv/fatcat/datasets/container_issnl_dupes.sample.json \
+ | python -m fatcat_tools.mergers.containers --editgroup-description-override "Automated merging of duplicate container entities with the same ISSN-L" --dry-run merge-containers -
+
+Got various errors and patched them:
+
+ AttributeError: 'EntityHistoryEntry' object has no attribute 'editor'
+
+ requests.exceptions.HTTPError: 404 Client Error: NOT FOUND for url: https://fatcat.wiki/container/%7Bident%7D/stats.json
+
+ fatcat_openapi_client.exceptions.ApiValueError: Missing the required parameter `editgroup_id` when calling `accept_editgroup`
+
+Run again:
+
+ head -n25 /srv/fatcat/datasets/container_issnl_dupes.sample.json \
+ | python -m fatcat_tools.mergers.containers --editgroup-description-override "Automated merging of duplicate container entities with the same ISSN-L" --dry-run merge-containers -
+ # Running in dry-run mode!
+ # Counter({'updated-entities': 96, 'skip-container-release-count': 84, 'lines': 25, 'merged': 25, 'skip': 0, 'updated-total': 0})
+
+Finally! dry-run mode actually worked. Try entire sample in dry-run:
+
+ cat /srv/fatcat/datasets/container_issnl_dupes.sample.json \
+ | python -m fatcat_tools.mergers.containers --editgroup-description-override "Automated merging of duplicate container entities with the same ISSN-L" --dry-run merge-containers -
+ # Running in dry-run mode!
+ # Counter({'updated-entities': 310, 'skip-container-release-count': 251, 'lines': 100, 'merged': 100, 'skip': 0, 'updated-total': 0})
+
+How about a small `max-container-releases`:
+
+ cat /srv/fatcat/datasets/container_issnl_dupes.sample.json \
+ | python -m fatcat_tools.mergers.containers --editgroup-description-override "Automated merging of duplicate container entities with the same ISSN-L" --dry-run merge-containers -
+ # Running in dry-run mode!
+ # Counter({'updated-entities': 310, 'skip-container-release-count': 251, 'lines': 100, 'merged': 100, 'skip': 0, 'updated-total': 0})
+
+Exact same count... maybe something isn't working? Debugged and fixed it.
+
+ requests.exceptions.HTTPError: 503 Server Error: SERVICE UNAVAILABLE for url: https://fatcat.wiki/container/xn7i2sdijzbypcetz77kttj76y/stats.json
+
+ # Running in dry-run mode!
+ # Counter({'updated-entities': 310, 'lines': 100, 'merged': 100, 'skip-container-release-count': 92, 'skip': 0, 'updated-total': 0})
+
+From skimming, it looks like 100 is probably a good cut-off. There are sort of
+a lot of these dupes!
+
+Try some actual merges:
+
+ head -n25 /srv/fatcat/datasets/container_issnl_dupes.sample.json \
+ | python -m fatcat_tools.mergers.containers --editgroup-description-override "Automated merging of duplicate container entities with the same ISSN-L" merge-containers -
+ # Counter({'updated-entities': 96, 'skip-container-release-count': 84, 'lines': 25, 'merged': 25, 'skip': 0, 'updated-total': 0})
+
+Run immediately again:
+
+ # Counter({'lines': 25, 'skip': 25, 'skip-not-active-entity': 25, 'skip-container-release-count': 2, 'merged': 0, 'updated-total': 0})
+
+Run all the samples, with limit of 100 releases:
+
+ cat /srv/fatcat/datasets/container_issnl_dupes.sample.json \
+ | python -m fatcat_tools.mergers.containers --editgroup-description-override "Automated merging of duplicate container entities with the same ISSN-L" merge-containers - --max-container-releases 100
+ # Counter({'updated-entities': 214, 'lines': 100, 'merged': 75, 'skip': 25, 'skip-not-active-entity': 25, 'skip-container-release-count': 15, 'updated-total': 0})
+
+Wow, there are going to be a lot of these containers not merged because they
+have so many releases! Will have to do a second, more carefully reviewed (?)
+round of merging.
+
+Unfortunately, not seeing any human-edited container entities here to check if
+that filter is working.
diff --git a/notes/cleanups/scripts/container_dupe_to_json.py b/notes/cleanups/scripts/container_dupe_to_json.py
new file mode 100755
index 00000000..2e841c69
--- /dev/null
+++ b/notes/cleanups/scripts/container_dupe_to_json.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+
+"""
+This script can be used to transform duplicate container entity rows into JSON
+objects which can be passed to the container entity merger.
+
+It is initially used to de-dupe ISSN-Ls. The script is based on
+`file_dupe_to_json.py`.
+"""
+
+import json, sys
+from typing import Optional
+
+EXTID_TYPE = "issnl"
+
+
+def print_group(extid, dupe_ids):
+ if len(dupe_ids) < 2:
+ return
+ group = dict(
+ entity_type="container",
+ primary_id=None,
+ duplicate_ids=dupe_ids,
+ evidence=dict(
+ extid=extid,
+ extid_type=EXTID_TYPE,
+ ),
+ )
+ print(json.dumps(group, sort_keys=True))
+
+def run():
+ last_extid = None
+ dupe_ids = []
+ for l in sys.stdin:
+ l = l.strip()
+ if not l:
+ continue
+ (row_extid, row_id) = l.split("\t")[0:2]
+ if EXTID_TYPE == "issnl":
+ assert len(row_extid) == 9
+ else:
+ raise Exception(f"extid type not supported yet: {EXTID_TYPE}")
+ if row_extid == last_extid:
+ dupe_ids.append(row_id)
+ continue
+ elif dupe_ids:
+ print_group(last_extid, dupe_ids)
+ last_extid = row_extid
+ dupe_ids = [row_id]
+ if last_extid and dupe_ids:
+ print_group(last_extid, dupe_ids)
+
+
+if __name__=="__main__":
+ run()
diff --git a/python/fatcat_tools/mergers/common.py b/python/fatcat_tools/mergers/common.py
index e25f8194..f8197519 100644
--- a/python/fatcat_tools/mergers/common.py
+++ b/python/fatcat_tools/mergers/common.py
@@ -111,7 +111,8 @@ class EntityMerger(EntityImporter):
else:
self.counts["skip"] += 1
if self._edit_count >= self.edit_batch_size:
- self.api.accept_editgroup(self._editgroup_id)
+ if not self.dry_run_mode:
+ self.api.accept_editgroup(self._editgroup_id)
self._editgroup_id = None
self._edit_count = 0
self._idents_inflight = []
@@ -128,7 +129,8 @@ class EntityMerger(EntityImporter):
def finish(self) -> Counter:
if self._edit_count > 0:
- self.api.accept_editgroup(self._editgroup_id)
+ if not self.dry_run_mode:
+ self.api.accept_editgroup(self._editgroup_id)
self._editgroup_id = None
self._edit_count = 0
self._idents_inflight = []
diff --git a/python/fatcat_tools/mergers/containers.py b/python/fatcat_tools/mergers/containers.py
new file mode 100644
index 00000000..1b9975e5
--- /dev/null
+++ b/python/fatcat_tools/mergers/containers.py
@@ -0,0 +1,246 @@
+import argparse
+import os
+import sys
+from typing import Any, Dict, List, Optional
+
+import fatcat_openapi_client
+from fatcat_openapi_client.models import ContainerEntity
+
+from fatcat_tools import authenticated_api
+from fatcat_tools.harvest.harvest_common import requests_retry_session
+from fatcat_tools.importers import JsonLinePusher
+
+from .common import EntityMerger
+
+
+class ContainerMerger(EntityMerger):
+ """
+ Combines container entities into a single primary. Does not merge partial
+ metadata (identifiers, etc). Can chose "primary" container to redirect to,
+ if necessary.
+
+ The `max_container_releases` argument (int or None) can be used to
+ prevent redirecting containers which already have releases pointed at them
+ (based on release ES index stats). If set to 0, no releases are allowed. If
+ set to None (or a negative number), the parameter is ignored.
+
+ The `clobber_human_edited` flag (boolean) can be used to allow updating
+ entities even if they have had human edits in the past.
+
+ This merger makes external HTTP requests to fatcat.wiki, for the purpose of
+ fetching release stats.
+ """
+
+ def __init__(self, api: fatcat_openapi_client.ApiClient, **kwargs) -> None:
+
+ eg_desc = (
+ kwargs.pop("editgroup_description", None) or "Automated merge of container entities"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ContainerMerger")
+ self.dry_run_mode: bool = eg_extra.get("dry_run_mode", False)
+ self.clobber_human_edited: bool = kwargs.get("clobber_human_edited", False)
+ self.max_container_releases: Optional[int] = kwargs.get("max_container_releases", 0)
+ if self.max_container_releases is not None and self.max_container_releases < 0:
+ self.max_container_releases = None
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
+ self.entity_type_name = "container"
+ self.http_session = requests_retry_session(status_forcelist=[429, 500, 502, 504, 504])
+
+ def choose_primary_container(
+ self,
+ entities: List[ContainerEntity],
+ redirects: Dict[str, List[str]],
+ release_counts: Dict[str, int],
+ ) -> str:
+ assert entities and len(entities) >= 2
+
+ # want to sort in descending order, so reverse=True
+ entities = sorted(
+ entities,
+ key=lambda a: (
+ # linked release counts
+ release_counts[a.ident],
+ # number of redirected entities
+ len(redirects[a.ident]),
+ # not a stub
+ bool(a.container_type != "stub"),
+ # has a strong identifier?
+ bool(a.issnl or (a.extra and a.extra.get("dblp"))),
+ # has IA sim metadata?
+ bool(a.extra and a.extra.get("ia")),
+ # has additional metadata?
+ bool(a.publication_status or a.container_type),
+ bool(a.extra),
+ ),
+ reverse=True,
+ )
+ return entities[0].ident
+
+ def try_merge(
+ self,
+ dupe_ids: List[str],
+ primary_id: Optional[str] = None,
+ evidence: Optional[Dict[str, Any]] = None,
+ ) -> int:
+
+ # currently required for extid validation
+ if not evidence or not (evidence.get("extid_type") and evidence.get("extid")):
+ self.counts["skip-missing-evidence"] += 1
+ return 0
+
+ updated_entities = 0
+ entities: Dict[str, ContainerEntity] = dict()
+ redirects: Dict[str, List[str]] = dict()
+ release_counts: Dict[str, int] = dict()
+
+ all_ids = dupe_ids.copy()
+ if primary_id:
+ all_ids.append(primary_id)
+ for ident in all_ids:
+ try:
+ entities[ident] = self.api.get_container(ident)
+ redirects[ident] = self.api.get_container_redirects(ident)
+ except fatcat_openapi_client.ApiException as ae:
+ if ae.status == 404:
+ self.counts["skip-entity-not-found"] += 1
+ return 0
+ else:
+ raise
+ if entities[ident].state != "active":
+ self.counts["skip-not-active-entity"] += 1
+ return 0
+ if getattr(entities[ident], evidence["extid_type"]) != evidence["extid"]:
+ self.counts["skip-extid-mismatch"] += 1
+ return 0
+ if not self.clobber_human_edited:
+ edit_history = self.api.get_container_history(ident)
+ for edit in edit_history:
+ if edit.editgroup.editor.is_bot is not True:
+ print(f"skipping container_{ident}: human edited", file=sys.stderr)
+ self.counts["skip-human-edited"] += 1
+ return 0
+ resp = self.http_session.get(f"https://fatcat.wiki/container/{ident}/stats.json")
+ resp.raise_for_status()
+ stats = resp.json()
+ release_counts[ident] = stats["total"]
+ if self.max_container_releases is not None:
+ if release_counts[ident] > self.max_container_releases:
+ self.counts["skip-container-release-count"] += 1
+ print(
+ f"skipping container_{ident}: release count {release_counts[ident]}",
+ file=sys.stderr,
+ )
+ continue
+
+ if not primary_id:
+ primary_id = self.choose_primary_container(
+ list(entities.values()), redirects, release_counts
+ )
+ dupe_ids = [d for d in dupe_ids if d != primary_id]
+
+ assert primary_id not in dupe_ids
+
+ if self.dry_run_mode:
+ eg_id = "dummy-editgroup-id"
+ else:
+ eg_id = self.get_editgroup_id()
+
+ primary = entities[primary_id]
+ for other_id in dupe_ids:
+ other = entities[other_id]
+ if not self.dry_run_mode:
+ self.api.update_container(
+ eg_id,
+ other.ident,
+ ContainerEntity(
+ redirect=primary.ident,
+ edit_extra=evidence,
+ ),
+ )
+ updated_entities += 1
+
+ return updated_entities
+
+
+def run_merge_containers(args: argparse.Namespace) -> None:
+ em = ContainerMerger(
+ args.api,
+ edit_batch_size=args.batch_size,
+ dry_run_mode=args.dry_run,
+ max_container_releases=args.max_container_releases,
+ clobber_human_edited=args.clobber_human_edited,
+ editgroup_description=args.editgroup_description_override,
+ )
+ JsonLinePusher(em, args.json_file).run()
+
+
+def main() -> None:
+ """
+ Invoke like:
+
+ python3 -m fatcat_tools.mergers.containers [options]
+ """
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--host-url", default="http://localhost:9411/v0", help="connect to this host/port"
+ )
+ parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)
+ parser.add_argument(
+ "--editgroup-description-override",
+ help="editgroup description override",
+ default=None,
+ type=str,
+ )
+ parser.add_argument(
+ "--dry-run",
+ action="store_true",
+ help="don't actually commit merges, just count what would have been",
+ )
+ parser.set_defaults(
+ auth_var="FATCAT_AUTH_API_TOKEN",
+ )
+ subparsers = parser.add_subparsers()
+
+ sub_merge_containers = subparsers.add_parser("merge-containers")
+ sub_merge_containers.set_defaults(func=run_merge_containers)
+ sub_merge_containers.add_argument(
+ "json_file",
+ help="source of merge lines to process (or stdin)",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_merge_containers.add_argument(
+ "--clobber-human-edited",
+ action="store_true",
+ help="if set, entities which have non-bot (human) edits can be updated/redirected",
+ )
+ sub_merge_containers.add_argument(
+ "--max-container-releases",
+ default=0,
+ type=int,
+ help="if container has more than this many releases linked, don't update (set to -1 to disable limit)",
+ )
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ print("tell me what to do!")
+ sys.exit(-1)
+
+ # allow editgroup description override via env variable (but CLI arg takes
+ # precedence)
+ if not args.editgroup_description_override and os.environ.get(
+ "FATCAT_EDITGROUP_DESCRIPTION"
+ ):
+ args.editgroup_description_override = os.environ.get("FATCAT_EDITGROUP_DESCRIPTION")
+
+ args.api = authenticated_api(
+ args.host_url,
+ # token is an optional kwarg (can be empty string, None, etc)
+ token=os.environ.get(args.auth_var),
+ )
+ args.func(args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/fatcat_tools/mergers/releases.py b/python/fatcat_tools/mergers/releases.py
index 1f995b00..0149bbbe 100644
--- a/python/fatcat_tools/mergers/releases.py
+++ b/python/fatcat_tools/mergers/releases.py
@@ -92,7 +92,6 @@ class ReleaseMerger(EntityMerger):
updated_entities = 0
releases = dict()
existing_redirects: Dict[str, List[str]] = dict()
- eg_id = self.get_editgroup_id()
all_ids = dupe_ids.copy()
if primary_id:
@@ -113,6 +112,11 @@ class ReleaseMerger(EntityMerger):
updated_work_ids = []
redirected_release_ids = []
+ if self.dry_run_mode:
+ eg_id = "dummy-editgroup-id"
+ else:
+ eg_id = self.get_editgroup_id()
+
# execute all the release redirects
for release in releases.values():
if release.ident == primary_id:
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index c16053ec..a6d85f7e 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -284,7 +284,7 @@ def _rte_container_helper(container: ContainerEntity, release_year: Optional[int
t["container_name"] = container.name
# this is container.ident, not release.container_id, because there may
# be a redirect involved
- t["container_id"] = container.ident
+ t["container_id"] = container.redirect or container.ident
t["container_issnl"] = container.issnl
issns = [container.issnl, container.issne, container.issnp]
issns = list(set([i for i in issns if i]))
diff --git a/python/tests/merge_containers.py b/python/tests/merge_containers.py
new file mode 100644
index 00000000..a657522e
--- /dev/null
+++ b/python/tests/merge_containers.py
@@ -0,0 +1,116 @@
+from fatcat_openapi_client import ContainerEntity
+from fixtures import api
+
+from fatcat_tools.mergers.containers import ContainerMerger
+
+
+def test_choose_primary_container(api) -> None:
+
+ release_counts = dict()
+ redirects = dict()
+ em = ContainerMerger(api=api)
+
+ ce_stub = ContainerEntity(
+ ident="pppppp5apzfhbbxxc7rgu2yw6m",
+ name="dummy journal",
+ )
+ release_counts[ce_stub.ident] = 0
+ redirects[ce_stub.ident] = []
+
+ ce_partial = ContainerEntity(
+ ident="eeeeeeeapzfhbbxxc7rgu2yw6m",
+ name="dummy complete journal",
+ publisher="some publisher",
+ issnl="1234-5678",
+ publication_status="active",
+ extra=dict(asdf=123, ia=dict(asdf=True)),
+ )
+ release_counts[ce_partial.ident] = 0
+ redirects[ce_partial.ident] = []
+
+ ce_partial_redirects = ContainerEntity(
+ ident="rrrrrrrrrrfhbbxxc7rgu2yw6m",
+ name="dummy complete journal",
+ publisher="some publisher",
+ issnl="1234-5678",
+ publication_status="active",
+ extra=dict(asdf=123, ia=dict(asdf=True)),
+ )
+ release_counts[ce_partial_redirects.ident] = 0
+ redirects[ce_partial_redirects.ident] = [
+ "zzzzzzzzrrfhbbxxc7rgu2yw6m",
+ ]
+
+ ce_complete_zero = ContainerEntity(
+ ident="oooooooapzfhbbxxc7rgu2yw6m",
+ name="dummy complete journal",
+ publisher="some publisher",
+ issnl="1234-5678",
+ publication_status="active",
+ extra=dict(asdf=123, ia=dict(asdf=True)),
+ )
+ release_counts[ce_complete_zero.ident] = 0
+ redirects[ce_complete_zero.ident] = []
+
+ ce_complete_small = ContainerEntity(
+ ident="cccccccapzfhbbxxc7rgu2yw6m",
+ name="dummy complete journal",
+ publisher="some publisher",
+ issnl="1234-5678",
+ publication_status="active",
+ extra=dict(asdf=123, ia=dict(asdf=True)),
+ )
+ release_counts[ce_complete_small.ident] = 10
+ redirects[ce_complete_small.ident] = []
+
+ ce_complete_big = ContainerEntity(
+ ident="ddddddddpzfhbbxxc7rgu2yw6m",
+ name="dummy complete journal",
+ publisher="some publisher",
+ issnl="1234-5678",
+ publication_status="active",
+ extra=dict(asdf=123, ia=dict(asdf=True)),
+ )
+ release_counts[ce_complete_big.ident] = 9999999
+ redirects[ce_complete_big.ident] = []
+
+ assert (
+ em.choose_primary_container([ce_stub, ce_partial], redirects, release_counts)
+ == ce_partial.ident
+ )
+ assert (
+ em.choose_primary_container(
+ [ce_stub, ce_complete_zero, ce_partial], redirects, release_counts
+ )
+ == ce_complete_zero.ident
+ )
+ assert (
+ em.choose_primary_container(
+ [ce_stub, ce_partial_redirects, ce_complete_zero, ce_partial],
+ redirects,
+ release_counts,
+ )
+ == ce_partial_redirects.ident
+ )
+ assert (
+ em.choose_primary_container(
+ [ce_stub, ce_complete_zero, ce_complete_small, ce_partial],
+ redirects,
+ release_counts,
+ )
+ == ce_complete_small.ident
+ )
+ assert (
+ em.choose_primary_container(
+ [ce_stub, ce_complete_big, ce_complete_zero, ce_complete_small, ce_partial],
+ redirects,
+ release_counts,
+ )
+ == ce_complete_big.ident
+ )
+ assert (
+ em.choose_primary_container(
+ [ce_complete_small, ce_complete_big], redirects, release_counts
+ )
+ == ce_complete_big.ident
+ )