diff options
-rw-r--r-- | notes/cleanups/container_issnl_dedupe.md | 105 | ||||
-rwxr-xr-x | notes/cleanups/scripts/container_dupe_to_json.py | 55 | ||||
-rw-r--r-- | python/fatcat_tools/mergers/common.py | 6 | ||||
-rw-r--r-- | python/fatcat_tools/mergers/containers.py | 246 | ||||
-rw-r--r-- | python/fatcat_tools/mergers/releases.py | 6 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 2 | ||||
-rw-r--r-- | python/tests/merge_containers.py | 116 |
7 files changed, 532 insertions, 4 deletions
diff --git a/notes/cleanups/container_issnl_dedupe.md b/notes/cleanups/container_issnl_dedupe.md new file mode 100644 index 00000000..a76bc961 --- /dev/null +++ b/notes/cleanups/container_issnl_dedupe.md @@ -0,0 +1,105 @@ + +Simply de-duplicating container entities on the basis of ISSN-L. + +Initial plan is to: + +- only merge containers with zero (0) release entities pointing at them +- not update any containers which have had human edits +- not merge additional metadata from redirected entities to the "primary" entity + + +## Prep + +Using commands from `check_issnl.sh`: + + zcat container_export.json.gz \ + | jq '[.issnl, .ident] | @tsv' -r \ + | sort -S 4G \ + | uniq -D -w 9 \ + > issnl_ident.dupes.tsv + + wc -l issnl_ident.dupes.tsv + # 3174 issnl_ident.dupes.tsv + + cut -f1 issnl_ident.dupes.tsv | uniq | wc -l + # 835 + +Run transform script: + + cat issnl_ident.dupes.tsv | ./container_dupe_to_json.py | pv -l > container_issnl_dupes.json + +Create a small random sample: + + shuf -n100 container_issnl_dupes.json > container_issnl_dupes.sample.json + +## QA Testing + + git log | head -n1 + # commit e72d61e60c43911b6d77c4842951441235561dcf + + export FATCAT_AUTH_API_TOKEN=[...] + + head -n25 /srv/fatcat/datasets/container_issnl_dupes.sample.json \ + | python -m fatcat_tools.mergers.containers --editgroup-description-override "Automated merging of duplicate container entities with the same ISSN-L" --dry-run merge-containers - + +Got various errors and patched them: + + AttributeError: 'EntityHistoryEntry' object has no attribute 'editor' + + requests.exceptions.HTTPError: 404 Client Error: NOT FOUND for url: https://fatcat.wiki/container/%7Bident%7D/stats.json + + fatcat_openapi_client.exceptions.ApiValueError: Missing the required parameter `editgroup_id` when calling `accept_editgroup` + +Run again: + + head -n25 /srv/fatcat/datasets/container_issnl_dupes.sample.json \ + | python -m fatcat_tools.mergers.containers --editgroup-description-override "Automated merging of duplicate container entities with the same ISSN-L" --dry-run merge-containers - + # Running in dry-run mode! + # Counter({'updated-entities': 96, 'skip-container-release-count': 84, 'lines': 25, 'merged': 25, 'skip': 0, 'updated-total': 0}) + +Finally! dry-run mode actually worked. Try entire sample in dry-run: + + cat /srv/fatcat/datasets/container_issnl_dupes.sample.json \ + | python -m fatcat_tools.mergers.containers --editgroup-description-override "Automated merging of duplicate container entities with the same ISSN-L" --dry-run merge-containers - + # Running in dry-run mode! + # Counter({'updated-entities': 310, 'skip-container-release-count': 251, 'lines': 100, 'merged': 100, 'skip': 0, 'updated-total': 0}) + +How about a small `max-container-releases`: + + cat /srv/fatcat/datasets/container_issnl_dupes.sample.json \ + | python -m fatcat_tools.mergers.containers --editgroup-description-override "Automated merging of duplicate container entities with the same ISSN-L" --dry-run merge-containers - + # Running in dry-run mode! + # Counter({'updated-entities': 310, 'skip-container-release-count': 251, 'lines': 100, 'merged': 100, 'skip': 0, 'updated-total': 0}) + +Exact same count... maybe something isn't working? Debugged and fixed it. + + requests.exceptions.HTTPError: 503 Server Error: SERVICE UNAVAILABLE for url: https://fatcat.wiki/container/xn7i2sdijzbypcetz77kttj76y/stats.json + + # Running in dry-run mode! + # Counter({'updated-entities': 310, 'lines': 100, 'merged': 100, 'skip-container-release-count': 92, 'skip': 0, 'updated-total': 0}) + +From skimming, it looks like 100 is probably a good cut-off. There are sort of +a lot of these dupes! + +Try some actual merges: + + head -n25 /srv/fatcat/datasets/container_issnl_dupes.sample.json \ + | python -m fatcat_tools.mergers.containers --editgroup-description-override "Automated merging of duplicate container entities with the same ISSN-L" merge-containers - + # Counter({'updated-entities': 96, 'skip-container-release-count': 84, 'lines': 25, 'merged': 25, 'skip': 0, 'updated-total': 0}) + +Run immediately again: + + # Counter({'lines': 25, 'skip': 25, 'skip-not-active-entity': 25, 'skip-container-release-count': 2, 'merged': 0, 'updated-total': 0}) + +Run all the samples, with limit of 100 releases: + + cat /srv/fatcat/datasets/container_issnl_dupes.sample.json \ + | python -m fatcat_tools.mergers.containers --editgroup-description-override "Automated merging of duplicate container entities with the same ISSN-L" merge-containers - --max-container-releases 100 + # Counter({'updated-entities': 214, 'lines': 100, 'merged': 75, 'skip': 25, 'skip-not-active-entity': 25, 'skip-container-release-count': 15, 'updated-total': 0}) + +Wow, there are going to be a lot of these containers not merged because they +have so many releases! Will have to do a second, more carefully reviewed (?) +round of merging. + +Unfortunately, not seeing any human-edited container entities here to check if +that filter is working. diff --git a/notes/cleanups/scripts/container_dupe_to_json.py b/notes/cleanups/scripts/container_dupe_to_json.py new file mode 100755 index 00000000..2e841c69 --- /dev/null +++ b/notes/cleanups/scripts/container_dupe_to_json.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 + +""" +This script can be used to transform duplicate container entity rows into JSON +objects which can be passed to the container entity merger. + +It is initially used to de-dupe ISSN-Ls. The script is based on +`file_dupe_to_json.py`. +""" + +import json, sys +from typing import Optional + +EXTID_TYPE = "issnl" + + +def print_group(extid, dupe_ids): + if len(dupe_ids) < 2: + return + group = dict( + entity_type="container", + primary_id=None, + duplicate_ids=dupe_ids, + evidence=dict( + extid=extid, + extid_type=EXTID_TYPE, + ), + ) + print(json.dumps(group, sort_keys=True)) + +def run(): + last_extid = None + dupe_ids = [] + for l in sys.stdin: + l = l.strip() + if not l: + continue + (row_extid, row_id) = l.split("\t")[0:2] + if EXTID_TYPE == "issnl": + assert len(row_extid) == 9 + else: + raise Exception(f"extid type not supported yet: {EXTID_TYPE}") + if row_extid == last_extid: + dupe_ids.append(row_id) + continue + elif dupe_ids: + print_group(last_extid, dupe_ids) + last_extid = row_extid + dupe_ids = [row_id] + if last_extid and dupe_ids: + print_group(last_extid, dupe_ids) + + +if __name__=="__main__": + run() diff --git a/python/fatcat_tools/mergers/common.py b/python/fatcat_tools/mergers/common.py index e25f8194..f8197519 100644 --- a/python/fatcat_tools/mergers/common.py +++ b/python/fatcat_tools/mergers/common.py @@ -111,7 +111,8 @@ class EntityMerger(EntityImporter): else: self.counts["skip"] += 1 if self._edit_count >= self.edit_batch_size: - self.api.accept_editgroup(self._editgroup_id) + if not self.dry_run_mode: + self.api.accept_editgroup(self._editgroup_id) self._editgroup_id = None self._edit_count = 0 self._idents_inflight = [] @@ -128,7 +129,8 @@ class EntityMerger(EntityImporter): def finish(self) -> Counter: if self._edit_count > 0: - self.api.accept_editgroup(self._editgroup_id) + if not self.dry_run_mode: + self.api.accept_editgroup(self._editgroup_id) self._editgroup_id = None self._edit_count = 0 self._idents_inflight = [] diff --git a/python/fatcat_tools/mergers/containers.py b/python/fatcat_tools/mergers/containers.py new file mode 100644 index 00000000..1b9975e5 --- /dev/null +++ b/python/fatcat_tools/mergers/containers.py @@ -0,0 +1,246 @@ +import argparse +import os +import sys +from typing import Any, Dict, List, Optional + +import fatcat_openapi_client +from fatcat_openapi_client.models import ContainerEntity + +from fatcat_tools import authenticated_api +from fatcat_tools.harvest.harvest_common import requests_retry_session +from fatcat_tools.importers import JsonLinePusher + +from .common import EntityMerger + + +class ContainerMerger(EntityMerger): + """ + Combines container entities into a single primary. Does not merge partial + metadata (identifiers, etc). Can chose "primary" container to redirect to, + if necessary. + + The `max_container_releases` argument (int or None) can be used to + prevent redirecting containers which already have releases pointed at them + (based on release ES index stats). If set to 0, no releases are allowed. If + set to None (or a negative number), the parameter is ignored. + + The `clobber_human_edited` flag (boolean) can be used to allow updating + entities even if they have had human edits in the past. + + This merger makes external HTTP requests to fatcat.wiki, for the purpose of + fetching release stats. + """ + + def __init__(self, api: fatcat_openapi_client.ApiClient, **kwargs) -> None: + + eg_desc = ( + kwargs.pop("editgroup_description", None) or "Automated merge of container entities" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ContainerMerger") + self.dry_run_mode: bool = eg_extra.get("dry_run_mode", False) + self.clobber_human_edited: bool = kwargs.get("clobber_human_edited", False) + self.max_container_releases: Optional[int] = kwargs.get("max_container_releases", 0) + if self.max_container_releases is not None and self.max_container_releases < 0: + self.max_container_releases = None + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) + self.entity_type_name = "container" + self.http_session = requests_retry_session(status_forcelist=[429, 500, 502, 504, 504]) + + def choose_primary_container( + self, + entities: List[ContainerEntity], + redirects: Dict[str, List[str]], + release_counts: Dict[str, int], + ) -> str: + assert entities and len(entities) >= 2 + + # want to sort in descending order, so reverse=True + entities = sorted( + entities, + key=lambda a: ( + # linked release counts + release_counts[a.ident], + # number of redirected entities + len(redirects[a.ident]), + # not a stub + bool(a.container_type != "stub"), + # has a strong identifier? + bool(a.issnl or (a.extra and a.extra.get("dblp"))), + # has IA sim metadata? + bool(a.extra and a.extra.get("ia")), + # has additional metadata? + bool(a.publication_status or a.container_type), + bool(a.extra), + ), + reverse=True, + ) + return entities[0].ident + + def try_merge( + self, + dupe_ids: List[str], + primary_id: Optional[str] = None, + evidence: Optional[Dict[str, Any]] = None, + ) -> int: + + # currently required for extid validation + if not evidence or not (evidence.get("extid_type") and evidence.get("extid")): + self.counts["skip-missing-evidence"] += 1 + return 0 + + updated_entities = 0 + entities: Dict[str, ContainerEntity] = dict() + redirects: Dict[str, List[str]] = dict() + release_counts: Dict[str, int] = dict() + + all_ids = dupe_ids.copy() + if primary_id: + all_ids.append(primary_id) + for ident in all_ids: + try: + entities[ident] = self.api.get_container(ident) + redirects[ident] = self.api.get_container_redirects(ident) + except fatcat_openapi_client.ApiException as ae: + if ae.status == 404: + self.counts["skip-entity-not-found"] += 1 + return 0 + else: + raise + if entities[ident].state != "active": + self.counts["skip-not-active-entity"] += 1 + return 0 + if getattr(entities[ident], evidence["extid_type"]) != evidence["extid"]: + self.counts["skip-extid-mismatch"] += 1 + return 0 + if not self.clobber_human_edited: + edit_history = self.api.get_container_history(ident) + for edit in edit_history: + if edit.editgroup.editor.is_bot is not True: + print(f"skipping container_{ident}: human edited", file=sys.stderr) + self.counts["skip-human-edited"] += 1 + return 0 + resp = self.http_session.get(f"https://fatcat.wiki/container/{ident}/stats.json") + resp.raise_for_status() + stats = resp.json() + release_counts[ident] = stats["total"] + if self.max_container_releases is not None: + if release_counts[ident] > self.max_container_releases: + self.counts["skip-container-release-count"] += 1 + print( + f"skipping container_{ident}: release count {release_counts[ident]}", + file=sys.stderr, + ) + continue + + if not primary_id: + primary_id = self.choose_primary_container( + list(entities.values()), redirects, release_counts + ) + dupe_ids = [d for d in dupe_ids if d != primary_id] + + assert primary_id not in dupe_ids + + if self.dry_run_mode: + eg_id = "dummy-editgroup-id" + else: + eg_id = self.get_editgroup_id() + + primary = entities[primary_id] + for other_id in dupe_ids: + other = entities[other_id] + if not self.dry_run_mode: + self.api.update_container( + eg_id, + other.ident, + ContainerEntity( + redirect=primary.ident, + edit_extra=evidence, + ), + ) + updated_entities += 1 + + return updated_entities + + +def run_merge_containers(args: argparse.Namespace) -> None: + em = ContainerMerger( + args.api, + edit_batch_size=args.batch_size, + dry_run_mode=args.dry_run, + max_container_releases=args.max_container_releases, + clobber_human_edited=args.clobber_human_edited, + editgroup_description=args.editgroup_description_override, + ) + JsonLinePusher(em, args.json_file).run() + + +def main() -> None: + """ + Invoke like: + + python3 -m fatcat_tools.mergers.containers [options] + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--host-url", default="http://localhost:9411/v0", help="connect to this host/port" + ) + parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) + parser.add_argument( + "--editgroup-description-override", + help="editgroup description override", + default=None, + type=str, + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="don't actually commit merges, just count what would have been", + ) + parser.set_defaults( + auth_var="FATCAT_AUTH_API_TOKEN", + ) + subparsers = parser.add_subparsers() + + sub_merge_containers = subparsers.add_parser("merge-containers") + sub_merge_containers.set_defaults(func=run_merge_containers) + sub_merge_containers.add_argument( + "json_file", + help="source of merge lines to process (or stdin)", + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_merge_containers.add_argument( + "--clobber-human-edited", + action="store_true", + help="if set, entities which have non-bot (human) edits can be updated/redirected", + ) + sub_merge_containers.add_argument( + "--max-container-releases", + default=0, + type=int, + help="if container has more than this many releases linked, don't update (set to -1 to disable limit)", + ) + + args = parser.parse_args() + if not args.__dict__.get("func"): + print("tell me what to do!") + sys.exit(-1) + + # allow editgroup description override via env variable (but CLI arg takes + # precedence) + if not args.editgroup_description_override and os.environ.get( + "FATCAT_EDITGROUP_DESCRIPTION" + ): + args.editgroup_description_override = os.environ.get("FATCAT_EDITGROUP_DESCRIPTION") + + args.api = authenticated_api( + args.host_url, + # token is an optional kwarg (can be empty string, None, etc) + token=os.environ.get(args.auth_var), + ) + args.func(args) + + +if __name__ == "__main__": + main() diff --git a/python/fatcat_tools/mergers/releases.py b/python/fatcat_tools/mergers/releases.py index 1f995b00..0149bbbe 100644 --- a/python/fatcat_tools/mergers/releases.py +++ b/python/fatcat_tools/mergers/releases.py @@ -92,7 +92,6 @@ class ReleaseMerger(EntityMerger): updated_entities = 0 releases = dict() existing_redirects: Dict[str, List[str]] = dict() - eg_id = self.get_editgroup_id() all_ids = dupe_ids.copy() if primary_id: @@ -113,6 +112,11 @@ class ReleaseMerger(EntityMerger): updated_work_ids = [] redirected_release_ids = [] + if self.dry_run_mode: + eg_id = "dummy-editgroup-id" + else: + eg_id = self.get_editgroup_id() + # execute all the release redirects for release in releases.values(): if release.ident == primary_id: diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index c16053ec..a6d85f7e 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -284,7 +284,7 @@ def _rte_container_helper(container: ContainerEntity, release_year: Optional[int t["container_name"] = container.name # this is container.ident, not release.container_id, because there may # be a redirect involved - t["container_id"] = container.ident + t["container_id"] = container.redirect or container.ident t["container_issnl"] = container.issnl issns = [container.issnl, container.issne, container.issnp] issns = list(set([i for i in issns if i])) diff --git a/python/tests/merge_containers.py b/python/tests/merge_containers.py new file mode 100644 index 00000000..a657522e --- /dev/null +++ b/python/tests/merge_containers.py @@ -0,0 +1,116 @@ +from fatcat_openapi_client import ContainerEntity +from fixtures import api + +from fatcat_tools.mergers.containers import ContainerMerger + + +def test_choose_primary_container(api) -> None: + + release_counts = dict() + redirects = dict() + em = ContainerMerger(api=api) + + ce_stub = ContainerEntity( + ident="pppppp5apzfhbbxxc7rgu2yw6m", + name="dummy journal", + ) + release_counts[ce_stub.ident] = 0 + redirects[ce_stub.ident] = [] + + ce_partial = ContainerEntity( + ident="eeeeeeeapzfhbbxxc7rgu2yw6m", + name="dummy complete journal", + publisher="some publisher", + issnl="1234-5678", + publication_status="active", + extra=dict(asdf=123, ia=dict(asdf=True)), + ) + release_counts[ce_partial.ident] = 0 + redirects[ce_partial.ident] = [] + + ce_partial_redirects = ContainerEntity( + ident="rrrrrrrrrrfhbbxxc7rgu2yw6m", + name="dummy complete journal", + publisher="some publisher", + issnl="1234-5678", + publication_status="active", + extra=dict(asdf=123, ia=dict(asdf=True)), + ) + release_counts[ce_partial_redirects.ident] = 0 + redirects[ce_partial_redirects.ident] = [ + "zzzzzzzzrrfhbbxxc7rgu2yw6m", + ] + + ce_complete_zero = ContainerEntity( + ident="oooooooapzfhbbxxc7rgu2yw6m", + name="dummy complete journal", + publisher="some publisher", + issnl="1234-5678", + publication_status="active", + extra=dict(asdf=123, ia=dict(asdf=True)), + ) + release_counts[ce_complete_zero.ident] = 0 + redirects[ce_complete_zero.ident] = [] + + ce_complete_small = ContainerEntity( + ident="cccccccapzfhbbxxc7rgu2yw6m", + name="dummy complete journal", + publisher="some publisher", + issnl="1234-5678", + publication_status="active", + extra=dict(asdf=123, ia=dict(asdf=True)), + ) + release_counts[ce_complete_small.ident] = 10 + redirects[ce_complete_small.ident] = [] + + ce_complete_big = ContainerEntity( + ident="ddddddddpzfhbbxxc7rgu2yw6m", + name="dummy complete journal", + publisher="some publisher", + issnl="1234-5678", + publication_status="active", + extra=dict(asdf=123, ia=dict(asdf=True)), + ) + release_counts[ce_complete_big.ident] = 9999999 + redirects[ce_complete_big.ident] = [] + + assert ( + em.choose_primary_container([ce_stub, ce_partial], redirects, release_counts) + == ce_partial.ident + ) + assert ( + em.choose_primary_container( + [ce_stub, ce_complete_zero, ce_partial], redirects, release_counts + ) + == ce_complete_zero.ident + ) + assert ( + em.choose_primary_container( + [ce_stub, ce_partial_redirects, ce_complete_zero, ce_partial], + redirects, + release_counts, + ) + == ce_partial_redirects.ident + ) + assert ( + em.choose_primary_container( + [ce_stub, ce_complete_zero, ce_complete_small, ce_partial], + redirects, + release_counts, + ) + == ce_complete_small.ident + ) + assert ( + em.choose_primary_container( + [ce_stub, ce_complete_big, ce_complete_zero, ce_complete_small, ce_partial], + redirects, + release_counts, + ) + == ce_complete_big.ident + ) + assert ( + em.choose_primary_container( + [ce_complete_small, ce_complete_big], redirects, release_counts + ) + == ce_complete_big.ident + ) |