From 38328c25674fee7781a8d8601e1d47de04186f19 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 16 Dec 2020 19:54:47 -0800 Subject: add fuzzy matching helper to importer base class Using fuzzycat. Add basic test coverage. --- python/fatcat_tools/importers/common.py | 64 +++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 2 deletions(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 3c810391..1cce5fd0 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -7,16 +7,23 @@ import sqlite3 import datetime import subprocess from collections import Counter -from confluent_kafka import Consumer, KafkaException +from typing import Optional, Tuple import xml.etree.ElementTree as ET +import elasticsearch from bs4 import BeautifulSoup +from confluent_kafka import Consumer, KafkaException import fatcat_openapi_client +from fatcat_openapi_client import ReleaseEntity from fatcat_openapi_client.rest import ApiException +from fuzzycat.matching import match_release_fuzzy +import fuzzycat.common +import fuzzycat.verify # TODO: refactor so remove need for this (re-imports for backwards compatibility) from fatcat_tools.normal import (clean_str as clean, is_cjk, b32_hex, LANG_MAP_MARC) # noqa: F401 +from fatcat_tools.transforms import entity_to_dict DATE_FMT = "%Y-%m-%d" SANE_MAX_RELEASES = 200 @@ -145,12 +152,16 @@ class EntityImporter: self.api = api self.do_updates = bool(kwargs.get('do_updates', True)) + self.do_fuzzy_match = kwargs.get('do_fuzzy_match', True) self.bezerk_mode = kwargs.get('bezerk_mode', False) self.submit_mode = kwargs.get('submit_mode', False) self.edit_batch_size = kwargs.get('edit_batch_size', 100) self.editgroup_description = kwargs.get('editgroup_description') self.editgroup_extra = eg_extra - self.reset() + + self.es_client = kwargs.get('es_client') + if not self.es_client: + self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki") self._issnl_id_map = dict() self._orcid_id_map = dict() @@ -158,6 +169,8 @@ class EntityImporter: self._doi_id_map = dict() self._pmid_id_map = dict() + self.reset() + def reset(self): self.counts = Counter({'total': 0, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0}) self._edit_count = 0 @@ -433,6 +446,53 @@ class EntityImporter: existing.urls = [u for u in existing.urls if u.url not in redundant_urls] return existing + def match_existing_release_fuzzy(self, release: ReleaseEntity) -> Optional[Tuple[str, ReleaseEntity]]: + """ + This helper function uses fuzzycat (and elasticsearch) to look for + existing release entities with similar metadata. + + Returns None if there was no match of any kind, or a single tuple + (status: str, existing: ReleaseEntity) if there was a match. + + Status string is one of the fuzzycat.common.Status, with "strongest + match" in this sorted order: + + - EXACT + - STRONG + - WEAK + - AMBIGUOUS + + Eg, if there is any EXACT match that is always returned; an AMBIGIOUS + result is only returned if all the candidate matches were ambiguous. + """ + + # this map used to establish priority order of verified matches + STATUS_SORT = { + fuzzycat.common.Status.TODO: 0, + fuzzycat.common.Status.EXACT: 10, + fuzzycat.common.Status.STRONG: 20, + fuzzycat.common.Status.WEAK: 30, + fuzzycat.common.Status.AMBIGUOUS: 40, + fuzzycat.common.Status.DIFFERENT: 60, + } + + # TODO: the size here is a first guess; what should it really be? + candidates = match_release_fuzzy(release, size=10, es=self.es_client) + if not candidates: + return None + + release_dict = entity_to_dict(release, api_client=self.api.api_client) + verified = [(fuzzycat.verify.verify(release_dict, entity_to_dict(c, api_client=self.api.api_client)), c) for c in candidates] + + # chose the "closest" match + closest = sorted(verified, key=lambda v: STATUS_SORT[v[0].status])[0] + if closest[0].status == fuzzycat.common.Status.DIFFERENT: + return None + elif closest[0].status == fuzzycat.common.Status.TODO: + raise NotImplementedError("fuzzycat verify hit a Status.TODO") + else: + return (closest[0].status.name, closest[1]) + class RecordPusher: """ -- cgit v1.2.3 From 6d5811693c36b9e73dedf0205c40f2aed63e2870 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 16 Dec 2020 19:56:01 -0800 Subject: add fuzzy match filtering to DOAJ importer In this default configuration, any entities with a fuzzy match (even "ambiguous") will be skipped at import time, to prevent creating duplicates. This is conservative towards not creating new/duplicate entities. In the future, as we get more confidence in fuzzy match/verification, we can start to ignore AMBIGUOUS, handle EXACT as same release, and merge STRONG (and WEAK?) matches under the same work entity. --- python/fatcat_tools/importers/doaj_article.py | 11 +++++++++-- python/tests/import_doaj.py | 16 ++++++++++++++-- 2 files changed, 23 insertions(+), 4 deletions(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py index 03752484..191a65d8 100644 --- a/python/fatcat_tools/importers/doaj_article.py +++ b/python/fatcat_tools/importers/doaj_article.py @@ -217,9 +217,16 @@ class DoajArticleImporter(EntityImporter): return False break - # TODO: in the future could do fuzzy match here, eg using elasticsearch + if not existing and self.do_fuzzy_match: + fuzzy_result = self.match_existing_release_fuzzy(re) + # TODO: in the future, could assign work_id for clustering, or for + # "EXACT" match, set existing and allow (optional) update code path + # to run + if fuzzy_result is not None: + self.counts["exists-fuzzy"] += 1 + return False - # create entity + # if no fuzzy existing match, create entity if not existing: return True diff --git a/python/tests/import_doaj.py b/python/tests/import_doaj.py index d69aebd7..17a23257 100644 --- a/python/tests/import_doaj.py +++ b/python/tests/import_doaj.py @@ -3,6 +3,7 @@ import json import datetime import pytest +import elasticsearch import fatcat_openapi_client from fatcat_tools.importers import DoajArticleImporter, JsonLinePusher @@ -11,14 +12,22 @@ from fixtures import * @pytest.fixture(scope="function") -def doaj_importer(api): +def doaj_importer(api, mocker): + es_client = elasticsearch.Elasticsearch("mockbackend") + mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield DoajArticleImporter(api, issn_file, bezerk_mode=True) + yield DoajArticleImporter( + api, + issn_file, + bezerk_mode=True, + es_client=es_client, + ) def test_doaj_importer(doaj_importer): last_index = doaj_importer.api.get_changelog(limit=1)[0].index with open("tests/files/example_doaj_articles.json", "r") as f: doaj_importer.bezerk_mode = True + doaj_importer.do_fuzzy_match = False counts = JsonLinePusher(doaj_importer, f).run() assert counts["insert"] == 5 assert counts["exists"] == 0 @@ -60,6 +69,7 @@ def test_doaj_importer_existing_doi(doaj_importer): doaj_importer.reset() doaj_importer.bezerk_mode = False doaj_importer.do_updates = False + doaj_importer.do_fuzzy_match = False counts = JsonLinePusher(doaj_importer, f).run() print(counts) assert counts["insert"] == 4 @@ -72,6 +82,7 @@ def test_doaj_importer_existing_doi(doaj_importer): doaj_importer.reset() doaj_importer.bezerk_mode = False doaj_importer.do_updates = True + doaj_importer.do_fuzzy_match = False counts = JsonLinePusher(doaj_importer, f).run() print(counts) assert counts["insert"] == 0 @@ -84,6 +95,7 @@ def test_doaj_importer_existing_doi(doaj_importer): doaj_importer.reset() doaj_importer.bezerk_mode = False doaj_importer.do_updates = True + doaj_importer.do_fuzzy_match = False counts = JsonLinePusher(doaj_importer, f).run() print(counts) assert counts["insert"] == 0 -- cgit v1.2.3 From 5eeb7a9d61beb8cb40fd89bd91fcd9dd820035aa Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 17 Dec 2020 16:01:04 -0800 Subject: update fuzzy helper to pass 'reason' through to import code The motivation for this change is to enable passing the 'reason' through to edit extra metadata, in cases where we merge or cluster releases. --- python/fatcat_tools/importers/common.py | 6 +++--- python/tests/import_common.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 1cce5fd0..6dc2ab9e 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -446,13 +446,13 @@ class EntityImporter: existing.urls = [u for u in existing.urls if u.url not in redundant_urls] return existing - def match_existing_release_fuzzy(self, release: ReleaseEntity) -> Optional[Tuple[str, ReleaseEntity]]: + def match_existing_release_fuzzy(self, release: ReleaseEntity) -> Optional[Tuple[str, str, ReleaseEntity]]: """ This helper function uses fuzzycat (and elasticsearch) to look for existing release entities with similar metadata. Returns None if there was no match of any kind, or a single tuple - (status: str, existing: ReleaseEntity) if there was a match. + (status: str, reason: str, existing: ReleaseEntity) if there was a match. Status string is one of the fuzzycat.common.Status, with "strongest match" in this sorted order: @@ -491,7 +491,7 @@ class EntityImporter: elif closest[0].status == fuzzycat.common.Status.TODO: raise NotImplementedError("fuzzycat verify hit a Status.TODO") else: - return (closest[0].status.name, closest[1]) + return (closest[0].status.name, closest[0].reason.value, closest[1]) class RecordPusher: diff --git a/python/tests/import_common.py b/python/tests/import_common.py index 9f04ebe0..d0db014e 100644 --- a/python/tests/import_common.py +++ b/python/tests/import_common.py @@ -63,11 +63,11 @@ def test_fuzzy_match_different(entity_importer, mocker) -> None: match_raw = mocker.patch('fatcat_tools.importers.common.match_release_fuzzy') match_raw.side_effect = [[r3, r2, r3, r2]] resp = entity_importer.match_existing_release_fuzzy(r1) - assert resp == ("STRONG", r2) + assert (resp[0], resp[2]) == ("STRONG", r2) match_raw.side_effect = [[r2, r2, r3, r1]] resp = entity_importer.match_existing_release_fuzzy(r1) - assert resp == ("EXACT", r1) + assert (resp[0], resp[2]) == ("EXACT", r1) match_raw.side_effect = [[r3]] resp = entity_importer.match_existing_release_fuzzy(r1) -- cgit v1.2.3