From 38328c25674fee7781a8d8601e1d47de04186f19 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 16 Dec 2020 19:54:47 -0800 Subject: add fuzzy matching helper to importer base class Using fuzzycat. Add basic test coverage. --- python/tests/fixtures.py | 7 ++++ python/tests/import_common.py | 78 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 python/tests/import_common.py (limited to 'python/tests') diff --git a/python/tests/fixtures.py b/python/tests/fixtures.py index d71ac21d..e0bb1aae 100644 --- a/python/tests/fixtures.py +++ b/python/tests/fixtures.py @@ -45,6 +45,13 @@ ES_CONTAINER_RANDOM_RESP = { 'took': 50 } +ES_RELEASE_EMPTY_RESP = { + 'timed_out': False, + 'hits': {'total': 0, 'hits': [], 'max_score': 0.0}, + '_shards': {'successful': 5, 'total': 5, 'skipped': 0, 'failed': 0}, + 'took': 50 +} + @pytest.fixture def full_app(mocker): load_dotenv(dotenv_path="./example.env") diff --git a/python/tests/import_common.py b/python/tests/import_common.py new file mode 100644 index 00000000..9f04ebe0 --- /dev/null +++ b/python/tests/import_common.py @@ -0,0 +1,78 @@ + +import json +import datetime +from typing import Any + +import pytest +import elasticsearch +import fatcat_openapi_client +from fatcat_openapi_client import ReleaseEntity, ReleaseExtIds +import fuzzycat.matching + +from fatcat_tools.importers import EntityImporter +from fatcat_tools.transforms import entity_to_dict +from fixtures import * + + +@pytest.fixture(scope="function") +def entity_importer(api, mocker) -> Any: + es_client = elasticsearch.Elasticsearch("mockbackend") + mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') + yield EntityImporter(api, es_client=es_client) + +def test_fuzzy_match_none(entity_importer, mocker) -> None: + """ + Simple ES-mocked test for "no search results" case + """ + + es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') + es_raw.side_effect = [ + (200, {}, json.dumps(ES_RELEASE_EMPTY_RESP)), + (200, {}, json.dumps(ES_RELEASE_EMPTY_RESP)), + ] + + release = ReleaseEntity( + title="some long title which should not match anything because it is for testing", + ext_ids=ReleaseExtIds(), + ) + + resp = entity_importer.match_existing_release_fuzzy(release) + assert resp == None + +def test_fuzzy_match_different(entity_importer, mocker) -> None: + """ + Simple fuzzycat-mocked test for "strong match" case + """ + + r1 = ReleaseEntity( + title="example title: novel work", + contribs=[ReleaseContrib(raw_name="robin hood")], + ext_ids=ReleaseExtIds(doi="10.1234/abcdefg"), + ) + r2 = ReleaseEntity( + title="Example Title: Novel Work?", + contribs=[ReleaseContrib(raw_name="robin hood")], + ext_ids=ReleaseExtIds(), + ) + r3 = ReleaseEntity( + title="entirely different", + contribs=[ReleaseContrib(raw_name="king tut")], + ext_ids=ReleaseExtIds(), + ) + + match_raw = mocker.patch('fatcat_tools.importers.common.match_release_fuzzy') + match_raw.side_effect = [[r3, r2, r3, r2]] + resp = entity_importer.match_existing_release_fuzzy(r1) + assert resp == ("STRONG", r2) + + match_raw.side_effect = [[r2, r2, r3, r1]] + resp = entity_importer.match_existing_release_fuzzy(r1) + assert resp == ("EXACT", r1) + + match_raw.side_effect = [[r3]] + resp = entity_importer.match_existing_release_fuzzy(r1) + assert resp == None + + match_raw.side_effect = [[]] + resp = entity_importer.match_existing_release_fuzzy(r1) + assert resp == None -- cgit v1.2.3 From 6d5811693c36b9e73dedf0205c40f2aed63e2870 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 16 Dec 2020 19:56:01 -0800 Subject: add fuzzy match filtering to DOAJ importer In this default configuration, any entities with a fuzzy match (even "ambiguous") will be skipped at import time, to prevent creating duplicates. This is conservative towards not creating new/duplicate entities. In the future, as we get more confidence in fuzzy match/verification, we can start to ignore AMBIGUOUS, handle EXACT as same release, and merge STRONG (and WEAK?) matches under the same work entity. --- python/fatcat_tools/importers/doaj_article.py | 11 +++++++++-- python/tests/import_doaj.py | 16 ++++++++++++++-- 2 files changed, 23 insertions(+), 4 deletions(-) (limited to 'python/tests') diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py index 03752484..191a65d8 100644 --- a/python/fatcat_tools/importers/doaj_article.py +++ b/python/fatcat_tools/importers/doaj_article.py @@ -217,9 +217,16 @@ class DoajArticleImporter(EntityImporter): return False break - # TODO: in the future could do fuzzy match here, eg using elasticsearch + if not existing and self.do_fuzzy_match: + fuzzy_result = self.match_existing_release_fuzzy(re) + # TODO: in the future, could assign work_id for clustering, or for + # "EXACT" match, set existing and allow (optional) update code path + # to run + if fuzzy_result is not None: + self.counts["exists-fuzzy"] += 1 + return False - # create entity + # if no fuzzy existing match, create entity if not existing: return True diff --git a/python/tests/import_doaj.py b/python/tests/import_doaj.py index d69aebd7..17a23257 100644 --- a/python/tests/import_doaj.py +++ b/python/tests/import_doaj.py @@ -3,6 +3,7 @@ import json import datetime import pytest +import elasticsearch import fatcat_openapi_client from fatcat_tools.importers import DoajArticleImporter, JsonLinePusher @@ -11,14 +12,22 @@ from fixtures import * @pytest.fixture(scope="function") -def doaj_importer(api): +def doaj_importer(api, mocker): + es_client = elasticsearch.Elasticsearch("mockbackend") + mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield DoajArticleImporter(api, issn_file, bezerk_mode=True) + yield DoajArticleImporter( + api, + issn_file, + bezerk_mode=True, + es_client=es_client, + ) def test_doaj_importer(doaj_importer): last_index = doaj_importer.api.get_changelog(limit=1)[0].index with open("tests/files/example_doaj_articles.json", "r") as f: doaj_importer.bezerk_mode = True + doaj_importer.do_fuzzy_match = False counts = JsonLinePusher(doaj_importer, f).run() assert counts["insert"] == 5 assert counts["exists"] == 0 @@ -60,6 +69,7 @@ def test_doaj_importer_existing_doi(doaj_importer): doaj_importer.reset() doaj_importer.bezerk_mode = False doaj_importer.do_updates = False + doaj_importer.do_fuzzy_match = False counts = JsonLinePusher(doaj_importer, f).run() print(counts) assert counts["insert"] == 4 @@ -72,6 +82,7 @@ def test_doaj_importer_existing_doi(doaj_importer): doaj_importer.reset() doaj_importer.bezerk_mode = False doaj_importer.do_updates = True + doaj_importer.do_fuzzy_match = False counts = JsonLinePusher(doaj_importer, f).run() print(counts) assert counts["insert"] == 0 @@ -84,6 +95,7 @@ def test_doaj_importer_existing_doi(doaj_importer): doaj_importer.reset() doaj_importer.bezerk_mode = False doaj_importer.do_updates = True + doaj_importer.do_fuzzy_match = False counts = JsonLinePusher(doaj_importer, f).run() print(counts) assert counts["insert"] == 0 -- cgit v1.2.3 From 5eeb7a9d61beb8cb40fd89bd91fcd9dd820035aa Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 17 Dec 2020 16:01:04 -0800 Subject: update fuzzy helper to pass 'reason' through to import code The motivation for this change is to enable passing the 'reason' through to edit extra metadata, in cases where we merge or cluster releases. --- python/fatcat_tools/importers/common.py | 6 +++--- python/tests/import_common.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'python/tests') diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 1cce5fd0..6dc2ab9e 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -446,13 +446,13 @@ class EntityImporter: existing.urls = [u for u in existing.urls if u.url not in redundant_urls] return existing - def match_existing_release_fuzzy(self, release: ReleaseEntity) -> Optional[Tuple[str, ReleaseEntity]]: + def match_existing_release_fuzzy(self, release: ReleaseEntity) -> Optional[Tuple[str, str, ReleaseEntity]]: """ This helper function uses fuzzycat (and elasticsearch) to look for existing release entities with similar metadata. Returns None if there was no match of any kind, or a single tuple - (status: str, existing: ReleaseEntity) if there was a match. + (status: str, reason: str, existing: ReleaseEntity) if there was a match. Status string is one of the fuzzycat.common.Status, with "strongest match" in this sorted order: @@ -491,7 +491,7 @@ class EntityImporter: elif closest[0].status == fuzzycat.common.Status.TODO: raise NotImplementedError("fuzzycat verify hit a Status.TODO") else: - return (closest[0].status.name, closest[1]) + return (closest[0].status.name, closest[0].reason.value, closest[1]) class RecordPusher: diff --git a/python/tests/import_common.py b/python/tests/import_common.py index 9f04ebe0..d0db014e 100644 --- a/python/tests/import_common.py +++ b/python/tests/import_common.py @@ -63,11 +63,11 @@ def test_fuzzy_match_different(entity_importer, mocker) -> None: match_raw = mocker.patch('fatcat_tools.importers.common.match_release_fuzzy') match_raw.side_effect = [[r3, r2, r3, r2]] resp = entity_importer.match_existing_release_fuzzy(r1) - assert resp == ("STRONG", r2) + assert (resp[0], resp[2]) == ("STRONG", r2) match_raw.side_effect = [[r2, r2, r3, r1]] resp = entity_importer.match_existing_release_fuzzy(r1) - assert resp == ("EXACT", r1) + assert (resp[0], resp[2]) == ("EXACT", r1) match_raw.side_effect = [[r3]] resp = entity_importer.match_existing_release_fuzzy(r1) -- cgit v1.2.3