diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-12-16 19:54:47 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-12-16 20:16:09 -0800 |
commit | 38328c25674fee7781a8d8601e1d47de04186f19 (patch) | |
tree | 32d834fb16bb93ad493a053a9b797d9da1e04ac5 | |
parent | 20f27677aff762822bbd3aa944caf430c089ab4b (diff) | |
download | fatcat-38328c25674fee7781a8d8601e1d47de04186f19.tar.gz fatcat-38328c25674fee7781a8d8601e1d47de04186f19.zip |
add fuzzy matching helper to importer base class
Using fuzzycat. Add basic test coverage.
-rw-r--r-- | python/fatcat_tools/importers/common.py | 64 | ||||
-rw-r--r-- | python/tests/fixtures.py | 7 | ||||
-rw-r--r-- | python/tests/import_common.py | 78 |
3 files changed, 147 insertions, 2 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 3c810391..1cce5fd0 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -7,16 +7,23 @@ import sqlite3 import datetime import subprocess from collections import Counter -from confluent_kafka import Consumer, KafkaException +from typing import Optional, Tuple import xml.etree.ElementTree as ET +import elasticsearch from bs4 import BeautifulSoup +from confluent_kafka import Consumer, KafkaException import fatcat_openapi_client +from fatcat_openapi_client import ReleaseEntity from fatcat_openapi_client.rest import ApiException +from fuzzycat.matching import match_release_fuzzy +import fuzzycat.common +import fuzzycat.verify # TODO: refactor so remove need for this (re-imports for backwards compatibility) from fatcat_tools.normal import (clean_str as clean, is_cjk, b32_hex, LANG_MAP_MARC) # noqa: F401 +from fatcat_tools.transforms import entity_to_dict DATE_FMT = "%Y-%m-%d" SANE_MAX_RELEASES = 200 @@ -145,12 +152,16 @@ class EntityImporter: self.api = api self.do_updates = bool(kwargs.get('do_updates', True)) + self.do_fuzzy_match = kwargs.get('do_fuzzy_match', True) self.bezerk_mode = kwargs.get('bezerk_mode', False) self.submit_mode = kwargs.get('submit_mode', False) self.edit_batch_size = kwargs.get('edit_batch_size', 100) self.editgroup_description = kwargs.get('editgroup_description') self.editgroup_extra = eg_extra - self.reset() + + self.es_client = kwargs.get('es_client') + if not self.es_client: + self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki") self._issnl_id_map = dict() self._orcid_id_map = dict() @@ -158,6 +169,8 @@ class EntityImporter: self._doi_id_map = dict() self._pmid_id_map = dict() + self.reset() + def reset(self): self.counts = Counter({'total': 0, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0}) self._edit_count = 0 @@ -433,6 +446,53 @@ class EntityImporter: existing.urls = [u for u in existing.urls if u.url not in redundant_urls] return existing + def match_existing_release_fuzzy(self, release: ReleaseEntity) -> Optional[Tuple[str, ReleaseEntity]]: + """ + This helper function uses fuzzycat (and elasticsearch) to look for + existing release entities with similar metadata. + + Returns None if there was no match of any kind, or a single tuple + (status: str, existing: ReleaseEntity) if there was a match. + + Status string is one of the fuzzycat.common.Status, with "strongest + match" in this sorted order: + + - EXACT + - STRONG + - WEAK + - AMBIGUOUS + + Eg, if there is any EXACT match that is always returned; an AMBIGIOUS + result is only returned if all the candidate matches were ambiguous. + """ + + # this map used to establish priority order of verified matches + STATUS_SORT = { + fuzzycat.common.Status.TODO: 0, + fuzzycat.common.Status.EXACT: 10, + fuzzycat.common.Status.STRONG: 20, + fuzzycat.common.Status.WEAK: 30, + fuzzycat.common.Status.AMBIGUOUS: 40, + fuzzycat.common.Status.DIFFERENT: 60, + } + + # TODO: the size here is a first guess; what should it really be? + candidates = match_release_fuzzy(release, size=10, es=self.es_client) + if not candidates: + return None + + release_dict = entity_to_dict(release, api_client=self.api.api_client) + verified = [(fuzzycat.verify.verify(release_dict, entity_to_dict(c, api_client=self.api.api_client)), c) for c in candidates] + + # chose the "closest" match + closest = sorted(verified, key=lambda v: STATUS_SORT[v[0].status])[0] + if closest[0].status == fuzzycat.common.Status.DIFFERENT: + return None + elif closest[0].status == fuzzycat.common.Status.TODO: + raise NotImplementedError("fuzzycat verify hit a Status.TODO") + else: + return (closest[0].status.name, closest[1]) + class RecordPusher: """ diff --git a/python/tests/fixtures.py b/python/tests/fixtures.py index d71ac21d..e0bb1aae 100644 --- a/python/tests/fixtures.py +++ b/python/tests/fixtures.py @@ -45,6 +45,13 @@ ES_CONTAINER_RANDOM_RESP = { 'took': 50 } +ES_RELEASE_EMPTY_RESP = { + 'timed_out': False, + 'hits': {'total': 0, 'hits': [], 'max_score': 0.0}, + '_shards': {'successful': 5, 'total': 5, 'skipped': 0, 'failed': 0}, + 'took': 50 +} + @pytest.fixture def full_app(mocker): load_dotenv(dotenv_path="./example.env") diff --git a/python/tests/import_common.py b/python/tests/import_common.py new file mode 100644 index 00000000..9f04ebe0 --- /dev/null +++ b/python/tests/import_common.py @@ -0,0 +1,78 @@ + +import json +import datetime +from typing import Any + +import pytest +import elasticsearch +import fatcat_openapi_client +from fatcat_openapi_client import ReleaseEntity, ReleaseExtIds +import fuzzycat.matching + +from fatcat_tools.importers import EntityImporter +from fatcat_tools.transforms import entity_to_dict +from fixtures import * + + +@pytest.fixture(scope="function") +def entity_importer(api, mocker) -> Any: + es_client = elasticsearch.Elasticsearch("mockbackend") + mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') + yield EntityImporter(api, es_client=es_client) + +def test_fuzzy_match_none(entity_importer, mocker) -> None: + """ + Simple ES-mocked test for "no search results" case + """ + + es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') + es_raw.side_effect = [ + (200, {}, json.dumps(ES_RELEASE_EMPTY_RESP)), + (200, {}, json.dumps(ES_RELEASE_EMPTY_RESP)), + ] + + release = ReleaseEntity( + title="some long title which should not match anything because it is for testing", + ext_ids=ReleaseExtIds(), + ) + + resp = entity_importer.match_existing_release_fuzzy(release) + assert resp == None + +def test_fuzzy_match_different(entity_importer, mocker) -> None: + """ + Simple fuzzycat-mocked test for "strong match" case + """ + + r1 = ReleaseEntity( + title="example title: novel work", + contribs=[ReleaseContrib(raw_name="robin hood")], + ext_ids=ReleaseExtIds(doi="10.1234/abcdefg"), + ) + r2 = ReleaseEntity( + title="Example Title: Novel Work?", + contribs=[ReleaseContrib(raw_name="robin hood")], + ext_ids=ReleaseExtIds(), + ) + r3 = ReleaseEntity( + title="entirely different", + contribs=[ReleaseContrib(raw_name="king tut")], + ext_ids=ReleaseExtIds(), + ) + + match_raw = mocker.patch('fatcat_tools.importers.common.match_release_fuzzy') + match_raw.side_effect = [[r3, r2, r3, r2]] + resp = entity_importer.match_existing_release_fuzzy(r1) + assert resp == ("STRONG", r2) + + match_raw.side_effect = [[r2, r2, r3, r1]] + resp = entity_importer.match_existing_release_fuzzy(r1) + assert resp == ("EXACT", r1) + + match_raw.side_effect = [[r3]] + resp = entity_importer.match_existing_release_fuzzy(r1) + assert resp == None + + match_raw.side_effect = [[]] + resp = entity_importer.match_existing_release_fuzzy(r1) + assert resp == None |