diff options
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 64 | ||||
| -rw-r--r-- | python/tests/fixtures.py | 7 | ||||
| -rw-r--r-- | python/tests/import_common.py | 78 | 
3 files changed, 147 insertions, 2 deletions
| diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 3c810391..1cce5fd0 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -7,16 +7,23 @@ import sqlite3  import datetime  import subprocess  from collections import Counter -from confluent_kafka import Consumer, KafkaException +from typing import Optional, Tuple  import xml.etree.ElementTree as ET +import elasticsearch  from bs4 import BeautifulSoup +from confluent_kafka import Consumer, KafkaException  import fatcat_openapi_client +from fatcat_openapi_client import ReleaseEntity  from fatcat_openapi_client.rest import ApiException +from fuzzycat.matching import match_release_fuzzy +import fuzzycat.common +import fuzzycat.verify  # TODO: refactor so remove need for this (re-imports for backwards compatibility)  from fatcat_tools.normal import (clean_str as clean, is_cjk, b32_hex, LANG_MAP_MARC) # noqa: F401 +from fatcat_tools.transforms import entity_to_dict  DATE_FMT = "%Y-%m-%d"  SANE_MAX_RELEASES = 200 @@ -145,12 +152,16 @@ class EntityImporter:          self.api = api          self.do_updates = bool(kwargs.get('do_updates', True)) +        self.do_fuzzy_match = kwargs.get('do_fuzzy_match', True)          self.bezerk_mode = kwargs.get('bezerk_mode', False)          self.submit_mode = kwargs.get('submit_mode', False)          self.edit_batch_size = kwargs.get('edit_batch_size', 100)          self.editgroup_description = kwargs.get('editgroup_description')          self.editgroup_extra = eg_extra -        self.reset() + +        self.es_client = kwargs.get('es_client') +        if not self.es_client: +            self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki")          self._issnl_id_map = dict()          self._orcid_id_map = dict() @@ -158,6 +169,8 @@ class EntityImporter:          self._doi_id_map = dict()          self._pmid_id_map = dict() +        self.reset() +      def reset(self):          self.counts = Counter({'total': 0, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0})          self._edit_count = 0 @@ -433,6 +446,53 @@ class EntityImporter:          existing.urls = [u for u in existing.urls if u.url not in redundant_urls]          return existing +    def match_existing_release_fuzzy(self, release: ReleaseEntity) -> Optional[Tuple[str, ReleaseEntity]]: +        """ +        This helper function uses fuzzycat (and elasticsearch) to look for +        existing release entities with similar metadata. + +        Returns None if there was no match of any kind, or a single tuple +        (status: str, existing: ReleaseEntity) if there was a match. + +        Status string is one of the fuzzycat.common.Status, with "strongest +        match" in this sorted order: + +        - EXACT +        - STRONG +        - WEAK +        - AMBIGUOUS + +        Eg, if there is any EXACT match that is always returned; an AMBIGIOUS +        result is only returned if all the candidate matches were ambiguous. +        """ + +        # this map used to establish priority order of verified matches +        STATUS_SORT = { +            fuzzycat.common.Status.TODO: 0, +            fuzzycat.common.Status.EXACT: 10, +            fuzzycat.common.Status.STRONG: 20, +            fuzzycat.common.Status.WEAK: 30, +            fuzzycat.common.Status.AMBIGUOUS: 40, +            fuzzycat.common.Status.DIFFERENT: 60, +        } + +        # TODO: the size here is a first guess; what should it really be? +        candidates = match_release_fuzzy(release, size=10, es=self.es_client) +        if not candidates: +            return None + +        release_dict = entity_to_dict(release, api_client=self.api.api_client) +        verified = [(fuzzycat.verify.verify(release_dict, entity_to_dict(c, api_client=self.api.api_client)), c) for c in candidates] + +        # chose the "closest" match +        closest = sorted(verified, key=lambda v: STATUS_SORT[v[0].status])[0] +        if closest[0].status == fuzzycat.common.Status.DIFFERENT: +            return None +        elif closest[0].status == fuzzycat.common.Status.TODO: +            raise NotImplementedError("fuzzycat verify hit a Status.TODO") +        else: +            return (closest[0].status.name, closest[1]) +  class RecordPusher:      """ diff --git a/python/tests/fixtures.py b/python/tests/fixtures.py index d71ac21d..e0bb1aae 100644 --- a/python/tests/fixtures.py +++ b/python/tests/fixtures.py @@ -45,6 +45,13 @@ ES_CONTAINER_RANDOM_RESP = {      'took': 50  } +ES_RELEASE_EMPTY_RESP = { +    'timed_out': False, +    'hits': {'total': 0, 'hits': [], 'max_score': 0.0}, +    '_shards': {'successful': 5, 'total': 5, 'skipped': 0, 'failed': 0}, +    'took': 50 +} +  @pytest.fixture  def full_app(mocker):      load_dotenv(dotenv_path="./example.env") diff --git a/python/tests/import_common.py b/python/tests/import_common.py new file mode 100644 index 00000000..9f04ebe0 --- /dev/null +++ b/python/tests/import_common.py @@ -0,0 +1,78 @@ + +import json +import datetime +from typing import Any + +import pytest +import elasticsearch +import fatcat_openapi_client +from fatcat_openapi_client import ReleaseEntity, ReleaseExtIds +import fuzzycat.matching + +from fatcat_tools.importers import EntityImporter +from fatcat_tools.transforms import entity_to_dict +from fixtures import * + + +@pytest.fixture(scope="function") +def entity_importer(api, mocker) -> Any: +    es_client = elasticsearch.Elasticsearch("mockbackend") +    mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') +    yield EntityImporter(api, es_client=es_client) + +def test_fuzzy_match_none(entity_importer, mocker) -> None: +    """ +    Simple ES-mocked test for "no search results" case +    """ + +    es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') +    es_raw.side_effect = [ +        (200, {}, json.dumps(ES_RELEASE_EMPTY_RESP)), +        (200, {}, json.dumps(ES_RELEASE_EMPTY_RESP)), +    ] + +    release = ReleaseEntity( +        title="some long title which should not match anything because it is for testing", +        ext_ids=ReleaseExtIds(), +    ) + +    resp = entity_importer.match_existing_release_fuzzy(release) +    assert resp == None + +def test_fuzzy_match_different(entity_importer, mocker) -> None: +    """ +    Simple fuzzycat-mocked test for "strong match" case +    """ + +    r1 = ReleaseEntity( +        title="example title: novel work", +        contribs=[ReleaseContrib(raw_name="robin hood")], +        ext_ids=ReleaseExtIds(doi="10.1234/abcdefg"), +    ) +    r2 = ReleaseEntity( +        title="Example Title: Novel Work?", +        contribs=[ReleaseContrib(raw_name="robin hood")], +        ext_ids=ReleaseExtIds(), +    ) +    r3 = ReleaseEntity( +        title="entirely different", +        contribs=[ReleaseContrib(raw_name="king tut")], +        ext_ids=ReleaseExtIds(), +    ) + +    match_raw = mocker.patch('fatcat_tools.importers.common.match_release_fuzzy') +    match_raw.side_effect = [[r3, r2, r3, r2]] +    resp = entity_importer.match_existing_release_fuzzy(r1) +    assert resp == ("STRONG", r2) + +    match_raw.side_effect = [[r2, r2, r3, r1]] +    resp = entity_importer.match_existing_release_fuzzy(r1) +    assert resp == ("EXACT", r1) + +    match_raw.side_effect = [[r3]] +    resp = entity_importer.match_existing_release_fuzzy(r1) +    assert resp == None + +    match_raw.side_effect = [[]] +    resp = entity_importer.match_existing_release_fuzzy(r1) +    assert resp == None | 
