aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-12-16 19:54:47 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-12-16 20:16:09 -0800
commit38328c25674fee7781a8d8601e1d47de04186f19 (patch)
tree32d834fb16bb93ad493a053a9b797d9da1e04ac5
parent20f27677aff762822bbd3aa944caf430c089ab4b (diff)
downloadfatcat-38328c25674fee7781a8d8601e1d47de04186f19.tar.gz
fatcat-38328c25674fee7781a8d8601e1d47de04186f19.zip
add fuzzy matching helper to importer base class
Using fuzzycat. Add basic test coverage.
-rw-r--r--python/fatcat_tools/importers/common.py64
-rw-r--r--python/tests/fixtures.py7
-rw-r--r--python/tests/import_common.py78
3 files changed, 147 insertions, 2 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 3c810391..1cce5fd0 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -7,16 +7,23 @@ import sqlite3
import datetime
import subprocess
from collections import Counter
-from confluent_kafka import Consumer, KafkaException
+from typing import Optional, Tuple
import xml.etree.ElementTree as ET
+import elasticsearch
from bs4 import BeautifulSoup
+from confluent_kafka import Consumer, KafkaException
import fatcat_openapi_client
+from fatcat_openapi_client import ReleaseEntity
from fatcat_openapi_client.rest import ApiException
+from fuzzycat.matching import match_release_fuzzy
+import fuzzycat.common
+import fuzzycat.verify
# TODO: refactor so remove need for this (re-imports for backwards compatibility)
from fatcat_tools.normal import (clean_str as clean, is_cjk, b32_hex, LANG_MAP_MARC) # noqa: F401
+from fatcat_tools.transforms import entity_to_dict
DATE_FMT = "%Y-%m-%d"
SANE_MAX_RELEASES = 200
@@ -145,12 +152,16 @@ class EntityImporter:
self.api = api
self.do_updates = bool(kwargs.get('do_updates', True))
+ self.do_fuzzy_match = kwargs.get('do_fuzzy_match', True)
self.bezerk_mode = kwargs.get('bezerk_mode', False)
self.submit_mode = kwargs.get('submit_mode', False)
self.edit_batch_size = kwargs.get('edit_batch_size', 100)
self.editgroup_description = kwargs.get('editgroup_description')
self.editgroup_extra = eg_extra
- self.reset()
+
+ self.es_client = kwargs.get('es_client')
+ if not self.es_client:
+ self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki")
self._issnl_id_map = dict()
self._orcid_id_map = dict()
@@ -158,6 +169,8 @@ class EntityImporter:
self._doi_id_map = dict()
self._pmid_id_map = dict()
+ self.reset()
+
def reset(self):
self.counts = Counter({'total': 0, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0})
self._edit_count = 0
@@ -433,6 +446,53 @@ class EntityImporter:
existing.urls = [u for u in existing.urls if u.url not in redundant_urls]
return existing
+ def match_existing_release_fuzzy(self, release: ReleaseEntity) -> Optional[Tuple[str, ReleaseEntity]]:
+ """
+ This helper function uses fuzzycat (and elasticsearch) to look for
+ existing release entities with similar metadata.
+
+ Returns None if there was no match of any kind, or a single tuple
+ (status: str, existing: ReleaseEntity) if there was a match.
+
+ Status string is one of the fuzzycat.common.Status, with "strongest
+ match" in this sorted order:
+
+ - EXACT
+ - STRONG
+ - WEAK
+ - AMBIGUOUS
+
+ Eg, if there is any EXACT match that is always returned; an AMBIGIOUS
+ result is only returned if all the candidate matches were ambiguous.
+ """
+
+ # this map used to establish priority order of verified matches
+ STATUS_SORT = {
+ fuzzycat.common.Status.TODO: 0,
+ fuzzycat.common.Status.EXACT: 10,
+ fuzzycat.common.Status.STRONG: 20,
+ fuzzycat.common.Status.WEAK: 30,
+ fuzzycat.common.Status.AMBIGUOUS: 40,
+ fuzzycat.common.Status.DIFFERENT: 60,
+ }
+
+ # TODO: the size here is a first guess; what should it really be?
+ candidates = match_release_fuzzy(release, size=10, es=self.es_client)
+ if not candidates:
+ return None
+
+ release_dict = entity_to_dict(release, api_client=self.api.api_client)
+ verified = [(fuzzycat.verify.verify(release_dict, entity_to_dict(c, api_client=self.api.api_client)), c) for c in candidates]
+
+ # chose the "closest" match
+ closest = sorted(verified, key=lambda v: STATUS_SORT[v[0].status])[0]
+ if closest[0].status == fuzzycat.common.Status.DIFFERENT:
+ return None
+ elif closest[0].status == fuzzycat.common.Status.TODO:
+ raise NotImplementedError("fuzzycat verify hit a Status.TODO")
+ else:
+ return (closest[0].status.name, closest[1])
+
class RecordPusher:
"""
diff --git a/python/tests/fixtures.py b/python/tests/fixtures.py
index d71ac21d..e0bb1aae 100644
--- a/python/tests/fixtures.py
+++ b/python/tests/fixtures.py
@@ -45,6 +45,13 @@ ES_CONTAINER_RANDOM_RESP = {
'took': 50
}
+ES_RELEASE_EMPTY_RESP = {
+ 'timed_out': False,
+ 'hits': {'total': 0, 'hits': [], 'max_score': 0.0},
+ '_shards': {'successful': 5, 'total': 5, 'skipped': 0, 'failed': 0},
+ 'took': 50
+}
+
@pytest.fixture
def full_app(mocker):
load_dotenv(dotenv_path="./example.env")
diff --git a/python/tests/import_common.py b/python/tests/import_common.py
new file mode 100644
index 00000000..9f04ebe0
--- /dev/null
+++ b/python/tests/import_common.py
@@ -0,0 +1,78 @@
+
+import json
+import datetime
+from typing import Any
+
+import pytest
+import elasticsearch
+import fatcat_openapi_client
+from fatcat_openapi_client import ReleaseEntity, ReleaseExtIds
+import fuzzycat.matching
+
+from fatcat_tools.importers import EntityImporter
+from fatcat_tools.transforms import entity_to_dict
+from fixtures import *
+
+
+@pytest.fixture(scope="function")
+def entity_importer(api, mocker) -> Any:
+ es_client = elasticsearch.Elasticsearch("mockbackend")
+ mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
+ yield EntityImporter(api, es_client=es_client)
+
+def test_fuzzy_match_none(entity_importer, mocker) -> None:
+ """
+ Simple ES-mocked test for "no search results" case
+ """
+
+ es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
+ es_raw.side_effect = [
+ (200, {}, json.dumps(ES_RELEASE_EMPTY_RESP)),
+ (200, {}, json.dumps(ES_RELEASE_EMPTY_RESP)),
+ ]
+
+ release = ReleaseEntity(
+ title="some long title which should not match anything because it is for testing",
+ ext_ids=ReleaseExtIds(),
+ )
+
+ resp = entity_importer.match_existing_release_fuzzy(release)
+ assert resp == None
+
+def test_fuzzy_match_different(entity_importer, mocker) -> None:
+ """
+ Simple fuzzycat-mocked test for "strong match" case
+ """
+
+ r1 = ReleaseEntity(
+ title="example title: novel work",
+ contribs=[ReleaseContrib(raw_name="robin hood")],
+ ext_ids=ReleaseExtIds(doi="10.1234/abcdefg"),
+ )
+ r2 = ReleaseEntity(
+ title="Example Title: Novel Work?",
+ contribs=[ReleaseContrib(raw_name="robin hood")],
+ ext_ids=ReleaseExtIds(),
+ )
+ r3 = ReleaseEntity(
+ title="entirely different",
+ contribs=[ReleaseContrib(raw_name="king tut")],
+ ext_ids=ReleaseExtIds(),
+ )
+
+ match_raw = mocker.patch('fatcat_tools.importers.common.match_release_fuzzy')
+ match_raw.side_effect = [[r3, r2, r3, r2]]
+ resp = entity_importer.match_existing_release_fuzzy(r1)
+ assert resp == ("STRONG", r2)
+
+ match_raw.side_effect = [[r2, r2, r3, r1]]
+ resp = entity_importer.match_existing_release_fuzzy(r1)
+ assert resp == ("EXACT", r1)
+
+ match_raw.side_effect = [[r3]]
+ resp = entity_importer.match_existing_release_fuzzy(r1)
+ assert resp == None
+
+ match_raw.side_effect = [[]]
+ resp = entity_importer.match_existing_release_fuzzy(r1)
+ assert resp == None