From 0c84af603894049dd8edd95da18d8990ab0516d1 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 5 Nov 2021 17:19:07 +0100 Subject: turn "match_release_fuzzy" into a class Goal of this refactoring was to make the matching process a bit more configurable by using a class and a cascade of queries. For a limited test set: `FuzzyReleaseMatcher.match` is works the same as `match_release_fuzzy`. --- TODO.md | 34 +- fuzzycat/contrib.py | 453 +++++++++++++++++ fuzzycat/grobid_unstructured.py | 2 +- fuzzycat/matching.py | 559 ++++++++++++++++++--- fuzzycat/sandcrawler.py | 5 +- fuzzycat/verify.py | 7 +- notes/2021_11_fuzzycat_refactoring.md | 87 ++++ tests/files/README.md | 5 + .../0.yaml | 13 + .../1.yaml | 13 + .../2.yaml | 16 + .../3.yaml | 16 + .../4.yaml | 16 + .../0.yaml | 14 + .../1.yaml | 14 + .../2.yaml | 17 + .../3.yaml | 17 + .../4.yaml | 17 + .../5.yaml | 17 + .../6.yaml | 14 + .../7.yaml | 17 + tests/test_grobid_unstructured.py | 6 +- tests/test_matching.py | 123 ++++- 23 files changed, 1371 insertions(+), 111 deletions(-) create mode 100644 fuzzycat/contrib.py create mode 100644 notes/2021_11_fuzzycat_refactoring.md create mode 100644 tests/files/README.md create mode 100644 tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml diff --git a/TODO.md b/TODO.md index 5666bc0..9241b60 100644 --- a/TODO.md +++ b/TODO.md @@ -1,28 +1,32 @@ # TODO * [ ] clustering should be broken up, e.g. into "map" and "sort" +* [ ] match release fuzzy should work not just with title +* [ ] match container name functions (maybe also with abbreviations, etc) +* [ ] better documentation, more examples +* [ ] shiv based packaging +* [ ] author similarity should be broken up; easier to tweak +* [ ] split up `verify` +* [ ] configurable `verify` + +Other repos: -In -[refcat/skate](https://gitlab.com/internetarchive/refcat/-/tree/master/skate) -we have one simple operation: extract a list of fields from blob of bytes. We -use [16 -mappers](https://gitlab.com/internetarchive/refcat/-/blob/f33e586d11f5f575f71ad209608ac9ba74fad2e5/skate/cmd/skate-map/main.go#L70-86) -currently, they are easy to write. +* [refcat/skate](https://gitlab.com/internetarchive/refcat/-/tree/master/skate) + +In refcat we have one simple operation: extract a list of fields from blob of +bytes. We use [16 mappers](https://is.gd/E0NEXj) currently, they are easy to +write. In refcat, we use GNU sort, and just when we need it, e.g. -[skate-map](https://gitlab.com/internetarchive/refcat/-/blob/f33e586d11f5f575f71ad209608ac9ba74fad2e5/python/refcat/tasks.py#L531-534). +[skate-map](https://is.gd/Kt9hvL). The `Cluster` class bundles, iteration, key extraction, sorting and group by operation into a single entity. Also in refcat, we do not work on a single file with clusters any more, but -mostly with two sorted streams, which are iterated over "comm" style. This -spares us an extra step of generating the cluster documents, but requires an -extra component, that allows to plug in various "reduce" functions. In refcat, -this component is called "zipkey", which is support batching, too. +mostly with two sorted streams, which are iterated over "mergesort/comm" style. -* [ ] match release fuzzy should work not just with title -* [ ] match container name functions (maybe also with abbreviations, etc) -* [ ] better documentation, more examples -* [ ] shiv based packaging +This spares us an extra step of generating the cluster documents, but requires +an extra component, that allows to plug in various "reduce" functions. In +refcat, this component is called "zipkey", which is support batching, too. diff --git a/fuzzycat/contrib.py b/fuzzycat/contrib.py new file mode 100644 index 0000000..93753ab --- /dev/null +++ b/fuzzycat/contrib.py @@ -0,0 +1,453 @@ +""" +Contrib related comparisons. + +Example: NgramContribMatcher, which compares two normalized raw name tokens +with a jaccard index. + + matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.5), + pipeline=Pipeline([ + lambda rc: rc.raw_name, + str.strip, + str.lower, + Ngram(n=3), + set, + ])) + result = matcher.compare(a, b) + ... + +Some notes from the dataset. + +* 692,893,828 contribs +* 64,680,311 uniq + +Top contrib names, many have their name on datasets, which explains the high +number. + +3069752 Kessy Abarenkov +2383819 Leho Tedersoo +2383748 Karl-Henrik Larsson +2383745 Urmas Kõljalg +2383699 Mohammad Bahram +2383692 Martin Ryberg +2382832 R. Henrik Nilsson +1702455 Markus Döring +1702427 Tom May +1702415 Dmitry Schigel +1702391 Santiago Sánchez-Ramírez + 841475 GLIS Of The ITPGRFA + 682144 James Scott + 682053 Michael Weiss + 681404 Susumu Takamatsu + 681388 A. Elizabeth Arnold + 681347 Artur Alves + 681341 Ellen Larsson + 681338 Maarja Öpik + 681335 Ursula Eberhardt + 681324 Nhu Nguyen + 681293 Otto Miettinen + 681292 Viacheslav Spirin + 681287 Gareth W. Griffith + 681283 Bálint Dima + 681278 Ursula Peintner + 681276 Tuula Niskanen + 681276 Olinto Liparini Pereira + 681275 Kare Liimatainen +""" + +import collections +import functools +import itertools +import logging +import operator +import re +import string +from typing import Any, Callable, List, Optional, Set + +import jellyfish +import thefuzz +from fatcat_openapi_client import ReleaseContrib + +logger = logging.getLogger("fuzzycat") + + +class Ngram: + """ + Turn a string into a list of overlapping tokens. + """ + def __init__(self, n: int = 3): + if n < 1: + raise ValueError("positive n required") + self.n = n + + def __call__(self, s: str) -> List[str]: + if 0 < len(s) < self.n: + return [s] + return [s[i:i + self.n] for i in range(len(s) - self.n + 1)] + + +class JaccardIndexThreshold: + """ + A Jaccard index threshold that can be used to compare two sets. Two empty + sets are equal. + """ + def __init__(self, threshold: float = 0.5, verbose=False): + self.threshold = threshold + self.verbose = verbose + + def __call__(self, a: Set, b: Set) -> bool: + if len(a) == 0 and len(b) == 0: + return True + index = len(a & b) / len(a | b) + if self.verbose: + logger.debug("[jaccard] {}".format(index)) + return index >= self.threshold + + +class FuzzyStringSimilarity: + """ + For two sets of strings, run fuzzy matching with "thefuzz" - + https://github.com/seatgeek/thefuzz, which among other things uses + Levenshtein distance. + + The min ratio can range from 0 to 100 (with 100 allowing exact matches + only). + """ + def __init__(self, min_ratio=75): + self.min_ratio = min_ratio + + def __call__(self, a: Set, b: Set) -> bool: + agg = 0 + for v in a: + match, score = thefuzz.exctractOne(v, b) + agg += score + return score > self.min_ratio + + +class Pipeline: + """ + A list of functions to execute, f -> g -> h, etc. Note that the output + type of f needs to match the input type of g, etc. + """ + def __init__(self, pipeline: Optional[List[Any]] = None, verbose: bool = False): + self.verbose = verbose + if pipeline is None: + self.pipeline = [ + lambda v: v, + ] + else: + self.pipeline = pipeline + + def run(self, value: Any) -> Any: + v = value + for i, f in enumerate(self.pipeline, start=1): + v = f(v) + if self.verbose: + logger.debug("[{}/{}] {}".format(i, len(self.pipeline), v)) + return v + + def __call__(self, value: Any, verbose: bool = False) -> Any: + self.verbose = verbose + return self.run(value) + + +# default_release_contrib_pipeline normalizes the raw name. +default_release_contrib_pipeline = Pipeline([ + lambda rc: rc.raw_name, + str.strip, + str.lower, +]) + +# default_release_contrib_list_pipeline turns contribs list into a contrib set. +default_release_contrib_list_pipeline = Pipeline([ + lambda seq: set((c.raw_name for c in seq)), +]) + + +class ContribMatcher: + """ + Compare two contrib entities and determine a match status, based on some + configuration. The final values of the `pipeline` will be compared with + `cmp`, which by default is equality. + + Other `cmp` options may generate ngrams and use jaccard index with some + threshold or decide on a string similarity metric. + + This is essentially just a shell, the various comparison methods live in + the tuple (pipeline, cmp). + """ + def __init__(self, + pipeline: Optional[List[Any]] = default_release_contrib_list_pipeline, + cmp: Callable[[Any, Any], bool] = operator.__eq__): + self.pipeline = pipeline + self.cmp = cmp + + def compare(self, a: ReleaseContrib, b: ReleaseContrib) -> bool: + """ + Compare returns True, if a and b are considered the same, given a + transformation pipeline and a comparison operation. + """ + u = self.pipeline(a) + v = self.pipeline(b) + return self.cmp(u, v) + + +class ContribListMatcher: + """ + Compare two lists of contribs. Each contrib entry is passed through the + same pipeline. + + Often two issues (separate or combined). + + - contrib missing, e.g. + - "Gentle Sunder Shrestha", "Gentle S Shrestha", "S. Shrestha", "Gentle Shrestha", ... + """ + def __init__(self, + pipeline: Optional[List[Any]] = default_release_contrib_list_pipeline, + cmp: Callable[[Any, Any], bool] = JaccardIndexThreshold(1.0)): + self.pipeline = pipeline + self.cmp = cmp + + def compare(self, + a: List[ReleaseContrib], + b: List[ReleaseContrib], + verbose: bool = False) -> bool: + """ + Compare two lists of contribs, pass each one through the pipeline. The + result may be a list or any other type. The comparison function needs + to be compatible. + """ + u = self.pipeline(a, verbose=verbose) + v = self.pipeline(b, verbose=verbose) + return self.cmp(u, v) + + +def cleanup_single_ws(s: str) -> str: + return re.sub(r"[ ]{2,}", " ", s) + + +def cleanup_remove_ws(s: str) -> str: + return re.sub(r"[\n\r\t\s]*", '', s) + + +def cleanup_keep_letters_digits_ws(s: str) -> str: + return ''.join((c for c in s if c in string.ascii_letters + string.digits + " ")) + + +def test_cleanup_single_ws(): + Case = collections.namedtuple("Case", "s result") + cases = ( + Case("", ""), + Case("abc", "abc"), + Case("abc abc", "abc abc"), + Case("abc abc", "abc abc"), + Case(" abc abc", " abc abc"), + Case(" abc abc", " abc abc"), + ) + for c in cases: + assert c.result == cleanup_single_ws(c.s) + + +def test_cleanup_remove_ws(): + Case = collections.namedtuple("Case", "s result") + cases = ( + Case("", ""), + Case("abc", "abc"), + Case("abc abc", "abcabc"), + Case("abc abc", "abcabc"), + Case(" abc abc", "abcabc"), + ) + for c in cases: + assert c.result == cleanup_remove_ws(c.s), c + + +def test_ngram(): + Case = collections.namedtuple("Case", "s n result") + cases = ( + Case("", 1, []), + Case("", 2, []), + Case("a", 2, ["a"]), + Case("ab", 2, ["ab"]), + Case("abcdef", 2, ['ab', 'bc', 'cd', 'de', 'ef']), + Case("abcdef", 4, ['abcd', 'bcde', 'cdef']), + Case("Nina Rogo", 3, ["Nin", "ina", "na ", "a R", " Ro", "Rog", "ogo"]), + ) + for c in cases: + ngram = Ngram(n=c.n) + assert ngram(c.s) == c.result + + +def test_pipeline(): + Case = collections.namedtuple("Case", "pipeline input result") + cases = (Case(Pipeline([lambda v: v["a"], str.strip, str.lower, + Ngram(n=3), set]), {"a": " X123 "}, {'123', 'x12'})), + for c in cases: + result = c.pipeline(c.input) + assert result == c.result + + +def test_jaccard_index_threshold(): + Case = collections.namedtuple("Case", "a b threshold result") + cases = ( + Case(set(), set(), 1.0, True), + Case(set(), set(["a"]), 1.0, False), + Case(set(["a"]), set(["a"]), 1.0, True), + Case(set(["a"]), set(["a", "b"]), 1.0, False), + Case(set(["a"]), set(["a", "b"]), 0.5, True), + Case(set(["a"]), set(["a", "b", "c"]), 0.5, False), + ) + for c in cases: + jit = JaccardIndexThreshold(threshold=c.threshold) + result = jit(c.a, c.b) + assert result == c.result + + +def test_ngram_contrib_matcher(caplog): + Case = collections.namedtuple("Case", "a b result") + cases = ( + Case( + ReleaseContrib(raw_name="Jane Austen"), + ReleaseContrib(raw_name="J.Austen"), + True, + ), + Case( + ReleaseContrib(raw_name="Fjodor Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + Case( + ReleaseContrib(raw_name="Fjodor M. Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + Case( + ReleaseContrib(raw_name="Fjodor Michailowitsch Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + ) + matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.4, verbose=True), + pipeline=Pipeline([ + lambda rc: rc.raw_name, + str.strip, + str.lower, + cleanup_remove_ws, + cleanup_keep_letters_digits_ws, + Ngram(n=3), + set, + ], + verbose=True)) + for c in cases: + with caplog.at_level(logging.DEBUG): + result = matcher.compare(c.a, c.b) + assert result == c.result + + +def test_jellyfish_soundex_contrib_matcher(caplog): + Case = collections.namedtuple("Case", "a b result") + cases = ( + Case( + ReleaseContrib(raw_name="Jane Austen"), + ReleaseContrib(raw_name="J.Austen"), + True, + ), + Case( + ReleaseContrib(raw_name="Fjodor Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + Case( + ReleaseContrib(raw_name="Fjodor M. Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + Case( + ReleaseContrib(raw_name="Fjodor Michailowitsch Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + ) + matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.3, verbose=True), + pipeline=Pipeline([ + lambda rc: rc.raw_name, + str.strip, + str.lower, + functools.partial(re.sub, r"[.;]", " "), + cleanup_keep_letters_digits_ws, + lambda s: set((jellyfish.soundex(v) for v in s.split())), + ], + verbose=True)) + for c in cases: + with caplog.at_level(logging.DEBUG): + result = matcher.compare(c.a, c.b) + assert result == c.result + + +def test_jellyfish_nysiis_contrib_matcher(caplog): + Case = collections.namedtuple("Case", "a b result") + cases = ( + Case( + ReleaseContrib(raw_name="Jane Austen"), + ReleaseContrib(raw_name="J.Austen"), + True, + ), + Case( + ReleaseContrib(raw_name="Fjodor Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + Case( + ReleaseContrib(raw_name="Fjodor M. Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + Case( + ReleaseContrib(raw_name="Fjodor Michailowitsch Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + ) + matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.3, verbose=True), + pipeline=Pipeline([ + lambda rc: rc.raw_name, + str.strip, + str.lower, + functools.partial(re.sub, r"[.;]", " "), + cleanup_keep_letters_digits_ws, + lambda s: set((jellyfish.nysiis(v) for v in s.split())), + ], + verbose=True)) + for c in cases: + with caplog.at_level(logging.DEBUG): + result = matcher.compare(c.a, c.b) + assert result == c.result + + +def test_default_contrib_list_matcher(caplog): + Case = collections.namedtuple("Case", "a b result") + cases = ( + Case( + [], + [], + True, + ), + Case( + [ReleaseContrib(raw_name="Michael Jordan")], + [ReleaseContrib(raw_name="Michael Jordan")], + True, + ), + Case( + [ReleaseContrib(raw_name="Michael Jordan")], + [ReleaseContrib(raw_name="michael jordan")], + False, + ), + Case( + [ReleaseContrib(raw_name="Amadeu Llebaria")], + [ReleaseContrib(raw_name="A. Llebaria")], + False, + ), + ) + matcher = ContribListMatcher() + for c in cases: + with caplog.at_level(logging.DEBUG): + result = matcher.compare(c.a, c.b, verbose=True) + assert result == c.result diff --git a/fuzzycat/grobid_unstructured.py b/fuzzycat/grobid_unstructured.py index 7470cd7..1765f42 100644 --- a/fuzzycat/grobid_unstructured.py +++ b/fuzzycat/grobid_unstructured.py @@ -15,7 +15,7 @@ from typing import Optional import requests from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds -from grobid_tei_xml import parse_citation_xml, GrobidBiblio +from grobid_tei_xml import GrobidBiblio, parse_citation_xml from fuzzycat.config import settings from fuzzycat.utils import clean_doi diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py index b358899..1531ac6 100644 --- a/fuzzycat/matching.py +++ b/fuzzycat/matching.py @@ -1,22 +1,430 @@ +import logging import os import re import sys -from typing import List, Optional, Type, Union +from typing import Any, List, Optional, Type, Union import elasticsearch import elasticsearch_dsl import fatcat_openapi_client import requests -from fatcat_openapi_client import ContainerEntity, DefaultApi, ReleaseEntity +from fatcat_openapi_client import (ContainerEntity, DefaultApi, ReleaseContrib, ReleaseEntity) from fatcat_openapi_client.rest import ApiException from fuzzycat.config import settings +from fuzzycat.contrib import (ContribListMatcher, FuzzyStringSimilarity, JaccardIndexThreshold, + Pipeline) from fuzzycat.entities import entity_from_dict, entity_from_json from fuzzycat.utils import es_compat_hits_total FATCAT_API_URL = settings.get("FATCAT_API_URL", "https://api.fatcat.wiki/v0") +class FuzzyReleaseMatcher: + """ + FuzzyReleaseMatcher tries to find similar items to a given release in + elasticsearch. Exact matches first, then fuzzy. + + In the best case, elasticsearch would automatically rank the most relevant + docs first, even with partial data. We still try to steer the matches by + using a query cascade. This is configurable. The last query should be a + generic. + + The goal here is to get a set of potential matches; verification has to. + happen separately. + + TODO: + + Example case not yet working well ("Stuehrenberg" vs "Stührenberg"): + + >>> result = matcher.match(entity_from_dict({"title": "internet archive", + "contribs": [{"raw_name": + "Stührenberg"}], + "ext_ids": {}}, + ReleaseEntity)) + + > Should return: https://fatcat.wiki/release/pu7e7tbctna2foqyyxztfw3ufy, + https://fatcat.wiki/release/search?q=St%C3%BChrenberg+internet+archive&generic=1 + (not returning anything via frontend either) + + Make sure we can switch from function to class: + + * [ ] 5 test cases for both + + """ + def __init__(self, es="https://search.fatcat.wiki", api=None, index="fatcat_release", size=10): + if isinstance(es, str): + self.es = elasticsearch.Elasticsearch([es]) + else: + self.es = es if es else elasticsearch.Elasticsearch() + self.api = api if api else public_api(FATCAT_API_URL) + self.index = index + self.size = size + self.logger = logging.getLogger("fuzzy") + + def match_release_by_id(self, release, **kwargs) -> List[ReleaseEntity]: + """ + Check for exact matches by identifier. + """ + ext_ids = release.ext_ids + attrs = ( + "doi", + "pmid", + "wikidata_qid", + "core", + "pmcid", + "arxiv", + "dblp", + "doaj", + "jstor", + "isbn13", + "ark", + "mag", + "oai", + ) + for attr in attrs: + value = getattr(ext_ids, attr) + if not value: + continue + try: + r = self.api.lookup_release(**{attr: value}) + except fatcat_openapi_client.rest.ApiException as err: + if err.status in [404, 400]: + r = None + else: + raise err + if r: + return [r] + return [] + + def match_release_exact_title_exact_contrib(self, release): + """ + Match exact title and exact contrib names. Case insensitive, order of + contribs does not matter. + """ + if release.title is None or release.contribs is None: + return [] + contrib_queries = [{ + "match": { + "contrib_names": { + "query": contrib.raw_name, + "operator": "AND", + } + } + } for contrib in release.contribs] + query = { + "bool": { + "must": [{ + "match": { + "title": { + "query": release.title, + "operator": "AND", + }, + } + }] + contrib_queries, + }, + } + result = [] + + resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + hits_total = es_compat_hits_total(resp) + if hits_total == 0: + return result + if hits_total > self.size: + self.logger.warn('more than {} hits: {}'.format(self.size, hits_total)) + + entities = response_to_entity_list(resp, + entity_type=ReleaseEntity, + size=self.size, + api=self.api) + + # Require overlap of contrib. + matcher = ContribListMatcher( + cmp=JaccardIndexThreshold(1.0), + pipeline=Pipeline([ + lambda contribs: set((c.raw_name.strip().lower() for c in contribs)), + ]), + ) + + for re in entities: + if re.title.strip().lower() != release.title.strip().lower(): + continue + if not matcher.compare(re.contribs, release.contribs): + continue + result.append(re) + return result + + def match_release_exact_title_partial_contrib(self, release): + """ + Allow for exact authors, but ok, if some are missing. + """ + if release.title is None or release.contribs is None: + return [] + contrib_queries = [{ + "match": { + "contrib_names": { + "query": contrib.raw_name, + "operator": "AND", + } + } + } for contrib in release.contribs] + query = { + "bool": { + "must": [{ + "match": { + "title": { + "query": release.title, + "operator": "AND", + }, + } + }] + contrib_queries, + }, + } + result = [] + resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + if es_compat_hits_total(resp) == 0: + return result + if es_compat_hits_total(resp) > self.size: + raise NotImplementedError('result set too large: {}'.format(es)) + entities = response_to_entity_list(resp, + entity_type=ReleaseEntity, + size=self.size, + api=self.api) + + # Require at least half the contribs to be shared. + matcher = ContribListMatcher( + cmp=JaccardIndexThreshold(0.5), + pipeline=Pipeline([ + lambda contribs: set((c.raw_name.strip().lower() for c in contribs)), + ]), + ) + + for re in entities: + if re.title.strip().lower() != release.title.strip().lower(): + continue + if not matcher.compare(re.contribs, release.contribs): + continue + result.append(re) + return result + + def match_release_exact_title_fuzzy_contrib(self, release): + """ + Exact title but ok it authors differ (slightly). + """ + if release.title is None or release.contribs is None: + return [] + contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()] + contrib_queries = [{ + "match": { + "contrib_names": { + "query": token, + } + } + } for token in contrib_tokens] + query = { + "bool": { + "must": [{ + "match": { + "title": { + "query": release.title, + "operator": "AND", + }, + } + }] + contrib_queries, + }, + } + result = [] + resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + if es_compat_hits_total(resp) == 0: + return result + if es_compat_hits_total(resp) > self.size: + raise NotImplementedError('todo: scroll required for larger result sets: {}'.format(es)) + entities = response_to_entity_list(resp, + entity_type=ReleaseEntity, + size=self.size, + api=self.api) + + matcher = ContribListMatcher( + cmp=FuzzyStringSimilarity(min_ratio=60), + pipeline=Pipeline([ + lambda contribs: set((c.raw_name.strip().lower() for c in contribs)), + ]), + ) + + for re in entities: + if re.title.strip().lower() != release.title.strip().lower(): + continue + if not matcher.compare(re.contribs, release.contribs): + continue + result.append(re) + return result + + def match_release_exact_title(self, release): + """ + Exact title, but any author. For common titles, this will yield 100s or + 1000s or results. + """ + if release.title is None: + return [] + query = { + "bool": { + "must": [{ + "match": { + "title": { + "query": release.title, + "operator": "AND", + }, + } + }], + }, + } + result = [] + resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + if es_compat_hits_total(resp) == 0: + return result + if es_compat_hits_total(resp) > self.size: + self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp))) + entities = response_to_entity_list(resp, + entity_type=ReleaseEntity, + size=self.size, + api=self.api) + for re in entities: + if re.title.strip().lower() != release.title.strip().lower(): + continue + result.append(re) + return result + + def match_release_fuzzy_title_fuzzy_contrib(self, release): + """ + Using elasticsearch fuzziness option (which is not that fuzzy). + """ + if release.title is None or release.contribs is None: + return [] + contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()] + contrib_queries = [{ + "match": { + "contrib_names": { + "query": token, + } + } + } for token in contrib_tokens] + query = { + "bool": { + "must": [ + { + "match": { + "title": { + "query": release.title, + "operator": "AND", + "fuzziness": "AUTO", + }, + } + }, + ] + contrib_queries, + }, + } + result = [] + resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + if es_compat_hits_total(resp) == 0: + return result + if es_compat_hits_total(resp) > self.size: + raise ValueError('too many hits: {}'.format(es_compat_hits_total(resp))) + entities = response_to_entity_list(resp, + entity_type=ReleaseEntity, + size=self.size, + api=self.api) + return entities + + def match_release_generic(self, release): + """ + Final catch all variant via title. + """ + if release.title is None: + return [] + query = { + "bool": { + "must": [ + { + "match": { + "title": { + "query": release.title, + "operator": "OR", + "fuzziness": "AUTO", + }, + } + }, + ], + }, + } + result = [] + resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + if es_compat_hits_total(resp) == 0: + return result + if es_compat_hits_total(resp) > self.size: + self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp))) + entities = response_to_entity_list(resp, + entity_type=ReleaseEntity, + size=self.size, + api=self.api) + return entities + + def match_release_generic_fuzzy_contrib(self, release): + """ + Only match contribs, if they exist. + """ + if release.contribs is None: + return [] + contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()] + contrib_queries = [{ + "match": { + "contrib_names": { + "query": token, + } + } + } for token in contrib_tokens] + query = { + "bool": { + "must": contrib_queries, + }, + } + result = [] + resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + if es_compat_hits_total(resp) == 0: + return result + if es_compat_hits_total(resp) > self.size: + self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp))) + entities = response_to_entity_list(resp, + entity_type=ReleaseEntity, + size=self.size, + api=self.api) + return entities + + def match_cascade(self, release, *qs, **kwargs): + """ + Returns the result from the first query that returns a result. All query + functions need to be defined on this class (for now). + """ + for q in qs: + self.logger.debug("[cascade] {}".format(q)) + result = q(release, **kwargs) + if len(result) > 0: + return result + return [] + + def match(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]: + """ + Match returns a list of match candidates given a release entity. + """ + if not release: + return [] + return self.match_cascade( + release, self.match_release_by_id, self.match_release_exact_title_exact_contrib, + self.match_release_exact_title_partial_contrib, + self.match_release_exact_title_fuzzy_contrib, self.match_release_exact_title, + self.match_release_fuzzy_title_fuzzy_contrib, self.match_release_generic, + self.match_release_generic_fuzzy_contrib) + + def match_release_fuzzy( release: ReleaseEntity, size: int = 5, @@ -28,6 +436,8 @@ def match_release_fuzzy( fatcat using Elasticsearch. TODO: rename "es" parameter to "es_client", which would be clearer + + This is deprecated, move to matcher class. """ assert isinstance(release, ReleaseEntity) @@ -41,6 +451,17 @@ def match_release_fuzzy( if api is None: api = public_api(FATCAT_API_URL) + # > query cascade + # + # [x] 1 exact ids + # [ ] 2 exact title and exact contrib + # [ ] 3 exact title and fuzzy contrib + # [ ] 4 exact title + # [ ] 5 title w/o stopwords, fuzzy contrib + # [ ] 6 title w/o stopwords + # [ ] 7 fuzzy title and fuzzy contrib + # [ ] 8 fuzzy whole document + # Try to match by external identifier. # TODO: use api, ability to disable; benchmark ext_ids = release.ext_ids @@ -75,105 +496,89 @@ def match_release_fuzzy( if release.title is not None and release.contribs is not None: names = " ".join([c.raw_name for c in release.contribs]) - body = { - "track_total_hits": True, - "query": { - "bool": { - "must": [ - { - "match": { - "title": { - "query": release.title, - "operator": "AND", - "fuzziness": "AUTO", - }, - } - }, - { - "match": { - "contrib_names": { - "query": names, - "operator": "AND", - "fuzziness": "AUTO", - } + query = { + "bool": { + "must": [ + { + "match": { + "title": { + "query": release.title, + "operator": "AND", + "fuzziness": "AUTO", + }, + } + }, + { + "match": { + "contrib_names": { + "query": names, + "operator": "AND", + "fuzziness": "AUTO", } - }, - ], - }, + } + }, + ], }, - "size": size, } - resp = es.search(body=body, index="fatcat_release") + resp = es.search(query=query, index="fatcat_release", size=size, track_total_hits=True) if es_compat_hits_total(resp) > 0: return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) - body = { - "track_total_hits": True, - "query": { - "bool": { - "should": [ - { - "match": { - "title": { - "query": release.title, - "operator": "AND", - "fuzziness": "AUTO", - }, + query = { + "bool": { + "should": [ + { + "match": { + "title": { + "query": release.title, + "operator": "AND", + "fuzziness": "AUTO", + }, + } + }, + { + "match": { + "contrib_names": { + "query": names, + "operator": "AND", + "fuzziness": "AUTO", } - }, - { - "match": { - "contrib_names": { - "query": names, - "operator": "AND", - "fuzziness": "AUTO", - } - } - }, - ], - }, + } + }, + ], }, - "size": size, } - resp = es.search(body=body, index="fatcat_release") + resp = es.search(query=query, index="fatcat_release", size=size, track_total_hits=True) if es_compat_hits_total(resp) > 0: return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) # Note: If the title is short, we will get lots of results here; do we need # to check for title length or result set length length or result set # length here? - body = { - "track_total_hits": True, - "query": { - "match": { - "title": { - "query": release.title, - "operator": "AND", - } + query = { + "match": { + "title": { + "query": release.title, + "operator": "AND", } - }, - "size": size, + } } - resp = es.search(body=body, index="fatcat_release") + resp = es.search(query=query, index="fatcat_release", size=size, track_total_hits=True) if es_compat_hits_total(resp) > 0: return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) # Get fuzzy. # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness - body = { - "track_total_hits": True, - "query": { - "match": { - "title": { - "query": release.title, - "operator": "AND", - "fuzziness": "AUTO", - } + query = { + "match": { + "title": { + "query": release.title, + "operator": "AND", + "fuzziness": "AUTO", } - }, - "size": size, + } } - resp = es.search(body=body, index="fatcat_release") + resp = es.search(query=query, index="fatcat_release", size=size, track_total_hits=True) if es_compat_hits_total(resp) > 0: return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) diff --git a/fuzzycat/sandcrawler.py b/fuzzycat/sandcrawler.py index 958756a..63b85e6 100644 --- a/fuzzycat/sandcrawler.py +++ b/fuzzycat/sandcrawler.py @@ -1,6 +1,7 @@ -import regex import unicodedata +import regex + # from http://zderadicka.eu/removing-diacritics-marks-from-strings/ SANDCRAWLER_CHAR_MAP = { '\N{Latin capital letter AE}': 'AE', @@ -63,6 +64,7 @@ SANDCRAWLER_REMOVE_CHAR_REGEX = regex.compile( r"[\s\p{Punctuation}\p{M}\p{InCombiningDiacriticalMarks}\u2000-\u206F\u2E00-\u2E7F’·“”‘’“”«»「」¿–±§_`°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]" ) + def sandcrawler_slugify(raw: str) -> str: """ Python re-implementation of sandcrawler Scala code for string comparison @@ -155,4 +157,3 @@ def test_sandcrawler_slugify() -> None: print(unicodedata.name(c)) print(in_str) assert sandcrawler_slugify(in_str) == out_str - diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 9eb808b..f570511 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -233,10 +233,9 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]: if has_doi_prefix(a_doi, "10.3403") and has_doi_prefix(b_doi, "10.3403"): if a_doi + "u" == b_doi or b_doi + "u" == a_doi: return Verify(Status.STRONG, Reason.CUSTOM_BSI_UNDATED) - if a_title == b_title and ((dict_has_key(a, "extra.subtitle") - and not dict_has_key(b, "extra.subtitle")) or - (dict_has_key(b, "extra.subtitle") - and not dict_has_key(a, "extra.subtitle"))): + if a_title == b_title and ( + (dict_has_key(a, "extra.subtitle") and not dict_has_key(b, "extra.subtitle")) or + (dict_has_key(b, "extra.subtitle") and not dict_has_key(a, "extra.subtitle"))): return Verify(Status.STRONG, Reason.CUSTOM_BSI_SUBDOC) except PathAccessError: pass diff --git a/notes/2021_11_fuzzycat_refactoring.md b/notes/2021_11_fuzzycat_refactoring.md new file mode 100644 index 0000000..171cee3 --- /dev/null +++ b/notes/2021_11_fuzzycat_refactoring.md @@ -0,0 +1,87 @@ +# Proposal: Fuzzycat Refactoring + +* Goal: Refactor fuzzycat to make matching and verification more composable, + configurable and testable. +* Status: wip + +A better design. + +* it has a correct scope (e.g. match X; very Y) +* it has good defaults, but allows configuration +* it is clear how and where to extend functionality +* it is easy to add one new test for a case + +## Matching + +* fuzzy matching will be a cascade of queries, until a result is returned +* there is an order of queries from exact to very fuzzy +* alternatively, we could use "ensemble matching", that takes the intersection of a couple of queries +* ES queries cannot cover all cases, we need to add additional checks; e.g. author list comparison + +Example + + FuzzyReleaseMatcher + match_release_id + match_release_exact_title_exact_contrib + match_release_... + + match_release_fuzzy (runs a cascade of queries) + +Each function is testable on its own. The class keeps the es client and other +global config around. It's scope is clear: given a "release" (or maybe just a +title string), generate a list of potentially related releases. + +Other entities follow the same pattern. + + FuzzyContainerMatcher + match_container_id + match_container_issn + match_container_abbreviation + match_container_... + + match_container_fuzzy (runs a cascade of queries) + +A helper object (not exactly the entity) for matching list of authors. Allows +to match by various means, e.g. exact, short names, partial lists, etc. Should +account for case, order, etc. + + FuzzyContribsMatcher + match_exact + match_short_names + match_partial_list + + match_fuzzy + +For each method in each matcher class, we can construct a test case only for +one particular method. A new method can be added with easy and tested separately. + +Don't know how yet, but we can create some "profiles" that allow for a matching +by a set of methods. Or use good defaults on the higher level `_fuzzy(...)` method. + +NOTE: the matcher classes could use the verification code internally; generate +a list of matches with an es query, then use a configured verifier to generate +verified matches; only put comparison code into verification module. + +## Verification (comparison) + +Verification works similarly. For each entity we define a set of methods, verifying a specific aspect. + + FuzzyReleaseVerifier + verify_release_id + verify_release_ext_id + verify_release_title_exact_match + verify_release_title_contrib_exact_match + verify_release_... + + verify(a, b) -> (Status, Reason) + +A large number of test cases are already there, may need a bit better structure +to relate cases to methods. The class can hold global configuration, maybe some +cached computed properties, if that helps. + + + FuzzyContainerVerifier + verify_container_id + ... + + diff --git a/tests/files/README.md b/tests/files/README.md new file mode 100644 index 0000000..ef674d6 --- /dev/null +++ b/tests/files/README.md @@ -0,0 +1,5 @@ +# Matcher Test Files + +The goal here is to have a mostly language-independent test cases for matching. + +Each subdirectory corresponds to a test function and contains examples for it. diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml new file mode 100644 index 0000000..2df8d9a --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml @@ -0,0 +1,13 @@ +title: titles are case insensitive +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Michael Adams" + } + ], + "title": "digital libraries", + "ext_ids": {} + } +expected: 2 diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml new file mode 100644 index 0000000..1070408 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml @@ -0,0 +1,13 @@ +title: another vanilla query +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Poul-Henning Kamp" + } + ], + "title": "The hyperdimensional tar pit", + "ext_ids": {} + } +expected: 2 diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml new file mode 100644 index 0000000..882e746 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml @@ -0,0 +1,16 @@ +title: order of contribs does not matter +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Maurice Florence" + }, + { + "raw_name": "Tuomo Tiisala" + } + ], + "title": "Foucault", + "ext_ids": {} + } +expected: 1 diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml new file mode 100644 index 0000000..0a2ad12 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml @@ -0,0 +1,16 @@ +title: order of contribs does not matter +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Tuomo Tiisala" + }, + { + "raw_name": "Maurice Florence" + } + ], + "title": "Foucault", + "ext_ids": {} + } +expected: 1 diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml new file mode 100644 index 0000000..36ea0fe --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml @@ -0,0 +1,16 @@ +title: short version of name should not work +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Tuomo Tiisala" + }, + { + "raw_name": "M. Florence" + } + ], + "title": "Foucault", + "ext_ids": {} + } +expected: 0 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml new file mode 100644 index 0000000..07230e8 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml @@ -0,0 +1,14 @@ +title: titles are case insensitive +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Michael Adams" + } + ], + "title": "digital libraries", + "ext_ids": {} + } +jaccard_index_threshold: 1.0 +expected: 2 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml new file mode 100644 index 0000000..62e9586 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml @@ -0,0 +1,14 @@ +title: another vanilla query +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Poul-Henning Kamp" + } + ], + "title": "The hyperdimensional tar pit", + "ext_ids": {} + } +jaccard_index_threshold: 1.0 +expected: 2 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml new file mode 100644 index 0000000..b89e825 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml @@ -0,0 +1,17 @@ +title: order of contribs does not matter +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Maurice Florence" + }, + { + "raw_name": "Tuomo Tiisala" + } + ], + "title": "Foucault", + "ext_ids": {} + } +jaccard_index_threshold: 1.0 +expected: 1 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml new file mode 100644 index 0000000..3de7262 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml @@ -0,0 +1,17 @@ +title: order of contribs does not matter +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Tuomo Tiisala" + }, + { + "raw_name": "Maurice Florence" + } + ], + "title": "Foucault", + "ext_ids": {} + } +jaccard_index_threshold: 1.0 +expected: 1 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml new file mode 100644 index 0000000..39fb065 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml @@ -0,0 +1,17 @@ +title: short version of name should not work +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Tuomo Tiisala" + }, + { + "raw_name": "M. Florence" + } + ], + "title": "Foucault", + "ext_ids": {} + } +jaccard_index_threshold: 1.0 +expected: 0 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml new file mode 100644 index 0000000..fff19fa --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml @@ -0,0 +1,17 @@ +title: here, Iz Beltagy is missing from author, but still retrieved +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Arman Cohan" + }, + { + "raw_name": "Kyle Lo" + } + ], + "title": "SciBERT: A Pretrained Language Model for Scientific Text", + "ext_ids": {} + } +jaccard_index_threshold: 0.5 +expected: 3 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml new file mode 100644 index 0000000..d4e0025 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml @@ -0,0 +1,14 @@ +title: here, 2/3 authors are missing, we fail with jaccard index 0.5 +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Arman Cohan" + } + ], + "title": "SciBERT: A Pretrained Language Model for Scientific Text", + "ext_ids": {} + } +jaccard_index_threshold: 0.5 +expected: 0 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml new file mode 100644 index 0000000..23d5a8d --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml @@ -0,0 +1,17 @@ +title: match, despite trailing whitespace +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Arman Cohan" + }, + { + "raw_name": "Kyle Lo" + } + ], + "title": "SciBERT: A Pretrained Language Model for Scientific Text ", + "ext_ids": {} + } +jaccard_index_threshold: 0.5 +expected: 3 diff --git a/tests/test_grobid_unstructured.py b/tests/test_grobid_unstructured.py index cf71f91..f36f9a4 100644 --- a/tests/test_grobid_unstructured.py +++ b/tests/test_grobid_unstructured.py @@ -18,11 +18,7 @@ def test_grobid_ref_to_release(): given_name='ahab', surname='sailor', ), - GrobidAuthor( - full_name='mary jane', - given_name='mary', - surname='jane' - ), + GrobidAuthor(full_name='mary jane', given_name='mary', surname='jane'), ], ) r = grobid_ref_to_release(d) diff --git a/tests/test_matching.py b/tests/test_matching.py index ad971a5..ca94c2a 100644 --- a/tests/test_matching.py +++ b/tests/test_matching.py @@ -1,13 +1,14 @@ +import collections import logging import warnings import elasticsearch import pytest import requests -from fatcat_openapi_client import ReleaseEntity +from fatcat_openapi_client import ReleaseEntity, ReleaseContrib -from fuzzycat.entities import entity_from_dict -from fuzzycat.matching import anything_to_entity, match_release_fuzzy +from fuzzycat.entities import entity_from_dict, entity_from_json +from fuzzycat.matching import anything_to_entity, match_release_fuzzy, FuzzyReleaseMatcher warnings.filterwarnings( "ignore") # InsecureRequestWarning: Unverified HTTPS request is being made to host ... @@ -18,6 +19,9 @@ from fatcat_openapi_client import ReleaseEntity import pytest import elasticsearch import logging +import yaml +import glob +import json logger = logging.getLogger('test_matching') logger.setLevel(logging.DEBUG) @@ -40,19 +44,35 @@ def is_reachable(url, timeout=3): return False +def yaml_to_cases(klass, + files="tests/files/fuzzy_release_match_release_exact_title_exact_contrib/*.yaml"): + """ + Turn yaml files into a collection of named tuple test cases. The glob is + relative to the project root (i.e. where you usually run `pytest` from). + """ + cases = [] + for path in glob.glob(files): + with open(path) as f: + doc = yaml.load(f, Loader=yaml.Loader) + cases.append(klass(**doc)) + return cases + + @pytest.fixture def es_client(): return elasticsearch.Elasticsearch([FATCAT_SEARCH_URL]) -@pytest.mark.skipif( - is_not_reachable(FATCAT_SEARCH_URL), - reason="{} not reachable, use e.g. FUZZYCAT_FATCAT_SEARCH_URL=localhost:9200 to override". - format(FATCAT_SEARCH_URL)) +# @pytest.mark.skipif( +# is_not_reachable(FATCAT_SEARCH_URL), +# reason="{} not reachable, use e.g. FUZZYCAT_FATCAT_SEARCH_URL=localhost:9200 to override". +# format(FATCAT_SEARCH_URL)) def test_match_release_fuzzy(es_client, caplog): """ This test is tied to the current index contents, so if that changes, this test may fail as well. + + Note: Deprecated. We want to get rid of this. """ cases = ( ("wtv64ahbdzgwnan7rllwr3nurm", 1), @@ -106,3 +126,92 @@ def test_match_release_fuzzy(es_client, caplog): logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result), [v.title for v in result])) assert len(result) == count, doc + + +def test_matcher_match_release(es_client, caplog): + cases = ( + ("wtv64ahbdzgwnan7rllwr3nurm", 1), + ("eqcgtpav3na5jh56o5vjsvb4ei", 1), + ) + matcher = FuzzyReleaseMatcher(es=es_client, size=5) + for i, (ident, count) in enumerate(cases): + entity = anything_to_entity(ident, ReleaseEntity) + result = matcher.match(entity) + logger.info("[{}] given {}, found {}".format(i, entity.title, len(result))) + assert len(result) == count + + # Partial data. + cases = ( + ({ + "title": "digital libraries", + "ext_ids": {} + }, 5), + ({ + "title": "unlikelytitle", + "ext_ids": {} + }, 0), + ({ + "title": "Imminent dystopia", + "ext_ids": {} + }, 5), + ({ + "title": "", + "contribs": [{ + "raw_name": "Aristoteles" + }], + "ext_ids": {} + }, 5), + # ({ + # "title": "Letter", + # "contribs": [{"raw_name": "Claudel"}], + # "ext_ids": {} + # }, 1), + # ({ + # "title": "The Future of Digital Scholarship", + # "contribs": [{ + # "raw_name": "Costantino Thanos" + # }], + # "ext_ids": {} + # }, 5), + ) + for i, (doc, count) in enumerate(cases): + entity = entity_from_dict(doc, ReleaseEntity) + result = matcher.match(entity) + with caplog.at_level(logging.INFO): + logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result), + [v.title for v in result])) + assert len(result) == count, doc + + +def test_fuzzy_release_matcher_match_release_by_id(es_client, caplog): + matcher = FuzzyReleaseMatcher(es=es_client) + cases = ( + ("wtv64ahbdzgwnan7rllwr3nurm", 1), + ("eqcgtpav3na5jh56o5vjsvb4ei", 1), + ) + for i, (ident, count) in enumerate(cases): + entity = anything_to_entity(ident, ReleaseEntity) + result = matcher.match_release_by_id(entity) + assert len(result) == count + + +def test_fuzzy_release_match_release_exact_title_exact_contrib(es_client, caplog): + matcher = FuzzyReleaseMatcher(es=es_client) + Case = collections.namedtuple("Case", "title date input expected") + cases = yaml_to_cases( + Case, "tests/files/fuzzy_release_match_release_exact_title_exact_contrib/*.yaml") + for i, c in enumerate(cases): + entity = entity_from_json(c.input, ReleaseEntity) + result = matcher.match_release_exact_title_exact_contrib(entity) + assert len(result) == c.expected, "[{}] {}".format(c.title, c.input) + + +def test_fuzzy_release_match_release_exact_title_partial_contrib(es_client, caplog): + matcher = FuzzyReleaseMatcher(es=es_client) + Case = collections.namedtuple("Case", "title date input jaccard_index_threshold expected") + cases = yaml_to_cases( + Case, "tests/files/fuzzy_release_match_release_exact_title_partial_contrib/*.yaml") + for i, c in enumerate(cases): + entity = entity_from_json(c.input, ReleaseEntity) + result = matcher.match_release_exact_title_partial_contrib(entity) + assert len(result) == c.expected, "[{}] {}".format(c.title, c.input) -- cgit v1.2.3