aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin@archive.org>2021-11-16 19:06:26 +0000
committerMartin Czygan <martin@archive.org>2021-11-16 19:06:26 +0000
commit24dcddc4e4cff744e7c0a964856329d2ac69601d (patch)
treead8650892805e55ec4a6958f9e1539eb675332b8
parent282f315c6ba3643c8c614220ab2f7e1d55de3658 (diff)
parent409392d66c3a6debe5bc69c0e2308209ac74ee35 (diff)
downloadfuzzycat-24dcddc4e4cff744e7c0a964856329d2ac69601d.tar.gz
fuzzycat-24dcddc4e4cff744e7c0a964856329d2ac69601d.zip
Merge branch 'martin-matcher-class' into 'master'
turn "match_release_fuzzy" into a class See merge request webgroup/fuzzycat!10
-rw-r--r--TODO.md34
-rw-r--r--fuzzycat/contrib.py453
-rw-r--r--fuzzycat/grobid_unstructured.py2
-rw-r--r--fuzzycat/matching.py595
-rw-r--r--fuzzycat/sandcrawler.py5
-rw-r--r--fuzzycat/verify.py7
-rw-r--r--notes/2021_11_fuzzycat_refactoring.md87
-rw-r--r--setup.py4
-rw-r--r--tests/files/README.md5
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml13
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml13
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml16
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml16
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml16
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml14
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml14
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml17
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml17
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml17
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml17
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml14
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml17
-rw-r--r--tests/test_grobid_unstructured.py6
-rw-r--r--tests/test_matching.py123
24 files changed, 1410 insertions, 112 deletions
diff --git a/TODO.md b/TODO.md
index 5666bc0..9241b60 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,28 +1,32 @@
# TODO
* [ ] clustering should be broken up, e.g. into "map" and "sort"
+* [ ] match release fuzzy should work not just with title
+* [ ] match container name functions (maybe also with abbreviations, etc)
+* [ ] better documentation, more examples
+* [ ] shiv based packaging
+* [ ] author similarity should be broken up; easier to tweak
+* [ ] split up `verify`
+* [ ] configurable `verify`
+
+Other repos:
-In
-[refcat/skate](https://gitlab.com/internetarchive/refcat/-/tree/master/skate)
-we have one simple operation: extract a list of fields from blob of bytes. We
-use [16
-mappers](https://gitlab.com/internetarchive/refcat/-/blob/f33e586d11f5f575f71ad209608ac9ba74fad2e5/skate/cmd/skate-map/main.go#L70-86)
-currently, they are easy to write.
+* [refcat/skate](https://gitlab.com/internetarchive/refcat/-/tree/master/skate)
+
+In refcat we have one simple operation: extract a list of fields from blob of
+bytes. We use [16 mappers](https://is.gd/E0NEXj) currently, they are easy to
+write.
In refcat, we use GNU sort, and just when we need it, e.g.
-[skate-map](https://gitlab.com/internetarchive/refcat/-/blob/f33e586d11f5f575f71ad209608ac9ba74fad2e5/python/refcat/tasks.py#L531-534).
+[skate-map](https://is.gd/Kt9hvL).
The `Cluster` class bundles, iteration, key extraction, sorting and group by
operation into a single entity.
Also in refcat, we do not work on a single file with clusters any more, but
-mostly with two sorted streams, which are iterated over "comm" style. This
-spares us an extra step of generating the cluster documents, but requires an
-extra component, that allows to plug in various "reduce" functions. In refcat,
-this component is called "zipkey", which is support batching, too.
+mostly with two sorted streams, which are iterated over "mergesort/comm" style.
-* [ ] match release fuzzy should work not just with title
-* [ ] match container name functions (maybe also with abbreviations, etc)
-* [ ] better documentation, more examples
-* [ ] shiv based packaging
+This spares us an extra step of generating the cluster documents, but requires
+an extra component, that allows to plug in various "reduce" functions. In
+refcat, this component is called "zipkey", which is support batching, too.
diff --git a/fuzzycat/contrib.py b/fuzzycat/contrib.py
new file mode 100644
index 0000000..93753ab
--- /dev/null
+++ b/fuzzycat/contrib.py
@@ -0,0 +1,453 @@
+"""
+Contrib related comparisons.
+
+Example: NgramContribMatcher, which compares two normalized raw name tokens
+with a jaccard index.
+
+ matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.5),
+ pipeline=Pipeline([
+ lambda rc: rc.raw_name,
+ str.strip,
+ str.lower,
+ Ngram(n=3),
+ set,
+ ]))
+ result = matcher.compare(a, b)
+ ...
+
+Some notes from the dataset.
+
+* 692,893,828 contribs
+* 64,680,311 uniq
+
+Top contrib names, many have their name on datasets, which explains the high
+number.
+
+3069752 Kessy Abarenkov
+2383819 Leho Tedersoo
+2383748 Karl-Henrik Larsson
+2383745 Urmas Kõljalg
+2383699 Mohammad Bahram
+2383692 Martin Ryberg
+2382832 R. Henrik Nilsson
+1702455 Markus Döring
+1702427 Tom May
+1702415 Dmitry Schigel
+1702391 Santiago Sánchez-Ramírez
+ 841475 GLIS Of The ITPGRFA
+ 682144 James Scott
+ 682053 Michael Weiss
+ 681404 Susumu Takamatsu
+ 681388 A. Elizabeth Arnold
+ 681347 Artur Alves
+ 681341 Ellen Larsson
+ 681338 Maarja Öpik
+ 681335 Ursula Eberhardt
+ 681324 Nhu Nguyen
+ 681293 Otto Miettinen
+ 681292 Viacheslav Spirin
+ 681287 Gareth W. Griffith
+ 681283 Bálint Dima
+ 681278 Ursula Peintner
+ 681276 Tuula Niskanen
+ 681276 Olinto Liparini Pereira
+ 681275 Kare Liimatainen
+"""
+
+import collections
+import functools
+import itertools
+import logging
+import operator
+import re
+import string
+from typing import Any, Callable, List, Optional, Set
+
+import jellyfish
+import thefuzz
+from fatcat_openapi_client import ReleaseContrib
+
+logger = logging.getLogger("fuzzycat")
+
+
+class Ngram:
+ """
+ Turn a string into a list of overlapping tokens.
+ """
+ def __init__(self, n: int = 3):
+ if n < 1:
+ raise ValueError("positive n required")
+ self.n = n
+
+ def __call__(self, s: str) -> List[str]:
+ if 0 < len(s) < self.n:
+ return [s]
+ return [s[i:i + self.n] for i in range(len(s) - self.n + 1)]
+
+
+class JaccardIndexThreshold:
+ """
+ A Jaccard index threshold that can be used to compare two sets. Two empty
+ sets are equal.
+ """
+ def __init__(self, threshold: float = 0.5, verbose=False):
+ self.threshold = threshold
+ self.verbose = verbose
+
+ def __call__(self, a: Set, b: Set) -> bool:
+ if len(a) == 0 and len(b) == 0:
+ return True
+ index = len(a & b) / len(a | b)
+ if self.verbose:
+ logger.debug("[jaccard] {}".format(index))
+ return index >= self.threshold
+
+
+class FuzzyStringSimilarity:
+ """
+ For two sets of strings, run fuzzy matching with "thefuzz" -
+ https://github.com/seatgeek/thefuzz, which among other things uses
+ Levenshtein distance.
+
+ The min ratio can range from 0 to 100 (with 100 allowing exact matches
+ only).
+ """
+ def __init__(self, min_ratio=75):
+ self.min_ratio = min_ratio
+
+ def __call__(self, a: Set, b: Set) -> bool:
+ agg = 0
+ for v in a:
+ match, score = thefuzz.exctractOne(v, b)
+ agg += score
+ return score > self.min_ratio
+
+
+class Pipeline:
+ """
+ A list of functions to execute, f -> g -> h, etc. Note that the output
+ type of f needs to match the input type of g, etc.
+ """
+ def __init__(self, pipeline: Optional[List[Any]] = None, verbose: bool = False):
+ self.verbose = verbose
+ if pipeline is None:
+ self.pipeline = [
+ lambda v: v,
+ ]
+ else:
+ self.pipeline = pipeline
+
+ def run(self, value: Any) -> Any:
+ v = value
+ for i, f in enumerate(self.pipeline, start=1):
+ v = f(v)
+ if self.verbose:
+ logger.debug("[{}/{}] {}".format(i, len(self.pipeline), v))
+ return v
+
+ def __call__(self, value: Any, verbose: bool = False) -> Any:
+ self.verbose = verbose
+ return self.run(value)
+
+
+# default_release_contrib_pipeline normalizes the raw name.
+default_release_contrib_pipeline = Pipeline([
+ lambda rc: rc.raw_name,
+ str.strip,
+ str.lower,
+])
+
+# default_release_contrib_list_pipeline turns contribs list into a contrib set.
+default_release_contrib_list_pipeline = Pipeline([
+ lambda seq: set((c.raw_name for c in seq)),
+])
+
+
+class ContribMatcher:
+ """
+ Compare two contrib entities and determine a match status, based on some
+ configuration. The final values of the `pipeline` will be compared with
+ `cmp`, which by default is equality.
+
+ Other `cmp` options may generate ngrams and use jaccard index with some
+ threshold or decide on a string similarity metric.
+
+ This is essentially just a shell, the various comparison methods live in
+ the tuple (pipeline, cmp).
+ """
+ def __init__(self,
+ pipeline: Optional[List[Any]] = default_release_contrib_list_pipeline,
+ cmp: Callable[[Any, Any], bool] = operator.__eq__):
+ self.pipeline = pipeline
+ self.cmp = cmp
+
+ def compare(self, a: ReleaseContrib, b: ReleaseContrib) -> bool:
+ """
+ Compare returns True, if a and b are considered the same, given a
+ transformation pipeline and a comparison operation.
+ """
+ u = self.pipeline(a)
+ v = self.pipeline(b)
+ return self.cmp(u, v)
+
+
+class ContribListMatcher:
+ """
+ Compare two lists of contribs. Each contrib entry is passed through the
+ same pipeline.
+
+ Often two issues (separate or combined).
+
+ - contrib missing, e.g.
+ - "Gentle Sunder Shrestha", "Gentle S Shrestha", "S. Shrestha", "Gentle Shrestha", ...
+ """
+ def __init__(self,
+ pipeline: Optional[List[Any]] = default_release_contrib_list_pipeline,
+ cmp: Callable[[Any, Any], bool] = JaccardIndexThreshold(1.0)):
+ self.pipeline = pipeline
+ self.cmp = cmp
+
+ def compare(self,
+ a: List[ReleaseContrib],
+ b: List[ReleaseContrib],
+ verbose: bool = False) -> bool:
+ """
+ Compare two lists of contribs, pass each one through the pipeline. The
+ result may be a list or any other type. The comparison function needs
+ to be compatible.
+ """
+ u = self.pipeline(a, verbose=verbose)
+ v = self.pipeline(b, verbose=verbose)
+ return self.cmp(u, v)
+
+
+def cleanup_single_ws(s: str) -> str:
+ return re.sub(r"[ ]{2,}", " ", s)
+
+
+def cleanup_remove_ws(s: str) -> str:
+ return re.sub(r"[\n\r\t\s]*", '', s)
+
+
+def cleanup_keep_letters_digits_ws(s: str) -> str:
+ return ''.join((c for c in s if c in string.ascii_letters + string.digits + " "))
+
+
+def test_cleanup_single_ws():
+ Case = collections.namedtuple("Case", "s result")
+ cases = (
+ Case("", ""),
+ Case("abc", "abc"),
+ Case("abc abc", "abc abc"),
+ Case("abc abc", "abc abc"),
+ Case(" abc abc", " abc abc"),
+ Case(" abc abc", " abc abc"),
+ )
+ for c in cases:
+ assert c.result == cleanup_single_ws(c.s)
+
+
+def test_cleanup_remove_ws():
+ Case = collections.namedtuple("Case", "s result")
+ cases = (
+ Case("", ""),
+ Case("abc", "abc"),
+ Case("abc abc", "abcabc"),
+ Case("abc abc", "abcabc"),
+ Case(" abc abc", "abcabc"),
+ )
+ for c in cases:
+ assert c.result == cleanup_remove_ws(c.s), c
+
+
+def test_ngram():
+ Case = collections.namedtuple("Case", "s n result")
+ cases = (
+ Case("", 1, []),
+ Case("", 2, []),
+ Case("a", 2, ["a"]),
+ Case("ab", 2, ["ab"]),
+ Case("abcdef", 2, ['ab', 'bc', 'cd', 'de', 'ef']),
+ Case("abcdef", 4, ['abcd', 'bcde', 'cdef']),
+ Case("Nina Rogo", 3, ["Nin", "ina", "na ", "a R", " Ro", "Rog", "ogo"]),
+ )
+ for c in cases:
+ ngram = Ngram(n=c.n)
+ assert ngram(c.s) == c.result
+
+
+def test_pipeline():
+ Case = collections.namedtuple("Case", "pipeline input result")
+ cases = (Case(Pipeline([lambda v: v["a"], str.strip, str.lower,
+ Ngram(n=3), set]), {"a": " X123 "}, {'123', 'x12'})),
+ for c in cases:
+ result = c.pipeline(c.input)
+ assert result == c.result
+
+
+def test_jaccard_index_threshold():
+ Case = collections.namedtuple("Case", "a b threshold result")
+ cases = (
+ Case(set(), set(), 1.0, True),
+ Case(set(), set(["a"]), 1.0, False),
+ Case(set(["a"]), set(["a"]), 1.0, True),
+ Case(set(["a"]), set(["a", "b"]), 1.0, False),
+ Case(set(["a"]), set(["a", "b"]), 0.5, True),
+ Case(set(["a"]), set(["a", "b", "c"]), 0.5, False),
+ )
+ for c in cases:
+ jit = JaccardIndexThreshold(threshold=c.threshold)
+ result = jit(c.a, c.b)
+ assert result == c.result
+
+
+def test_ngram_contrib_matcher(caplog):
+ Case = collections.namedtuple("Case", "a b result")
+ cases = (
+ Case(
+ ReleaseContrib(raw_name="Jane Austen"),
+ ReleaseContrib(raw_name="J.Austen"),
+ True,
+ ),
+ Case(
+ ReleaseContrib(raw_name="Fjodor Dostojewski"),
+ ReleaseContrib(raw_name="Fyodor Dostoevsky"),
+ False,
+ ),
+ Case(
+ ReleaseContrib(raw_name="Fjodor M. Dostojewski"),
+ ReleaseContrib(raw_name="Fyodor Dostoevsky"),
+ False,
+ ),
+ Case(
+ ReleaseContrib(raw_name="Fjodor Michailowitsch Dostojewski"),
+ ReleaseContrib(raw_name="Fyodor Dostoevsky"),
+ False,
+ ),
+ )
+ matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.4, verbose=True),
+ pipeline=Pipeline([
+ lambda rc: rc.raw_name,
+ str.strip,
+ str.lower,
+ cleanup_remove_ws,
+ cleanup_keep_letters_digits_ws,
+ Ngram(n=3),
+ set,
+ ],
+ verbose=True))
+ for c in cases:
+ with caplog.at_level(logging.DEBUG):
+ result = matcher.compare(c.a, c.b)
+ assert result == c.result
+
+
+def test_jellyfish_soundex_contrib_matcher(caplog):
+ Case = collections.namedtuple("Case", "a b result")
+ cases = (
+ Case(
+ ReleaseContrib(raw_name="Jane Austen"),
+ ReleaseContrib(raw_name="J.Austen"),
+ True,
+ ),
+ Case(
+ ReleaseContrib(raw_name="Fjodor Dostojewski"),
+ ReleaseContrib(raw_name="Fyodor Dostoevsky"),
+ False,
+ ),
+ Case(
+ ReleaseContrib(raw_name="Fjodor M. Dostojewski"),
+ ReleaseContrib(raw_name="Fyodor Dostoevsky"),
+ False,
+ ),
+ Case(
+ ReleaseContrib(raw_name="Fjodor Michailowitsch Dostojewski"),
+ ReleaseContrib(raw_name="Fyodor Dostoevsky"),
+ False,
+ ),
+ )
+ matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.3, verbose=True),
+ pipeline=Pipeline([
+ lambda rc: rc.raw_name,
+ str.strip,
+ str.lower,
+ functools.partial(re.sub, r"[.;]", " "),
+ cleanup_keep_letters_digits_ws,
+ lambda s: set((jellyfish.soundex(v) for v in s.split())),
+ ],
+ verbose=True))
+ for c in cases:
+ with caplog.at_level(logging.DEBUG):
+ result = matcher.compare(c.a, c.b)
+ assert result == c.result
+
+
+def test_jellyfish_nysiis_contrib_matcher(caplog):
+ Case = collections.namedtuple("Case", "a b result")
+ cases = (
+ Case(
+ ReleaseContrib(raw_name="Jane Austen"),
+ ReleaseContrib(raw_name="J.Austen"),
+ True,
+ ),
+ Case(
+ ReleaseContrib(raw_name="Fjodor Dostojewski"),
+ ReleaseContrib(raw_name="Fyodor Dostoevsky"),
+ False,
+ ),
+ Case(
+ ReleaseContrib(raw_name="Fjodor M. Dostojewski"),
+ ReleaseContrib(raw_name="Fyodor Dostoevsky"),
+ False,
+ ),
+ Case(
+ ReleaseContrib(raw_name="Fjodor Michailowitsch Dostojewski"),
+ ReleaseContrib(raw_name="Fyodor Dostoevsky"),
+ False,
+ ),
+ )
+ matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.3, verbose=True),
+ pipeline=Pipeline([
+ lambda rc: rc.raw_name,
+ str.strip,
+ str.lower,
+ functools.partial(re.sub, r"[.;]", " "),
+ cleanup_keep_letters_digits_ws,
+ lambda s: set((jellyfish.nysiis(v) for v in s.split())),
+ ],
+ verbose=True))
+ for c in cases:
+ with caplog.at_level(logging.DEBUG):
+ result = matcher.compare(c.a, c.b)
+ assert result == c.result
+
+
+def test_default_contrib_list_matcher(caplog):
+ Case = collections.namedtuple("Case", "a b result")
+ cases = (
+ Case(
+ [],
+ [],
+ True,
+ ),
+ Case(
+ [ReleaseContrib(raw_name="Michael Jordan")],
+ [ReleaseContrib(raw_name="Michael Jordan")],
+ True,
+ ),
+ Case(
+ [ReleaseContrib(raw_name="Michael Jordan")],
+ [ReleaseContrib(raw_name="michael jordan")],
+ False,
+ ),
+ Case(
+ [ReleaseContrib(raw_name="Amadeu Llebaria")],
+ [ReleaseContrib(raw_name="A. Llebaria")],
+ False,
+ ),
+ )
+ matcher = ContribListMatcher()
+ for c in cases:
+ with caplog.at_level(logging.DEBUG):
+ result = matcher.compare(c.a, c.b, verbose=True)
+ assert result == c.result
diff --git a/fuzzycat/grobid_unstructured.py b/fuzzycat/grobid_unstructured.py
index 7470cd7..1765f42 100644
--- a/fuzzycat/grobid_unstructured.py
+++ b/fuzzycat/grobid_unstructured.py
@@ -15,7 +15,7 @@ from typing import Optional
import requests
from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds
-from grobid_tei_xml import parse_citation_xml, GrobidBiblio
+from grobid_tei_xml import GrobidBiblio, parse_citation_xml
from fuzzycat.config import settings
from fuzzycat.utils import clean_doi
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
index b358899..c83e48c 100644
--- a/fuzzycat/matching.py
+++ b/fuzzycat/matching.py
@@ -1,33 +1,479 @@
+import logging
import os
import re
import sys
-from typing import List, Optional, Type, Union
+from typing import Any, List, Optional, Type, Union
import elasticsearch
import elasticsearch_dsl
import fatcat_openapi_client
import requests
-from fatcat_openapi_client import ContainerEntity, DefaultApi, ReleaseEntity
+from fatcat_openapi_client import (ContainerEntity, DefaultApi, ReleaseContrib, ReleaseEntity)
from fatcat_openapi_client.rest import ApiException
from fuzzycat.config import settings
+from fuzzycat.contrib import (ContribListMatcher, FuzzyStringSimilarity, JaccardIndexThreshold,
+ Pipeline)
from fuzzycat.entities import entity_from_dict, entity_from_json
from fuzzycat.utils import es_compat_hits_total
FATCAT_API_URL = settings.get("FATCAT_API_URL", "https://api.fatcat.wiki/v0")
+class FuzzyReleaseMatcher:
+ """
+ FuzzyReleaseMatcher tries to find similar items to a given release in
+ elasticsearch. Exact matches first, then fuzzy.
+
+ In the best case, elasticsearch would automatically rank the most relevant
+ docs first, even with partial data. We still try to steer the matches by
+ using a query cascade. This is configurable. The last query should be a
+ generic.
+
+ The goal here is to get a set of potential matches; verification has to.
+ happen separately.
+
+ TODO:
+
+ Example case not yet working well ("Stuehrenberg" vs "Stührenberg"):
+
+ >>> result = matcher.match(entity_from_dict({"title": "internet archive",
+ "contribs": [{"raw_name":
+ "Stührenberg"}],
+ "ext_ids": {}},
+ ReleaseEntity))
+
+ > Should return: https://fatcat.wiki/release/pu7e7tbctna2foqyyxztfw3ufy,
+ https://fatcat.wiki/release/search?q=St%C3%BChrenberg+internet+archive&generic=1
+ (not returning anything via frontend either)
+
+ Make sure we can switch from function to class:
+
+ * [ ] 5 test cases for both
+
+ """
+ def __init__(self, es="https://search.fatcat.wiki", api=None, index="fatcat_release", size=10):
+ if isinstance(es, str):
+ self.es = elasticsearch.Elasticsearch([es])
+ else:
+ self.es = es if es else elasticsearch.Elasticsearch()
+ self.api = api if api else public_api(FATCAT_API_URL)
+ self.index = index
+ self.size = size
+ self.logger = logging.getLogger("fuzzy")
+
+ def match_release_by_id(self, release, **kwargs) -> List[ReleaseEntity]:
+ """
+ Check for exact matches by identifier.
+ """
+ ext_ids = release.ext_ids
+ attrs = (
+ "doi",
+ "pmid",
+ "wikidata_qid",
+ "core",
+ "pmcid",
+ "arxiv",
+ "dblp",
+ "doaj",
+ "jstor",
+ "isbn13",
+ "ark",
+ "mag",
+ "oai",
+ )
+ for attr in attrs:
+ value = getattr(ext_ids, attr)
+ if not value:
+ continue
+ try:
+ r = self.api.lookup_release(**{attr: value})
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status in [404, 400]:
+ r = None
+ else:
+ raise err
+ if r:
+ return [r]
+ return []
+
+ def match_release_exact_title_exact_contrib(self, release):
+ """
+ Match exact title and exact contrib names. Case insensitive, order of
+ contribs does not matter.
+ """
+ if release.title is None or release.contribs is None:
+ return []
+ contrib_queries = [{
+ "match": {
+ "contrib_names": {
+ "query": contrib.raw_name,
+ "operator": "AND",
+ }
+ }
+ } for contrib in release.contribs]
+ query = {
+ "bool": {
+ "must": [{
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND",
+ },
+ }
+ }] + contrib_queries,
+ },
+ }
+ result = []
+
+ resp = self.es.search(index=self.index,
+ body={
+ "query": query,
+ "size": self.size,
+ "track_total_hits": True
+ })
+ hits_total = es_compat_hits_total(resp)
+ if hits_total == 0:
+ return result
+ if hits_total > self.size:
+ self.logger.warn('more than {} hits: {}'.format(self.size, hits_total))
+
+ entities = response_to_entity_list(resp,
+ entity_type=ReleaseEntity,
+ size=self.size,
+ api=self.api)
+
+ # Require overlap of contrib.
+ matcher = ContribListMatcher(
+ cmp=JaccardIndexThreshold(1.0),
+ pipeline=Pipeline([
+ lambda contribs: set((c.raw_name.strip().lower() for c in contribs)),
+ ]),
+ )
+
+ for re in entities:
+ if re.title.strip().lower() != release.title.strip().lower():
+ continue
+ if not matcher.compare(re.contribs, release.contribs):
+ continue
+ result.append(re)
+ return result
+
+ def match_release_exact_title_partial_contrib(self, release):
+ """
+ Allow for exact authors, but ok, if some are missing.
+ """
+ if release.title is None or release.contribs is None:
+ return []
+ contrib_queries = [{
+ "match": {
+ "contrib_names": {
+ "query": contrib.raw_name,
+ "operator": "AND",
+ }
+ }
+ } for contrib in release.contribs]
+ query = {
+ "bool": {
+ "must": [{
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND",
+ },
+ }
+ }] + contrib_queries,
+ },
+ }
+ result = []
+ resp = self.es.search(index=self.index,
+ body={
+ "query": query,
+ "size": self.size,
+ "track_total_hits": True
+ })
+ if es_compat_hits_total(resp) == 0:
+ return result
+ if es_compat_hits_total(resp) > self.size:
+ raise NotImplementedError('result set too large: {}'.format(es))
+ entities = response_to_entity_list(resp,
+ entity_type=ReleaseEntity,
+ size=self.size,
+ api=self.api)
+
+ # Require at least half the contribs to be shared.
+ matcher = ContribListMatcher(
+ cmp=JaccardIndexThreshold(0.5),
+ pipeline=Pipeline([
+ lambda contribs: set((c.raw_name.strip().lower() for c in contribs)),
+ ]),
+ )
+
+ for re in entities:
+ if re.title.strip().lower() != release.title.strip().lower():
+ continue
+ if not matcher.compare(re.contribs, release.contribs):
+ continue
+ result.append(re)
+ return result
+
+ def match_release_exact_title_fuzzy_contrib(self, release):
+ """
+ Exact title but ok it authors differ (slightly).
+ """
+ if release.title is None or release.contribs is None:
+ return []
+ contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()]
+ contrib_queries = [{
+ "match": {
+ "contrib_names": {
+ "query": token,
+ }
+ }
+ } for token in contrib_tokens]
+ query = {
+ "bool": {
+ "must": [{
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND",
+ },
+ }
+ }] + contrib_queries,
+ },
+ }
+ result = []
+ resp = self.es.search(index=self.index,
+ body={
+ "query": query,
+ "size": self.size,
+ "track_total_hits": True
+ })
+ if es_compat_hits_total(resp) == 0:
+ return result
+ if es_compat_hits_total(resp) > self.size:
+ raise NotImplementedError('todo: scroll required for larger result sets: {}'.format(es))
+ entities = response_to_entity_list(resp,
+ entity_type=ReleaseEntity,
+ size=self.size,
+ api=self.api)
+
+ matcher = ContribListMatcher(
+ cmp=FuzzyStringSimilarity(min_ratio=60),
+ pipeline=Pipeline([
+ lambda contribs: set((c.raw_name.strip().lower() for c in contribs)),
+ ]),
+ )
+
+ for re in entities:
+ if re.title.strip().lower() != release.title.strip().lower():
+ continue
+ if not matcher.compare(re.contribs, release.contribs):
+ continue
+ result.append(re)
+ return result
+
+ def match_release_exact_title(self, release):
+ """
+ Exact title, but any author. For common titles, this will yield 100s or
+ 1000s or results.
+ """
+ if release.title is None:
+ return []
+ query = {
+ "bool": {
+ "must": [{
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND",
+ },
+ }
+ }],
+ },
+ }
+ result = []
+ resp = self.es.search(body={
+ "query": query,
+ "size": self.size,
+ "track_total_hits": True
+ },
+ index=self.index)
+ if es_compat_hits_total(resp) == 0:
+ return result
+ if es_compat_hits_total(resp) > self.size:
+ self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp)))
+ entities = response_to_entity_list(resp,
+ entity_type=ReleaseEntity,
+ size=self.size,
+ api=self.api)
+ for re in entities:
+ if re.title.strip().lower() != release.title.strip().lower():
+ continue
+ result.append(re)
+ return result
+
+ def match_release_fuzzy_title_fuzzy_contrib(self, release):
+ """
+ Using elasticsearch fuzziness option (which is not that fuzzy).
+ """
+ if release.title is None or release.contribs is None:
+ return []
+ contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()]
+ contrib_queries = [{
+ "match": {
+ "contrib_names": {
+ "query": token,
+ }
+ }
+ } for token in contrib_tokens]
+ query = {
+ "bool": {
+ "must": [
+ {
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND",
+ "fuzziness": "AUTO",
+ },
+ }
+ },
+ ] + contrib_queries,
+ },
+ }
+ result = []
+ resp = self.es.search(index=self.index,
+ body={
+ "query": query,
+ "size": self.size,
+ "track_total_hits": True
+ })
+ if es_compat_hits_total(resp) == 0:
+ return result
+ if es_compat_hits_total(resp) > self.size:
+ raise ValueError('too many hits: {}'.format(es_compat_hits_total(resp)))
+ entities = response_to_entity_list(resp,
+ entity_type=ReleaseEntity,
+ size=self.size,
+ api=self.api)
+ return entities
+
+ def match_release_generic(self, release):
+ """
+ Final catch all variant via title.
+ """
+ if release.title is None:
+ return []
+ query = {
+ "bool": {
+ "must": [
+ {
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "OR",
+ "fuzziness": "AUTO",
+ },
+ }
+ },
+ ],
+ },
+ }
+ result = []
+ resp = self.es.search(index=self.index,
+ body={
+ "query": query,
+ "size": self.size,
+ "track_total_hits": True
+ })
+ if es_compat_hits_total(resp) == 0:
+ return result
+ if es_compat_hits_total(resp) > self.size:
+ self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp)))
+ entities = response_to_entity_list(resp,
+ entity_type=ReleaseEntity,
+ size=self.size,
+ api=self.api)
+ return entities
+
+ def match_release_generic_fuzzy_contrib(self, release):
+ """
+ Only match contribs, if they exist.
+ """
+ if release.contribs is None:
+ return []
+ contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()]
+ contrib_queries = [{
+ "match": {
+ "contrib_names": {
+ "query": token,
+ }
+ }
+ } for token in contrib_tokens]
+ query = {
+ "bool": {
+ "must": contrib_queries,
+ },
+ }
+ result = []
+ resp = self.es.search(index=self.index,
+ body={
+ "query": query,
+ "size": self.size,
+ "track_total_hits": True
+ })
+ if es_compat_hits_total(resp) == 0:
+ return result
+ if es_compat_hits_total(resp) > self.size:
+ self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp)))
+ entities = response_to_entity_list(resp,
+ entity_type=ReleaseEntity,
+ size=self.size,
+ api=self.api)
+ return entities
+
+ def match_cascade(self, release, *qs, **kwargs):
+ """
+ Returns the result from the first query that returns a result. All query
+ functions need to be defined on this class (for now).
+ """
+ for q in qs:
+ self.logger.debug("[cascade] {}".format(q))
+ result = q(release, **kwargs)
+ if len(result) > 0:
+ return result
+ return []
+
+ def match(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]:
+ """
+ Match returns a list of match candidates given a release entity.
+ """
+ if not release:
+ return []
+ return self.match_cascade(
+ release, self.match_release_by_id, self.match_release_exact_title_exact_contrib,
+ self.match_release_exact_title_partial_contrib,
+ self.match_release_exact_title_fuzzy_contrib, self.match_release_exact_title,
+ self.match_release_fuzzy_title_fuzzy_contrib, self.match_release_generic,
+ self.match_release_generic_fuzzy_contrib)
+
+
def match_release_fuzzy(
release: ReleaseEntity,
size: int = 5,
es: Optional[Union[str, Type[elasticsearch.client.Elasticsearch]]] = None,
api: DefaultApi = None,
+ index: str = "fatcat_release",
) -> List[ReleaseEntity]:
"""
Given a release entity, return a number similar release entities from
fatcat using Elasticsearch.
TODO: rename "es" parameter to "es_client", which would be clearer
+
+ This is deprecated, move to matcher class.
"""
assert isinstance(release, ReleaseEntity)
@@ -41,6 +487,17 @@ def match_release_fuzzy(
if api is None:
api = public_api(FATCAT_API_URL)
+ # > query cascade
+ #
+ # [x] 1 exact ids
+ # [ ] 2 exact title and exact contrib
+ # [ ] 3 exact title and fuzzy contrib
+ # [ ] 4 exact title
+ # [ ] 5 title w/o stopwords, fuzzy contrib
+ # [ ] 6 title w/o stopwords
+ # [ ] 7 fuzzy title and fuzzy contrib
+ # [ ] 8 fuzzy whole document
+
# Try to match by external identifier.
# TODO: use api, ability to disable; benchmark
ext_ids = release.ext_ids
@@ -75,105 +532,89 @@ def match_release_fuzzy(
if release.title is not None and release.contribs is not None:
names = " ".join([c.raw_name for c in release.contribs])
- body = {
- "track_total_hits": True,
- "query": {
- "bool": {
- "must": [
- {
- "match": {
- "title": {
- "query": release.title,
- "operator": "AND",
- "fuzziness": "AUTO",
- },
+ query = {
+ "bool": {
+ "must": [
+ {
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND",
+ "fuzziness": "AUTO",
+ },
+ }
+ },
+ {
+ "match": {
+ "contrib_names": {
+ "query": names,
+ "operator": "AND",
+ "fuzziness": "AUTO",
}
- },
- {
- "match": {
- "contrib_names": {
- "query": names,
- "operator": "AND",
- "fuzziness": "AUTO",
- }
- }
- },
- ],
- },
+ }
+ },
+ ],
},
- "size": size,
}
- resp = es.search(body=body, index="fatcat_release")
+ resp = es.search(index=index, body={"query": query, "size": size, "track_total_hits": True})
if es_compat_hits_total(resp) > 0:
return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api)
- body = {
- "track_total_hits": True,
- "query": {
- "bool": {
- "should": [
- {
- "match": {
- "title": {
- "query": release.title,
- "operator": "AND",
- "fuzziness": "AUTO",
- },
+ query = {
+ "bool": {
+ "should": [
+ {
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND",
+ "fuzziness": "AUTO",
+ },
+ }
+ },
+ {
+ "match": {
+ "contrib_names": {
+ "query": names,
+ "operator": "AND",
+ "fuzziness": "AUTO",
}
- },
- {
- "match": {
- "contrib_names": {
- "query": names,
- "operator": "AND",
- "fuzziness": "AUTO",
- }
- }
- },
- ],
- },
+ }
+ },
+ ],
},
- "size": size,
}
- resp = es.search(body=body, index="fatcat_release")
+ resp = es.search(index=index, body={"query": query, "size": size, "track_total_hits": True})
if es_compat_hits_total(resp) > 0:
return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api)
# Note: If the title is short, we will get lots of results here; do we need
# to check for title length or result set length length or result set
# length here?
- body = {
- "track_total_hits": True,
- "query": {
- "match": {
- "title": {
- "query": release.title,
- "operator": "AND",
- }
+ query = {
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND",
}
- },
- "size": size,
+ }
}
- resp = es.search(body=body, index="fatcat_release")
+ resp = es.search(index=index, body={"query": query, "size": size, "track_total_hits": True})
if es_compat_hits_total(resp) > 0:
return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api)
# Get fuzzy.
# https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
- body = {
- "track_total_hits": True,
- "query": {
- "match": {
- "title": {
- "query": release.title,
- "operator": "AND",
- "fuzziness": "AUTO",
- }
+ query = {
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND",
+ "fuzziness": "AUTO",
}
- },
- "size": size,
+ }
}
- resp = es.search(body=body, index="fatcat_release")
+ resp = es.search(index=index, body={"query": query, "size": size, "track_total_hits": True})
if es_compat_hits_total(resp) > 0:
return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api)
diff --git a/fuzzycat/sandcrawler.py b/fuzzycat/sandcrawler.py
index 958756a..63b85e6 100644
--- a/fuzzycat/sandcrawler.py
+++ b/fuzzycat/sandcrawler.py
@@ -1,6 +1,7 @@
-import regex
import unicodedata
+import regex
+
# from http://zderadicka.eu/removing-diacritics-marks-from-strings/
SANDCRAWLER_CHAR_MAP = {
'\N{Latin capital letter AE}': 'AE',
@@ -63,6 +64,7 @@ SANDCRAWLER_REMOVE_CHAR_REGEX = regex.compile(
r"[\s\p{Punctuation}\p{M}\p{InCombiningDiacriticalMarks}\u2000-\u206F\u2E00-\u2E7F’·“”‘’“”«»「」¿–±§_`°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]"
)
+
def sandcrawler_slugify(raw: str) -> str:
"""
Python re-implementation of sandcrawler Scala code for string comparison
@@ -155,4 +157,3 @@ def test_sandcrawler_slugify() -> None:
print(unicodedata.name(c))
print(in_str)
assert sandcrawler_slugify(in_str) == out_str
-
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 9eb808b..f570511 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -233,10 +233,9 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
if has_doi_prefix(a_doi, "10.3403") and has_doi_prefix(b_doi, "10.3403"):
if a_doi + "u" == b_doi or b_doi + "u" == a_doi:
return Verify(Status.STRONG, Reason.CUSTOM_BSI_UNDATED)
- if a_title == b_title and ((dict_has_key(a, "extra.subtitle")
- and not dict_has_key(b, "extra.subtitle")) or
- (dict_has_key(b, "extra.subtitle")
- and not dict_has_key(a, "extra.subtitle"))):
+ if a_title == b_title and (
+ (dict_has_key(a, "extra.subtitle") and not dict_has_key(b, "extra.subtitle")) or
+ (dict_has_key(b, "extra.subtitle") and not dict_has_key(a, "extra.subtitle"))):
return Verify(Status.STRONG, Reason.CUSTOM_BSI_SUBDOC)
except PathAccessError:
pass
diff --git a/notes/2021_11_fuzzycat_refactoring.md b/notes/2021_11_fuzzycat_refactoring.md
new file mode 100644
index 0000000..171cee3
--- /dev/null
+++ b/notes/2021_11_fuzzycat_refactoring.md
@@ -0,0 +1,87 @@
+# Proposal: Fuzzycat Refactoring
+
+* Goal: Refactor fuzzycat to make matching and verification more composable,
+ configurable and testable.
+* Status: wip
+
+A better design.
+
+* it has a correct scope (e.g. match X; very Y)
+* it has good defaults, but allows configuration
+* it is clear how and where to extend functionality
+* it is easy to add one new test for a case
+
+## Matching
+
+* fuzzy matching will be a cascade of queries, until a result is returned
+* there is an order of queries from exact to very fuzzy
+* alternatively, we could use "ensemble matching", that takes the intersection of a couple of queries
+* ES queries cannot cover all cases, we need to add additional checks; e.g. author list comparison
+
+Example
+
+ FuzzyReleaseMatcher
+ match_release_id
+ match_release_exact_title_exact_contrib
+ match_release_...
+
+ match_release_fuzzy (runs a cascade of queries)
+
+Each function is testable on its own. The class keeps the es client and other
+global config around. It's scope is clear: given a "release" (or maybe just a
+title string), generate a list of potentially related releases.
+
+Other entities follow the same pattern.
+
+ FuzzyContainerMatcher
+ match_container_id
+ match_container_issn
+ match_container_abbreviation
+ match_container_...
+
+ match_container_fuzzy (runs a cascade of queries)
+
+A helper object (not exactly the entity) for matching list of authors. Allows
+to match by various means, e.g. exact, short names, partial lists, etc. Should
+account for case, order, etc.
+
+ FuzzyContribsMatcher
+ match_exact
+ match_short_names
+ match_partial_list
+
+ match_fuzzy
+
+For each method in each matcher class, we can construct a test case only for
+one particular method. A new method can be added with easy and tested separately.
+
+Don't know how yet, but we can create some "profiles" that allow for a matching
+by a set of methods. Or use good defaults on the higher level `_fuzzy(...)` method.
+
+NOTE: the matcher classes could use the verification code internally; generate
+a list of matches with an es query, then use a configured verifier to generate
+verified matches; only put comparison code into verification module.
+
+## Verification (comparison)
+
+Verification works similarly. For each entity we define a set of methods, verifying a specific aspect.
+
+ FuzzyReleaseVerifier
+ verify_release_id
+ verify_release_ext_id
+ verify_release_title_exact_match
+ verify_release_title_contrib_exact_match
+ verify_release_...
+
+ verify(a, b) -> (Status, Reason)
+
+A large number of test cases are already there, may need a bit better structure
+to relate cases to methods. The class can hold global configuration, maybe some
+cached computed properties, if that helps.
+
+
+ FuzzyContainerVerifier
+ verify_container_id
+ ...
+
+
diff --git a/setup.py b/setup.py
index cb95ef4..ced393f 100644
--- a/setup.py
+++ b/setup.py
@@ -33,13 +33,15 @@ with open("README.md", "r") as fh:
"fatcat-openapi-client>=0.4.0", # https://pypi.org/project/fatcat-openapi-client/
"ftfy",
"glom",
+ "grobid_tei_xml==0.1.*",
"jellyfish",
+ "pyyaml",
"regex",
"requests>=2",
+ "thefuzz",
"toml",
"unidecode>=0.10",
"zstandard",
- "grobid_tei_xml==0.1.*",
],
extras_require={"dev": [
"ipython",
diff --git a/tests/files/README.md b/tests/files/README.md
new file mode 100644
index 0000000..ef674d6
--- /dev/null
+++ b/tests/files/README.md
@@ -0,0 +1,5 @@
+# Matcher Test Files
+
+The goal here is to have a mostly language-independent test cases for matching.
+
+Each subdirectory corresponds to a test function and contains examples for it.
diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml
new file mode 100644
index 0000000..2df8d9a
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml
@@ -0,0 +1,13 @@
+title: titles are case insensitive
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Michael Adams"
+ }
+ ],
+ "title": "digital libraries",
+ "ext_ids": {}
+ }
+expected: 2
diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml
new file mode 100644
index 0000000..1070408
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml
@@ -0,0 +1,13 @@
+title: another vanilla query
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Poul-Henning Kamp"
+ }
+ ],
+ "title": "The hyperdimensional tar pit",
+ "ext_ids": {}
+ }
+expected: 2
diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml
new file mode 100644
index 0000000..882e746
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml
@@ -0,0 +1,16 @@
+title: order of contribs does not matter
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Maurice Florence"
+ },
+ {
+ "raw_name": "Tuomo Tiisala"
+ }
+ ],
+ "title": "Foucault",
+ "ext_ids": {}
+ }
+expected: 1
diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml
new file mode 100644
index 0000000..0a2ad12
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml
@@ -0,0 +1,16 @@
+title: order of contribs does not matter
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Tuomo Tiisala"
+ },
+ {
+ "raw_name": "Maurice Florence"
+ }
+ ],
+ "title": "Foucault",
+ "ext_ids": {}
+ }
+expected: 1
diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml
new file mode 100644
index 0000000..36ea0fe
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml
@@ -0,0 +1,16 @@
+title: short version of name should not work
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Tuomo Tiisala"
+ },
+ {
+ "raw_name": "M. Florence"
+ }
+ ],
+ "title": "Foucault",
+ "ext_ids": {}
+ }
+expected: 0
diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml
new file mode 100644
index 0000000..07230e8
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml
@@ -0,0 +1,14 @@
+title: titles are case insensitive
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Michael Adams"
+ }
+ ],
+ "title": "digital libraries",
+ "ext_ids": {}
+ }
+jaccard_index_threshold: 1.0
+expected: 2
diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml
new file mode 100644
index 0000000..62e9586
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml
@@ -0,0 +1,14 @@
+title: another vanilla query
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Poul-Henning Kamp"
+ }
+ ],
+ "title": "The hyperdimensional tar pit",
+ "ext_ids": {}
+ }
+jaccard_index_threshold: 1.0
+expected: 2
diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml
new file mode 100644
index 0000000..b89e825
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml
@@ -0,0 +1,17 @@
+title: order of contribs does not matter
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Maurice Florence"
+ },
+ {
+ "raw_name": "Tuomo Tiisala"
+ }
+ ],
+ "title": "Foucault",
+ "ext_ids": {}
+ }
+jaccard_index_threshold: 1.0
+expected: 1
diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml
new file mode 100644
index 0000000..3de7262
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml
@@ -0,0 +1,17 @@
+title: order of contribs does not matter
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Tuomo Tiisala"
+ },
+ {
+ "raw_name": "Maurice Florence"
+ }
+ ],
+ "title": "Foucault",
+ "ext_ids": {}
+ }
+jaccard_index_threshold: 1.0
+expected: 1
diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml
new file mode 100644
index 0000000..39fb065
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml
@@ -0,0 +1,17 @@
+title: short version of name should not work
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Tuomo Tiisala"
+ },
+ {
+ "raw_name": "M. Florence"
+ }
+ ],
+ "title": "Foucault",
+ "ext_ids": {}
+ }
+jaccard_index_threshold: 1.0
+expected: 0
diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml
new file mode 100644
index 0000000..fff19fa
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml
@@ -0,0 +1,17 @@
+title: here, Iz Beltagy is missing from author, but still retrieved
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Arman Cohan"
+ },
+ {
+ "raw_name": "Kyle Lo"
+ }
+ ],
+ "title": "SciBERT: A Pretrained Language Model for Scientific Text",
+ "ext_ids": {}
+ }
+jaccard_index_threshold: 0.5
+expected: 3
diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml
new file mode 100644
index 0000000..d4e0025
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml
@@ -0,0 +1,14 @@
+title: here, 2/3 authors are missing, we fail with jaccard index 0.5
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Arman Cohan"
+ }
+ ],
+ "title": "SciBERT: A Pretrained Language Model for Scientific Text",
+ "ext_ids": {}
+ }
+jaccard_index_threshold: 0.5
+expected: 0
diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml
new file mode 100644
index 0000000..23d5a8d
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml
@@ -0,0 +1,17 @@
+title: match, despite trailing whitespace
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Arman Cohan"
+ },
+ {
+ "raw_name": "Kyle Lo"
+ }
+ ],
+ "title": "SciBERT: A Pretrained Language Model for Scientific Text ",
+ "ext_ids": {}
+ }
+jaccard_index_threshold: 0.5
+expected: 3
diff --git a/tests/test_grobid_unstructured.py b/tests/test_grobid_unstructured.py
index cf71f91..f36f9a4 100644
--- a/tests/test_grobid_unstructured.py
+++ b/tests/test_grobid_unstructured.py
@@ -18,11 +18,7 @@ def test_grobid_ref_to_release():
given_name='ahab',
surname='sailor',
),
- GrobidAuthor(
- full_name='mary jane',
- given_name='mary',
- surname='jane'
- ),
+ GrobidAuthor(full_name='mary jane', given_name='mary', surname='jane'),
],
)
r = grobid_ref_to_release(d)
diff --git a/tests/test_matching.py b/tests/test_matching.py
index ad971a5..ca94c2a 100644
--- a/tests/test_matching.py
+++ b/tests/test_matching.py
@@ -1,13 +1,14 @@
+import collections
import logging
import warnings
import elasticsearch
import pytest
import requests
-from fatcat_openapi_client import ReleaseEntity
+from fatcat_openapi_client import ReleaseEntity, ReleaseContrib
-from fuzzycat.entities import entity_from_dict
-from fuzzycat.matching import anything_to_entity, match_release_fuzzy
+from fuzzycat.entities import entity_from_dict, entity_from_json
+from fuzzycat.matching import anything_to_entity, match_release_fuzzy, FuzzyReleaseMatcher
warnings.filterwarnings(
"ignore") # InsecureRequestWarning: Unverified HTTPS request is being made to host ...
@@ -18,6 +19,9 @@ from fatcat_openapi_client import ReleaseEntity
import pytest
import elasticsearch
import logging
+import yaml
+import glob
+import json
logger = logging.getLogger('test_matching')
logger.setLevel(logging.DEBUG)
@@ -40,19 +44,35 @@ def is_reachable(url, timeout=3):
return False
+def yaml_to_cases(klass,
+ files="tests/files/fuzzy_release_match_release_exact_title_exact_contrib/*.yaml"):
+ """
+ Turn yaml files into a collection of named tuple test cases. The glob is
+ relative to the project root (i.e. where you usually run `pytest` from).
+ """
+ cases = []
+ for path in glob.glob(files):
+ with open(path) as f:
+ doc = yaml.load(f, Loader=yaml.Loader)
+ cases.append(klass(**doc))
+ return cases
+
+
@pytest.fixture
def es_client():
return elasticsearch.Elasticsearch([FATCAT_SEARCH_URL])
-@pytest.mark.skipif(
- is_not_reachable(FATCAT_SEARCH_URL),
- reason="{} not reachable, use e.g. FUZZYCAT_FATCAT_SEARCH_URL=localhost:9200 to override".
- format(FATCAT_SEARCH_URL))
+# @pytest.mark.skipif(
+# is_not_reachable(FATCAT_SEARCH_URL),
+# reason="{} not reachable, use e.g. FUZZYCAT_FATCAT_SEARCH_URL=localhost:9200 to override".
+# format(FATCAT_SEARCH_URL))
def test_match_release_fuzzy(es_client, caplog):
"""
This test is tied to the current index contents, so if that changes, this
test may fail as well.
+
+ Note: Deprecated. We want to get rid of this.
"""
cases = (
("wtv64ahbdzgwnan7rllwr3nurm", 1),
@@ -106,3 +126,92 @@ def test_match_release_fuzzy(es_client, caplog):
logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result),
[v.title for v in result]))
assert len(result) == count, doc
+
+
+def test_matcher_match_release(es_client, caplog):
+ cases = (
+ ("wtv64ahbdzgwnan7rllwr3nurm", 1),
+ ("eqcgtpav3na5jh56o5vjsvb4ei", 1),
+ )
+ matcher = FuzzyReleaseMatcher(es=es_client, size=5)
+ for i, (ident, count) in enumerate(cases):
+ entity = anything_to_entity(ident, ReleaseEntity)
+ result = matcher.match(entity)
+ logger.info("[{}] given {}, found {}".format(i, entity.title, len(result)))
+ assert len(result) == count
+
+ # Partial data.
+ cases = (
+ ({
+ "title": "digital libraries",
+ "ext_ids": {}
+ }, 5),
+ ({
+ "title": "unlikelytitle",
+ "ext_ids": {}
+ }, 0),
+ ({
+ "title": "Imminent dystopia",
+ "ext_ids": {}
+ }, 5),
+ ({
+ "title": "",
+ "contribs": [{
+ "raw_name": "Aristoteles"
+ }],
+ "ext_ids": {}
+ }, 5),
+ # ({
+ # "title": "Letter",
+ # "contribs": [{"raw_name": "Claudel"}],
+ # "ext_ids": {}
+ # }, 1),
+ # ({
+ # "title": "The Future of Digital Scholarship",
+ # "contribs": [{
+ # "raw_name": "Costantino Thanos"
+ # }],
+ # "ext_ids": {}
+ # }, 5),
+ )
+ for i, (doc, count) in enumerate(cases):
+ entity = entity_from_dict(doc, ReleaseEntity)
+ result = matcher.match(entity)
+ with caplog.at_level(logging.INFO):
+ logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result),
+ [v.title for v in result]))
+ assert len(result) == count, doc
+
+
+def test_fuzzy_release_matcher_match_release_by_id(es_client, caplog):
+ matcher = FuzzyReleaseMatcher(es=es_client)
+ cases = (
+ ("wtv64ahbdzgwnan7rllwr3nurm", 1),
+ ("eqcgtpav3na5jh56o5vjsvb4ei", 1),
+ )
+ for i, (ident, count) in enumerate(cases):
+ entity = anything_to_entity(ident, ReleaseEntity)
+ result = matcher.match_release_by_id(entity)
+ assert len(result) == count
+
+
+def test_fuzzy_release_match_release_exact_title_exact_contrib(es_client, caplog):
+ matcher = FuzzyReleaseMatcher(es=es_client)
+ Case = collections.namedtuple("Case", "title date input expected")
+ cases = yaml_to_cases(
+ Case, "tests/files/fuzzy_release_match_release_exact_title_exact_contrib/*.yaml")
+ for i, c in enumerate(cases):
+ entity = entity_from_json(c.input, ReleaseEntity)
+ result = matcher.match_release_exact_title_exact_contrib(entity)
+ assert len(result) == c.expected, "[{}] {}".format(c.title, c.input)
+
+
+def test_fuzzy_release_match_release_exact_title_partial_contrib(es_client, caplog):
+ matcher = FuzzyReleaseMatcher(es=es_client)
+ Case = collections.namedtuple("Case", "title date input jaccard_index_threshold expected")
+ cases = yaml_to_cases(
+ Case, "tests/files/fuzzy_release_match_release_exact_title_partial_contrib/*.yaml")
+ for i, c in enumerate(cases):
+ entity = entity_from_json(c.input, ReleaseEntity)
+ result = matcher.match_release_exact_title_partial_contrib(entity)
+ assert len(result) == c.expected, "[{}] {}".format(c.title, c.input)