aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-11-17 14:51:50 +0100
committerMartin Czygan <martin.czygan@gmail.com>2021-12-06 19:53:30 +0100
commitdd6149140542585f2b0bfc3b334ec2b0a88b790e (patch)
tree6a11c228558cfbf73932bc828cda9be3735cfd78
parentd104f8d0ba8eef5563555de82be66bbf17f961db (diff)
downloadfuzzycat-dd6149140542585f2b0bfc3b334ec2b0a88b790e.tar.gz
fuzzycat-dd6149140542585f2b0bfc3b334ec2b0a88b790e.zip
complete FuzzyReleaseMatcher refactoring
We keep the name, since the api - "matcher.match(release)" - is the same; simplified queries; at most one query is performed against elasticsearch; parallel release retrieval from the API; optional support for release year windows; Test cases are expressed in yaml and will be auto-loaded from the specified directory; test work against the current search endpoint, which means the actual output may change on index updates; for the moment, we think this setup is relatively simple and not too unstable. about: title contrib, partial name input: > { "contribs": [ { "raw_name": "Adams" } ], "title": "digital libraries", "ext_ids": {} } release_year_padding: 1 expected: - 7rmvqtrb2jdyhcxxodihzzcugy - a2u6ougtsjcbvczou6sazsulcm - dy45vilej5diros6zmax46nm4e - exuwhhayird4fdjmmsiqpponlq - gqrj7jikezgcfpjfazhpf4e7c4 - mkmqt3453relbpuyktnmsg6hjq - t2g5sl3dgzchtnq7dynxyzje44 - t4tvenhrvzamraxrvvxivxmvga - wd3oeoi3bffknfbg2ymleqc4ja - y63a6dhrfnb7bltlxfynydbojy
-rw-r--r--TODO.md5
-rw-r--r--fuzzycat/matching.py479
-rw-r--r--notes/es_fuzzy_queries/README.md1
-rw-r--r--tests/files/simple_fuzzy_release_matcher/0.yaml16
-rw-r--r--tests/files/simple_fuzzy_release_matcher/1.yaml24
-rw-r--r--tests/files/simple_fuzzy_release_matcher/2.yaml24
-rw-r--r--tests/files/simple_fuzzy_release_matcher/3.yaml19
-rw-r--r--tests/files/simple_fuzzy_release_matcher/4.yaml16
-rw-r--r--tests/files/simple_fuzzy_release_matcher/5.yaml16
-rw-r--r--tests/files/simple_fuzzy_release_matcher/6.yaml24
-rw-r--r--tests/files/simple_fuzzy_release_matcher/7.yaml10
-rw-r--r--tests/files/simple_fuzzy_release_matcher/8.yaml139
-rw-r--r--tests/files/simple_fuzzy_release_matcher/9.yaml139
-rw-r--r--tests/test_matching.py94
14 files changed, 644 insertions, 362 deletions
diff --git a/TODO.md b/TODO.md
index d9d8b02..414c972 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,5 +1,10 @@
# TODO
+* [ ] match release with fewer requests (or do them in parallel)
+* [ ] de-clobber verify
+
+----
+
* [ ] clustering should be broken up, e.g. into "map" and "sort"
* [x] match release should be a class
* [x] match release fuzzy should work not just with title
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
index 2984d9a..cb6acbb 100644
--- a/fuzzycat/matching.py
+++ b/fuzzycat/matching.py
@@ -1,7 +1,9 @@
+import collections
import logging
import os
import re
import sys
+from multiprocessing.dummy import Pool
from typing import Any, List, Optional, Type, Union
import elasticsearch
@@ -22,37 +24,24 @@ FATCAT_API_URL = settings.get("FATCAT_API_URL", "https://api.fatcat.wiki/v0")
class FuzzyReleaseMatcher:
"""
- FuzzyReleaseMatcher tries to find similar items to a given release in
- elasticsearch. Exact matches first, then fuzzy.
+ This is a helper class to fetch related documents to a given release
+ document from fatcat search (currently elasticsearc)). Elasticsearch should
+ rank similar documents high itself, so all we try to do here is to tweak
+ the specific query a bit, depending on the completeness of the input
+ document, e.g. if the input has contrib and title, then use both, if it
+ only has a title, then use just that, etc.
- In the best case, elasticsearch would automatically rank the most relevant
- docs first, even with partial data. We still try to steer the matches by
- using a query cascade. This is configurable. The last query should be a
- generic.
-
- The goal here is to get a set of potential matches; verification has to.
- happen separately.
-
- TODO:
-
- Example case not yet working well ("Stuehrenberg" vs "Stührenberg"):
-
- >>> result = matcher.match(entity_from_dict({"title": "internet archive",
- "contribs": [{"raw_name":
- "Stührenberg"}],
- "ext_ids": {}},
- ReleaseEntity))
-
- > Should return: https://fatcat.wiki/release/pu7e7tbctna2foqyyxztfw3ufy,
- https://fatcat.wiki/release/search?q=St%C3%BChrenberg+internet+archive&generic=1
- (not returning anything via frontend either)
-
- Make sure we can switch from function to class:
-
- * [ ] 5 test cases for both
+ We try to get the result in a single query.
+ TODO/Tweaks: e.g. if document do have a "release_year", add this as a "should" clause.
"""
- def __init__(self, es="https://search.fatcat.wiki", api=None, index="fatcat_release", size=10):
+ def __init__(self,
+ es="https://search.fatcat.wiki",
+ api=None,
+ index="fatcat_release",
+ size=10,
+ min_token_length=3,
+ release_year_padding=1):
if isinstance(es, str):
self.es = elasticsearch.Elasticsearch([es])
else:
@@ -61,8 +50,10 @@ class FuzzyReleaseMatcher:
self.index = index
self.size = size
self.logger = logging.getLogger("fuzzy")
+ self.min_token_length = min_token_length
+ self.release_year_padding = 1
- def match_release_by_id(self, release, **kwargs) -> List[ReleaseEntity]:
+ def _match_id(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]:
"""
Check for exact matches by identifier.
"""
@@ -97,229 +88,10 @@ class FuzzyReleaseMatcher:
return [r]
return []
- def match_release_exact_title_exact_contrib(self, release):
- """
- Match exact title and exact contrib names. Case insensitive, order of
- contribs does not matter.
- """
- if release.title is None or release.contribs is None:
- return []
- contrib_queries = [{
- "match": {
- "contrib_names": {
- "query": contrib.raw_name,
- "operator": "AND",
- }
- }
- } for contrib in release.contribs]
- query = {
- "bool": {
- "must": [{
- "match": {
- "title": {
- "query": release.title,
- "operator": "AND",
- },
- }
- }] + contrib_queries,
- },
- }
- result = []
-
- resp = self.es.search(index=self.index,
- body={
- "query": query,
- "size": self.size,
- "track_total_hits": True
- })
- hits_total = es_compat_hits_total(resp)
- if hits_total == 0:
- return result
- if hits_total > self.size:
- self.logger.warn('more than {} hits: {}'.format(self.size, hits_total))
-
- entities = response_to_entity_list(resp,
- entity_type=ReleaseEntity,
- size=self.size,
- api=self.api)
-
- # Require overlap of contrib.
- matcher = ContribListMatcher(
- cmp=JaccardIndexThreshold(1.0),
- pipeline=Pipeline([
- lambda contribs: set((c.raw_name.strip().lower() for c in contribs)),
- ]),
- )
-
- for re in entities:
- if re.title.strip().lower() != release.title.strip().lower():
- continue
- if not matcher.compare(re.contribs, release.contribs):
- continue
- result.append(re)
- return result
-
- def match_release_exact_title_partial_contrib(self, release):
- """
- Allow for exact authors, but ok, if some are missing.
- """
- if release.title is None or release.contribs is None:
- return []
- contrib_queries = [{
- "match": {
- "contrib_names": {
- "query": contrib.raw_name,
- "operator": "AND",
- }
- }
- } for contrib in release.contribs]
- query = {
- "bool": {
- "must": [{
- "match": {
- "title": {
- "query": release.title,
- "operator": "AND",
- },
- }
- }] + contrib_queries,
- },
- }
- result = []
- resp = self.es.search(index=self.index,
- body={
- "query": query,
- "size": self.size,
- "track_total_hits": True
- })
- if es_compat_hits_total(resp) == 0:
- return result
- if es_compat_hits_total(resp) > self.size:
- raise NotImplementedError('result set too large: {}'.format(es))
- entities = response_to_entity_list(resp,
- entity_type=ReleaseEntity,
- size=self.size,
- api=self.api)
-
- # Require at least half the contribs to be shared.
- matcher = ContribListMatcher(
- cmp=JaccardIndexThreshold(0.5),
- pipeline=Pipeline([
- lambda contribs: set((c.raw_name.strip().lower() for c in contribs)),
- ]),
- )
-
- for re in entities:
- if re.title.strip().lower() != release.title.strip().lower():
- continue
- if not matcher.compare(re.contribs, release.contribs):
- continue
- result.append(re)
- return result
-
- def match_release_exact_title_fuzzy_contrib(self, release):
- """
- Exact title but ok it authors differ (slightly).
- """
- if release.title is None or release.contribs is None:
- return []
- contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()]
- contrib_queries = [{
- "match": {
- "contrib_names": {
- "query": token,
- }
- }
- } for token in contrib_tokens]
- query = {
- "bool": {
- "must": [{
- "match": {
- "title": {
- "query": release.title,
- "operator": "AND",
- },
- }
- }] + contrib_queries,
- },
- }
- result = []
- resp = self.es.search(index=self.index,
- body={
- "query": query,
- "size": self.size,
- "track_total_hits": True
- })
- if es_compat_hits_total(resp) == 0:
- return result
- if es_compat_hits_total(resp) > self.size:
- raise NotImplementedError('todo: scroll required for larger result sets: {}'.format(es))
- entities = response_to_entity_list(resp,
- entity_type=ReleaseEntity,
- size=self.size,
- api=self.api)
-
- matcher = ContribListMatcher(
- cmp=FuzzyStringSimilarity(min_ratio=60),
- pipeline=Pipeline([
- lambda contribs: set((c.raw_name.strip().lower() for c in contribs)),
- ]),
- )
-
- for re in entities:
- if re.title.strip().lower() != release.title.strip().lower():
- continue
- if not matcher.compare(re.contribs, release.contribs):
- continue
- result.append(re)
- return result
-
- def match_release_exact_title(self, release):
+ def _match_title_contrib(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]:
"""
- Exact title, but any author. For common titles, this will yield 100s or
- 1000s or results.
+ Match in the presence of defined title and contrib fields.
"""
- if release.title is None:
- return []
- query = {
- "bool": {
- "must": [{
- "match": {
- "title": {
- "query": release.title,
- "operator": "AND",
- },
- }
- }],
- },
- }
- result = []
- resp = self.es.search(body={
- "query": query,
- "size": self.size,
- "track_total_hits": True
- },
- index=self.index)
- if es_compat_hits_total(resp) == 0:
- return result
- if es_compat_hits_total(resp) > self.size:
- self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp)))
- entities = response_to_entity_list(resp,
- entity_type=ReleaseEntity,
- size=self.size,
- api=self.api)
- for re in entities:
- if re.title.strip().lower() != release.title.strip().lower():
- continue
- result.append(re)
- return result
-
- def match_release_fuzzy_title_fuzzy_contrib(self, release):
- """
- Using elasticsearch fuzziness option (which is not that fuzzy).
- """
- if release.title is None or release.contribs is None:
- return []
contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()]
contrib_queries = [{
"match": {
@@ -343,7 +115,18 @@ class FuzzyReleaseMatcher:
] + contrib_queries,
},
}
+ if release.release_year is not None:
+ query["bool"]["must"].append({
+ "range": {
+ "year": {
+ "gte": release.release_year - self.release_year_padding,
+ "lte": release.release_year + self.release_year_padding,
+ "boost": 0.5,
+ }
+ }
+ })
result = []
+ self.logger.info(query)
resp = self.es.search(index=self.index,
body={
"query": query,
@@ -353,19 +136,17 @@ class FuzzyReleaseMatcher:
if es_compat_hits_total(resp) == 0:
return result
if es_compat_hits_total(resp) > self.size:
- raise ValueError('too many hits: {}'.format(es_compat_hits_total(resp)))
+ self.logger.warning('too many hits: {}'.format(es_compat_hits_total(resp)))
entities = response_to_entity_list(resp,
entity_type=ReleaseEntity,
size=self.size,
api=self.api)
return entities
- def match_release_generic(self, release):
+ def _match_title(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]:
"""
- Final catch all variant via title.
+ Match in the presence of a title.
"""
- if release.title is None:
- return []
query = {
"bool": {
"must": [
@@ -373,7 +154,7 @@ class FuzzyReleaseMatcher:
"match": {
"title": {
"query": release.title,
- "operator": "OR",
+ "operator": "AND",
"fuzziness": "AUTO",
},
}
@@ -381,6 +162,16 @@ class FuzzyReleaseMatcher:
],
},
}
+ if release.release_year is not None:
+ query["bool"]["must"].append({
+ "range": {
+ "year": {
+ "gte": release.release_year - self.release_year_padding,
+ "lte": release.release_year + self.release_year_padding,
+ "boost": 0.5,
+ }
+ }
+ })
result = []
resp = self.es.search(index=self.index,
body={
@@ -391,19 +182,17 @@ class FuzzyReleaseMatcher:
if es_compat_hits_total(resp) == 0:
return result
if es_compat_hits_total(resp) > self.size:
- self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp)))
+ self.logger.warning('too many hits: {}'.format(es_compat_hits_total(resp)))
entities = response_to_entity_list(resp,
entity_type=ReleaseEntity,
size=self.size,
api=self.api)
return entities
- def match_release_generic_fuzzy_contrib(self, release):
+ def _match_contribs(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]:
"""
- Only match contribs, if they exist.
+ Match in the presence of contribs (and no title).
"""
- if release.contribs is None:
- return []
contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()]
contrib_queries = [{
"match": {
@@ -417,6 +206,16 @@ class FuzzyReleaseMatcher:
"must": contrib_queries,
},
}
+ if release.release_year is not None:
+ query["bool"]["must"].append({
+ "range": {
+ "year": {
+ "gte": release.release_year - self.release_year_padding,
+ "lte": release.release_year + self.release_year_padding,
+ "boost": 0.5,
+ }
+ }
+ })
result = []
resp = self.es.search(index=self.index,
body={
@@ -427,37 +226,78 @@ class FuzzyReleaseMatcher:
if es_compat_hits_total(resp) == 0:
return result
if es_compat_hits_total(resp) > self.size:
- self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp)))
+ self.logger.warning('too many hits: {}'.format(es_compat_hits_total(resp)))
entities = response_to_entity_list(resp,
entity_type=ReleaseEntity,
size=self.size,
api=self.api)
return entities
- def match_cascade(self, release, *qs, **kwargs):
+ def _match_generic(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]:
"""
- Returns the result from the first query that returns a result. All query
- functions need to be defined on this class (for now).
+ Throw tokens at elasticsearch.
"""
- for q in qs:
- self.logger.debug("[cascade] {}".format(q))
- result = q(release, **kwargs)
- if len(result) > 0:
- return result
- return []
+ token_queries = [
+ {
+ "match": {
+ "biblio": { # https://git.io/JMXvJ
+ "query": token,
+ }
+ }
+ } for token in release_tokens(release) if len(token) > self.min_token_length
+ ]
+ query = {
+ "bool": {
+ "must": token_queries,
+ },
+ }
+ if release.release_year is not None:
+ query["bool"]["must"].append({
+ "range": {
+ "year": {
+ "gte": release.release_year - self.release_year_padding,
+ "lte": release.release_year + self.release_year_padding,
+ "boost": 0.5,
+ }
+ }
+ })
+ result = []
+ self.logger.info(query)
+ resp = self.es.search(index=self.index,
+ body={
+ "query": query,
+ "size": self.size,
+ "track_total_hits": True
+ })
+ if es_compat_hits_total(resp) == 0:
+ return result
+ if es_compat_hits_total(resp) > self.size:
+ self.logger.warning('too many hits: {}'.format(es_compat_hits_total(resp)))
+ entities = response_to_entity_list(resp,
+ entity_type=ReleaseEntity,
+ size=self.size,
+ api=self.api)
+ return entities
def match(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]:
"""
- Match returns a list of match candidates given a release entity.
+ Match dispatches methods based on which fields are defined on the
+ document.
"""
if not release:
return []
- return self.match_cascade(
- release, self.match_release_by_id, self.match_release_exact_title_exact_contrib,
- self.match_release_exact_title_partial_contrib,
- self.match_release_exact_title_fuzzy_contrib, self.match_release_exact_title,
- self.match_release_fuzzy_title_fuzzy_contrib, self.match_release_generic,
- self.match_release_generic_fuzzy_contrib)
+ if release.ext_ids and len(release.ext_ids.to_dict()) > 0:
+ result = self._match_id(release)
+ if release.title is not None and release.contribs is not None:
+ result = self._match_title_contrib(release)
+ elif release.title is not None:
+ result = self._match_title(release)
+ elif release.contribs is not None:
+ result = self._match_contribs(release)
+ else:
+ result = self._match_generic(release)
+
+ return result
def public_api(host_uri):
@@ -471,14 +311,97 @@ def public_api(host_uri):
return fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf))
+def release_tokens(release: ReleaseEntity) -> List[str]:
+ """
+ Turn a release into a set of tokens.
+ """
+ tokens = []
+ red = release.to_dict()
+ for k, v in red.items():
+ if v is None or k == "ext_ids":
+ continue
+ v = str(v)
+ for tok in v.split():
+ tokens.append(tok)
+ for _, v in red.get("ext_ids", {}).items():
+ if v is None or not isinstance(v, str):
+ continue
+ for tok in v.split():
+ tokens.append(tok)
+
+ return tokens
+
+
+def test_release_tokens():
+ Case = collections.namedtuple("Case", "re tokens")
+ cases = (
+ Case(entity_from_dict({"ext_ids": {}}, ReleaseEntity), []),
+ Case(entity_from_dict({
+ "ext_ids": {},
+ "title": "Flow my tears"
+ }, ReleaseEntity), ["Flow", "my", "tears"]),
+ Case(
+ entity_from_dict(
+ {
+ "ext_ids": {},
+ "subtitle": "An illustrated guide",
+ "release_year": 1981,
+ }, ReleaseEntity), ["An", "illustrated", "guide", "1981"]),
+ )
+ for c in cases:
+ tokens = release_tokens(c.re)
+ assert tokens == c.tokens
+
+
+def fetch_release(ident, api=None):
+ """
+ Return release entity of None.
+ """
+ if api is None:
+ api = public_api(FATCAT_API_URL)
+ try:
+ re = api.get_release(ident, hide="refs,abstracts", expand="container,contribs,files")
+ except ApiException as exc:
+ if exc.status == 404:
+ print("[err] failed to retrieve release entity: {}".format(id), file=sys.stderr)
+ else:
+ print("[err] api failed with {}: {}".format(exc.status, exc.message), file=sys.stderr)
+ else:
+ return re
+
+
def retrieve_entity_list(
ids: List[str],
api: DefaultApi = None,
entity_type: Union[Type[ReleaseEntity], Type[ContainerEntity]] = ReleaseEntity,
) -> List[Union[Type[ReleaseEntity], Type[ContainerEntity]]]:
"""
+ Parallel requests.
+ """
+ if api is None:
+ api = public_api(FATCAT_API_URL)
+
+ result = []
+ if entity_type == ReleaseEntity:
+ with Pool(10) as p:
+ result = p.map(fetch_release, ids)
+ return [v for v in result if v is not None]
+ else:
+ raise ValueError("[err] cannot retrieve ids {} of type {}".format(ids, entity_type))
+
+ return result
+
+
+def retrieve_entity_list_sequential(
+ ids: List[str],
+ api: DefaultApi = None,
+ entity_type: Union[Type[ReleaseEntity], Type[ContainerEntity]] = ReleaseEntity,
+) -> List[Union[Type[ReleaseEntity], Type[ContainerEntity]]]:
+ """
Retrieve a list of entities. Some entities might be missing. Return all
that are accessible.
+
+ TODO: parallelize API access.
"""
if api is None:
api = public_api(FATCAT_API_URL)
diff --git a/notes/es_fuzzy_queries/README.md b/notes/es_fuzzy_queries/README.md
new file mode 100644
index 0000000..f69d5ea
--- /dev/null
+++ b/notes/es_fuzzy_queries/README.md
@@ -0,0 +1 @@
+# ES query examples
diff --git a/tests/files/simple_fuzzy_release_matcher/0.yaml b/tests/files/simple_fuzzy_release_matcher/0.yaml
new file mode 100644
index 0000000..71fc992
--- /dev/null
+++ b/tests/files/simple_fuzzy_release_matcher/0.yaml
@@ -0,0 +1,16 @@
+about: title and contrib
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Michael Adams"
+ }
+ ],
+ "title": "digital libraries",
+ "ext_ids": {}
+ }
+release_year_padding: 1
+expected:
+ - 7rmvqtrb2jdyhcxxodihzzcugy
+ - upm5nljirrbsfenoyxsisciltq
+ - wd3oeoi3bffknfbg2ymleqc4ja
diff --git a/tests/files/simple_fuzzy_release_matcher/1.yaml b/tests/files/simple_fuzzy_release_matcher/1.yaml
new file mode 100644
index 0000000..df6a954
--- /dev/null
+++ b/tests/files/simple_fuzzy_release_matcher/1.yaml
@@ -0,0 +1,24 @@
+about: title contrib, partial name
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Adams"
+ }
+ ],
+ "title": "digital libraries",
+ "ext_ids": {}
+ }
+release_year_padding: 1
+expected:
+ - 7rmvqtrb2jdyhcxxodihzzcugy
+ - a2u6ougtsjcbvczou6sazsulcm
+ - dy45vilej5diros6zmax46nm4e
+ - exuwhhayird4fdjmmsiqpponlq
+ - gqrj7jikezgcfpjfazhpf4e7c4
+ - mkmqt3453relbpuyktnmsg6hjq
+ - t2g5sl3dgzchtnq7dynxyzje44
+ - t4tvenhrvzamraxrvvxivxmvga
+ - wd3oeoi3bffknfbg2ymleqc4ja
+ - y63a6dhrfnb7bltlxfynydbojy
+
diff --git a/tests/files/simple_fuzzy_release_matcher/2.yaml b/tests/files/simple_fuzzy_release_matcher/2.yaml
new file mode 100644
index 0000000..df6a954
--- /dev/null
+++ b/tests/files/simple_fuzzy_release_matcher/2.yaml
@@ -0,0 +1,24 @@
+about: title contrib, partial name
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Adams"
+ }
+ ],
+ "title": "digital libraries",
+ "ext_ids": {}
+ }
+release_year_padding: 1
+expected:
+ - 7rmvqtrb2jdyhcxxodihzzcugy
+ - a2u6ougtsjcbvczou6sazsulcm
+ - dy45vilej5diros6zmax46nm4e
+ - exuwhhayird4fdjmmsiqpponlq
+ - gqrj7jikezgcfpjfazhpf4e7c4
+ - mkmqt3453relbpuyktnmsg6hjq
+ - t2g5sl3dgzchtnq7dynxyzje44
+ - t4tvenhrvzamraxrvvxivxmvga
+ - wd3oeoi3bffknfbg2ymleqc4ja
+ - y63a6dhrfnb7bltlxfynydbojy
+
diff --git a/tests/files/simple_fuzzy_release_matcher/3.yaml b/tests/files/simple_fuzzy_release_matcher/3.yaml
new file mode 100644
index 0000000..1ab761b
--- /dev/null
+++ b/tests/files/simple_fuzzy_release_matcher/3.yaml
@@ -0,0 +1,19 @@
+about: title only
+input: >
+ {
+ "title": "The future of scholarly communications",
+ "ext_ids": {}
+ }
+release_year_padding: 0
+expected:
+ - '2f57funqizf4lcxjanls45upom'
+ - '3p2hngx6kfa33bdaobipimdzhe'
+ - '75dzcdywlbb3logmrrpkabanfa'
+ - 'ccoocm7uzjgwnlpfk5fbwfudjm'
+ - 'nfydgfziuvhete6p3lrn4u325u'
+ - 'ntpiporu75bendibjku4kjmd5q'
+ - 'op6a5fclonhrxm3zlo6ub2tlw4'
+ - 'opoxzl3zzbccdh5tptm5p2krem'
+ - 'umzryrtocbakberuubjm2pgxum'
+ - 'zb4bjnwqsveyzcwebvvmnsoq7u'
+
diff --git a/tests/files/simple_fuzzy_release_matcher/4.yaml b/tests/files/simple_fuzzy_release_matcher/4.yaml
new file mode 100644
index 0000000..9419406
--- /dev/null
+++ b/tests/files/simple_fuzzy_release_matcher/4.yaml
@@ -0,0 +1,16 @@
+about: title, year
+input: >
+ {
+ "title": "The future of scholarly communications",
+ "release_year": 2014,
+ "ext_ids": {}
+ }
+release_year_padding: 0
+expected:
+ - '66r4s55dpvht5jghwkhupai2km'
+ - 'ccoocm7uzjgwnlpfk5fbwfudjm'
+ - 'du4awowpsbbcjlo2pe6dvmxewu'
+ - 'nfydgfziuvhete6p3lrn4u325u'
+ - 'ntpiporu75bendibjku4kjmd5q'
+ - 'op6a5fclonhrxm3zlo6ub2tlw4'
+ - 'xsuxmk5dyba6rnkeslipxxdlzi'
diff --git a/tests/files/simple_fuzzy_release_matcher/5.yaml b/tests/files/simple_fuzzy_release_matcher/5.yaml
new file mode 100644
index 0000000..1eb435b
--- /dev/null
+++ b/tests/files/simple_fuzzy_release_matcher/5.yaml
@@ -0,0 +1,16 @@
+about: contrib, year
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Lissandrini"
+ }
+ ],
+ "release_year": 2014,
+ "ext_ids": {}
+ }
+release_year_padding: 1
+expected:
+ - 'xfhjsixnlvbibigrilisqqvfk4'
+ - 'zfhfpo2shrdexpgd2as4fz7wnm'
+ - 'cyct2bqs5feqbowg6ovv53pdfq'
diff --git a/tests/files/simple_fuzzy_release_matcher/6.yaml b/tests/files/simple_fuzzy_release_matcher/6.yaml
new file mode 100644
index 0000000..ae52b23
--- /dev/null
+++ b/tests/files/simple_fuzzy_release_matcher/6.yaml
@@ -0,0 +1,24 @@
+about: contrib, year
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Goodwin"
+ }
+ ],
+ "release_year": 2014,
+ "ext_ids": {}
+ }
+release_year_padding: 0
+expected:
+ - 2bbtr4cltbgannqc6vqijvvzdq
+ - 34i2hba6tzf3xomobhumfkkvga
+ - 62sz5fhhuvenpfctf6wejl5m2i
+ - chnqmdm4yfd4zk6kawujvsbhwy
+ - chs7be23vfdthk3xre54w534zm
+ - f5lp3nipazhyxoa2xarlomkofm
+ - hikujb5wmvasnoat2myt56l63y
+ - qbom7rwqtzfypa5hltgbx4e2iq
+ - qh44drz3bvg2ndzwzc55xops7y
+ - r4n57quetbf7tddwodjauegmzq
+
diff --git a/tests/files/simple_fuzzy_release_matcher/7.yaml b/tests/files/simple_fuzzy_release_matcher/7.yaml
new file mode 100644
index 0000000..2330f0d
--- /dev/null
+++ b/tests/files/simple_fuzzy_release_matcher/7.yaml
@@ -0,0 +1,10 @@
+about: just a subtitle
+input: >
+ {
+ "subtitle": "topographies parisiennes",
+ "ext_ids": {}
+ }
+release_year_padding: 1
+expected:
+ - yvqtz2zvkzcbpj4jxrp7bvydfu
+ - lttg27o7mjganpkhrgy3xyv7vu
diff --git a/tests/files/simple_fuzzy_release_matcher/8.yaml b/tests/files/simple_fuzzy_release_matcher/8.yaml
new file mode 100644
index 0000000..b43e53a
--- /dev/null
+++ b/tests/files/simple_fuzzy_release_matcher/8.yaml
@@ -0,0 +1,139 @@
+about: a full document, https://fatcat.wiki/release/yvqtz2zvkzcbpj4jxrp7bvydfu
+input: >
+ {
+ "abstracts": [],
+ "refs": [],
+ "contribs": [
+ {
+ "index": 0,
+ "raw_name": "Annelies Schulte Nordholt",
+ "role": "author",
+ "extra": {
+ "seq": "first"
+ }
+ }
+ ],
+ "publisher": "Uopen Journals",
+ "pages": "66",
+ "ext_ids": {
+ "doi": "10.18352/bmgn-lchr.128"
+ },
+ "release_year": 2008,
+ "release_date": "2008-02-19",
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "container_id": "sm7svbj64vc55gj4p23t7c3lrm",
+ "webcaptures": [],
+ "filesets": [],
+ "files": [
+ {
+ "release_ids": [
+ "yvqtz2zvkzcbpj4jxrp7bvydfu"
+ ],
+ "mimetype": "application/pdf",
+ "urls": [
+ {
+ "url": "https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/",
+ "rel": "publisher"
+ },
+ {
+ "url": "https://web.archive.org/web/20200209043715/https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/",
+ "rel": "webarchive"
+ }
+ ],
+ "sha256": "96f3552fa3eee10282109dd994f6993caf44627946317d03862a5df167140b23",
+ "sha1": "a9ba7c2038e2a77ac1b1144344443a3835d83c40",
+ "md5": "7dae3ec6c1d65cae6a91554071cc9625",
+ "size": 889420,
+ "revision": "57e3b801-0d84-405b-be8b-6b2b0583cd75",
+ "ident": "oew6z4a6gvfqxc5kiy2r62ucfq",
+ "state": "active"
+ }
+ ],
+ "container": {
+ "wikidata_qid": "Q15763709",
+ "issnp": "1873-5045",
+ "issne": "1873-5045",
+ "issnl": "1873-5045",
+ "publisher": "Uopen Journals",
+ "name": "Relief: Revue Électronique de Littérature Francaise",
+ "extra": {
+ "country": "nl",
+ "default_license": "CC-BY",
+ "doaj": {
+ "as_of": "2021-11-20",
+ "default_license": "CC-BY",
+ "seal": false
+ },
+ "kbart": {
+ "clockss": {
+ "year_spans": [
+ [
+ 2007,
+ 2016
+ ]
+ ]
+ },
+ "lockss": {
+ "year_spans": [
+ [
+ 2007,
+ 2019
+ ]
+ ]
+ },
+ "pkp_pln": {
+ "year_spans": [
+ [
+ 2007,
+ 2021
+ ]
+ ]
+ },
+ "portico": {
+ "year_spans": [
+ [
+ 2007,
+ 2017
+ ]
+ ]
+ }
+ },
+ "languages": [
+ "en"
+ ],
+ "publisher_type": "unipress",
+ "road": {
+ "as_of": "2018-01-24"
+ },
+ "sherpa_romeo": {
+ "color": "blue"
+ },
+ "szczepanski": {
+ "as_of": "2018"
+ },
+ "urls": [
+ "https://www.revue-relief.org/",
+ "http://www.revue-relief.org/index.php/relief",
+ "http://www.revue-relief.org/index.php/relief/about"
+ ]
+ },
+ "revision": "2f36f957-7b60-4452-9310-1bd5e0035c0e",
+ "ident": "sm7svbj64vc55gj4p23t7c3lrm",
+ "state": "active"
+ },
+ "work_id": "qcpd2i2txfdi5emqb7fxsawk6e",
+ "title": "Georges Perec: topographies parisiennes du flâneur",
+ "state": "active",
+ "ident": "yvqtz2zvkzcbpj4jxrp7bvydfu",
+ "revision": "c9e80d74-8c4f-47a7-b49a-689f26856dff",
+ "extra": {
+ "crossref": {
+ "type": "journal-article"
+ }
+ }
+ }
+release_year_padding: 1
+expected:
+ - yvqtz2zvkzcbpj4jxrp7bvydfu
+ - lttg27o7mjganpkhrgy3xyv7vu
diff --git a/tests/files/simple_fuzzy_release_matcher/9.yaml b/tests/files/simple_fuzzy_release_matcher/9.yaml
new file mode 100644
index 0000000..b43e53a
--- /dev/null
+++ b/tests/files/simple_fuzzy_release_matcher/9.yaml
@@ -0,0 +1,139 @@
+about: a full document, https://fatcat.wiki/release/yvqtz2zvkzcbpj4jxrp7bvydfu
+input: >
+ {
+ "abstracts": [],
+ "refs": [],
+ "contribs": [
+ {
+ "index": 0,
+ "raw_name": "Annelies Schulte Nordholt",
+ "role": "author",
+ "extra": {
+ "seq": "first"
+ }
+ }
+ ],
+ "publisher": "Uopen Journals",
+ "pages": "66",
+ "ext_ids": {
+ "doi": "10.18352/bmgn-lchr.128"
+ },
+ "release_year": 2008,
+ "release_date": "2008-02-19",
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "container_id": "sm7svbj64vc55gj4p23t7c3lrm",
+ "webcaptures": [],
+ "filesets": [],
+ "files": [
+ {
+ "release_ids": [
+ "yvqtz2zvkzcbpj4jxrp7bvydfu"
+ ],
+ "mimetype": "application/pdf",
+ "urls": [
+ {
+ "url": "https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/",
+ "rel": "publisher"
+ },
+ {
+ "url": "https://web.archive.org/web/20200209043715/https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/",
+ "rel": "webarchive"
+ }
+ ],
+ "sha256": "96f3552fa3eee10282109dd994f6993caf44627946317d03862a5df167140b23",
+ "sha1": "a9ba7c2038e2a77ac1b1144344443a3835d83c40",
+ "md5": "7dae3ec6c1d65cae6a91554071cc9625",
+ "size": 889420,
+ "revision": "57e3b801-0d84-405b-be8b-6b2b0583cd75",
+ "ident": "oew6z4a6gvfqxc5kiy2r62ucfq",
+ "state": "active"
+ }
+ ],
+ "container": {
+ "wikidata_qid": "Q15763709",
+ "issnp": "1873-5045",
+ "issne": "1873-5045",
+ "issnl": "1873-5045",
+ "publisher": "Uopen Journals",
+ "name": "Relief: Revue Électronique de Littérature Francaise",
+ "extra": {
+ "country": "nl",
+ "default_license": "CC-BY",
+ "doaj": {
+ "as_of": "2021-11-20",
+ "default_license": "CC-BY",
+ "seal": false
+ },
+ "kbart": {
+ "clockss": {
+ "year_spans": [
+ [
+ 2007,
+ 2016
+ ]
+ ]
+ },
+ "lockss": {
+ "year_spans": [
+ [
+ 2007,
+ 2019
+ ]
+ ]
+ },
+ "pkp_pln": {
+ "year_spans": [
+ [
+ 2007,
+ 2021
+ ]
+ ]
+ },
+ "portico": {
+ "year_spans": [
+ [
+ 2007,
+ 2017
+ ]
+ ]
+ }
+ },
+ "languages": [
+ "en"
+ ],
+ "publisher_type": "unipress",
+ "road": {
+ "as_of": "2018-01-24"
+ },
+ "sherpa_romeo": {
+ "color": "blue"
+ },
+ "szczepanski": {
+ "as_of": "2018"
+ },
+ "urls": [
+ "https://www.revue-relief.org/",
+ "http://www.revue-relief.org/index.php/relief",
+ "http://www.revue-relief.org/index.php/relief/about"
+ ]
+ },
+ "revision": "2f36f957-7b60-4452-9310-1bd5e0035c0e",
+ "ident": "sm7svbj64vc55gj4p23t7c3lrm",
+ "state": "active"
+ },
+ "work_id": "qcpd2i2txfdi5emqb7fxsawk6e",
+ "title": "Georges Perec: topographies parisiennes du flâneur",
+ "state": "active",
+ "ident": "yvqtz2zvkzcbpj4jxrp7bvydfu",
+ "revision": "c9e80d74-8c4f-47a7-b49a-689f26856dff",
+ "extra": {
+ "crossref": {
+ "type": "journal-article"
+ }
+ }
+ }
+release_year_padding: 1
+expected:
+ - yvqtz2zvkzcbpj4jxrp7bvydfu
+ - lttg27o7mjganpkhrgy3xyv7vu
diff --git a/tests/test_matching.py b/tests/test_matching.py
index a7754ee..b9d7fae 100644
--- a/tests/test_matching.py
+++ b/tests/test_matching.py
@@ -48,90 +48,16 @@ def es_client():
return elasticsearch.Elasticsearch([FATCAT_SEARCH_URL])
-def test_matcher_match_release(es_client, caplog):
- cases = (
- ("wtv64ahbdzgwnan7rllwr3nurm", 1),
- ("eqcgtpav3na5jh56o5vjsvb4ei", 1),
- )
- matcher = FuzzyReleaseMatcher(es=es_client, size=5)
- for i, (ident, count) in enumerate(cases):
- entity = anything_to_entity(ident, ReleaseEntity)
- result = matcher.match(entity)
- logger.info("[{}] given {}, found {}".format(i, entity.title, len(result)))
- assert len(result) == count
-
- # Partial data.
- cases = (
- ({
- "title": "digital libraries",
- "ext_ids": {}
- }, 5),
- ({
- "title": "unlikelytitle",
- "ext_ids": {}
- }, 0),
- ({
- "title": "Imminent dystopia",
- "ext_ids": {}
- }, 5),
- ({
- "title": "",
- "contribs": [{
- "raw_name": "Aristoteles"
- }],
- "ext_ids": {}
- }, 5),
- # ({
- # "title": "Letter",
- # "contribs": [{"raw_name": "Claudel"}],
- # "ext_ids": {}
- # }, 1),
- # ({
- # "title": "The Future of Digital Scholarship",
- # "contribs": [{
- # "raw_name": "Costantino Thanos"
- # }],
- # "ext_ids": {}
- # }, 5),
- )
- for i, (doc, count) in enumerate(cases):
- entity = entity_from_dict(doc, ReleaseEntity)
- result = matcher.match(entity)
- with caplog.at_level(logging.INFO):
- logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result),
- [v.title for v in result]))
- assert len(result) == count, doc
-
-
-def test_fuzzy_release_matcher_match_release_by_id(es_client, caplog):
- matcher = FuzzyReleaseMatcher(es=es_client)
- cases = (
- ("wtv64ahbdzgwnan7rllwr3nurm", 1),
- ("eqcgtpav3na5jh56o5vjsvb4ei", 1),
- )
- for i, (ident, count) in enumerate(cases):
- entity = anything_to_entity(ident, ReleaseEntity)
- result = matcher.match_release_by_id(entity)
- assert len(result) == count
-
-
-def test_fuzzy_release_match_release_exact_title_exact_contrib(es_client, caplog):
- matcher = FuzzyReleaseMatcher(es=es_client)
- Case = collections.namedtuple("Case", "title date input expected")
- cases = yaml_to_cases(
- Case, "tests/files/fuzzy_release_match_release_exact_title_exact_contrib/*.yaml")
- for i, c in enumerate(cases):
- entity = entity_from_json(c.input, ReleaseEntity)
- result = matcher.match_release_exact_title_exact_contrib(entity)
- assert len(result) == c.expected, "[{}] {}".format(c.title, c.input)
-
-
-def test_fuzzy_release_match_release_exact_title_partial_contrib(es_client, caplog):
+def test_simple_fuzzy_release_matcher(es_client, caplog):
+ """
+ Use a single test function to test the higher level match function. We want
+ the result to be sensible, but should also document broken examples here.
+ """
matcher = FuzzyReleaseMatcher(es=es_client)
- Case = collections.namedtuple("Case", "title date input jaccard_index_threshold expected")
- cases = yaml_to_cases(
- Case, "tests/files/fuzzy_release_match_release_exact_title_partial_contrib/*.yaml")
+ Case = collections.namedtuple("Case", "about input release_year_padding expected")
+ cases = yaml_to_cases(Case, "tests/files/simple_fuzzy_release_matcher/*.yaml")
for i, c in enumerate(cases):
+ matcher.release_year_padding = c.release_year_padding
entity = entity_from_json(c.input, ReleaseEntity)
- result = matcher.match_release_exact_title_partial_contrib(entity)
- assert len(result) == c.expected, "[{}] {}".format(c.title, c.input)
+ result = matcher.match(entity)
+ assert set([r.ident for r in result]) == set(c.expected), "[{}] {}".format(c.about, c.input)