diff options
-rw-r--r-- | TODO.md | 5 | ||||
-rw-r--r-- | fuzzycat/matching.py | 479 | ||||
-rw-r--r-- | notes/es_fuzzy_queries/README.md | 1 | ||||
-rw-r--r-- | tests/files/simple_fuzzy_release_matcher/0.yaml | 16 | ||||
-rw-r--r-- | tests/files/simple_fuzzy_release_matcher/1.yaml | 24 | ||||
-rw-r--r-- | tests/files/simple_fuzzy_release_matcher/2.yaml | 24 | ||||
-rw-r--r-- | tests/files/simple_fuzzy_release_matcher/3.yaml | 19 | ||||
-rw-r--r-- | tests/files/simple_fuzzy_release_matcher/4.yaml | 16 | ||||
-rw-r--r-- | tests/files/simple_fuzzy_release_matcher/5.yaml | 16 | ||||
-rw-r--r-- | tests/files/simple_fuzzy_release_matcher/6.yaml | 24 | ||||
-rw-r--r-- | tests/files/simple_fuzzy_release_matcher/7.yaml | 10 | ||||
-rw-r--r-- | tests/files/simple_fuzzy_release_matcher/8.yaml | 139 | ||||
-rw-r--r-- | tests/files/simple_fuzzy_release_matcher/9.yaml | 139 | ||||
-rw-r--r-- | tests/test_matching.py | 94 |
14 files changed, 644 insertions, 362 deletions
@@ -1,5 +1,10 @@ # TODO +* [ ] match release with fewer requests (or do them in parallel) +* [ ] de-clobber verify + +---- + * [ ] clustering should be broken up, e.g. into "map" and "sort" * [x] match release should be a class * [x] match release fuzzy should work not just with title diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py index 2984d9a..cb6acbb 100644 --- a/fuzzycat/matching.py +++ b/fuzzycat/matching.py @@ -1,7 +1,9 @@ +import collections import logging import os import re import sys +from multiprocessing.dummy import Pool from typing import Any, List, Optional, Type, Union import elasticsearch @@ -22,37 +24,24 @@ FATCAT_API_URL = settings.get("FATCAT_API_URL", "https://api.fatcat.wiki/v0") class FuzzyReleaseMatcher: """ - FuzzyReleaseMatcher tries to find similar items to a given release in - elasticsearch. Exact matches first, then fuzzy. + This is a helper class to fetch related documents to a given release + document from fatcat search (currently elasticsearc)). Elasticsearch should + rank similar documents high itself, so all we try to do here is to tweak + the specific query a bit, depending on the completeness of the input + document, e.g. if the input has contrib and title, then use both, if it + only has a title, then use just that, etc. - In the best case, elasticsearch would automatically rank the most relevant - docs first, even with partial data. We still try to steer the matches by - using a query cascade. This is configurable. The last query should be a - generic. - - The goal here is to get a set of potential matches; verification has to. - happen separately. - - TODO: - - Example case not yet working well ("Stuehrenberg" vs "Stührenberg"): - - >>> result = matcher.match(entity_from_dict({"title": "internet archive", - "contribs": [{"raw_name": - "Stührenberg"}], - "ext_ids": {}}, - ReleaseEntity)) - - > Should return: https://fatcat.wiki/release/pu7e7tbctna2foqyyxztfw3ufy, - https://fatcat.wiki/release/search?q=St%C3%BChrenberg+internet+archive&generic=1 - (not returning anything via frontend either) - - Make sure we can switch from function to class: - - * [ ] 5 test cases for both + We try to get the result in a single query. + TODO/Tweaks: e.g. if document do have a "release_year", add this as a "should" clause. """ - def __init__(self, es="https://search.fatcat.wiki", api=None, index="fatcat_release", size=10): + def __init__(self, + es="https://search.fatcat.wiki", + api=None, + index="fatcat_release", + size=10, + min_token_length=3, + release_year_padding=1): if isinstance(es, str): self.es = elasticsearch.Elasticsearch([es]) else: @@ -61,8 +50,10 @@ class FuzzyReleaseMatcher: self.index = index self.size = size self.logger = logging.getLogger("fuzzy") + self.min_token_length = min_token_length + self.release_year_padding = 1 - def match_release_by_id(self, release, **kwargs) -> List[ReleaseEntity]: + def _match_id(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]: """ Check for exact matches by identifier. """ @@ -97,229 +88,10 @@ class FuzzyReleaseMatcher: return [r] return [] - def match_release_exact_title_exact_contrib(self, release): - """ - Match exact title and exact contrib names. Case insensitive, order of - contribs does not matter. - """ - if release.title is None or release.contribs is None: - return [] - contrib_queries = [{ - "match": { - "contrib_names": { - "query": contrib.raw_name, - "operator": "AND", - } - } - } for contrib in release.contribs] - query = { - "bool": { - "must": [{ - "match": { - "title": { - "query": release.title, - "operator": "AND", - }, - } - }] + contrib_queries, - }, - } - result = [] - - resp = self.es.search(index=self.index, - body={ - "query": query, - "size": self.size, - "track_total_hits": True - }) - hits_total = es_compat_hits_total(resp) - if hits_total == 0: - return result - if hits_total > self.size: - self.logger.warn('more than {} hits: {}'.format(self.size, hits_total)) - - entities = response_to_entity_list(resp, - entity_type=ReleaseEntity, - size=self.size, - api=self.api) - - # Require overlap of contrib. - matcher = ContribListMatcher( - cmp=JaccardIndexThreshold(1.0), - pipeline=Pipeline([ - lambda contribs: set((c.raw_name.strip().lower() for c in contribs)), - ]), - ) - - for re in entities: - if re.title.strip().lower() != release.title.strip().lower(): - continue - if not matcher.compare(re.contribs, release.contribs): - continue - result.append(re) - return result - - def match_release_exact_title_partial_contrib(self, release): - """ - Allow for exact authors, but ok, if some are missing. - """ - if release.title is None or release.contribs is None: - return [] - contrib_queries = [{ - "match": { - "contrib_names": { - "query": contrib.raw_name, - "operator": "AND", - } - } - } for contrib in release.contribs] - query = { - "bool": { - "must": [{ - "match": { - "title": { - "query": release.title, - "operator": "AND", - }, - } - }] + contrib_queries, - }, - } - result = [] - resp = self.es.search(index=self.index, - body={ - "query": query, - "size": self.size, - "track_total_hits": True - }) - if es_compat_hits_total(resp) == 0: - return result - if es_compat_hits_total(resp) > self.size: - raise NotImplementedError('result set too large: {}'.format(es)) - entities = response_to_entity_list(resp, - entity_type=ReleaseEntity, - size=self.size, - api=self.api) - - # Require at least half the contribs to be shared. - matcher = ContribListMatcher( - cmp=JaccardIndexThreshold(0.5), - pipeline=Pipeline([ - lambda contribs: set((c.raw_name.strip().lower() for c in contribs)), - ]), - ) - - for re in entities: - if re.title.strip().lower() != release.title.strip().lower(): - continue - if not matcher.compare(re.contribs, release.contribs): - continue - result.append(re) - return result - - def match_release_exact_title_fuzzy_contrib(self, release): - """ - Exact title but ok it authors differ (slightly). - """ - if release.title is None or release.contribs is None: - return [] - contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()] - contrib_queries = [{ - "match": { - "contrib_names": { - "query": token, - } - } - } for token in contrib_tokens] - query = { - "bool": { - "must": [{ - "match": { - "title": { - "query": release.title, - "operator": "AND", - }, - } - }] + contrib_queries, - }, - } - result = [] - resp = self.es.search(index=self.index, - body={ - "query": query, - "size": self.size, - "track_total_hits": True - }) - if es_compat_hits_total(resp) == 0: - return result - if es_compat_hits_total(resp) > self.size: - raise NotImplementedError('todo: scroll required for larger result sets: {}'.format(es)) - entities = response_to_entity_list(resp, - entity_type=ReleaseEntity, - size=self.size, - api=self.api) - - matcher = ContribListMatcher( - cmp=FuzzyStringSimilarity(min_ratio=60), - pipeline=Pipeline([ - lambda contribs: set((c.raw_name.strip().lower() for c in contribs)), - ]), - ) - - for re in entities: - if re.title.strip().lower() != release.title.strip().lower(): - continue - if not matcher.compare(re.contribs, release.contribs): - continue - result.append(re) - return result - - def match_release_exact_title(self, release): + def _match_title_contrib(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]: """ - Exact title, but any author. For common titles, this will yield 100s or - 1000s or results. + Match in the presence of defined title and contrib fields. """ - if release.title is None: - return [] - query = { - "bool": { - "must": [{ - "match": { - "title": { - "query": release.title, - "operator": "AND", - }, - } - }], - }, - } - result = [] - resp = self.es.search(body={ - "query": query, - "size": self.size, - "track_total_hits": True - }, - index=self.index) - if es_compat_hits_total(resp) == 0: - return result - if es_compat_hits_total(resp) > self.size: - self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp))) - entities = response_to_entity_list(resp, - entity_type=ReleaseEntity, - size=self.size, - api=self.api) - for re in entities: - if re.title.strip().lower() != release.title.strip().lower(): - continue - result.append(re) - return result - - def match_release_fuzzy_title_fuzzy_contrib(self, release): - """ - Using elasticsearch fuzziness option (which is not that fuzzy). - """ - if release.title is None or release.contribs is None: - return [] contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()] contrib_queries = [{ "match": { @@ -343,7 +115,18 @@ class FuzzyReleaseMatcher: ] + contrib_queries, }, } + if release.release_year is not None: + query["bool"]["must"].append({ + "range": { + "year": { + "gte": release.release_year - self.release_year_padding, + "lte": release.release_year + self.release_year_padding, + "boost": 0.5, + } + } + }) result = [] + self.logger.info(query) resp = self.es.search(index=self.index, body={ "query": query, @@ -353,19 +136,17 @@ class FuzzyReleaseMatcher: if es_compat_hits_total(resp) == 0: return result if es_compat_hits_total(resp) > self.size: - raise ValueError('too many hits: {}'.format(es_compat_hits_total(resp))) + self.logger.warning('too many hits: {}'.format(es_compat_hits_total(resp))) entities = response_to_entity_list(resp, entity_type=ReleaseEntity, size=self.size, api=self.api) return entities - def match_release_generic(self, release): + def _match_title(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]: """ - Final catch all variant via title. + Match in the presence of a title. """ - if release.title is None: - return [] query = { "bool": { "must": [ @@ -373,7 +154,7 @@ class FuzzyReleaseMatcher: "match": { "title": { "query": release.title, - "operator": "OR", + "operator": "AND", "fuzziness": "AUTO", }, } @@ -381,6 +162,16 @@ class FuzzyReleaseMatcher: ], }, } + if release.release_year is not None: + query["bool"]["must"].append({ + "range": { + "year": { + "gte": release.release_year - self.release_year_padding, + "lte": release.release_year + self.release_year_padding, + "boost": 0.5, + } + } + }) result = [] resp = self.es.search(index=self.index, body={ @@ -391,19 +182,17 @@ class FuzzyReleaseMatcher: if es_compat_hits_total(resp) == 0: return result if es_compat_hits_total(resp) > self.size: - self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp))) + self.logger.warning('too many hits: {}'.format(es_compat_hits_total(resp))) entities = response_to_entity_list(resp, entity_type=ReleaseEntity, size=self.size, api=self.api) return entities - def match_release_generic_fuzzy_contrib(self, release): + def _match_contribs(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]: """ - Only match contribs, if they exist. + Match in the presence of contribs (and no title). """ - if release.contribs is None: - return [] contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()] contrib_queries = [{ "match": { @@ -417,6 +206,16 @@ class FuzzyReleaseMatcher: "must": contrib_queries, }, } + if release.release_year is not None: + query["bool"]["must"].append({ + "range": { + "year": { + "gte": release.release_year - self.release_year_padding, + "lte": release.release_year + self.release_year_padding, + "boost": 0.5, + } + } + }) result = [] resp = self.es.search(index=self.index, body={ @@ -427,37 +226,78 @@ class FuzzyReleaseMatcher: if es_compat_hits_total(resp) == 0: return result if es_compat_hits_total(resp) > self.size: - self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp))) + self.logger.warning('too many hits: {}'.format(es_compat_hits_total(resp))) entities = response_to_entity_list(resp, entity_type=ReleaseEntity, size=self.size, api=self.api) return entities - def match_cascade(self, release, *qs, **kwargs): + def _match_generic(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]: """ - Returns the result from the first query that returns a result. All query - functions need to be defined on this class (for now). + Throw tokens at elasticsearch. """ - for q in qs: - self.logger.debug("[cascade] {}".format(q)) - result = q(release, **kwargs) - if len(result) > 0: - return result - return [] + token_queries = [ + { + "match": { + "biblio": { # https://git.io/JMXvJ + "query": token, + } + } + } for token in release_tokens(release) if len(token) > self.min_token_length + ] + query = { + "bool": { + "must": token_queries, + }, + } + if release.release_year is not None: + query["bool"]["must"].append({ + "range": { + "year": { + "gte": release.release_year - self.release_year_padding, + "lte": release.release_year + self.release_year_padding, + "boost": 0.5, + } + } + }) + result = [] + self.logger.info(query) + resp = self.es.search(index=self.index, + body={ + "query": query, + "size": self.size, + "track_total_hits": True + }) + if es_compat_hits_total(resp) == 0: + return result + if es_compat_hits_total(resp) > self.size: + self.logger.warning('too many hits: {}'.format(es_compat_hits_total(resp))) + entities = response_to_entity_list(resp, + entity_type=ReleaseEntity, + size=self.size, + api=self.api) + return entities def match(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]: """ - Match returns a list of match candidates given a release entity. + Match dispatches methods based on which fields are defined on the + document. """ if not release: return [] - return self.match_cascade( - release, self.match_release_by_id, self.match_release_exact_title_exact_contrib, - self.match_release_exact_title_partial_contrib, - self.match_release_exact_title_fuzzy_contrib, self.match_release_exact_title, - self.match_release_fuzzy_title_fuzzy_contrib, self.match_release_generic, - self.match_release_generic_fuzzy_contrib) + if release.ext_ids and len(release.ext_ids.to_dict()) > 0: + result = self._match_id(release) + if release.title is not None and release.contribs is not None: + result = self._match_title_contrib(release) + elif release.title is not None: + result = self._match_title(release) + elif release.contribs is not None: + result = self._match_contribs(release) + else: + result = self._match_generic(release) + + return result def public_api(host_uri): @@ -471,14 +311,97 @@ def public_api(host_uri): return fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf)) +def release_tokens(release: ReleaseEntity) -> List[str]: + """ + Turn a release into a set of tokens. + """ + tokens = [] + red = release.to_dict() + for k, v in red.items(): + if v is None or k == "ext_ids": + continue + v = str(v) + for tok in v.split(): + tokens.append(tok) + for _, v in red.get("ext_ids", {}).items(): + if v is None or not isinstance(v, str): + continue + for tok in v.split(): + tokens.append(tok) + + return tokens + + +def test_release_tokens(): + Case = collections.namedtuple("Case", "re tokens") + cases = ( + Case(entity_from_dict({"ext_ids": {}}, ReleaseEntity), []), + Case(entity_from_dict({ + "ext_ids": {}, + "title": "Flow my tears" + }, ReleaseEntity), ["Flow", "my", "tears"]), + Case( + entity_from_dict( + { + "ext_ids": {}, + "subtitle": "An illustrated guide", + "release_year": 1981, + }, ReleaseEntity), ["An", "illustrated", "guide", "1981"]), + ) + for c in cases: + tokens = release_tokens(c.re) + assert tokens == c.tokens + + +def fetch_release(ident, api=None): + """ + Return release entity of None. + """ + if api is None: + api = public_api(FATCAT_API_URL) + try: + re = api.get_release(ident, hide="refs,abstracts", expand="container,contribs,files") + except ApiException as exc: + if exc.status == 404: + print("[err] failed to retrieve release entity: {}".format(id), file=sys.stderr) + else: + print("[err] api failed with {}: {}".format(exc.status, exc.message), file=sys.stderr) + else: + return re + + def retrieve_entity_list( ids: List[str], api: DefaultApi = None, entity_type: Union[Type[ReleaseEntity], Type[ContainerEntity]] = ReleaseEntity, ) -> List[Union[Type[ReleaseEntity], Type[ContainerEntity]]]: """ + Parallel requests. + """ + if api is None: + api = public_api(FATCAT_API_URL) + + result = [] + if entity_type == ReleaseEntity: + with Pool(10) as p: + result = p.map(fetch_release, ids) + return [v for v in result if v is not None] + else: + raise ValueError("[err] cannot retrieve ids {} of type {}".format(ids, entity_type)) + + return result + + +def retrieve_entity_list_sequential( + ids: List[str], + api: DefaultApi = None, + entity_type: Union[Type[ReleaseEntity], Type[ContainerEntity]] = ReleaseEntity, +) -> List[Union[Type[ReleaseEntity], Type[ContainerEntity]]]: + """ Retrieve a list of entities. Some entities might be missing. Return all that are accessible. + + TODO: parallelize API access. """ if api is None: api = public_api(FATCAT_API_URL) diff --git a/notes/es_fuzzy_queries/README.md b/notes/es_fuzzy_queries/README.md new file mode 100644 index 0000000..f69d5ea --- /dev/null +++ b/notes/es_fuzzy_queries/README.md @@ -0,0 +1 @@ +# ES query examples diff --git a/tests/files/simple_fuzzy_release_matcher/0.yaml b/tests/files/simple_fuzzy_release_matcher/0.yaml new file mode 100644 index 0000000..71fc992 --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/0.yaml @@ -0,0 +1,16 @@ +about: title and contrib +input: > + { + "contribs": [ + { + "raw_name": "Michael Adams" + } + ], + "title": "digital libraries", + "ext_ids": {} + } +release_year_padding: 1 +expected: + - 7rmvqtrb2jdyhcxxodihzzcugy + - upm5nljirrbsfenoyxsisciltq + - wd3oeoi3bffknfbg2ymleqc4ja diff --git a/tests/files/simple_fuzzy_release_matcher/1.yaml b/tests/files/simple_fuzzy_release_matcher/1.yaml new file mode 100644 index 0000000..df6a954 --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/1.yaml @@ -0,0 +1,24 @@ +about: title contrib, partial name +input: > + { + "contribs": [ + { + "raw_name": "Adams" + } + ], + "title": "digital libraries", + "ext_ids": {} + } +release_year_padding: 1 +expected: + - 7rmvqtrb2jdyhcxxodihzzcugy + - a2u6ougtsjcbvczou6sazsulcm + - dy45vilej5diros6zmax46nm4e + - exuwhhayird4fdjmmsiqpponlq + - gqrj7jikezgcfpjfazhpf4e7c4 + - mkmqt3453relbpuyktnmsg6hjq + - t2g5sl3dgzchtnq7dynxyzje44 + - t4tvenhrvzamraxrvvxivxmvga + - wd3oeoi3bffknfbg2ymleqc4ja + - y63a6dhrfnb7bltlxfynydbojy + diff --git a/tests/files/simple_fuzzy_release_matcher/2.yaml b/tests/files/simple_fuzzy_release_matcher/2.yaml new file mode 100644 index 0000000..df6a954 --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/2.yaml @@ -0,0 +1,24 @@ +about: title contrib, partial name +input: > + { + "contribs": [ + { + "raw_name": "Adams" + } + ], + "title": "digital libraries", + "ext_ids": {} + } +release_year_padding: 1 +expected: + - 7rmvqtrb2jdyhcxxodihzzcugy + - a2u6ougtsjcbvczou6sazsulcm + - dy45vilej5diros6zmax46nm4e + - exuwhhayird4fdjmmsiqpponlq + - gqrj7jikezgcfpjfazhpf4e7c4 + - mkmqt3453relbpuyktnmsg6hjq + - t2g5sl3dgzchtnq7dynxyzje44 + - t4tvenhrvzamraxrvvxivxmvga + - wd3oeoi3bffknfbg2ymleqc4ja + - y63a6dhrfnb7bltlxfynydbojy + diff --git a/tests/files/simple_fuzzy_release_matcher/3.yaml b/tests/files/simple_fuzzy_release_matcher/3.yaml new file mode 100644 index 0000000..1ab761b --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/3.yaml @@ -0,0 +1,19 @@ +about: title only +input: > + { + "title": "The future of scholarly communications", + "ext_ids": {} + } +release_year_padding: 0 +expected: + - '2f57funqizf4lcxjanls45upom' + - '3p2hngx6kfa33bdaobipimdzhe' + - '75dzcdywlbb3logmrrpkabanfa' + - 'ccoocm7uzjgwnlpfk5fbwfudjm' + - 'nfydgfziuvhete6p3lrn4u325u' + - 'ntpiporu75bendibjku4kjmd5q' + - 'op6a5fclonhrxm3zlo6ub2tlw4' + - 'opoxzl3zzbccdh5tptm5p2krem' + - 'umzryrtocbakberuubjm2pgxum' + - 'zb4bjnwqsveyzcwebvvmnsoq7u' + diff --git a/tests/files/simple_fuzzy_release_matcher/4.yaml b/tests/files/simple_fuzzy_release_matcher/4.yaml new file mode 100644 index 0000000..9419406 --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/4.yaml @@ -0,0 +1,16 @@ +about: title, year +input: > + { + "title": "The future of scholarly communications", + "release_year": 2014, + "ext_ids": {} + } +release_year_padding: 0 +expected: + - '66r4s55dpvht5jghwkhupai2km' + - 'ccoocm7uzjgwnlpfk5fbwfudjm' + - 'du4awowpsbbcjlo2pe6dvmxewu' + - 'nfydgfziuvhete6p3lrn4u325u' + - 'ntpiporu75bendibjku4kjmd5q' + - 'op6a5fclonhrxm3zlo6ub2tlw4' + - 'xsuxmk5dyba6rnkeslipxxdlzi' diff --git a/tests/files/simple_fuzzy_release_matcher/5.yaml b/tests/files/simple_fuzzy_release_matcher/5.yaml new file mode 100644 index 0000000..1eb435b --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/5.yaml @@ -0,0 +1,16 @@ +about: contrib, year +input: > + { + "contribs": [ + { + "raw_name": "Lissandrini" + } + ], + "release_year": 2014, + "ext_ids": {} + } +release_year_padding: 1 +expected: + - 'xfhjsixnlvbibigrilisqqvfk4' + - 'zfhfpo2shrdexpgd2as4fz7wnm' + - 'cyct2bqs5feqbowg6ovv53pdfq' diff --git a/tests/files/simple_fuzzy_release_matcher/6.yaml b/tests/files/simple_fuzzy_release_matcher/6.yaml new file mode 100644 index 0000000..ae52b23 --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/6.yaml @@ -0,0 +1,24 @@ +about: contrib, year +input: > + { + "contribs": [ + { + "raw_name": "Goodwin" + } + ], + "release_year": 2014, + "ext_ids": {} + } +release_year_padding: 0 +expected: + - 2bbtr4cltbgannqc6vqijvvzdq + - 34i2hba6tzf3xomobhumfkkvga + - 62sz5fhhuvenpfctf6wejl5m2i + - chnqmdm4yfd4zk6kawujvsbhwy + - chs7be23vfdthk3xre54w534zm + - f5lp3nipazhyxoa2xarlomkofm + - hikujb5wmvasnoat2myt56l63y + - qbom7rwqtzfypa5hltgbx4e2iq + - qh44drz3bvg2ndzwzc55xops7y + - r4n57quetbf7tddwodjauegmzq + diff --git a/tests/files/simple_fuzzy_release_matcher/7.yaml b/tests/files/simple_fuzzy_release_matcher/7.yaml new file mode 100644 index 0000000..2330f0d --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/7.yaml @@ -0,0 +1,10 @@ +about: just a subtitle +input: > + { + "subtitle": "topographies parisiennes", + "ext_ids": {} + } +release_year_padding: 1 +expected: + - yvqtz2zvkzcbpj4jxrp7bvydfu + - lttg27o7mjganpkhrgy3xyv7vu diff --git a/tests/files/simple_fuzzy_release_matcher/8.yaml b/tests/files/simple_fuzzy_release_matcher/8.yaml new file mode 100644 index 0000000..b43e53a --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/8.yaml @@ -0,0 +1,139 @@ +about: a full document, https://fatcat.wiki/release/yvqtz2zvkzcbpj4jxrp7bvydfu +input: > + { + "abstracts": [], + "refs": [], + "contribs": [ + { + "index": 0, + "raw_name": "Annelies Schulte Nordholt", + "role": "author", + "extra": { + "seq": "first" + } + } + ], + "publisher": "Uopen Journals", + "pages": "66", + "ext_ids": { + "doi": "10.18352/bmgn-lchr.128" + }, + "release_year": 2008, + "release_date": "2008-02-19", + "release_stage": "published", + "release_type": "article-journal", + "container_id": "sm7svbj64vc55gj4p23t7c3lrm", + "webcaptures": [], + "filesets": [], + "files": [ + { + "release_ids": [ + "yvqtz2zvkzcbpj4jxrp7bvydfu" + ], + "mimetype": "application/pdf", + "urls": [ + { + "url": "https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", + "rel": "publisher" + }, + { + "url": "https://web.archive.org/web/20200209043715/https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", + "rel": "webarchive" + } + ], + "sha256": "96f3552fa3eee10282109dd994f6993caf44627946317d03862a5df167140b23", + "sha1": "a9ba7c2038e2a77ac1b1144344443a3835d83c40", + "md5": "7dae3ec6c1d65cae6a91554071cc9625", + "size": 889420, + "revision": "57e3b801-0d84-405b-be8b-6b2b0583cd75", + "ident": "oew6z4a6gvfqxc5kiy2r62ucfq", + "state": "active" + } + ], + "container": { + "wikidata_qid": "Q15763709", + "issnp": "1873-5045", + "issne": "1873-5045", + "issnl": "1873-5045", + "publisher": "Uopen Journals", + "name": "Relief: Revue Électronique de Littérature Francaise", + "extra": { + "country": "nl", + "default_license": "CC-BY", + "doaj": { + "as_of": "2021-11-20", + "default_license": "CC-BY", + "seal": false + }, + "kbart": { + "clockss": { + "year_spans": [ + [ + 2007, + 2016 + ] + ] + }, + "lockss": { + "year_spans": [ + [ + 2007, + 2019 + ] + ] + }, + "pkp_pln": { + "year_spans": [ + [ + 2007, + 2021 + ] + ] + }, + "portico": { + "year_spans": [ + [ + 2007, + 2017 + ] + ] + } + }, + "languages": [ + "en" + ], + "publisher_type": "unipress", + "road": { + "as_of": "2018-01-24" + }, + "sherpa_romeo": { + "color": "blue" + }, + "szczepanski": { + "as_of": "2018" + }, + "urls": [ + "https://www.revue-relief.org/", + "http://www.revue-relief.org/index.php/relief", + "http://www.revue-relief.org/index.php/relief/about" + ] + }, + "revision": "2f36f957-7b60-4452-9310-1bd5e0035c0e", + "ident": "sm7svbj64vc55gj4p23t7c3lrm", + "state": "active" + }, + "work_id": "qcpd2i2txfdi5emqb7fxsawk6e", + "title": "Georges Perec: topographies parisiennes du flâneur", + "state": "active", + "ident": "yvqtz2zvkzcbpj4jxrp7bvydfu", + "revision": "c9e80d74-8c4f-47a7-b49a-689f26856dff", + "extra": { + "crossref": { + "type": "journal-article" + } + } + } +release_year_padding: 1 +expected: + - yvqtz2zvkzcbpj4jxrp7bvydfu + - lttg27o7mjganpkhrgy3xyv7vu diff --git a/tests/files/simple_fuzzy_release_matcher/9.yaml b/tests/files/simple_fuzzy_release_matcher/9.yaml new file mode 100644 index 0000000..b43e53a --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/9.yaml @@ -0,0 +1,139 @@ +about: a full document, https://fatcat.wiki/release/yvqtz2zvkzcbpj4jxrp7bvydfu +input: > + { + "abstracts": [], + "refs": [], + "contribs": [ + { + "index": 0, + "raw_name": "Annelies Schulte Nordholt", + "role": "author", + "extra": { + "seq": "first" + } + } + ], + "publisher": "Uopen Journals", + "pages": "66", + "ext_ids": { + "doi": "10.18352/bmgn-lchr.128" + }, + "release_year": 2008, + "release_date": "2008-02-19", + "release_stage": "published", + "release_type": "article-journal", + "container_id": "sm7svbj64vc55gj4p23t7c3lrm", + "webcaptures": [], + "filesets": [], + "files": [ + { + "release_ids": [ + "yvqtz2zvkzcbpj4jxrp7bvydfu" + ], + "mimetype": "application/pdf", + "urls": [ + { + "url": "https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", + "rel": "publisher" + }, + { + "url": "https://web.archive.org/web/20200209043715/https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", + "rel": "webarchive" + } + ], + "sha256": "96f3552fa3eee10282109dd994f6993caf44627946317d03862a5df167140b23", + "sha1": "a9ba7c2038e2a77ac1b1144344443a3835d83c40", + "md5": "7dae3ec6c1d65cae6a91554071cc9625", + "size": 889420, + "revision": "57e3b801-0d84-405b-be8b-6b2b0583cd75", + "ident": "oew6z4a6gvfqxc5kiy2r62ucfq", + "state": "active" + } + ], + "container": { + "wikidata_qid": "Q15763709", + "issnp": "1873-5045", + "issne": "1873-5045", + "issnl": "1873-5045", + "publisher": "Uopen Journals", + "name": "Relief: Revue Électronique de Littérature Francaise", + "extra": { + "country": "nl", + "default_license": "CC-BY", + "doaj": { + "as_of": "2021-11-20", + "default_license": "CC-BY", + "seal": false + }, + "kbart": { + "clockss": { + "year_spans": [ + [ + 2007, + 2016 + ] + ] + }, + "lockss": { + "year_spans": [ + [ + 2007, + 2019 + ] + ] + }, + "pkp_pln": { + "year_spans": [ + [ + 2007, + 2021 + ] + ] + }, + "portico": { + "year_spans": [ + [ + 2007, + 2017 + ] + ] + } + }, + "languages": [ + "en" + ], + "publisher_type": "unipress", + "road": { + "as_of": "2018-01-24" + }, + "sherpa_romeo": { + "color": "blue" + }, + "szczepanski": { + "as_of": "2018" + }, + "urls": [ + "https://www.revue-relief.org/", + "http://www.revue-relief.org/index.php/relief", + "http://www.revue-relief.org/index.php/relief/about" + ] + }, + "revision": "2f36f957-7b60-4452-9310-1bd5e0035c0e", + "ident": "sm7svbj64vc55gj4p23t7c3lrm", + "state": "active" + }, + "work_id": "qcpd2i2txfdi5emqb7fxsawk6e", + "title": "Georges Perec: topographies parisiennes du flâneur", + "state": "active", + "ident": "yvqtz2zvkzcbpj4jxrp7bvydfu", + "revision": "c9e80d74-8c4f-47a7-b49a-689f26856dff", + "extra": { + "crossref": { + "type": "journal-article" + } + } + } +release_year_padding: 1 +expected: + - yvqtz2zvkzcbpj4jxrp7bvydfu + - lttg27o7mjganpkhrgy3xyv7vu diff --git a/tests/test_matching.py b/tests/test_matching.py index a7754ee..b9d7fae 100644 --- a/tests/test_matching.py +++ b/tests/test_matching.py @@ -48,90 +48,16 @@ def es_client(): return elasticsearch.Elasticsearch([FATCAT_SEARCH_URL]) -def test_matcher_match_release(es_client, caplog): - cases = ( - ("wtv64ahbdzgwnan7rllwr3nurm", 1), - ("eqcgtpav3na5jh56o5vjsvb4ei", 1), - ) - matcher = FuzzyReleaseMatcher(es=es_client, size=5) - for i, (ident, count) in enumerate(cases): - entity = anything_to_entity(ident, ReleaseEntity) - result = matcher.match(entity) - logger.info("[{}] given {}, found {}".format(i, entity.title, len(result))) - assert len(result) == count - - # Partial data. - cases = ( - ({ - "title": "digital libraries", - "ext_ids": {} - }, 5), - ({ - "title": "unlikelytitle", - "ext_ids": {} - }, 0), - ({ - "title": "Imminent dystopia", - "ext_ids": {} - }, 5), - ({ - "title": "", - "contribs": [{ - "raw_name": "Aristoteles" - }], - "ext_ids": {} - }, 5), - # ({ - # "title": "Letter", - # "contribs": [{"raw_name": "Claudel"}], - # "ext_ids": {} - # }, 1), - # ({ - # "title": "The Future of Digital Scholarship", - # "contribs": [{ - # "raw_name": "Costantino Thanos" - # }], - # "ext_ids": {} - # }, 5), - ) - for i, (doc, count) in enumerate(cases): - entity = entity_from_dict(doc, ReleaseEntity) - result = matcher.match(entity) - with caplog.at_level(logging.INFO): - logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result), - [v.title for v in result])) - assert len(result) == count, doc - - -def test_fuzzy_release_matcher_match_release_by_id(es_client, caplog): - matcher = FuzzyReleaseMatcher(es=es_client) - cases = ( - ("wtv64ahbdzgwnan7rllwr3nurm", 1), - ("eqcgtpav3na5jh56o5vjsvb4ei", 1), - ) - for i, (ident, count) in enumerate(cases): - entity = anything_to_entity(ident, ReleaseEntity) - result = matcher.match_release_by_id(entity) - assert len(result) == count - - -def test_fuzzy_release_match_release_exact_title_exact_contrib(es_client, caplog): - matcher = FuzzyReleaseMatcher(es=es_client) - Case = collections.namedtuple("Case", "title date input expected") - cases = yaml_to_cases( - Case, "tests/files/fuzzy_release_match_release_exact_title_exact_contrib/*.yaml") - for i, c in enumerate(cases): - entity = entity_from_json(c.input, ReleaseEntity) - result = matcher.match_release_exact_title_exact_contrib(entity) - assert len(result) == c.expected, "[{}] {}".format(c.title, c.input) - - -def test_fuzzy_release_match_release_exact_title_partial_contrib(es_client, caplog): +def test_simple_fuzzy_release_matcher(es_client, caplog): + """ + Use a single test function to test the higher level match function. We want + the result to be sensible, but should also document broken examples here. + """ matcher = FuzzyReleaseMatcher(es=es_client) - Case = collections.namedtuple("Case", "title date input jaccard_index_threshold expected") - cases = yaml_to_cases( - Case, "tests/files/fuzzy_release_match_release_exact_title_partial_contrib/*.yaml") + Case = collections.namedtuple("Case", "about input release_year_padding expected") + cases = yaml_to_cases(Case, "tests/files/simple_fuzzy_release_matcher/*.yaml") for i, c in enumerate(cases): + matcher.release_year_padding = c.release_year_padding entity = entity_from_json(c.input, ReleaseEntity) - result = matcher.match_release_exact_title_partial_contrib(entity) - assert len(result) == c.expected, "[{}] {}".format(c.title, c.input) + result = matcher.match(entity) + assert set([r.ident for r in result]) == set(c.expected), "[{}] {}".format(c.about, c.input) |