diff options
| -rw-r--r-- | TODO.md | 5 | ||||
| -rw-r--r-- | fuzzycat/matching.py | 479 | ||||
| -rw-r--r-- | notes/es_fuzzy_queries/README.md | 1 | ||||
| -rw-r--r-- | tests/files/simple_fuzzy_release_matcher/0.yaml | 16 | ||||
| -rw-r--r-- | tests/files/simple_fuzzy_release_matcher/1.yaml | 24 | ||||
| -rw-r--r-- | tests/files/simple_fuzzy_release_matcher/2.yaml | 24 | ||||
| -rw-r--r-- | tests/files/simple_fuzzy_release_matcher/3.yaml | 19 | ||||
| -rw-r--r-- | tests/files/simple_fuzzy_release_matcher/4.yaml | 16 | ||||
| -rw-r--r-- | tests/files/simple_fuzzy_release_matcher/5.yaml | 16 | ||||
| -rw-r--r-- | tests/files/simple_fuzzy_release_matcher/6.yaml | 24 | ||||
| -rw-r--r-- | tests/files/simple_fuzzy_release_matcher/7.yaml | 10 | ||||
| -rw-r--r-- | tests/files/simple_fuzzy_release_matcher/8.yaml | 139 | ||||
| -rw-r--r-- | tests/files/simple_fuzzy_release_matcher/9.yaml | 139 | ||||
| -rw-r--r-- | tests/test_matching.py | 94 | 
14 files changed, 644 insertions, 362 deletions
@@ -1,5 +1,10 @@  # TODO +* [ ] match release with fewer requests (or do them in parallel) +* [ ] de-clobber verify + +---- +  * [ ] clustering should be broken up, e.g. into "map" and "sort"  * [x] match release should be a class  * [x] match release fuzzy should work not just with title diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py index 2984d9a..cb6acbb 100644 --- a/fuzzycat/matching.py +++ b/fuzzycat/matching.py @@ -1,7 +1,9 @@ +import collections  import logging  import os  import re  import sys +from multiprocessing.dummy import Pool  from typing import Any, List, Optional, Type, Union  import elasticsearch @@ -22,37 +24,24 @@ FATCAT_API_URL = settings.get("FATCAT_API_URL", "https://api.fatcat.wiki/v0")  class FuzzyReleaseMatcher:      """ -    FuzzyReleaseMatcher tries to find similar items to a given release in -    elasticsearch. Exact matches first, then fuzzy. +    This is a helper class to fetch related documents to a given release +    document from fatcat search (currently elasticsearc)). Elasticsearch should +    rank similar documents high itself, so all we try to do here is to tweak +    the specific query a bit, depending on the completeness of the input +    document, e.g. if the input has contrib and title, then use both, if it +    only has a title, then use just that, etc. -    In the best case, elasticsearch would automatically rank the most relevant -    docs first, even with partial data. We still try to steer the matches by -    using a query cascade. This is configurable. The last query should be a -    generic. - -    The goal here is to get a set of potential matches; verification has to. -    happen separately. - -    TODO: - -    Example case not yet working well ("Stuehrenberg" vs "Stührenberg"): - -    >>> result = matcher.match(entity_from_dict({"title": "internet archive", -                                                 "contribs": [{"raw_name": -                                                               "Stührenberg"}], -                                                 "ext_ids": {}}, -                                                ReleaseEntity)) - -    > Should return: https://fatcat.wiki/release/pu7e7tbctna2foqyyxztfw3ufy, -    https://fatcat.wiki/release/search?q=St%C3%BChrenberg+internet+archive&generic=1 -    (not returning anything via frontend either) - -    Make sure we can switch from function to class: - -    * [ ] 5 test cases for both +    We try to get the result in a single query. +    TODO/Tweaks: e.g. if document do have a "release_year", add this as a "should" clause.      """ -    def __init__(self, es="https://search.fatcat.wiki", api=None, index="fatcat_release", size=10): +    def __init__(self, +                 es="https://search.fatcat.wiki", +                 api=None, +                 index="fatcat_release", +                 size=10, +                 min_token_length=3, +                 release_year_padding=1):          if isinstance(es, str):              self.es = elasticsearch.Elasticsearch([es])          else: @@ -61,8 +50,10 @@ class FuzzyReleaseMatcher:          self.index = index          self.size = size          self.logger = logging.getLogger("fuzzy") +        self.min_token_length = min_token_length +        self.release_year_padding = 1 -    def match_release_by_id(self, release, **kwargs) -> List[ReleaseEntity]: +    def _match_id(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]:          """          Check for exact matches by identifier.          """ @@ -97,229 +88,10 @@ class FuzzyReleaseMatcher:                  return [r]          return [] -    def match_release_exact_title_exact_contrib(self, release): -        """ -        Match exact title and exact contrib names. Case insensitive, order of -        contribs does not matter. -        """ -        if release.title is None or release.contribs is None: -            return [] -        contrib_queries = [{ -            "match": { -                "contrib_names": { -                    "query": contrib.raw_name, -                    "operator": "AND", -                } -            } -        } for contrib in release.contribs] -        query = { -            "bool": { -                "must": [{ -                    "match": { -                        "title": { -                            "query": release.title, -                            "operator": "AND", -                        }, -                    } -                }] + contrib_queries, -            }, -        } -        result = [] - -        resp = self.es.search(index=self.index, -                              body={ -                                  "query": query, -                                  "size": self.size, -                                  "track_total_hits": True -                              }) -        hits_total = es_compat_hits_total(resp) -        if hits_total == 0: -            return result -        if hits_total > self.size: -            self.logger.warn('more than {} hits: {}'.format(self.size, hits_total)) - -        entities = response_to_entity_list(resp, -                                           entity_type=ReleaseEntity, -                                           size=self.size, -                                           api=self.api) - -        # Require overlap of contrib. -        matcher = ContribListMatcher( -            cmp=JaccardIndexThreshold(1.0), -            pipeline=Pipeline([ -                lambda contribs: set((c.raw_name.strip().lower() for c in contribs)), -            ]), -        ) - -        for re in entities: -            if re.title.strip().lower() != release.title.strip().lower(): -                continue -            if not matcher.compare(re.contribs, release.contribs): -                continue -            result.append(re) -        return result - -    def match_release_exact_title_partial_contrib(self, release): -        """ -        Allow for exact authors, but ok, if some are missing. -        """ -        if release.title is None or release.contribs is None: -            return [] -        contrib_queries = [{ -            "match": { -                "contrib_names": { -                    "query": contrib.raw_name, -                    "operator": "AND", -                } -            } -        } for contrib in release.contribs] -        query = { -            "bool": { -                "must": [{ -                    "match": { -                        "title": { -                            "query": release.title, -                            "operator": "AND", -                        }, -                    } -                }] + contrib_queries, -            }, -        } -        result = [] -        resp = self.es.search(index=self.index, -                              body={ -                                  "query": query, -                                  "size": self.size, -                                  "track_total_hits": True -                              }) -        if es_compat_hits_total(resp) == 0: -            return result -        if es_compat_hits_total(resp) > self.size: -            raise NotImplementedError('result set too large: {}'.format(es)) -        entities = response_to_entity_list(resp, -                                           entity_type=ReleaseEntity, -                                           size=self.size, -                                           api=self.api) - -        # Require at least half the contribs to be shared. -        matcher = ContribListMatcher( -            cmp=JaccardIndexThreshold(0.5), -            pipeline=Pipeline([ -                lambda contribs: set((c.raw_name.strip().lower() for c in contribs)), -            ]), -        ) - -        for re in entities: -            if re.title.strip().lower() != release.title.strip().lower(): -                continue -            if not matcher.compare(re.contribs, release.contribs): -                continue -            result.append(re) -        return result - -    def match_release_exact_title_fuzzy_contrib(self, release): -        """ -        Exact title but ok it authors differ (slightly). -        """ -        if release.title is None or release.contribs is None: -            return [] -        contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()] -        contrib_queries = [{ -            "match": { -                "contrib_names": { -                    "query": token, -                } -            } -        } for token in contrib_tokens] -        query = { -            "bool": { -                "must": [{ -                    "match": { -                        "title": { -                            "query": release.title, -                            "operator": "AND", -                        }, -                    } -                }] + contrib_queries, -            }, -        } -        result = [] -        resp = self.es.search(index=self.index, -                              body={ -                                  "query": query, -                                  "size": self.size, -                                  "track_total_hits": True -                              }) -        if es_compat_hits_total(resp) == 0: -            return result -        if es_compat_hits_total(resp) > self.size: -            raise NotImplementedError('todo: scroll required for larger result sets: {}'.format(es)) -        entities = response_to_entity_list(resp, -                                           entity_type=ReleaseEntity, -                                           size=self.size, -                                           api=self.api) - -        matcher = ContribListMatcher( -            cmp=FuzzyStringSimilarity(min_ratio=60), -            pipeline=Pipeline([ -                lambda contribs: set((c.raw_name.strip().lower() for c in contribs)), -            ]), -        ) - -        for re in entities: -            if re.title.strip().lower() != release.title.strip().lower(): -                continue -            if not matcher.compare(re.contribs, release.contribs): -                continue -            result.append(re) -        return result - -    def match_release_exact_title(self, release): +    def _match_title_contrib(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]:          """ -        Exact title, but any author. For common titles, this will yield 100s or -        1000s or results. +        Match in the presence of defined title and contrib fields.          """ -        if release.title is None: -            return [] -        query = { -            "bool": { -                "must": [{ -                    "match": { -                        "title": { -                            "query": release.title, -                            "operator": "AND", -                        }, -                    } -                }], -            }, -        } -        result = [] -        resp = self.es.search(body={ -            "query": query, -            "size": self.size, -            "track_total_hits": True -        }, -                              index=self.index) -        if es_compat_hits_total(resp) == 0: -            return result -        if es_compat_hits_total(resp) > self.size: -            self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp))) -        entities = response_to_entity_list(resp, -                                           entity_type=ReleaseEntity, -                                           size=self.size, -                                           api=self.api) -        for re in entities: -            if re.title.strip().lower() != release.title.strip().lower(): -                continue -            result.append(re) -        return result - -    def match_release_fuzzy_title_fuzzy_contrib(self, release): -        """ -        Using elasticsearch fuzziness option (which is not that fuzzy). -        """ -        if release.title is None or release.contribs is None: -            return []          contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()]          contrib_queries = [{              "match": { @@ -343,7 +115,18 @@ class FuzzyReleaseMatcher:                  ] + contrib_queries,              },          } +        if release.release_year is not None: +            query["bool"]["must"].append({ +                "range": { +                    "year": { +                        "gte": release.release_year - self.release_year_padding, +                        "lte": release.release_year + self.release_year_padding, +                        "boost": 0.5, +                    } +                } +            })          result = [] +        self.logger.info(query)          resp = self.es.search(index=self.index,                                body={                                    "query": query, @@ -353,19 +136,17 @@ class FuzzyReleaseMatcher:          if es_compat_hits_total(resp) == 0:              return result          if es_compat_hits_total(resp) > self.size: -            raise ValueError('too many hits: {}'.format(es_compat_hits_total(resp))) +            self.logger.warning('too many hits: {}'.format(es_compat_hits_total(resp)))          entities = response_to_entity_list(resp,                                             entity_type=ReleaseEntity,                                             size=self.size,                                             api=self.api)          return entities -    def match_release_generic(self, release): +    def _match_title(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]:          """ -        Final catch all variant via title. +        Match in the presence of a title.          """ -        if release.title is None: -            return []          query = {              "bool": {                  "must": [ @@ -373,7 +154,7 @@ class FuzzyReleaseMatcher:                          "match": {                              "title": {                                  "query": release.title, -                                "operator": "OR", +                                "operator": "AND",                                  "fuzziness": "AUTO",                              },                          } @@ -381,6 +162,16 @@ class FuzzyReleaseMatcher:                  ],              },          } +        if release.release_year is not None: +            query["bool"]["must"].append({ +                "range": { +                    "year": { +                        "gte": release.release_year - self.release_year_padding, +                        "lte": release.release_year + self.release_year_padding, +                        "boost": 0.5, +                    } +                } +            })          result = []          resp = self.es.search(index=self.index,                                body={ @@ -391,19 +182,17 @@ class FuzzyReleaseMatcher:          if es_compat_hits_total(resp) == 0:              return result          if es_compat_hits_total(resp) > self.size: -            self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp))) +            self.logger.warning('too many hits: {}'.format(es_compat_hits_total(resp)))          entities = response_to_entity_list(resp,                                             entity_type=ReleaseEntity,                                             size=self.size,                                             api=self.api)          return entities -    def match_release_generic_fuzzy_contrib(self, release): +    def _match_contribs(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]:          """ -        Only match contribs, if they exist. +        Match in the presence of contribs (and no title).          """ -        if release.contribs is None: -            return []          contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()]          contrib_queries = [{              "match": { @@ -417,6 +206,16 @@ class FuzzyReleaseMatcher:                  "must": contrib_queries,              },          } +        if release.release_year is not None: +            query["bool"]["must"].append({ +                "range": { +                    "year": { +                        "gte": release.release_year - self.release_year_padding, +                        "lte": release.release_year + self.release_year_padding, +                        "boost": 0.5, +                    } +                } +            })          result = []          resp = self.es.search(index=self.index,                                body={ @@ -427,37 +226,78 @@ class FuzzyReleaseMatcher:          if es_compat_hits_total(resp) == 0:              return result          if es_compat_hits_total(resp) > self.size: -            self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp))) +            self.logger.warning('too many hits: {}'.format(es_compat_hits_total(resp)))          entities = response_to_entity_list(resp,                                             entity_type=ReleaseEntity,                                             size=self.size,                                             api=self.api)          return entities -    def match_cascade(self, release, *qs, **kwargs): +    def _match_generic(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]:          """ -        Returns the result from the first query that returns a result. All query -        functions need to be defined on this class (for now). +        Throw tokens at elasticsearch.          """ -        for q in qs: -            self.logger.debug("[cascade] {}".format(q)) -            result = q(release, **kwargs) -            if len(result) > 0: -                return result -        return [] +        token_queries = [ +            { +                "match": { +                    "biblio": {  # https://git.io/JMXvJ +                        "query": token, +                    } +                } +            } for token in release_tokens(release) if len(token) > self.min_token_length +        ] +        query = { +            "bool": { +                "must": token_queries, +            }, +        } +        if release.release_year is not None: +            query["bool"]["must"].append({ +                "range": { +                    "year": { +                        "gte": release.release_year - self.release_year_padding, +                        "lte": release.release_year + self.release_year_padding, +                        "boost": 0.5, +                    } +                } +            }) +        result = [] +        self.logger.info(query) +        resp = self.es.search(index=self.index, +                              body={ +                                  "query": query, +                                  "size": self.size, +                                  "track_total_hits": True +                              }) +        if es_compat_hits_total(resp) == 0: +            return result +        if es_compat_hits_total(resp) > self.size: +            self.logger.warning('too many hits: {}'.format(es_compat_hits_total(resp))) +        entities = response_to_entity_list(resp, +                                           entity_type=ReleaseEntity, +                                           size=self.size, +                                           api=self.api) +        return entities      def match(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]:          """ -        Match returns a list of match candidates given a release entity. +        Match dispatches methods based on which fields are defined on the +        document.          """          if not release:              return [] -        return self.match_cascade( -            release, self.match_release_by_id, self.match_release_exact_title_exact_contrib, -            self.match_release_exact_title_partial_contrib, -            self.match_release_exact_title_fuzzy_contrib, self.match_release_exact_title, -            self.match_release_fuzzy_title_fuzzy_contrib, self.match_release_generic, -            self.match_release_generic_fuzzy_contrib) +        if release.ext_ids and len(release.ext_ids.to_dict()) > 0: +            result = self._match_id(release) +        if release.title is not None and release.contribs is not None: +            result = self._match_title_contrib(release) +        elif release.title is not None: +            result = self._match_title(release) +        elif release.contribs is not None: +            result = self._match_contribs(release) +        else: +            result = self._match_generic(release) + +        return result  def public_api(host_uri): @@ -471,14 +311,97 @@ def public_api(host_uri):      return fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf)) +def release_tokens(release: ReleaseEntity) -> List[str]: +    """ +    Turn a release into a set of tokens. +    """ +    tokens = [] +    red = release.to_dict() +    for k, v in red.items(): +        if v is None or k == "ext_ids": +            continue +        v = str(v) +        for tok in v.split(): +            tokens.append(tok) +    for _, v in red.get("ext_ids", {}).items(): +        if v is None or not isinstance(v, str): +            continue +        for tok in v.split(): +            tokens.append(tok) + +    return tokens + + +def test_release_tokens(): +    Case = collections.namedtuple("Case", "re tokens") +    cases = ( +        Case(entity_from_dict({"ext_ids": {}}, ReleaseEntity), []), +        Case(entity_from_dict({ +            "ext_ids": {}, +            "title": "Flow my tears" +        }, ReleaseEntity), ["Flow", "my", "tears"]), +        Case( +            entity_from_dict( +                { +                    "ext_ids": {}, +                    "subtitle": "An illustrated guide", +                    "release_year": 1981, +                }, ReleaseEntity), ["An", "illustrated", "guide", "1981"]), +    ) +    for c in cases: +        tokens = release_tokens(c.re) +        assert tokens == c.tokens + + +def fetch_release(ident, api=None): +    """ +    Return release entity of None. +    """ +    if api is None: +        api = public_api(FATCAT_API_URL) +    try: +        re = api.get_release(ident, hide="refs,abstracts", expand="container,contribs,files") +    except ApiException as exc: +        if exc.status == 404: +            print("[err] failed to retrieve release entity: {}".format(id), file=sys.stderr) +        else: +            print("[err] api failed with {}: {}".format(exc.status, exc.message), file=sys.stderr) +    else: +        return re + +  def retrieve_entity_list(      ids: List[str],      api: DefaultApi = None,      entity_type: Union[Type[ReleaseEntity], Type[ContainerEntity]] = ReleaseEntity,  ) -> List[Union[Type[ReleaseEntity], Type[ContainerEntity]]]:      """ +    Parallel requests. +    """ +    if api is None: +        api = public_api(FATCAT_API_URL) + +    result = [] +    if entity_type == ReleaseEntity: +        with Pool(10) as p: +            result = p.map(fetch_release, ids) +        return [v for v in result if v is not None] +    else: +        raise ValueError("[err] cannot retrieve ids {} of type {}".format(ids, entity_type)) + +    return result + + +def retrieve_entity_list_sequential( +    ids: List[str], +    api: DefaultApi = None, +    entity_type: Union[Type[ReleaseEntity], Type[ContainerEntity]] = ReleaseEntity, +) -> List[Union[Type[ReleaseEntity], Type[ContainerEntity]]]: +    """      Retrieve a list of entities. Some entities might be missing. Return all      that are accessible. + +    TODO: parallelize API access.      """      if api is None:          api = public_api(FATCAT_API_URL) diff --git a/notes/es_fuzzy_queries/README.md b/notes/es_fuzzy_queries/README.md new file mode 100644 index 0000000..f69d5ea --- /dev/null +++ b/notes/es_fuzzy_queries/README.md @@ -0,0 +1 @@ +# ES query examples diff --git a/tests/files/simple_fuzzy_release_matcher/0.yaml b/tests/files/simple_fuzzy_release_matcher/0.yaml new file mode 100644 index 0000000..71fc992 --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/0.yaml @@ -0,0 +1,16 @@ +about: title and contrib +input: > +  { +    "contribs": [ +      { +        "raw_name": "Michael Adams" +      } +    ], +    "title": "digital libraries", +    "ext_ids": {} +  } +release_year_padding: 1 +expected: +  - 7rmvqtrb2jdyhcxxodihzzcugy +  - upm5nljirrbsfenoyxsisciltq +  - wd3oeoi3bffknfbg2ymleqc4ja diff --git a/tests/files/simple_fuzzy_release_matcher/1.yaml b/tests/files/simple_fuzzy_release_matcher/1.yaml new file mode 100644 index 0000000..df6a954 --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/1.yaml @@ -0,0 +1,24 @@ +about: title contrib, partial name +input: > +  { +    "contribs": [ +      { +        "raw_name": "Adams" +      } +    ], +    "title": "digital libraries", +    "ext_ids": {} +  } +release_year_padding: 1 +expected: +  - 7rmvqtrb2jdyhcxxodihzzcugy +  - a2u6ougtsjcbvczou6sazsulcm +  - dy45vilej5diros6zmax46nm4e +  - exuwhhayird4fdjmmsiqpponlq +  - gqrj7jikezgcfpjfazhpf4e7c4 +  - mkmqt3453relbpuyktnmsg6hjq +  - t2g5sl3dgzchtnq7dynxyzje44 +  - t4tvenhrvzamraxrvvxivxmvga +  - wd3oeoi3bffknfbg2ymleqc4ja +  - y63a6dhrfnb7bltlxfynydbojy + diff --git a/tests/files/simple_fuzzy_release_matcher/2.yaml b/tests/files/simple_fuzzy_release_matcher/2.yaml new file mode 100644 index 0000000..df6a954 --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/2.yaml @@ -0,0 +1,24 @@ +about: title contrib, partial name +input: > +  { +    "contribs": [ +      { +        "raw_name": "Adams" +      } +    ], +    "title": "digital libraries", +    "ext_ids": {} +  } +release_year_padding: 1 +expected: +  - 7rmvqtrb2jdyhcxxodihzzcugy +  - a2u6ougtsjcbvczou6sazsulcm +  - dy45vilej5diros6zmax46nm4e +  - exuwhhayird4fdjmmsiqpponlq +  - gqrj7jikezgcfpjfazhpf4e7c4 +  - mkmqt3453relbpuyktnmsg6hjq +  - t2g5sl3dgzchtnq7dynxyzje44 +  - t4tvenhrvzamraxrvvxivxmvga +  - wd3oeoi3bffknfbg2ymleqc4ja +  - y63a6dhrfnb7bltlxfynydbojy + diff --git a/tests/files/simple_fuzzy_release_matcher/3.yaml b/tests/files/simple_fuzzy_release_matcher/3.yaml new file mode 100644 index 0000000..1ab761b --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/3.yaml @@ -0,0 +1,19 @@ +about: title only +input: > +  { +    "title": "The future of scholarly communications", +    "ext_ids": {} +  } +release_year_padding: 0 +expected: +  - '2f57funqizf4lcxjanls45upom' +  - '3p2hngx6kfa33bdaobipimdzhe' +  - '75dzcdywlbb3logmrrpkabanfa' +  - 'ccoocm7uzjgwnlpfk5fbwfudjm' +  - 'nfydgfziuvhete6p3lrn4u325u' +  - 'ntpiporu75bendibjku4kjmd5q' +  - 'op6a5fclonhrxm3zlo6ub2tlw4' +  - 'opoxzl3zzbccdh5tptm5p2krem' +  - 'umzryrtocbakberuubjm2pgxum' +  - 'zb4bjnwqsveyzcwebvvmnsoq7u' + diff --git a/tests/files/simple_fuzzy_release_matcher/4.yaml b/tests/files/simple_fuzzy_release_matcher/4.yaml new file mode 100644 index 0000000..9419406 --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/4.yaml @@ -0,0 +1,16 @@ +about: title, year +input: > +  { +    "title": "The future of scholarly communications", +    "release_year": 2014, +    "ext_ids": {} +  } +release_year_padding: 0 +expected: +  - '66r4s55dpvht5jghwkhupai2km' +  - 'ccoocm7uzjgwnlpfk5fbwfudjm' +  - 'du4awowpsbbcjlo2pe6dvmxewu' +  - 'nfydgfziuvhete6p3lrn4u325u' +  - 'ntpiporu75bendibjku4kjmd5q' +  - 'op6a5fclonhrxm3zlo6ub2tlw4' +  - 'xsuxmk5dyba6rnkeslipxxdlzi' diff --git a/tests/files/simple_fuzzy_release_matcher/5.yaml b/tests/files/simple_fuzzy_release_matcher/5.yaml new file mode 100644 index 0000000..1eb435b --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/5.yaml @@ -0,0 +1,16 @@ +about: contrib, year +input: > +  { +    "contribs": [ +      { +        "raw_name": "Lissandrini" +      } +    ], +    "release_year": 2014, +    "ext_ids": {} +  } +release_year_padding: 1 +expected: +  - 'xfhjsixnlvbibigrilisqqvfk4' +  - 'zfhfpo2shrdexpgd2as4fz7wnm' +  - 'cyct2bqs5feqbowg6ovv53pdfq' diff --git a/tests/files/simple_fuzzy_release_matcher/6.yaml b/tests/files/simple_fuzzy_release_matcher/6.yaml new file mode 100644 index 0000000..ae52b23 --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/6.yaml @@ -0,0 +1,24 @@ +about: contrib, year +input: > +  { +    "contribs": [ +      { +        "raw_name": "Goodwin" +      } +    ], +    "release_year": 2014, +    "ext_ids": {} +  } +release_year_padding: 0 +expected: +  - 2bbtr4cltbgannqc6vqijvvzdq +  - 34i2hba6tzf3xomobhumfkkvga +  - 62sz5fhhuvenpfctf6wejl5m2i +  - chnqmdm4yfd4zk6kawujvsbhwy +  - chs7be23vfdthk3xre54w534zm +  - f5lp3nipazhyxoa2xarlomkofm +  - hikujb5wmvasnoat2myt56l63y +  - qbom7rwqtzfypa5hltgbx4e2iq +  - qh44drz3bvg2ndzwzc55xops7y +  - r4n57quetbf7tddwodjauegmzq + diff --git a/tests/files/simple_fuzzy_release_matcher/7.yaml b/tests/files/simple_fuzzy_release_matcher/7.yaml new file mode 100644 index 0000000..2330f0d --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/7.yaml @@ -0,0 +1,10 @@ +about: just a subtitle +input: > +  { +    "subtitle": "topographies parisiennes", +    "ext_ids": {} +  } +release_year_padding: 1 +expected: +  - yvqtz2zvkzcbpj4jxrp7bvydfu +  - lttg27o7mjganpkhrgy3xyv7vu diff --git a/tests/files/simple_fuzzy_release_matcher/8.yaml b/tests/files/simple_fuzzy_release_matcher/8.yaml new file mode 100644 index 0000000..b43e53a --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/8.yaml @@ -0,0 +1,139 @@ +about: a full document, https://fatcat.wiki/release/yvqtz2zvkzcbpj4jxrp7bvydfu +input: > +  { +    "abstracts": [], +    "refs": [], +    "contribs": [ +      { +        "index": 0, +        "raw_name": "Annelies Schulte Nordholt", +        "role": "author", +        "extra": { +          "seq": "first" +        } +      } +    ], +    "publisher": "Uopen Journals", +    "pages": "66", +    "ext_ids": { +      "doi": "10.18352/bmgn-lchr.128" +    }, +    "release_year": 2008, +    "release_date": "2008-02-19", +    "release_stage": "published", +    "release_type": "article-journal", +    "container_id": "sm7svbj64vc55gj4p23t7c3lrm", +    "webcaptures": [], +    "filesets": [], +    "files": [ +      { +        "release_ids": [ +          "yvqtz2zvkzcbpj4jxrp7bvydfu" +        ], +        "mimetype": "application/pdf", +        "urls": [ +          { +            "url": "https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", +            "rel": "publisher" +          }, +          { +            "url": "https://web.archive.org/web/20200209043715/https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", +            "rel": "webarchive" +          } +        ], +        "sha256": "96f3552fa3eee10282109dd994f6993caf44627946317d03862a5df167140b23", +        "sha1": "a9ba7c2038e2a77ac1b1144344443a3835d83c40", +        "md5": "7dae3ec6c1d65cae6a91554071cc9625", +        "size": 889420, +        "revision": "57e3b801-0d84-405b-be8b-6b2b0583cd75", +        "ident": "oew6z4a6gvfqxc5kiy2r62ucfq", +        "state": "active" +      } +    ], +    "container": { +      "wikidata_qid": "Q15763709", +      "issnp": "1873-5045", +      "issne": "1873-5045", +      "issnl": "1873-5045", +      "publisher": "Uopen Journals", +      "name": "Relief: Revue Électronique de Littérature Francaise", +      "extra": { +        "country": "nl", +        "default_license": "CC-BY", +        "doaj": { +          "as_of": "2021-11-20", +          "default_license": "CC-BY", +          "seal": false +        }, +        "kbart": { +          "clockss": { +            "year_spans": [ +              [ +                2007, +                2016 +              ] +            ] +          }, +          "lockss": { +            "year_spans": [ +              [ +                2007, +                2019 +              ] +            ] +          }, +          "pkp_pln": { +            "year_spans": [ +              [ +                2007, +                2021 +              ] +            ] +          }, +          "portico": { +            "year_spans": [ +              [ +                2007, +                2017 +              ] +            ] +          } +        }, +        "languages": [ +          "en" +        ], +        "publisher_type": "unipress", +        "road": { +          "as_of": "2018-01-24" +        }, +        "sherpa_romeo": { +          "color": "blue" +        }, +        "szczepanski": { +          "as_of": "2018" +        }, +        "urls": [ +          "https://www.revue-relief.org/", +          "http://www.revue-relief.org/index.php/relief", +          "http://www.revue-relief.org/index.php/relief/about" +        ] +      }, +      "revision": "2f36f957-7b60-4452-9310-1bd5e0035c0e", +      "ident": "sm7svbj64vc55gj4p23t7c3lrm", +      "state": "active" +    }, +    "work_id": "qcpd2i2txfdi5emqb7fxsawk6e", +    "title": "Georges Perec: topographies parisiennes du flâneur", +    "state": "active", +    "ident": "yvqtz2zvkzcbpj4jxrp7bvydfu", +    "revision": "c9e80d74-8c4f-47a7-b49a-689f26856dff", +    "extra": { +      "crossref": { +        "type": "journal-article" +      } +    } +  } +release_year_padding: 1 +expected: +  - yvqtz2zvkzcbpj4jxrp7bvydfu +  - lttg27o7mjganpkhrgy3xyv7vu diff --git a/tests/files/simple_fuzzy_release_matcher/9.yaml b/tests/files/simple_fuzzy_release_matcher/9.yaml new file mode 100644 index 0000000..b43e53a --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/9.yaml @@ -0,0 +1,139 @@ +about: a full document, https://fatcat.wiki/release/yvqtz2zvkzcbpj4jxrp7bvydfu +input: > +  { +    "abstracts": [], +    "refs": [], +    "contribs": [ +      { +        "index": 0, +        "raw_name": "Annelies Schulte Nordholt", +        "role": "author", +        "extra": { +          "seq": "first" +        } +      } +    ], +    "publisher": "Uopen Journals", +    "pages": "66", +    "ext_ids": { +      "doi": "10.18352/bmgn-lchr.128" +    }, +    "release_year": 2008, +    "release_date": "2008-02-19", +    "release_stage": "published", +    "release_type": "article-journal", +    "container_id": "sm7svbj64vc55gj4p23t7c3lrm", +    "webcaptures": [], +    "filesets": [], +    "files": [ +      { +        "release_ids": [ +          "yvqtz2zvkzcbpj4jxrp7bvydfu" +        ], +        "mimetype": "application/pdf", +        "urls": [ +          { +            "url": "https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", +            "rel": "publisher" +          }, +          { +            "url": "https://web.archive.org/web/20200209043715/https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", +            "rel": "webarchive" +          } +        ], +        "sha256": "96f3552fa3eee10282109dd994f6993caf44627946317d03862a5df167140b23", +        "sha1": "a9ba7c2038e2a77ac1b1144344443a3835d83c40", +        "md5": "7dae3ec6c1d65cae6a91554071cc9625", +        "size": 889420, +        "revision": "57e3b801-0d84-405b-be8b-6b2b0583cd75", +        "ident": "oew6z4a6gvfqxc5kiy2r62ucfq", +        "state": "active" +      } +    ], +    "container": { +      "wikidata_qid": "Q15763709", +      "issnp": "1873-5045", +      "issne": "1873-5045", +      "issnl": "1873-5045", +      "publisher": "Uopen Journals", +      "name": "Relief: Revue Électronique de Littérature Francaise", +      "extra": { +        "country": "nl", +        "default_license": "CC-BY", +        "doaj": { +          "as_of": "2021-11-20", +          "default_license": "CC-BY", +          "seal": false +        }, +        "kbart": { +          "clockss": { +            "year_spans": [ +              [ +                2007, +                2016 +              ] +            ] +          }, +          "lockss": { +            "year_spans": [ +              [ +                2007, +                2019 +              ] +            ] +          }, +          "pkp_pln": { +            "year_spans": [ +              [ +                2007, +                2021 +              ] +            ] +          }, +          "portico": { +            "year_spans": [ +              [ +                2007, +                2017 +              ] +            ] +          } +        }, +        "languages": [ +          "en" +        ], +        "publisher_type": "unipress", +        "road": { +          "as_of": "2018-01-24" +        }, +        "sherpa_romeo": { +          "color": "blue" +        }, +        "szczepanski": { +          "as_of": "2018" +        }, +        "urls": [ +          "https://www.revue-relief.org/", +          "http://www.revue-relief.org/index.php/relief", +          "http://www.revue-relief.org/index.php/relief/about" +        ] +      }, +      "revision": "2f36f957-7b60-4452-9310-1bd5e0035c0e", +      "ident": "sm7svbj64vc55gj4p23t7c3lrm", +      "state": "active" +    }, +    "work_id": "qcpd2i2txfdi5emqb7fxsawk6e", +    "title": "Georges Perec: topographies parisiennes du flâneur", +    "state": "active", +    "ident": "yvqtz2zvkzcbpj4jxrp7bvydfu", +    "revision": "c9e80d74-8c4f-47a7-b49a-689f26856dff", +    "extra": { +      "crossref": { +        "type": "journal-article" +      } +    } +  } +release_year_padding: 1 +expected: +  - yvqtz2zvkzcbpj4jxrp7bvydfu +  - lttg27o7mjganpkhrgy3xyv7vu diff --git a/tests/test_matching.py b/tests/test_matching.py index a7754ee..b9d7fae 100644 --- a/tests/test_matching.py +++ b/tests/test_matching.py @@ -48,90 +48,16 @@ def es_client():      return elasticsearch.Elasticsearch([FATCAT_SEARCH_URL]) -def test_matcher_match_release(es_client, caplog): -    cases = ( -        ("wtv64ahbdzgwnan7rllwr3nurm", 1), -        ("eqcgtpav3na5jh56o5vjsvb4ei", 1), -    ) -    matcher = FuzzyReleaseMatcher(es=es_client, size=5) -    for i, (ident, count) in enumerate(cases): -        entity = anything_to_entity(ident, ReleaseEntity) -        result = matcher.match(entity) -        logger.info("[{}] given {}, found {}".format(i, entity.title, len(result))) -        assert len(result) == count - -    # Partial data. -    cases = ( -        ({ -            "title": "digital libraries", -            "ext_ids": {} -        }, 5), -        ({ -            "title": "unlikelytitle", -            "ext_ids": {} -        }, 0), -        ({ -            "title": "Imminent dystopia", -            "ext_ids": {} -        }, 5), -        ({ -            "title": "", -            "contribs": [{ -                "raw_name": "Aristoteles" -            }], -            "ext_ids": {} -        }, 5), -        # ({ -        #     "title": "Letter", -        #     "contribs": [{"raw_name": "Claudel"}], -        #     "ext_ids": {} -        # }, 1), -        # ({ -        #     "title": "The Future of Digital Scholarship", -        #     "contribs": [{ -        #         "raw_name": "Costantino Thanos" -        #     }], -        #     "ext_ids": {} -        # }, 5), -    ) -    for i, (doc, count) in enumerate(cases): -        entity = entity_from_dict(doc, ReleaseEntity) -        result = matcher.match(entity) -        with caplog.at_level(logging.INFO): -            logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result), -                                                                      [v.title for v in result])) -        assert len(result) == count, doc - - -def test_fuzzy_release_matcher_match_release_by_id(es_client, caplog): -    matcher = FuzzyReleaseMatcher(es=es_client) -    cases = ( -        ("wtv64ahbdzgwnan7rllwr3nurm", 1), -        ("eqcgtpav3na5jh56o5vjsvb4ei", 1), -    ) -    for i, (ident, count) in enumerate(cases): -        entity = anything_to_entity(ident, ReleaseEntity) -        result = matcher.match_release_by_id(entity) -        assert len(result) == count - - -def test_fuzzy_release_match_release_exact_title_exact_contrib(es_client, caplog): -    matcher = FuzzyReleaseMatcher(es=es_client) -    Case = collections.namedtuple("Case", "title date input expected") -    cases = yaml_to_cases( -        Case, "tests/files/fuzzy_release_match_release_exact_title_exact_contrib/*.yaml") -    for i, c in enumerate(cases): -        entity = entity_from_json(c.input, ReleaseEntity) -        result = matcher.match_release_exact_title_exact_contrib(entity) -        assert len(result) == c.expected, "[{}] {}".format(c.title, c.input) - - -def test_fuzzy_release_match_release_exact_title_partial_contrib(es_client, caplog): +def test_simple_fuzzy_release_matcher(es_client, caplog): +    """ +    Use a single test function to test the higher level match function. We want +    the result to be sensible, but should also document broken examples here. +    """      matcher = FuzzyReleaseMatcher(es=es_client) -    Case = collections.namedtuple("Case", "title date input jaccard_index_threshold expected") -    cases = yaml_to_cases( -        Case, "tests/files/fuzzy_release_match_release_exact_title_partial_contrib/*.yaml") +    Case = collections.namedtuple("Case", "about input release_year_padding expected") +    cases = yaml_to_cases(Case, "tests/files/simple_fuzzy_release_matcher/*.yaml")      for i, c in enumerate(cases): +        matcher.release_year_padding = c.release_year_padding          entity = entity_from_json(c.input, ReleaseEntity) -        result = matcher.match_release_exact_title_partial_contrib(entity) -        assert len(result) == c.expected, "[{}] {}".format(c.title, c.input) +        result = matcher.match(entity) +        assert set([r.ident for r in result]) == set(c.expected), "[{}] {}".format(c.about, c.input)  | 
