From de9f1155ea57c812171abd5517ab39f4fe135cb3 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 17 Dec 2021 10:07:15 +0100 Subject: apply first round of feedback on matching --- fuzzycat/matching.py | 58 ++++++++++++++++++++++++++++---- fuzzycat/utils.py | 7 ++++ tests/files/fuzzy_release_matcher/0.yaml | 1 + tests/files/fuzzy_release_matcher/1.yaml | 1 + tests/files/fuzzy_release_matcher/2.yaml | 1 + tests/files/fuzzy_release_matcher/3.yaml | 1 + tests/files/fuzzy_release_matcher/4.yaml | 1 + tests/files/fuzzy_release_matcher/5.yaml | 1 + tests/files/fuzzy_release_matcher/6.yaml | 1 + tests/files/fuzzy_release_matcher/7.yaml | 1 + tests/files/fuzzy_release_matcher/8.yaml | 1 + tests/files/fuzzy_release_matcher/9.yaml | 1 + tests/test_matching.py | 7 ++-- 13 files changed, 73 insertions(+), 9 deletions(-) diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py index b01ce64..38899f9 100644 --- a/fuzzycat/matching.py +++ b/fuzzycat/matching.py @@ -45,7 +45,7 @@ class FuzzyReleaseMatcher: ...) this is and will be too slow. Anecdata: An early 2020 test run matching 23M "title strings" took - literally a couple of weeks to complete. + literally weeks to complete. This class is currently tested against the live fatcat search instance. A usage example: @@ -63,7 +63,8 @@ class FuzzyReleaseMatcher: index="fatcat_release", size=10, min_token_length=3, - release_year_padding=1): + release_year_padding=1, + skip_id_matching=False): if isinstance(es, str): self.es = elasticsearch.Elasticsearch([es]) else: @@ -74,6 +75,7 @@ class FuzzyReleaseMatcher: self.logger = logging.getLogger("fuzzy") self.min_token_length = min_token_length self.release_year_padding = 1 + self.skip_id_matching = skip_id_matching def _match_id(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]: """ @@ -114,7 +116,10 @@ class FuzzyReleaseMatcher: """ Match in the presence of defined title and contrib fields. """ - contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()] + contrib_tokens = [ + tok for c in release.contribs for tok in c.raw_name.split() + if len(tok) > self.min_token_length + ] contrib_queries = [{ "match": { "contrib_names": { @@ -124,6 +129,11 @@ class FuzzyReleaseMatcher: } for token in contrib_tokens] query = { "bool": { + "must_not": [{ + "match": { + "release_type": "stub", + }, + }], "must": [ { "match": { @@ -137,6 +147,9 @@ class FuzzyReleaseMatcher: ] + contrib_queries, }, } + # TODO: could boost on various things, like "overall metadata quality" + # (eg, does indexed record have title+year+release_type+container), or + # on publication stage (assuming things getting cited are 'published') if release.release_year is not None: query["bool"]["must"].append({ "range": { @@ -149,6 +162,7 @@ class FuzzyReleaseMatcher: }) result = [] self.logger.info(query) + # TODO: can we use the container name resp = self.es.search(index=self.index, body={ "query": query, @@ -171,6 +185,11 @@ class FuzzyReleaseMatcher: """ query = { "bool": { + "must_not": [{ + "match": { + "release_type": "stub", + }, + }], "must": [ { "match": { @@ -225,6 +244,11 @@ class FuzzyReleaseMatcher: } for token in contrib_tokens] query = { "bool": { + "must_not": [{ + "match": { + "release_type": "stub", + }, + }], "must": contrib_queries, }, } @@ -270,6 +294,11 @@ class FuzzyReleaseMatcher: ] query = { "bool": { + "must_not": [{ + "match": { + "release_type": "stub", + }, + }], "must": token_queries, }, } @@ -307,10 +336,11 @@ class FuzzyReleaseMatcher: document. """ if not release: - return [] - if release.ext_ids and len(release.ext_ids.to_dict()) > 0: + result = [] + elif not self.skip_id_matching and release.ext_ids and any( + release.ext_ids.to_dict().values()): result = self._match_id(release) - if release.title is not None and release.contribs is not None: + elif release.title is not None and release.contribs is not None: result = self._match_title_contrib(release) elif release.title is not None: result = self._match_title(release) @@ -332,6 +362,22 @@ def public_api(host_uri): conf.host = host_uri return fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf)) +def release_contrib_tokens(release : ReleaseEntity) -> List[str]: + """ + Return contribs as a list of tokens. + """ + # TODO! fix this + tokens = [] + for c in release.contribs: + if c.surname is not None: + tokens += c.surname.split() + elif c.raw_name is not None: + tokens += c.surname.split() + contrib_tokens = [ + tok for c in release.contribs for tok in c.raw_name.split() + ] + return contrib_tokens + def release_tokens(release: ReleaseEntity) -> List[str]: """ diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index 24e103a..dadfa5c 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -30,6 +30,13 @@ def es_compat_hits_total(resp): """ try: return resp["hits"]["total"]["value"] # ES7 + except KeyError: + # with track_total_hits set to False, we observed missing "total" keys, + # es returns: {'_shards': {'failed': 0, 'skipped': 0, 'successful': 6, + # 'total': 6}, 'hits': {'hits': [{'_id': 'yvqtz2zvkzcbpj4jxrp7b...ons': + # [], 'any_abstract': False, 'ark_id': None, ...}, ...}], + # 'max_score': 108.32384}, 'timed_out': False, 'took': 921} + return len(resp["hits"]["hits"]) except TypeError: return resp["hits"]["total"] # ES6 diff --git a/tests/files/fuzzy_release_matcher/0.yaml b/tests/files/fuzzy_release_matcher/0.yaml index 71fc992..3c0b915 100644 --- a/tests/files/fuzzy_release_matcher/0.yaml +++ b/tests/files/fuzzy_release_matcher/0.yaml @@ -10,6 +10,7 @@ input: > "ext_ids": {} } release_year_padding: 1 +skip_id_matching: false expected: - 7rmvqtrb2jdyhcxxodihzzcugy - upm5nljirrbsfenoyxsisciltq diff --git a/tests/files/fuzzy_release_matcher/1.yaml b/tests/files/fuzzy_release_matcher/1.yaml index df6a954..115111b 100644 --- a/tests/files/fuzzy_release_matcher/1.yaml +++ b/tests/files/fuzzy_release_matcher/1.yaml @@ -10,6 +10,7 @@ input: > "ext_ids": {} } release_year_padding: 1 +skip_id_matching: false expected: - 7rmvqtrb2jdyhcxxodihzzcugy - a2u6ougtsjcbvczou6sazsulcm diff --git a/tests/files/fuzzy_release_matcher/2.yaml b/tests/files/fuzzy_release_matcher/2.yaml index df6a954..115111b 100644 --- a/tests/files/fuzzy_release_matcher/2.yaml +++ b/tests/files/fuzzy_release_matcher/2.yaml @@ -10,6 +10,7 @@ input: > "ext_ids": {} } release_year_padding: 1 +skip_id_matching: false expected: - 7rmvqtrb2jdyhcxxodihzzcugy - a2u6ougtsjcbvczou6sazsulcm diff --git a/tests/files/fuzzy_release_matcher/3.yaml b/tests/files/fuzzy_release_matcher/3.yaml index 1ab761b..ed56d5a 100644 --- a/tests/files/fuzzy_release_matcher/3.yaml +++ b/tests/files/fuzzy_release_matcher/3.yaml @@ -5,6 +5,7 @@ input: > "ext_ids": {} } release_year_padding: 0 +skip_id_matching: false expected: - '2f57funqizf4lcxjanls45upom' - '3p2hngx6kfa33bdaobipimdzhe' diff --git a/tests/files/fuzzy_release_matcher/4.yaml b/tests/files/fuzzy_release_matcher/4.yaml index 9419406..899772b 100644 --- a/tests/files/fuzzy_release_matcher/4.yaml +++ b/tests/files/fuzzy_release_matcher/4.yaml @@ -6,6 +6,7 @@ input: > "ext_ids": {} } release_year_padding: 0 +skip_id_matching: false expected: - '66r4s55dpvht5jghwkhupai2km' - 'ccoocm7uzjgwnlpfk5fbwfudjm' diff --git a/tests/files/fuzzy_release_matcher/5.yaml b/tests/files/fuzzy_release_matcher/5.yaml index 1eb435b..d8f208a 100644 --- a/tests/files/fuzzy_release_matcher/5.yaml +++ b/tests/files/fuzzy_release_matcher/5.yaml @@ -10,6 +10,7 @@ input: > "ext_ids": {} } release_year_padding: 1 +skip_id_matching: false expected: - 'xfhjsixnlvbibigrilisqqvfk4' - 'zfhfpo2shrdexpgd2as4fz7wnm' diff --git a/tests/files/fuzzy_release_matcher/6.yaml b/tests/files/fuzzy_release_matcher/6.yaml index ae52b23..7841b68 100644 --- a/tests/files/fuzzy_release_matcher/6.yaml +++ b/tests/files/fuzzy_release_matcher/6.yaml @@ -10,6 +10,7 @@ input: > "ext_ids": {} } release_year_padding: 0 +skip_id_matching: false expected: - 2bbtr4cltbgannqc6vqijvvzdq - 34i2hba6tzf3xomobhumfkkvga diff --git a/tests/files/fuzzy_release_matcher/7.yaml b/tests/files/fuzzy_release_matcher/7.yaml index 2330f0d..7affb8f 100644 --- a/tests/files/fuzzy_release_matcher/7.yaml +++ b/tests/files/fuzzy_release_matcher/7.yaml @@ -5,6 +5,7 @@ input: > "ext_ids": {} } release_year_padding: 1 +skip_id_matching: false expected: - yvqtz2zvkzcbpj4jxrp7bvydfu - lttg27o7mjganpkhrgy3xyv7vu diff --git a/tests/files/fuzzy_release_matcher/8.yaml b/tests/files/fuzzy_release_matcher/8.yaml index b43e53a..271d1a4 100644 --- a/tests/files/fuzzy_release_matcher/8.yaml +++ b/tests/files/fuzzy_release_matcher/8.yaml @@ -134,6 +134,7 @@ input: > } } release_year_padding: 1 +skip_id_matching: true expected: - yvqtz2zvkzcbpj4jxrp7bvydfu - lttg27o7mjganpkhrgy3xyv7vu diff --git a/tests/files/fuzzy_release_matcher/9.yaml b/tests/files/fuzzy_release_matcher/9.yaml index b43e53a..271d1a4 100644 --- a/tests/files/fuzzy_release_matcher/9.yaml +++ b/tests/files/fuzzy_release_matcher/9.yaml @@ -134,6 +134,7 @@ input: > } } release_year_padding: 1 +skip_id_matching: true expected: - yvqtz2zvkzcbpj4jxrp7bvydfu - lttg27o7mjganpkhrgy3xyv7vu diff --git a/tests/test_matching.py b/tests/test_matching.py index eb54751..a8f8f5b 100644 --- a/tests/test_matching.py +++ b/tests/test_matching.py @@ -29,8 +29,7 @@ logger.setLevel(logging.DEBUG) FATCAT_SEARCH_URL = settings.get("FATCAT_SEARCH_URL", "https://search.fatcat.wiki:443") -def yaml_to_cases(klass, - files="tests/files/fuzzy_release_match_release_exact_title_exact_contrib/*.yaml"): +def yaml_to_cases(klass, files="tests/files/fuzzy_release_matcher/*.yaml"): """ Turn yaml files into a collection of named tuple test cases. The glob is relative to the project root (i.e. where you usually run `pytest` from). @@ -54,10 +53,12 @@ def test_simple_fuzzy_release_matcher(es_client, caplog): the result to be sensible, but should also document broken examples here. """ matcher = FuzzyReleaseMatcher(es=es_client) - Case = collections.namedtuple("Case", "about input release_year_padding expected") + Case = collections.namedtuple( + "Case", ["about", "input", "skip_id_matching", "release_year_padding", "expected"]) cases = yaml_to_cases(Case, "tests/files/fuzzy_release_matcher/*.yaml") for i, c in enumerate(cases): matcher.release_year_padding = c.release_year_padding + matcher.skip_id_matching = c.skip_id_matching entity = entity_from_json(c.input, ReleaseEntity) result = matcher.match(entity) assert set([r.ident for r in result]) == set(c.expected), "[{}] {}".format(c.about, c.input) -- cgit v1.2.3