From de9f1155ea57c812171abd5517ab39f4fe135cb3 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 17 Dec 2021 10:07:15 +0100 Subject: apply first round of feedback on matching --- fuzzycat/matching.py | 58 ++++++++++++++++++++++++++++++++++++++++++++++------ fuzzycat/utils.py | 7 +++++++ 2 files changed, 59 insertions(+), 6 deletions(-) (limited to 'fuzzycat') diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py index b01ce64..38899f9 100644 --- a/fuzzycat/matching.py +++ b/fuzzycat/matching.py @@ -45,7 +45,7 @@ class FuzzyReleaseMatcher: ...) this is and will be too slow. Anecdata: An early 2020 test run matching 23M "title strings" took - literally a couple of weeks to complete. + literally weeks to complete. This class is currently tested against the live fatcat search instance. A usage example: @@ -63,7 +63,8 @@ class FuzzyReleaseMatcher: index="fatcat_release", size=10, min_token_length=3, - release_year_padding=1): + release_year_padding=1, + skip_id_matching=False): if isinstance(es, str): self.es = elasticsearch.Elasticsearch([es]) else: @@ -74,6 +75,7 @@ class FuzzyReleaseMatcher: self.logger = logging.getLogger("fuzzy") self.min_token_length = min_token_length self.release_year_padding = 1 + self.skip_id_matching = skip_id_matching def _match_id(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]: """ @@ -114,7 +116,10 @@ class FuzzyReleaseMatcher: """ Match in the presence of defined title and contrib fields. """ - contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()] + contrib_tokens = [ + tok for c in release.contribs for tok in c.raw_name.split() + if len(tok) > self.min_token_length + ] contrib_queries = [{ "match": { "contrib_names": { @@ -124,6 +129,11 @@ class FuzzyReleaseMatcher: } for token in contrib_tokens] query = { "bool": { + "must_not": [{ + "match": { + "release_type": "stub", + }, + }], "must": [ { "match": { @@ -137,6 +147,9 @@ class FuzzyReleaseMatcher: ] + contrib_queries, }, } + # TODO: could boost on various things, like "overall metadata quality" + # (eg, does indexed record have title+year+release_type+container), or + # on publication stage (assuming things getting cited are 'published') if release.release_year is not None: query["bool"]["must"].append({ "range": { @@ -149,6 +162,7 @@ class FuzzyReleaseMatcher: }) result = [] self.logger.info(query) + # TODO: can we use the container name resp = self.es.search(index=self.index, body={ "query": query, @@ -171,6 +185,11 @@ class FuzzyReleaseMatcher: """ query = { "bool": { + "must_not": [{ + "match": { + "release_type": "stub", + }, + }], "must": [ { "match": { @@ -225,6 +244,11 @@ class FuzzyReleaseMatcher: } for token in contrib_tokens] query = { "bool": { + "must_not": [{ + "match": { + "release_type": "stub", + }, + }], "must": contrib_queries, }, } @@ -270,6 +294,11 @@ class FuzzyReleaseMatcher: ] query = { "bool": { + "must_not": [{ + "match": { + "release_type": "stub", + }, + }], "must": token_queries, }, } @@ -307,10 +336,11 @@ class FuzzyReleaseMatcher: document. """ if not release: - return [] - if release.ext_ids and len(release.ext_ids.to_dict()) > 0: + result = [] + elif not self.skip_id_matching and release.ext_ids and any( + release.ext_ids.to_dict().values()): result = self._match_id(release) - if release.title is not None and release.contribs is not None: + elif release.title is not None and release.contribs is not None: result = self._match_title_contrib(release) elif release.title is not None: result = self._match_title(release) @@ -332,6 +362,22 @@ def public_api(host_uri): conf.host = host_uri return fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf)) +def release_contrib_tokens(release : ReleaseEntity) -> List[str]: + """ + Return contribs as a list of tokens. + """ + # TODO! fix this + tokens = [] + for c in release.contribs: + if c.surname is not None: + tokens += c.surname.split() + elif c.raw_name is not None: + tokens += c.surname.split() + contrib_tokens = [ + tok for c in release.contribs for tok in c.raw_name.split() + ] + return contrib_tokens + def release_tokens(release: ReleaseEntity) -> List[str]: """ diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index 24e103a..dadfa5c 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -30,6 +30,13 @@ def es_compat_hits_total(resp): """ try: return resp["hits"]["total"]["value"] # ES7 + except KeyError: + # with track_total_hits set to False, we observed missing "total" keys, + # es returns: {'_shards': {'failed': 0, 'skipped': 0, 'successful': 6, + # 'total': 6}, 'hits': {'hits': [{'_id': 'yvqtz2zvkzcbpj4jxrp7b...ons': + # [], 'any_abstract': False, 'ark_id': None, ...}, ...}], + # 'max_score': 108.32384}, 'timed_out': False, 'took': 921} + return len(resp["hits"]["hits"]) except TypeError: return resp["hits"]["total"] # ES6 -- cgit v1.2.3