diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-12-17 10:07:15 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-12-21 20:56:56 +0100 |
commit | de9f1155ea57c812171abd5517ab39f4fe135cb3 (patch) | |
tree | 2b2071642259c46ede5b56d15cbce15187226362 /fuzzycat/matching.py | |
parent | 4720fb51584fae1edc2a79dd94c24b4ddac92acb (diff) | |
download | fuzzycat-master.tar.gz fuzzycat-master.zip |
Diffstat (limited to 'fuzzycat/matching.py')
-rw-r--r-- | fuzzycat/matching.py | 58 |
1 files changed, 52 insertions, 6 deletions
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py index b01ce64..38899f9 100644 --- a/fuzzycat/matching.py +++ b/fuzzycat/matching.py @@ -45,7 +45,7 @@ class FuzzyReleaseMatcher: ...) this is and will be too slow. Anecdata: An early 2020 test run matching 23M "title strings" took - literally a couple of weeks to complete. + literally weeks to complete. This class is currently tested against the live fatcat search instance. A usage example: @@ -63,7 +63,8 @@ class FuzzyReleaseMatcher: index="fatcat_release", size=10, min_token_length=3, - release_year_padding=1): + release_year_padding=1, + skip_id_matching=False): if isinstance(es, str): self.es = elasticsearch.Elasticsearch([es]) else: @@ -74,6 +75,7 @@ class FuzzyReleaseMatcher: self.logger = logging.getLogger("fuzzy") self.min_token_length = min_token_length self.release_year_padding = 1 + self.skip_id_matching = skip_id_matching def _match_id(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]: """ @@ -114,7 +116,10 @@ class FuzzyReleaseMatcher: """ Match in the presence of defined title and contrib fields. """ - contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()] + contrib_tokens = [ + tok for c in release.contribs for tok in c.raw_name.split() + if len(tok) > self.min_token_length + ] contrib_queries = [{ "match": { "contrib_names": { @@ -124,6 +129,11 @@ class FuzzyReleaseMatcher: } for token in contrib_tokens] query = { "bool": { + "must_not": [{ + "match": { + "release_type": "stub", + }, + }], "must": [ { "match": { @@ -137,6 +147,9 @@ class FuzzyReleaseMatcher: ] + contrib_queries, }, } + # TODO: could boost on various things, like "overall metadata quality" + # (eg, does indexed record have title+year+release_type+container), or + # on publication stage (assuming things getting cited are 'published') if release.release_year is not None: query["bool"]["must"].append({ "range": { @@ -149,6 +162,7 @@ class FuzzyReleaseMatcher: }) result = [] self.logger.info(query) + # TODO: can we use the container name resp = self.es.search(index=self.index, body={ "query": query, @@ -171,6 +185,11 @@ class FuzzyReleaseMatcher: """ query = { "bool": { + "must_not": [{ + "match": { + "release_type": "stub", + }, + }], "must": [ { "match": { @@ -225,6 +244,11 @@ class FuzzyReleaseMatcher: } for token in contrib_tokens] query = { "bool": { + "must_not": [{ + "match": { + "release_type": "stub", + }, + }], "must": contrib_queries, }, } @@ -270,6 +294,11 @@ class FuzzyReleaseMatcher: ] query = { "bool": { + "must_not": [{ + "match": { + "release_type": "stub", + }, + }], "must": token_queries, }, } @@ -307,10 +336,11 @@ class FuzzyReleaseMatcher: document. """ if not release: - return [] - if release.ext_ids and len(release.ext_ids.to_dict()) > 0: + result = [] + elif not self.skip_id_matching and release.ext_ids and any( + release.ext_ids.to_dict().values()): result = self._match_id(release) - if release.title is not None and release.contribs is not None: + elif release.title is not None and release.contribs is not None: result = self._match_title_contrib(release) elif release.title is not None: result = self._match_title(release) @@ -332,6 +362,22 @@ def public_api(host_uri): conf.host = host_uri return fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf)) +def release_contrib_tokens(release : ReleaseEntity) -> List[str]: + """ + Return contribs as a list of tokens. + """ + # TODO! fix this + tokens = [] + for c in release.contribs: + if c.surname is not None: + tokens += c.surname.split() + elif c.raw_name is not None: + tokens += c.surname.split() + contrib_tokens = [ + tok for c in release.contribs for tok in c.raw_name.split() + ] + return contrib_tokens + def release_tokens(release: ReleaseEntity) -> List[str]: """ |