From de9f1155ea57c812171abd5517ab39f4fe135cb3 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 17 Dec 2021 10:07:15 +0100
Subject: apply first round of feedback on matching

---
 fuzzycat/matching.py | 58 ++++++++++++++++++++++++++++++++++++++++++++++------
 fuzzycat/utils.py    |  7 +++++++
 2 files changed, 59 insertions(+), 6 deletions(-)

(limited to 'fuzzycat')

diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
index b01ce64..38899f9 100644
--- a/fuzzycat/matching.py
+++ b/fuzzycat/matching.py
@@ -45,7 +45,7 @@ class FuzzyReleaseMatcher:
     ...) this is and will be too slow.
 
     Anecdata: An early 2020 test run matching 23M "title strings" took
-    literally a couple of weeks to complete.
+    literally weeks to complete.
 
     This class is currently tested against the live fatcat search instance. A
     usage example:
@@ -63,7 +63,8 @@ class FuzzyReleaseMatcher:
                  index="fatcat_release",
                  size=10,
                  min_token_length=3,
-                 release_year_padding=1):
+                 release_year_padding=1,
+                 skip_id_matching=False):
         if isinstance(es, str):
             self.es = elasticsearch.Elasticsearch([es])
         else:
@@ -74,6 +75,7 @@ class FuzzyReleaseMatcher:
         self.logger = logging.getLogger("fuzzy")
         self.min_token_length = min_token_length
         self.release_year_padding = 1
+        self.skip_id_matching = skip_id_matching
 
     def _match_id(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]:
         """
@@ -114,7 +116,10 @@ class FuzzyReleaseMatcher:
         """
         Match in the presence of defined title and contrib fields.
         """
-        contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()]
+        contrib_tokens = [
+            tok for c in release.contribs for tok in c.raw_name.split()
+            if len(tok) > self.min_token_length
+        ]
         contrib_queries = [{
             "match": {
                 "contrib_names": {
@@ -124,6 +129,11 @@ class FuzzyReleaseMatcher:
         } for token in contrib_tokens]
         query = {
             "bool": {
+                "must_not": [{
+                    "match": {
+                        "release_type": "stub",
+                    },
+                }],
                 "must": [
                     {
                         "match": {
@@ -137,6 +147,9 @@ class FuzzyReleaseMatcher:
                 ] + contrib_queries,
             },
         }
+        # TODO: could boost on various things, like "overall metadata quality"
+        # (eg, does indexed record have title+year+release_type+container), or
+        # on publication stage (assuming things getting cited are 'published')
         if release.release_year is not None:
             query["bool"]["must"].append({
                 "range": {
@@ -149,6 +162,7 @@ class FuzzyReleaseMatcher:
             })
         result = []
         self.logger.info(query)
+        # TODO: can we use the container name
         resp = self.es.search(index=self.index,
                               body={
                                   "query": query,
@@ -171,6 +185,11 @@ class FuzzyReleaseMatcher:
         """
         query = {
             "bool": {
+                "must_not": [{
+                    "match": {
+                        "release_type": "stub",
+                    },
+                }],
                 "must": [
                     {
                         "match": {
@@ -225,6 +244,11 @@ class FuzzyReleaseMatcher:
         } for token in contrib_tokens]
         query = {
             "bool": {
+                "must_not": [{
+                    "match": {
+                        "release_type": "stub",
+                    },
+                }],
                 "must": contrib_queries,
             },
         }
@@ -270,6 +294,11 @@ class FuzzyReleaseMatcher:
         ]
         query = {
             "bool": {
+                "must_not": [{
+                    "match": {
+                        "release_type": "stub",
+                    },
+                }],
                 "must": token_queries,
             },
         }
@@ -307,10 +336,11 @@ class FuzzyReleaseMatcher:
         document.
         """
         if not release:
-            return []
-        if release.ext_ids and len(release.ext_ids.to_dict()) > 0:
+            result = []
+        elif not self.skip_id_matching and release.ext_ids and any(
+                release.ext_ids.to_dict().values()):
             result = self._match_id(release)
-        if release.title is not None and release.contribs is not None:
+        elif release.title is not None and release.contribs is not None:
             result = self._match_title_contrib(release)
         elif release.title is not None:
             result = self._match_title(release)
@@ -332,6 +362,22 @@ def public_api(host_uri):
     conf.host = host_uri
     return fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf))
 
+def release_contrib_tokens(release : ReleaseEntity) -> List[str]:
+    """
+    Return contribs as a list of tokens.
+    """
+    # TODO! fix this
+    tokens = []
+    for c in release.contribs:
+        if c.surname is not None:
+            tokens += c.surname.split()
+        elif c.raw_name is not None:
+            tokens += c.surname.split()
+    contrib_tokens = [
+        tok for c in release.contribs for tok in c.raw_name.split()
+    ]
+    return contrib_tokens
+
 
 def release_tokens(release: ReleaseEntity) -> List[str]:
     """
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index 24e103a..dadfa5c 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -30,6 +30,13 @@ def es_compat_hits_total(resp):
     """
     try:
         return resp["hits"]["total"]["value"]  # ES7
+    except KeyError:
+        # with track_total_hits set to False, we observed missing "total" keys,
+        # es returns: {'_shards': {'failed': 0, 'skipped': 0, 'successful': 6,
+        # 'total': 6}, 'hits': {'hits': [{'_id': 'yvqtz2zvkzcbpj4jxrp7b...ons':
+        # [], 'any_abstract': False, 'ark_id': None, ...}, ...}],
+        # 'max_score': 108.32384}, 'timed_out': False, 'took': 921}
+        return len(resp["hits"]["hits"])
     except TypeError:
         return resp["hits"]["total"]  # ES6
 
-- 
cgit v1.2.3