apply first round of feedback on matchingHEAD master

author: Martin Czygan <martin.czygan@gmail.com> 2021-12-17 10:07:15 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2021-12-21 20:56:56 +0100
commit: de9f1155ea57c812171abd5517ab39f4fe135cb3 (patch)
tree: 2b2071642259c46ede5b56d15cbce15187226362 /fuzzycat/matching.py
parent: 4720fb51584fae1edc2a79dd94c24b4ddac92acb (diff)
download: fuzzycat-master.tar.gz
fuzzycat-master.zip
1 files changed, 52 insertions, 6 deletions
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
index b01ce64..38899f9 100644
--- a/fuzzycat/matching.py
+++ b/fuzzycat/matching.py
@@ -45,7 +45,7 @@ class FuzzyReleaseMatcher:
     ...) this is and will be too slow.
 
     Anecdata: An early 2020 test run matching 23M "title strings" took
-    literally a couple of weeks to complete.
+    literally weeks to complete.
 
     This class is currently tested against the live fatcat search instance. A
     usage example:
@@ -63,7 +63,8 @@ class FuzzyReleaseMatcher:
                  index="fatcat_release",
                  size=10,
                  min_token_length=3,
-                 release_year_padding=1):
+                 release_year_padding=1,
+                 skip_id_matching=False):
         if isinstance(es, str):
             self.es = elasticsearch.Elasticsearch([es])
         else:
@@ -74,6 +75,7 @@ class FuzzyReleaseMatcher:
         self.logger = logging.getLogger("fuzzy")
         self.min_token_length = min_token_length
         self.release_year_padding = 1
+        self.skip_id_matching = skip_id_matching
 
     def _match_id(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]:
         """
@@ -114,7 +116,10 @@ class FuzzyReleaseMatcher:
         """
         Match in the presence of defined title and contrib fields.
         """
-        contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()]
+        contrib_tokens = [
+            tok for c in release.contribs for tok in c.raw_name.split()
+            if len(tok) > self.min_token_length
+        ]
         contrib_queries = [{
             "match": {
                 "contrib_names": {
@@ -124,6 +129,11 @@ class FuzzyReleaseMatcher:
         } for token in contrib_tokens]
         query = {
             "bool": {
+                "must_not": [{
+                    "match": {
+                        "release_type": "stub",
+                    },
+                }],
                 "must": [
                     {
                         "match": {
@@ -137,6 +147,9 @@ class FuzzyReleaseMatcher:
                 ] + contrib_queries,
             },
         }
+        # TODO: could boost on various things, like "overall metadata quality"
+        # (eg, does indexed record have title+year+release_type+container), or
+        # on publication stage (assuming things getting cited are 'published')
         if release.release_year is not None:
             query["bool"]["must"].append({
                 "range": {
@@ -149,6 +162,7 @@ class FuzzyReleaseMatcher:
             })
         result = []
         self.logger.info(query)
+        # TODO: can we use the container name
         resp = self.es.search(index=self.index,
                               body={
                                   "query": query,
@@ -171,6 +185,11 @@ class FuzzyReleaseMatcher:
         """
         query = {
             "bool": {
+                "must_not": [{
+                    "match": {
+                        "release_type": "stub",
+                    },
+                }],
                 "must": [
                     {
                         "match": {
@@ -225,6 +244,11 @@ class FuzzyReleaseMatcher:
         } for token in contrib_tokens]
         query = {
             "bool": {
+                "must_not": [{
+                    "match": {
+                        "release_type": "stub",
+                    },
+                }],
                 "must": contrib_queries,
             },
         }
@@ -270,6 +294,11 @@ class FuzzyReleaseMatcher:
         ]
         query = {
             "bool": {
+                "must_not": [{
+                    "match": {
+                        "release_type": "stub",
+                    },
+                }],
                 "must": token_queries,
             },
         }
@@ -307,10 +336,11 @@ class FuzzyReleaseMatcher:
         document.
         """
         if not release:
-            return []
-        if release.ext_ids and len(release.ext_ids.to_dict()) > 0:
+            result = []
+        elif not self.skip_id_matching and release.ext_ids and any(
+                release.ext_ids.to_dict().values()):
             result = self._match_id(release)
-        if release.title is not None and release.contribs is not None:
+        elif release.title is not None and release.contribs is not None:
             result = self._match_title_contrib(release)
         elif release.title is not None:
             result = self._match_title(release)
@@ -332,6 +362,22 @@ def public_api(host_uri):
     conf.host = host_uri
     return fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf))
 
+def release_contrib_tokens(release : ReleaseEntity) -> List[str]:
+    """
+    Return contribs as a list of tokens.
+    """
+    # TODO! fix this
+    tokens = []
+    for c in release.contribs:
+        if c.surname is not None:
+            tokens += c.surname.split()
+        elif c.raw_name is not None:
+            tokens += c.surname.split()
+    contrib_tokens = [
+        tok for c in release.contribs for tok in c.raw_name.split()
+    ]
+    return contrib_tokens
+
 
 def release_tokens(release: ReleaseEntity) -> List[str]:
     """
author	Martin Czygan <martin.czygan@gmail.com>	2021-12-17 10:07:15 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2021-12-21 20:56:56 +0100
commit	de9f1155ea57c812171abd5517ab39f4fe135cb3 (patch)
tree	2b2071642259c46ede5b56d15cbce15187226362 /fuzzycat/matching.py
parent	4720fb51584fae1edc2a79dd94c24b4ddac92acb (diff)
download	fuzzycat-master.tar.gz fuzzycat-master.zip