matching: update docs

author: Martin Czygan <martin.czygan@gmail.com> 2021-12-07 17:46:35 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2021-12-07 17:46:35 +0100
commit: 73f54c0ab791f2850850256cfa6065028553940f (patch)
tree: 5bad5a7614a8dd7dde924347b835ac66fa0a0ef5
parent: 9c2587982aa5e9458f043ae7fb3e3365eb47e067 (diff)
download: fuzzycat-73f54c0ab791f2850850256cfa6065028553940f.tar.gz
fuzzycat-73f54c0ab791f2850850256cfa6065028553940f.zip
1 files changed, 8 insertions, 9 deletions
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
index cb6acbb..cf872a6 100644
--- a/fuzzycat/matching.py
+++ b/fuzzycat/matching.py
@@ -25,15 +25,14 @@ FATCAT_API_URL = settings.get("FATCAT_API_URL", "https://api.fatcat.wiki/v0")
 class FuzzyReleaseMatcher:
     """
     This is a helper class to fetch related documents to a given release
-    document from fatcat search (currently elasticsearc)). Elasticsearch should
-    rank similar documents high itself, so all we try to do here is to tweak
-    the specific query a bit, depending on the completeness of the input
-    document, e.g. if the input has contrib and title, then use both, if it
-    only has a title, then use just that, etc.
+    document from fatcat search (currently elasticsearch).
 
-    We try to get the result in a single query.
+    Elasticsearch should rank similar documents high itself, so all we try to
+    do here is to tweak the query a bit, e.g. vary it according to the
+    completeness of the input document, e.g. if the input has contrib and
+    title, then use both, if it only has a title, then use just that, etc.
 
-    TODO/Tweaks: e.g. if document do have a "release_year", add this as a "should" clause.
+    This class is currently tested against the live fatcat search instance.
     """
     def __init__(self,
                  es="https://search.fatcat.wiki",
@@ -235,7 +234,7 @@ class FuzzyReleaseMatcher:
 
     def _match_generic(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]:
         """
-        Throw tokens at elasticsearch.
+        Throw tokens at elasticsearch, but ignore tokens shorter than `min_token_length`.
         """
         token_queries = [
             {
@@ -244,7 +243,7 @@ class FuzzyReleaseMatcher:
                         "query": token,
                     }
                 }
-            } for token in release_tokens(release) if len(token) > self.min_token_length
+            } for token in release_tokens(release) if len(token) >= self.min_token_length
         ]
         query = {
             "bool": {
author	Martin Czygan <martin.czygan@gmail.com>	2021-12-07 17:46:35 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2021-12-07 17:46:35 +0100
commit	73f54c0ab791f2850850256cfa6065028553940f (patch)
tree	5bad5a7614a8dd7dde924347b835ac66fa0a0ef5
parent	9c2587982aa5e9458f043ae7fb3e3365eb47e067 (diff)
download	fuzzycat-73f54c0ab791f2850850256cfa6065028553940f.tar.gz fuzzycat-73f54c0ab791f2850850256cfa6065028553940f.zip