aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-12-07 17:46:35 +0100
committerMartin Czygan <martin.czygan@gmail.com>2021-12-07 17:46:35 +0100
commit73f54c0ab791f2850850256cfa6065028553940f (patch)
tree5bad5a7614a8dd7dde924347b835ac66fa0a0ef5
parent9c2587982aa5e9458f043ae7fb3e3365eb47e067 (diff)
downloadfuzzycat-73f54c0ab791f2850850256cfa6065028553940f.tar.gz
fuzzycat-73f54c0ab791f2850850256cfa6065028553940f.zip
matching: update docs
-rw-r--r--fuzzycat/matching.py17
1 files changed, 8 insertions, 9 deletions
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
index cb6acbb..cf872a6 100644
--- a/fuzzycat/matching.py
+++ b/fuzzycat/matching.py
@@ -25,15 +25,14 @@ FATCAT_API_URL = settings.get("FATCAT_API_URL", "https://api.fatcat.wiki/v0")
class FuzzyReleaseMatcher:
"""
This is a helper class to fetch related documents to a given release
- document from fatcat search (currently elasticsearc)). Elasticsearch should
- rank similar documents high itself, so all we try to do here is to tweak
- the specific query a bit, depending on the completeness of the input
- document, e.g. if the input has contrib and title, then use both, if it
- only has a title, then use just that, etc.
+ document from fatcat search (currently elasticsearch).
- We try to get the result in a single query.
+ Elasticsearch should rank similar documents high itself, so all we try to
+ do here is to tweak the query a bit, e.g. vary it according to the
+ completeness of the input document, e.g. if the input has contrib and
+ title, then use both, if it only has a title, then use just that, etc.
- TODO/Tweaks: e.g. if document do have a "release_year", add this as a "should" clause.
+ This class is currently tested against the live fatcat search instance.
"""
def __init__(self,
es="https://search.fatcat.wiki",
@@ -235,7 +234,7 @@ class FuzzyReleaseMatcher:
def _match_generic(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]:
"""
- Throw tokens at elasticsearch.
+ Throw tokens at elasticsearch, but ignore tokens shorter than `min_token_length`.
"""
token_queries = [
{
@@ -244,7 +243,7 @@ class FuzzyReleaseMatcher:
"query": token,
}
}
- } for token in release_tokens(release) if len(token) > self.min_token_length
+ } for token in release_tokens(release) if len(token) >= self.min_token_length
]
query = {
"bool": {