diff options
-rw-r--r-- | fuzzycat/matching.py | 17 |
1 files changed, 8 insertions, 9 deletions
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py index cb6acbb..cf872a6 100644 --- a/fuzzycat/matching.py +++ b/fuzzycat/matching.py @@ -25,15 +25,14 @@ FATCAT_API_URL = settings.get("FATCAT_API_URL", "https://api.fatcat.wiki/v0") class FuzzyReleaseMatcher: """ This is a helper class to fetch related documents to a given release - document from fatcat search (currently elasticsearc)). Elasticsearch should - rank similar documents high itself, so all we try to do here is to tweak - the specific query a bit, depending on the completeness of the input - document, e.g. if the input has contrib and title, then use both, if it - only has a title, then use just that, etc. + document from fatcat search (currently elasticsearch). - We try to get the result in a single query. + Elasticsearch should rank similar documents high itself, so all we try to + do here is to tweak the query a bit, e.g. vary it according to the + completeness of the input document, e.g. if the input has contrib and + title, then use both, if it only has a title, then use just that, etc. - TODO/Tweaks: e.g. if document do have a "release_year", add this as a "should" clause. + This class is currently tested against the live fatcat search instance. """ def __init__(self, es="https://search.fatcat.wiki", @@ -235,7 +234,7 @@ class FuzzyReleaseMatcher: def _match_generic(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]: """ - Throw tokens at elasticsearch. + Throw tokens at elasticsearch, but ignore tokens shorter than `min_token_length`. """ token_queries = [ { @@ -244,7 +243,7 @@ class FuzzyReleaseMatcher: "query": token, } } - } for token in release_tokens(release) if len(token) > self.min_token_length + } for token in release_tokens(release) if len(token) >= self.min_token_length ] query = { "bool": { |