diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-04-12 19:42:31 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-04-12 19:42:31 +0200 |
commit | 07c39548f848ded84bbce8455b974a5e298f1ea2 (patch) | |
tree | 03dfd6936667c3cfb3f2d8b96fd54ed6a004fbd7 | |
parent | 81220a314a6bb179db3554ceb36958417535390f (diff) | |
download | fuzzycat-07c39548f848ded84bbce8455b974a5e298f1ea2.tar.gz fuzzycat-07c39548f848ded84bbce8455b974a5e298f1ea2.zip |
address es hits.total change in ES7
* https://www.elastic.co/guide/en/elasticsearch/reference/current/breaking-changes-7.0.html
-rw-r--r-- | fuzzycat/matching.py | 9 | ||||
-rw-r--r-- | fuzzycat/utils.py | 14 | ||||
-rw-r--r-- | tests/test_utils.py | 11 |
3 files changed, 28 insertions, 6 deletions
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py index df9617b..9ccb62b 100644 --- a/fuzzycat/matching.py +++ b/fuzzycat/matching.py @@ -12,6 +12,7 @@ from fatcat_openapi_client import ContainerEntity, DefaultApi, ReleaseEntity from fatcat_openapi_client.rest import ApiException from fuzzycat.entities import entity_from_dict, entity_from_json +from fuzzycat.utils import es_compat_hits_total settings = Dynaconf(envvar_prefix="FUZZYCAT") FATCAT_API_URL = settings.get("FATCAT_API_URL", "https://api.fatcat.wiki/v0") @@ -79,7 +80,7 @@ def match_release_fuzzy( "size": size, } resp = es.search(body=body, index="fatcat_release") - if resp["hits"]["total"] > 0: + if es_compat_hits_total(resp) > 0: return response_to_entity_list(resp, entity_type=ReleaseEntity, api=api) # Get fuzzy. @@ -97,7 +98,7 @@ def match_release_fuzzy( "size": size, } resp = es.search(body=body, index="fatcat_release") - if resp["hits"]["total"] > 0: + if es_compat_hits_total(resp) > 0: return response_to_entity_list(resp, entity_type=ReleaseEntity, api=api) # TODO: perform more queries on other fields. @@ -209,9 +210,9 @@ def anything_to_entity( if re.match("[0-9]{4}(-)?[0-9]{3,3}[0-9xx]", s): # TODO: make index name configurable - url = "{}/fatcat_{}/_search?q=issns:{}".format(es_url, entity_name, s) + url = "{}/fatcat_{}/_search?track_total_hits=true&q=issns:{}".format(es_url, entity_name, s) doc = requests.get(url).json() - if doc["hits"]["total"] == 1: + if es_compat_hits_total(resp) == 1: ident = doc["hits"]["hits"][0]["_source"]["ident"] url = "{}/{}/{}".format(api_url, entity_name, ident) return entity_from_json(requests.get(url).text, entity_type) diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index 0f73456..bd7ceed 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -19,6 +19,18 @@ CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+") ParsedPages = collections.namedtuple("ParsedPages", "start end count") +def es_compat_hits_total(resp): + """ + Given a search response dict, support ES6 and ES7 style total value. See: + https://www.elastic.co/guide/en/elasticsearch/reference/current/breaking-changes-7.0.html + + It is responsibility of the call site to set `track_total_hits` in ES7 to + get an exact number. + """ + try: + return resp["hits"]["total"]["value"] + except TypeError: + return resp["hits"]["total"] def parse_page_string(s): """ @@ -177,7 +189,7 @@ def random_idents_from_query(query="*", if resp.status_code != 200: raise RuntimeError('could not query {} for random item: {}'.format(es, r.url)) payload = resp.json() - if payload["hits"]["total"] < 2: + if es_compat_hits_total(payload) < 2: raise RuntimeError('to few documents') idents = [doc["_source"]["ident"] for doc in payload["hits"]["hits"]] return random.sample(idents, r) diff --git a/tests/test_utils.py b/tests/test_utils.py index 29b125b..24be9d1 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,7 +3,7 @@ import os from fuzzycat.utils import (author_similarity_score, cut, jaccard, nwise, slugify_string, token_n_grams, tokenize_string, parse_page_string, dict_key_exists, - zstdlines) + zstdlines, es_compat_hits_total) def test_slugify_string(): @@ -98,3 +98,12 @@ def test_zstdlines(): for zfn, fn in examples: with open(fn) as f: assert [s.strip() for s in f.readlines()] == list(zstdlines(zfn)) + +def test_es_compat_hits_total(): + cases = ( + ({"hits": {"total": 6}}, 6), + ({"hits": {"total": {"value": 7, "relation": "eq"}}}, 7), + ) + for r, expected in cases: + assert es_compat_hits_total(r) == expected + |