aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-04-12 19:42:31 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-04-12 19:42:31 +0200
commit07c39548f848ded84bbce8455b974a5e298f1ea2 (patch)
tree03dfd6936667c3cfb3f2d8b96fd54ed6a004fbd7
parent81220a314a6bb179db3554ceb36958417535390f (diff)
downloadfuzzycat-07c39548f848ded84bbce8455b974a5e298f1ea2.tar.gz
fuzzycat-07c39548f848ded84bbce8455b974a5e298f1ea2.zip
address es hits.total change in ES7
* https://www.elastic.co/guide/en/elasticsearch/reference/current/breaking-changes-7.0.html
-rw-r--r--fuzzycat/matching.py9
-rw-r--r--fuzzycat/utils.py14
-rw-r--r--tests/test_utils.py11
3 files changed, 28 insertions, 6 deletions
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
index df9617b..9ccb62b 100644
--- a/fuzzycat/matching.py
+++ b/fuzzycat/matching.py
@@ -12,6 +12,7 @@ from fatcat_openapi_client import ContainerEntity, DefaultApi, ReleaseEntity
from fatcat_openapi_client.rest import ApiException
from fuzzycat.entities import entity_from_dict, entity_from_json
+from fuzzycat.utils import es_compat_hits_total
settings = Dynaconf(envvar_prefix="FUZZYCAT")
FATCAT_API_URL = settings.get("FATCAT_API_URL", "https://api.fatcat.wiki/v0")
@@ -79,7 +80,7 @@ def match_release_fuzzy(
"size": size,
}
resp = es.search(body=body, index="fatcat_release")
- if resp["hits"]["total"] > 0:
+ if es_compat_hits_total(resp) > 0:
return response_to_entity_list(resp, entity_type=ReleaseEntity, api=api)
# Get fuzzy.
@@ -97,7 +98,7 @@ def match_release_fuzzy(
"size": size,
}
resp = es.search(body=body, index="fatcat_release")
- if resp["hits"]["total"] > 0:
+ if es_compat_hits_total(resp) > 0:
return response_to_entity_list(resp, entity_type=ReleaseEntity, api=api)
# TODO: perform more queries on other fields.
@@ -209,9 +210,9 @@ def anything_to_entity(
if re.match("[0-9]{4}(-)?[0-9]{3,3}[0-9xx]", s):
# TODO: make index name configurable
- url = "{}/fatcat_{}/_search?q=issns:{}".format(es_url, entity_name, s)
+ url = "{}/fatcat_{}/_search?track_total_hits=true&q=issns:{}".format(es_url, entity_name, s)
doc = requests.get(url).json()
- if doc["hits"]["total"] == 1:
+ if es_compat_hits_total(resp) == 1:
ident = doc["hits"]["hits"][0]["_source"]["ident"]
url = "{}/{}/{}".format(api_url, entity_name, ident)
return entity_from_json(requests.get(url).text, entity_type)
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index 0f73456..bd7ceed 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -19,6 +19,18 @@ CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+")
ParsedPages = collections.namedtuple("ParsedPages", "start end count")
+def es_compat_hits_total(resp):
+ """
+ Given a search response dict, support ES6 and ES7 style total value. See:
+ https://www.elastic.co/guide/en/elasticsearch/reference/current/breaking-changes-7.0.html
+
+ It is responsibility of the call site to set `track_total_hits` in ES7 to
+ get an exact number.
+ """
+ try:
+ return resp["hits"]["total"]["value"]
+ except TypeError:
+ return resp["hits"]["total"]
def parse_page_string(s):
"""
@@ -177,7 +189,7 @@ def random_idents_from_query(query="*",
if resp.status_code != 200:
raise RuntimeError('could not query {} for random item: {}'.format(es, r.url))
payload = resp.json()
- if payload["hits"]["total"] < 2:
+ if es_compat_hits_total(payload) < 2:
raise RuntimeError('to few documents')
idents = [doc["_source"]["ident"] for doc in payload["hits"]["hits"]]
return random.sample(idents, r)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 29b125b..24be9d1 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -3,7 +3,7 @@ import os
from fuzzycat.utils import (author_similarity_score, cut, jaccard, nwise, slugify_string,
token_n_grams, tokenize_string, parse_page_string, dict_key_exists,
- zstdlines)
+ zstdlines, es_compat_hits_total)
def test_slugify_string():
@@ -98,3 +98,12 @@ def test_zstdlines():
for zfn, fn in examples:
with open(fn) as f:
assert [s.strip() for s in f.readlines()] == list(zstdlines(zfn))
+
+def test_es_compat_hits_total():
+ cases = (
+ ({"hits": {"total": 6}}, 6),
+ ({"hits": {"total": {"value": 7, "relation": "eq"}}}, 7),
+ )
+ for r, expected in cases:
+ assert es_compat_hits_total(r) == expected
+