diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2022-02-09 18:03:23 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2022-02-09 18:06:55 -0800 |
commit | ef7d24cd08b98d58779335acd1d655bd342cd9ec (patch) | |
tree | 8f1e3db6187fd92a359d9c99c54e867f5412c46b /python/fatcat_tools/search/common.py | |
parent | d73ab1f7cc45c122f321f0e717de2067554baabb (diff) | |
download | fatcat-ef7d24cd08b98d58779335acd1d655bd342cd9ec.tar.gz fatcat-ef7d24cd08b98d58779335acd1d655bd342cd9ec.zip |
move container_status ES query code from fatcat_web to fatcat_tools
The main motivation is to never have fatcat_tools import from
fatcat_web, only vica-versa. Some code in fatcat_tools needs container
stats, so starting with that code path (plus some generic helpers).
Diffstat (limited to 'python/fatcat_tools/search/common.py')
-rw-r--r-- | python/fatcat_tools/search/common.py | 96 |
1 files changed, 96 insertions, 0 deletions
diff --git a/python/fatcat_tools/search/common.py b/python/fatcat_tools/search/common.py new file mode 100644 index 00000000..584757fd --- /dev/null +++ b/python/fatcat_tools/search/common.py @@ -0,0 +1,96 @@ +import sys +from typing import Any, Dict, List, Union + +import elasticsearch +import elasticsearch_dsl.response +from elasticsearch_dsl import Search + + +class FatcatSearchError(Exception): + def __init__(self, status_code: Union[int, str], name: str, description: str = None): + if status_code == "TIMEOUT": + status_code = 504 + elif isinstance(status_code, str): + try: + status_code = int(status_code) + except ValueError: + status_code = 503 + self.status_code = status_code + self.name = name + self.description = description + + +def _hits_total_int(val: Any) -> int: + """ + Compatibility hack between ES 6.x and 7.x. In ES 6x, total is returned as + an int in many places, in ES 7 as a dict (JSON object) with 'value' key + """ + if isinstance(val, int): + return val + else: + return int(val["value"]) + + +def results_to_dict(response: elasticsearch_dsl.response.Response) -> List[dict]: + """ + Takes a response returns all the hits as JSON objects. + + Also handles surrogate strings that elasticsearch returns sometimes, + probably due to mangled data processing in some pipeline. "Crimes against + Unicode"; production workaround + """ + + results = [] + for h in response: + r = h._d_ + # print(h.meta._d_) + results.append(r) + + for h in results: + for key in h: + if type(h[key]) is str: + h[key] = h[key].encode("utf8", "ignore").decode("utf8") + return results + + +def wrap_es_execution(search: Search) -> Any: + """ + Executes a Search object, and converts various ES error types into + something we can pretty print to the user. + """ + try: + resp = search.execute() + except elasticsearch.exceptions.RequestError as e: + # this is a "user" error + print("elasticsearch 400: " + str(e.info), file=sys.stderr) + description = None + assert isinstance(e.info, dict) + if e.info.get("error", {}).get("root_cause", {}): + description = str(e.info["error"]["root_cause"][0].get("reason")) + raise FatcatSearchError(e.status_code, str(e.error), description) + except elasticsearch.exceptions.ConnectionError as e: + raise FatcatSearchError(e.status_code, "ConnectionError: search engine not available") + except elasticsearch.exceptions.TransportError as e: + # all other errors + print("elasticsearch non-200 status code: {}".format(e.info), file=sys.stderr) + description = None + assert isinstance(e.info, dict) + if e.info and e.info.get("error", {}).get("root_cause", {}): + description = str(e.info["error"]["root_cause"][0].get("reason")) + raise FatcatSearchError(e.status_code, str(e.error), description) + return resp + + +def agg_to_dict(agg: Any) -> Dict[str, Any]: + """ + Takes a simple term aggregation result (with buckets) and returns a simple + dict with keys as terms and counts as values. Includes an extra value + '_other', and by convention aggregations should be written to have "missing" + values as '_unknown'. + """ + result = dict() + for bucket in agg.buckets: + result[bucket.key] = bucket.doc_count + if agg.sum_other_doc_count: + result["_other"] = agg.sum_other_doc_count + return result |