diff options
Diffstat (limited to 'python/fatcat_tools/search/stats.py')
-rw-r--r-- | python/fatcat_tools/search/stats.py | 87 |
1 files changed, 87 insertions, 0 deletions
diff --git a/python/fatcat_tools/search/stats.py b/python/fatcat_tools/search/stats.py new file mode 100644 index 00000000..5496b94a --- /dev/null +++ b/python/fatcat_tools/search/stats.py @@ -0,0 +1,87 @@ +from typing import Any, Dict + +import elasticsearch +from elasticsearch_dsl import Search + +from fatcat_tools.search.common import _hits_total_int, agg_to_dict, wrap_es_execution + + +def query_es_container_stats( + ident: str, + es_client: elasticsearch.Elasticsearch, + es_index: str = "fatcat_release", + merge_shadows: bool = False, +) -> Dict[str, Any]: + """ + Returns dict: + ident + total: count + in_web: count + in_kbart: count + is_preserved: count + preservation{} + "histogram" by preservation status + release_type{} + "histogram" by release type + """ + + search = Search(using=es_client, index=es_index) + search = search.query( + "term", + container_id=ident, + ) + search.aggs.bucket( + "container_stats", + "filters", + filters={ + "in_web": { + "term": {"in_web": True}, + }, + "in_kbart": { + "term": {"in_kbart": True}, + }, + "is_preserved": { + "term": {"is_preserved": True}, + }, + }, + ) + search.aggs.bucket( + "preservation", + "terms", + field="preservation", + missing="_unknown", + ) + search.aggs.bucket( + "release_type", + "terms", + field="release_type", + missing="_unknown", + ) + + search = search[:0] + + search = search.params(request_cache=True) + search = search.params(track_total_hits=True) + resp = wrap_es_execution(search) + + container_stats = resp.aggregations.container_stats.buckets + preservation_bucket = agg_to_dict(resp.aggregations.preservation) + preservation_bucket["total"] = _hits_total_int(resp.hits.total) + for k in ("bright", "dark", "shadows_only", "none"): + if k not in preservation_bucket: + preservation_bucket[k] = 0 + if merge_shadows: + preservation_bucket["none"] += preservation_bucket["shadows_only"] + preservation_bucket["shadows_only"] = 0 + release_type_bucket = agg_to_dict(resp.aggregations.release_type) + stats = { + "ident": ident, + "total": _hits_total_int(resp.hits.total), + "in_web": container_stats["in_web"]["doc_count"], + "in_kbart": container_stats["in_kbart"]["doc_count"], + "is_preserved": container_stats["is_preserved"]["doc_count"], + "preservation": preservation_bucket, + "release_type": release_type_bucket, + } + + return stats |