aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/search/stats.py
blob: 5496b94a209576fb32bf2697a42b08b5a981906a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from typing import Any, Dict

import elasticsearch
from elasticsearch_dsl import Search

from fatcat_tools.search.common import _hits_total_int, agg_to_dict, wrap_es_execution


def query_es_container_stats(
    ident: str,
    es_client: elasticsearch.Elasticsearch,
    es_index: str = "fatcat_release",
    merge_shadows: bool = False,
) -> Dict[str, Any]:
    """
    Returns dict:
        ident
        total: count
        in_web: count
        in_kbart: count
        is_preserved: count
        preservation{}
            "histogram" by preservation status
        release_type{}
            "histogram" by release type
    """

    search = Search(using=es_client, index=es_index)
    search = search.query(
        "term",
        container_id=ident,
    )
    search.aggs.bucket(
        "container_stats",
        "filters",
        filters={
            "in_web": {
                "term": {"in_web": True},
            },
            "in_kbart": {
                "term": {"in_kbart": True},
            },
            "is_preserved": {
                "term": {"is_preserved": True},
            },
        },
    )
    search.aggs.bucket(
        "preservation",
        "terms",
        field="preservation",
        missing="_unknown",
    )
    search.aggs.bucket(
        "release_type",
        "terms",
        field="release_type",
        missing="_unknown",
    )

    search = search[:0]

    search = search.params(request_cache=True)
    search = search.params(track_total_hits=True)
    resp = wrap_es_execution(search)

    container_stats = resp.aggregations.container_stats.buckets
    preservation_bucket = agg_to_dict(resp.aggregations.preservation)
    preservation_bucket["total"] = _hits_total_int(resp.hits.total)
    for k in ("bright", "dark", "shadows_only", "none"):
        if k not in preservation_bucket:
            preservation_bucket[k] = 0
    if merge_shadows:
        preservation_bucket["none"] += preservation_bucket["shadows_only"]
        preservation_bucket["shadows_only"] = 0
    release_type_bucket = agg_to_dict(resp.aggregations.release_type)
    stats = {
        "ident": ident,
        "total": _hits_total_int(resp.hits.total),
        "in_web": container_stats["in_web"]["doc_count"],
        "in_kbart": container_stats["in_kbart"]["doc_count"],
        "is_preserved": container_stats["is_preserved"]["doc_count"],
        "preservation": preservation_bucket,
        "release_type": release_type_bucket,
    }

    return stats