diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-07-06 16:03:59 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-07-30 18:17:54 -0700 |
commit | 7282174320c7ec9367bddd26fa6f14e424c8480a (patch) | |
tree | 3b0cb780c09143b16225f6fb82c6b248cd9dfb20 | |
parent | a4e21d7651aded342c495e38a76e3d965ab2ff76 (diff) | |
download | fatcat-7282174320c7ec9367bddd26fa6f14e424c8480a.tar.gz fatcat-7282174320c7ec9367bddd26fa6f14e424c8480a.zip |
include new-style preservation+release_type aggs in container stats
-rw-r--r-- | python/fatcat_web/search.py | 39 | ||||
-rw-r--r-- | python/tests/web_search.py | 13 |
2 files changed, 47 insertions, 5 deletions
diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index 3fd7f9dc..f60860c9 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -133,6 +133,20 @@ def wrap_es_execution(search: Search) -> Any: raise FatcatSearchError(e.status_code, str(e.error), description) return resp +def agg_to_dict(agg) -> dict: + """ + Takes a simple term aggregation result (with buckets) and returns a simple + dict with keys as terms and counts as values. Includes an extra value + '_other', and by convention aggregations should be writen to have "missing" + vaules as '_unknown'. + """ + result = dict() + for bucket in agg.buckets: + result[bucket.key] = bucket.doc_count + if agg.sum_other_doc_count: + result['_other'] = agg.sum_other_doc_count + return result + def do_container_search( query: GenericQuery, deep_page_limit: int = 2000 ) -> SearchHits: @@ -383,19 +397,36 @@ def get_elastic_container_stats(ident, issnl=None): }, }, ) + search.aggs.bucket( + 'preservation', + 'terms', + field='preservation', + missing='_unknown', + ) + search.aggs.bucket( + 'release_type', + 'terms', + field='release_type', + missing='_unknown', + ) + search = search[:0] search = search.params(request_cache=True) resp = wrap_es_execution(search) - buckets = resp.aggregations.container_stats.buckets + container_stats = resp.aggregations.container_stats.buckets + preservation_bucket = agg_to_dict(resp.aggregations.preservation) + release_type_bucket = agg_to_dict(resp.aggregations.release_type) stats = { 'ident': ident, 'issnl': issnl, 'total': resp.hits.total, - 'in_web': buckets['in_web']['doc_count'], - 'in_kbart': buckets['in_kbart']['doc_count'], - 'is_preserved': buckets['is_preserved']['doc_count'], + 'in_web': container_stats['in_web']['doc_count'], + 'in_kbart': container_stats['in_kbart']['doc_count'], + 'is_preserved': container_stats['is_preserved']['doc_count'], + 'preservation': preservation_bucket, + 'release_types': release_type_bucket, } return stats diff --git a/python/tests/web_search.py b/python/tests/web_search.py index 1e9b61b3..c90ad6e5 100644 --- a/python/tests/web_search.py +++ b/python/tests/web_search.py @@ -114,7 +114,18 @@ def test_container_stats(app, mocker): 'container_stats': {'buckets': { 'is_preserved': {'doc_count': 461939}, 'in_kbart': {'doc_count': 461939}, - 'in_web': {'doc_count': 2797}}}}, + 'in_web': {'doc_count': 2797}}}, + 'preservation': { + 'doc_count_error_upper_bound': 0, 'sum_other_doc_count': 0, + 'buckets': [{'key': 'bright', 'doc_count': 4143}, {'key': 'none', + 'doc_count': 101}, {'key': 'dark', 'doc_count': 79}, {'key': + 'shadows_only', 'doc_count': 5}]}, + 'release_type': { + 'doc_count_error_upper_bound': 0, 'sum_other_doc_count': 0, + 'buckets': [{'key': 'article-journal', 'doc_count': 4324}, {'key': + 'article', 'doc_count': 2}, {'key': '_unknown', 'doc_count': 1}, + {'key': 'editorial', 'doc_count': 1}]}, + }, 'hits': {'total': 461939, 'hits': [], 'max_score': 0.0}, '_shards': {'successful': 5, 'total': 5, 'skipped': 0, 'failed': 0}, 'took': 50 |