aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-07-06 16:03:59 -0700
committerBryan Newbold <bnewbold@robocracy.org>2020-07-30 18:17:54 -0700
commit7282174320c7ec9367bddd26fa6f14e424c8480a (patch)
tree3b0cb780c09143b16225f6fb82c6b248cd9dfb20 /python
parenta4e21d7651aded342c495e38a76e3d965ab2ff76 (diff)
downloadfatcat-7282174320c7ec9367bddd26fa6f14e424c8480a.tar.gz
fatcat-7282174320c7ec9367bddd26fa6f14e424c8480a.zip
include new-style preservation+release_type aggs in container stats
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_web/search.py39
-rw-r--r--python/tests/web_search.py13
2 files changed, 47 insertions, 5 deletions
diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py
index 3fd7f9dc..f60860c9 100644
--- a/python/fatcat_web/search.py
+++ b/python/fatcat_web/search.py
@@ -133,6 +133,20 @@ def wrap_es_execution(search: Search) -> Any:
raise FatcatSearchError(e.status_code, str(e.error), description)
return resp
+def agg_to_dict(agg) -> dict:
+ """
+ Takes a simple term aggregation result (with buckets) and returns a simple
+ dict with keys as terms and counts as values. Includes an extra value
+ '_other', and by convention aggregations should be writen to have "missing"
+ vaules as '_unknown'.
+ """
+ result = dict()
+ for bucket in agg.buckets:
+ result[bucket.key] = bucket.doc_count
+ if agg.sum_other_doc_count:
+ result['_other'] = agg.sum_other_doc_count
+ return result
+
def do_container_search(
query: GenericQuery, deep_page_limit: int = 2000
) -> SearchHits:
@@ -383,19 +397,36 @@ def get_elastic_container_stats(ident, issnl=None):
},
},
)
+ search.aggs.bucket(
+ 'preservation',
+ 'terms',
+ field='preservation',
+ missing='_unknown',
+ )
+ search.aggs.bucket(
+ 'release_type',
+ 'terms',
+ field='release_type',
+ missing='_unknown',
+ )
+
search = search[:0]
search = search.params(request_cache=True)
resp = wrap_es_execution(search)
- buckets = resp.aggregations.container_stats.buckets
+ container_stats = resp.aggregations.container_stats.buckets
+ preservation_bucket = agg_to_dict(resp.aggregations.preservation)
+ release_type_bucket = agg_to_dict(resp.aggregations.release_type)
stats = {
'ident': ident,
'issnl': issnl,
'total': resp.hits.total,
- 'in_web': buckets['in_web']['doc_count'],
- 'in_kbart': buckets['in_kbart']['doc_count'],
- 'is_preserved': buckets['is_preserved']['doc_count'],
+ 'in_web': container_stats['in_web']['doc_count'],
+ 'in_kbart': container_stats['in_kbart']['doc_count'],
+ 'is_preserved': container_stats['is_preserved']['doc_count'],
+ 'preservation': preservation_bucket,
+ 'release_types': release_type_bucket,
}
return stats
diff --git a/python/tests/web_search.py b/python/tests/web_search.py
index 1e9b61b3..c90ad6e5 100644
--- a/python/tests/web_search.py
+++ b/python/tests/web_search.py
@@ -114,7 +114,18 @@ def test_container_stats(app, mocker):
'container_stats': {'buckets': {
'is_preserved': {'doc_count': 461939},
'in_kbart': {'doc_count': 461939},
- 'in_web': {'doc_count': 2797}}}},
+ 'in_web': {'doc_count': 2797}}},
+ 'preservation': {
+ 'doc_count_error_upper_bound': 0, 'sum_other_doc_count': 0,
+ 'buckets': [{'key': 'bright', 'doc_count': 4143}, {'key': 'none',
+ 'doc_count': 101}, {'key': 'dark', 'doc_count': 79}, {'key':
+ 'shadows_only', 'doc_count': 5}]},
+ 'release_type': {
+ 'doc_count_error_upper_bound': 0, 'sum_other_doc_count': 0,
+ 'buckets': [{'key': 'article-journal', 'doc_count': 4324}, {'key':
+ 'article', 'doc_count': 2}, {'key': '_unknown', 'doc_count': 1},
+ {'key': 'editorial', 'doc_count': 1}]},
+ },
'hits': {'total': 461939, 'hits': [], 'max_score': 0.0},
'_shards': {'successful': 5, 'total': 5, 'skipped': 0, 'failed': 0},
'took': 50