diff options
Diffstat (limited to 'python/fatcat_web/search.py')
-rw-r--r-- | python/fatcat_web/search.py | 518 |
1 files changed, 292 insertions, 226 deletions
diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index 73781016..5fc3f614 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -1,4 +1,3 @@ - """ Helpers for doing elasticsearch queries (used in the web interface; not part of the formal API) @@ -17,7 +16,6 @@ from fatcat_web import app class FatcatSearchError(Exception): - def __init__(self, status_code: int, name: str, description: str = None): if status_code == "N/A": status_code = 503 @@ -25,6 +23,7 @@ class FatcatSearchError(Exception): self.name = name self.description = description + @dataclass class ReleaseQuery: q: Optional[str] = None @@ -35,31 +34,32 @@ class ReleaseQuery: recent: bool = False @classmethod - def from_args(cls, args) -> 'ReleaseQuery': + def from_args(cls, args) -> "ReleaseQuery": - query_str = args.get('q') or '*' + query_str = args.get("q") or "*" - container_id = args.get('container_id') + container_id = args.get("container_id") # TODO: as filter, not in query string if container_id: query_str += ' container_id:"{}"'.format(container_id) # TODO: where are container_issnl queries actually used? - issnl = args.get('container_issnl') + issnl = args.get("container_issnl") if issnl and query_str: query_str += ' container_issnl:"{}"'.format(issnl) - offset = args.get('offset', '0') + offset = args.get("offset", "0") offset = max(0, int(offset)) if offset.isnumeric() else 0 return ReleaseQuery( q=query_str, offset=offset, - fulltext_only=bool(args.get('fulltext_only')), + fulltext_only=bool(args.get("fulltext_only")), container_id=container_id, - recent=bool(args.get('recent')), + recent=bool(args.get("recent")), ) + @dataclass class GenericQuery: q: Optional[str] = None @@ -67,11 +67,11 @@ class GenericQuery: offset: Optional[int] = None @classmethod - def from_args(cls, args) -> 'GenericQuery': - query_str = args.get('q') + def from_args(cls, args) -> "GenericQuery": + query_str = args.get("q") if not query_str: - query_str = '*' - offset = args.get('offset', '0') + query_str = "*" + offset = args.get("offset", "0") offset = max(0, int(offset)) if offset.isnumeric() else 0 return GenericQuery( @@ -79,6 +79,7 @@ class GenericQuery: offset=offset, ) + @dataclass class SearchHits: count_returned: int @@ -89,6 +90,7 @@ class SearchHits: query_time_ms: int results: List[Any] + def _hits_total_int(val: Any) -> int: """ Compatibility hack between ES 6.x and 7.x. In ES 6x, total is returned as @@ -97,7 +99,7 @@ def _hits_total_int(val: Any) -> int: if isinstance(val, int): return val else: - return int(val['value']) + return int(val["value"]) def results_to_dict(response: elasticsearch_dsl.response.Response) -> List[dict]: @@ -121,6 +123,7 @@ def results_to_dict(response: elasticsearch_dsl.response.Response) -> List[dict] h[key] = h[key].encode("utf8", "ignore").decode("utf8") return results + def wrap_es_execution(search: Search) -> Any: """ Executes a Search object, and converts various ES error types into @@ -146,6 +149,7 @@ def wrap_es_execution(search: Search) -> Any: raise FatcatSearchError(e.status_code, str(e.error), description) return resp + def agg_to_dict(agg) -> dict: """ Takes a simple term aggregation result (with buckets) and returns a simple @@ -157,14 +161,13 @@ def agg_to_dict(agg) -> dict: for bucket in agg.buckets: result[bucket.key] = bucket.doc_count if agg.sum_other_doc_count: - result['_other'] = agg.sum_other_doc_count + result["_other"] = agg.sum_other_doc_count return result -def do_container_search( - query: GenericQuery, deep_page_limit: int = 2000 -) -> SearchHits: - search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_CONTAINER_INDEX']) +def do_container_search(query: GenericQuery, deep_page_limit: int = 2000) -> SearchHits: + + search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_CONTAINER_INDEX"]) search = search.query( "query_string", @@ -199,11 +202,10 @@ def do_container_search( results=results, ) -def do_release_search( - query: ReleaseQuery, deep_page_limit: int = 2000 -) -> SearchHits: - search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) +def do_release_search(query: ReleaseQuery, deep_page_limit: int = 2000) -> SearchHits: + + search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_RELEASE_INDEX"]) # availability filters if query.fulltext_only: @@ -240,7 +242,11 @@ def do_release_search( search = search.query( "boosting", - positive=Q("bool", must=basic_biblio, should=[has_fulltext],), + positive=Q( + "bool", + must=basic_biblio, + should=[has_fulltext], + ), negative=poor_metadata, negative_boost=0.5, ) @@ -260,9 +266,13 @@ def do_release_search( for h in results: # Ensure 'contrib_names' is a list, not a single string - if type(h['contrib_names']) is not list: - h['contrib_names'] = [h['contrib_names'], ] - h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']] + if type(h["contrib_names"]) is not list: + h["contrib_names"] = [ + h["contrib_names"], + ] + h["contrib_names"] = [ + name.encode("utf8", "ignore").decode("utf8") for name in h["contrib_names"] + ] return SearchHits( count_returned=len(results), @@ -274,6 +284,7 @@ def do_release_search( results=results, ) + def get_elastic_container_random_releases(ident: str, limit=5) -> dict: """ Returns a list of releases from the container. @@ -281,16 +292,16 @@ def get_elastic_container_random_releases(ident: str, limit=5) -> dict: assert limit > 0 and limit <= 100 - search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_RELEASE_INDEX"]) search = search.query( - 'bool', + "bool", must=[ - Q('term', container_id=ident), - Q('range', release_year={ "lte": datetime.datetime.today().year }), - ] + Q("term", container_id=ident), + Q("range", release_year={"lte": datetime.datetime.today().year}), + ], ) - search = search.sort('-in_web', '-release_date') - search = search[:int(limit)] + search = search.sort("-in_web", "-release_date") + search = search[: int(limit)] search = search.params(request_cache=True) # not needed: search = search.params(track_total_hits=True) @@ -299,6 +310,7 @@ def get_elastic_container_random_releases(ident: str, limit=5) -> dict: return results + def get_elastic_entity_stats() -> dict: """ TODO: files, filesets, webcaptures (no schema yet) @@ -312,11 +324,11 @@ def get_elastic_entity_stats() -> dict: stats = {} # release totals - search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_RELEASE_INDEX"]) search.aggs.bucket( - 'release_ref_count', - 'sum', - field='ref_count', + "release_ref_count", + "sum", + field="ref_count", ) search = search[:0] # pylint: disable=unsubscriptable-object @@ -324,15 +336,15 @@ def get_elastic_entity_stats() -> dict: search = search.params(track_total_hits=True) resp = wrap_es_execution(search) - stats['release'] = { + stats["release"] = { "total": _hits_total_int(resp.hits.total), "refs_total": int(resp.aggregations.release_ref_count.value), } # paper counts - search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_RELEASE_INDEX"]) search = search.query( - 'terms', + "terms", release_type=[ "article-journal", "paper-conference", @@ -341,17 +353,21 @@ def get_elastic_entity_stats() -> dict: ], ) search.aggs.bucket( - 'paper_like', - 'filters', + "paper_like", + "filters", filters={ - "in_web": { "term": { "in_web": "true" } }, - "is_oa": { "term": { "is_oa": "true" } }, - "in_kbart": { "term": { "in_kbart": "true" } }, - "in_web_not_kbart": { "bool": { "filter": [ - { "term": { "in_web": "true" } }, - { "term": { "in_kbart": "false" } }, - ]}}, - } + "in_web": {"term": {"in_web": "true"}}, + "is_oa": {"term": {"is_oa": "true"}}, + "in_kbart": {"term": {"in_kbart": "true"}}, + "in_web_not_kbart": { + "bool": { + "filter": [ + {"term": {"in_web": "true"}}, + {"term": {"in_kbart": "false"}}, + ] + } + }, + }, ) search = search[:0] @@ -359,35 +375,36 @@ def get_elastic_entity_stats() -> dict: search = search.params(track_total_hits=True) resp = wrap_es_execution(search) buckets = resp.aggregations.paper_like.buckets - stats['papers'] = { - 'total': _hits_total_int(resp.hits.total), - 'in_web': buckets.in_web.doc_count, - 'is_oa': buckets.is_oa.doc_count, - 'in_kbart': buckets.in_kbart.doc_count, - 'in_web_not_kbart': buckets.in_web_not_kbart.doc_count, + stats["papers"] = { + "total": _hits_total_int(resp.hits.total), + "in_web": buckets.in_web.doc_count, + "is_oa": buckets.is_oa.doc_count, + "in_kbart": buckets.in_kbart.doc_count, + "in_web_not_kbart": buckets.in_web_not_kbart.doc_count, } # container counts - search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_CONTAINER_INDEX']) + search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_CONTAINER_INDEX"]) search.aggs.bucket( - 'release_ref_count', - 'sum', - field='ref_count', + "release_ref_count", + "sum", + field="ref_count", ) search = search[:0] # pylint: disable=unsubscriptable-object search = search.params(request_cache=True) search = search.params(track_total_hits=True) resp = wrap_es_execution(search) - stats['container'] = { + stats["container"] = { "total": _hits_total_int(resp.hits.total), } return stats + def get_elastic_search_coverage(query: ReleaseQuery) -> dict: - search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_RELEASE_INDEX"]) search = search.query( "query_string", query=query.q, @@ -398,10 +415,10 @@ def get_elastic_search_coverage(query: ReleaseQuery) -> dict: fields=["biblio"], ) search.aggs.bucket( - 'preservation', - 'terms', - field='preservation', - missing='_unknown', + "preservation", + "terms", + field="preservation", + missing="_unknown", ) if query.recent: date_today = datetime.date.today() @@ -416,21 +433,24 @@ def get_elastic_search_coverage(query: ReleaseQuery) -> dict: resp = wrap_es_execution(search) preservation_bucket = agg_to_dict(resp.aggregations.preservation) - preservation_bucket['total'] = _hits_total_int(resp.hits.total) - for k in ('bright', 'dark', 'shadows_only', 'none'): + preservation_bucket["total"] = _hits_total_int(resp.hits.total) + for k in ("bright", "dark", "shadows_only", "none"): if k not in preservation_bucket: preservation_bucket[k] = 0 - if app.config['FATCAT_MERGE_SHADOW_PRESERVATION']: - preservation_bucket['none'] += preservation_bucket['shadows_only'] - preservation_bucket['shadows_only'] = 0 + if app.config["FATCAT_MERGE_SHADOW_PRESERVATION"]: + preservation_bucket["none"] += preservation_bucket["shadows_only"] + preservation_bucket["shadows_only"] = 0 stats = { - 'total': _hits_total_int(resp.hits.total), - 'preservation': preservation_bucket, + "total": _hits_total_int(resp.hits.total), + "preservation": preservation_bucket, } return stats -def get_elastic_container_stats(ident, issnl=None, es_client=None, es_index=None, merge_shadows=None): + +def get_elastic_container_stats( + ident, issnl=None, es_client=None, es_index=None, merge_shadows=None +): """ Returns dict: ident @@ -444,41 +464,41 @@ def get_elastic_container_stats(ident, issnl=None, es_client=None, es_index=None if not es_client: es_client = app.es_client if not es_index: - es_index = app.config['ELASTICSEARCH_RELEASE_INDEX'] + es_index = app.config["ELASTICSEARCH_RELEASE_INDEX"] if merge_shadows is None: - merge_shadows = app.config['FATCAT_MERGE_SHADOW_PRESERVATION'] + merge_shadows = app.config["FATCAT_MERGE_SHADOW_PRESERVATION"] search = Search(using=es_client, index=es_index) search = search.query( - 'term', + "term", container_id=ident, ) search.aggs.bucket( - 'container_stats', - 'filters', + "container_stats", + "filters", filters={ "in_web": { - "term": { "in_web": True }, + "term": {"in_web": True}, }, "in_kbart": { - "term": { "in_kbart": True }, + "term": {"in_kbart": True}, }, "is_preserved": { - "term": { "is_preserved": True }, + "term": {"is_preserved": True}, }, }, ) search.aggs.bucket( - 'preservation', - 'terms', - field='preservation', - missing='_unknown', + "preservation", + "terms", + field="preservation", + missing="_unknown", ) search.aggs.bucket( - 'release_type', - 'terms', - field='release_type', - missing='_unknown', + "release_type", + "terms", + field="release_type", + missing="_unknown", ) search = search[:0] @@ -489,27 +509,28 @@ def get_elastic_container_stats(ident, issnl=None, es_client=None, es_index=None container_stats = resp.aggregations.container_stats.buckets preservation_bucket = agg_to_dict(resp.aggregations.preservation) - preservation_bucket['total'] = _hits_total_int(resp.hits.total) - for k in ('bright', 'dark', 'shadows_only', 'none'): + preservation_bucket["total"] = _hits_total_int(resp.hits.total) + for k in ("bright", "dark", "shadows_only", "none"): if k not in preservation_bucket: preservation_bucket[k] = 0 if merge_shadows: - preservation_bucket['none'] += preservation_bucket['shadows_only'] - preservation_bucket['shadows_only'] = 0 + preservation_bucket["none"] += preservation_bucket["shadows_only"] + preservation_bucket["shadows_only"] = 0 release_type_bucket = agg_to_dict(resp.aggregations.release_type) stats = { - 'ident': ident, - 'issnl': issnl, - 'total': _hits_total_int(resp.hits.total), - 'in_web': container_stats['in_web']['doc_count'], - 'in_kbart': container_stats['in_kbart']['doc_count'], - 'is_preserved': container_stats['is_preserved']['doc_count'], - 'preservation': preservation_bucket, - 'release_type': release_type_bucket, + "ident": ident, + "issnl": issnl, + "total": _hits_total_int(resp.hits.total), + "in_web": container_stats["in_web"]["doc_count"], + "in_kbart": container_stats["in_kbart"]["doc_count"], + "is_preserved": container_stats["is_preserved"]["doc_count"], + "preservation": preservation_bucket, + "release_type": release_type_bucket, } return stats + def get_elastic_container_histogram_legacy(ident) -> List: """ Fetches a stacked histogram of {year, in_ia}. This is for the older style @@ -522,48 +543,58 @@ def get_elastic_container_histogram_legacy(ident) -> List: (year, in_ia, count) """ - search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_RELEASE_INDEX"]) search = search.query( - 'bool', + "bool", must=[ - Q("range", release_year={ - "gte": datetime.datetime.today().year - 499, - "lte": datetime.datetime.today().year, - }), + Q( + "range", + release_year={ + "gte": datetime.datetime.today().year - 499, + "lte": datetime.datetime.today().year, + }, + ), ], filter=[ - Q("bool", minimum_should_match=1, should=[ - Q("match", container_id=ident), - ]), + Q( + "bool", + minimum_should_match=1, + should=[ + Q("match", container_id=ident), + ], + ), ], ) search.aggs.bucket( - 'year_in_ia', - 'composite', + "year_in_ia", + "composite", size=1000, sources=[ - {"year": { - "histogram": { - "field": "release_year", - "interval": 1, - }, - }}, - {"in_ia": { - "terms": { - "field": "in_ia", - }, - }}, + { + "year": { + "histogram": { + "field": "release_year", + "interval": 1, + }, + } + }, + { + "in_ia": { + "terms": { + "field": "in_ia", + }, + } + }, ], ) search = search[:0] - search = search.params(request_cache='true') + search = search.params(request_cache="true") search = search.params(track_total_hits=True) resp = wrap_es_execution(search) buckets = resp.aggregations.year_in_ia.buckets - vals = [(int(h['key']['year']), h['key']['in_ia'], h['doc_count']) - for h in buckets] + vals = [(int(h["key"]["year"]), h["key"]["in_ia"], h["doc_count"]) for h in buckets] vals = sorted(vals) return vals @@ -580,7 +611,7 @@ def get_elastic_preservation_by_year(query) -> List[dict]: {year (int), bright (int), dark (int), shadows_only (int), none (int)} """ - search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_RELEASE_INDEX"]) if query.q not in [None, "*"]: search = search.query( "query_string", @@ -607,41 +638,47 @@ def get_elastic_preservation_by_year(query) -> List[dict]: ) search.aggs.bucket( - 'year_preservation', - 'composite', + "year_preservation", + "composite", size=1500, sources=[ - {"year": { - "histogram": { - "field": "release_year", - "interval": 1, - }, - }}, - {"preservation": { - "terms": { - "field": "preservation", - }, - }}, + { + "year": { + "histogram": { + "field": "release_year", + "interval": 1, + }, + } + }, + { + "preservation": { + "terms": { + "field": "preservation", + }, + } + }, ], ) search = search[:0] - search = search.params(request_cache='true') + search = search.params(request_cache="true") search = search.params(track_total_hits=True) resp = wrap_es_execution(search) buckets = resp.aggregations.year_preservation.buckets - year_nums = set([int(h['key']['year']) for h in buckets]) + year_nums = set([int(h["key"]["year"]) for h in buckets]) year_dicts = dict() if year_nums: - for num in range(min(year_nums), max(year_nums)+1): + for num in range(min(year_nums), max(year_nums) + 1): year_dicts[num] = dict(year=num, bright=0, dark=0, shadows_only=0, none=0) for row in buckets: - year_dicts[int(row['key']['year'])][row['key']['preservation']] = int(row['doc_count']) - if app.config['FATCAT_MERGE_SHADOW_PRESERVATION']: + year_dicts[int(row["key"]["year"])][row["key"]["preservation"]] = int( + row["doc_count"] + ) + if app.config["FATCAT_MERGE_SHADOW_PRESERVATION"]: for k in year_dicts.keys(): - year_dicts[k]['none'] += year_dicts[k]['shadows_only'] - year_dicts[k]['shadows_only'] = 0 - return sorted(year_dicts.values(), key=lambda x: x['year']) + year_dicts[k]["none"] += year_dicts[k]["shadows_only"] + year_dicts[k]["shadows_only"] = 0 + return sorted(year_dicts.values(), key=lambda x: x["year"]) def get_elastic_preservation_by_date(query) -> List[dict]: @@ -656,7 +693,7 @@ def get_elastic_preservation_by_date(query) -> List[dict]: {date (str), bright (int), dark (int), shadows_only (int), none (int)} """ - search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_RELEASE_INDEX"]) if query.q not in [None, "*"]: search = search.query( "query_string", @@ -678,32 +715,37 @@ def get_elastic_preservation_by_date(query) -> List[dict]: start_date = date_today - datetime.timedelta(days=60) end_date = date_today + datetime.timedelta(days=1) search = search.filter( - "range", release_date=dict( + "range", + release_date=dict( gte=str(start_date), lte=str(end_date), - ) + ), ) search.aggs.bucket( - 'date_preservation', - 'composite', + "date_preservation", + "composite", size=1500, sources=[ - {"date": { - "histogram": { - "field": "release_date", - "interval": 1, - }, - }}, - {"preservation": { - "terms": { - "field": "preservation", - }, - }}, + { + "date": { + "histogram": { + "field": "release_date", + "interval": 1, + }, + } + }, + { + "preservation": { + "terms": { + "field": "preservation", + }, + } + }, ], ) search = search[:0] - search = search.params(request_cache='true') + search = search.params(request_cache="true") search = search.params(track_total_hits=True) resp = wrap_es_execution(search) @@ -711,15 +753,18 @@ def get_elastic_preservation_by_date(query) -> List[dict]: date_dicts = dict() this_date = start_date while this_date <= end_date: - date_dicts[str(this_date)] = dict(date=str(this_date), bright=0, dark=0, shadows_only=0, none=0) + date_dicts[str(this_date)] = dict( + date=str(this_date), bright=0, dark=0, shadows_only=0, none=0 + ) this_date = this_date + datetime.timedelta(days=1) for row in buckets: - date_dicts[row['key']['date'][0:10]][row['key']['preservation']] = int(row['doc_count']) - if app.config['FATCAT_MERGE_SHADOW_PRESERVATION']: + date_dicts[row["key"]["date"][0:10]][row["key"]["preservation"]] = int(row["doc_count"]) + if app.config["FATCAT_MERGE_SHADOW_PRESERVATION"]: for k in date_dicts.keys(): - date_dicts[k]['none'] += date_dicts[k]['shadows_only'] - date_dicts[k]['shadows_only'] = 0 - return sorted(date_dicts.values(), key=lambda x: x['date']) + date_dicts[k]["none"] += date_dicts[k]["shadows_only"] + date_dicts[k]["shadows_only"] = 0 + return sorted(date_dicts.values(), key=lambda x: x["date"]) + def get_elastic_container_preservation_by_volume(container_id: str) -> List[dict]: """ @@ -733,52 +778,64 @@ def get_elastic_container_preservation_by_volume(container_id: str) -> List[dict {year (int), bright (int), dark (int), shadows_only (int), none (int)} """ - search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_RELEASE_INDEX"]) search = search.query( - 'bool', + "bool", filter=[ - Q("bool", must=[ - Q("match", container_id=container_id), - Q("exists", field="volume"), - ]), + Q( + "bool", + must=[ + Q("match", container_id=container_id), + Q("exists", field="volume"), + ], + ), ], ) search.aggs.bucket( - 'volume_preservation', - 'composite', + "volume_preservation", + "composite", size=1500, sources=[ - {"volume": { - "terms": { - "field": "volume", - }, - }}, - {"preservation": { - "terms": { - "field": "preservation", - }, - }}, + { + "volume": { + "terms": { + "field": "volume", + }, + } + }, + { + "preservation": { + "terms": { + "field": "preservation", + }, + } + }, ], ) search = search[:0] - search = search.params(request_cache='true') + search = search.params(request_cache="true") search = search.params(track_total_hits=True) resp = wrap_es_execution(search) buckets = resp.aggregations.volume_preservation.buckets - volume_nums = set([int(h['key']['volume']) for h in buckets if h['key']['volume'].isdigit()]) + volume_nums = set( + [int(h["key"]["volume"]) for h in buckets if h["key"]["volume"].isdigit()] + ) volume_dicts = dict() if volume_nums: - for num in range(min(volume_nums), max(volume_nums)+1): + for num in range(min(volume_nums), max(volume_nums) + 1): volume_dicts[num] = dict(volume=num, bright=0, dark=0, shadows_only=0, none=0) for row in buckets: - if row['key']['volume'].isdigit(): - volume_dicts[int(row['key']['volume'])][row['key']['preservation']] = int(row['doc_count']) - if app.config['FATCAT_MERGE_SHADOW_PRESERVATION']: + if row["key"]["volume"].isdigit(): + volume_dicts[int(row["key"]["volume"])][row["key"]["preservation"]] = int( + row["doc_count"] + ) + if app.config["FATCAT_MERGE_SHADOW_PRESERVATION"]: for k in volume_dicts.keys(): - volume_dicts[k]['none'] += volume_dicts[k]['shadows_only'] - volume_dicts[k]['shadows_only'] = 0 - return sorted(volume_dicts.values(), key=lambda x: x['volume']) + volume_dicts[k]["none"] += volume_dicts[k]["shadows_only"] + volume_dicts[k]["shadows_only"] = 0 + return sorted(volume_dicts.values(), key=lambda x: x["volume"]) + def get_elastic_preservation_by_type(query: ReleaseQuery) -> List[dict]: """ @@ -789,7 +846,7 @@ def get_elastic_preservation_by_type(query: ReleaseQuery) -> List[dict]: {year (int), bright (int), dark (int), shadows_only (int), none (int)} """ - search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_RELEASE_INDEX"]) if query.q not in [None, "*"]: search = search.query( "query_string", @@ -804,11 +861,14 @@ def get_elastic_preservation_by_type(query: ReleaseQuery) -> List[dict]: ) if query.container_id: search = search.query( - 'bool', + "bool", filter=[ - Q("bool", must=[ - Q("match", container_id=query.container_id), - ]), + Q( + "bool", + must=[ + Q("match", container_id=query.container_id), + ], + ), ], ) if query.recent: @@ -817,39 +877,45 @@ def get_elastic_preservation_by_type(query: ReleaseQuery) -> List[dict]: end_date = str(date_today + datetime.timedelta(days=1)) search = search.filter("range", release_date=dict(gte=start_date, lte=end_date)) search.aggs.bucket( - 'type_preservation', - 'composite', + "type_preservation", + "composite", size=1500, sources=[ - {"release_type": { - "terms": { - "field": "release_type", - }, - }}, - {"preservation": { - "terms": { - "field": "preservation", - }, - }}, + { + "release_type": { + "terms": { + "field": "release_type", + }, + } + }, + { + "preservation": { + "terms": { + "field": "preservation", + }, + } + }, ], ) search = search[:0] - search = search.params(request_cache='true') + search = search.params(request_cache="true") search = search.params(track_total_hits=True) resp = wrap_es_execution(search) buckets = resp.aggregations.type_preservation.buckets - type_set = set([h['key']['release_type'] for h in buckets]) + type_set = set([h["key"]["release_type"] for h in buckets]) type_dicts = dict() for k in type_set: type_dicts[k] = dict(release_type=k, bright=0, dark=0, shadows_only=0, none=0, total=0) for row in buckets: - type_dicts[row['key']['release_type']][row['key']['preservation']] = int(row['doc_count']) + type_dicts[row["key"]["release_type"]][row["key"]["preservation"]] = int( + row["doc_count"] + ) for k in type_set: - for p in ('bright', 'dark', 'shadows_only', 'none'): - type_dicts[k]['total'] += type_dicts[k][p] - if app.config['FATCAT_MERGE_SHADOW_PRESERVATION']: + for p in ("bright", "dark", "shadows_only", "none"): + type_dicts[k]["total"] += type_dicts[k][p] + if app.config["FATCAT_MERGE_SHADOW_PRESERVATION"]: for k in type_set: - type_dicts[k]['none'] += type_dicts[k]['shadows_only'] - type_dicts[k]['shadows_only'] = 0 - return sorted(type_dicts.values(), key=lambda x: x['total'], reverse=True) + type_dicts[k]["none"] += type_dicts[k]["shadows_only"] + type_dicts[k]["shadows_only"] = 0 + return sorted(type_dicts.values(), key=lambda x: x["total"], reverse=True) |