aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_web/search.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_web/search.py')
-rw-r--r--python/fatcat_web/search.py518
1 files changed, 292 insertions, 226 deletions
diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py
index 73781016..5fc3f614 100644
--- a/python/fatcat_web/search.py
+++ b/python/fatcat_web/search.py
@@ -1,4 +1,3 @@
-
"""
Helpers for doing elasticsearch queries (used in the web interface; not part of
the formal API)
@@ -17,7 +16,6 @@ from fatcat_web import app
class FatcatSearchError(Exception):
-
def __init__(self, status_code: int, name: str, description: str = None):
if status_code == "N/A":
status_code = 503
@@ -25,6 +23,7 @@ class FatcatSearchError(Exception):
self.name = name
self.description = description
+
@dataclass
class ReleaseQuery:
q: Optional[str] = None
@@ -35,31 +34,32 @@ class ReleaseQuery:
recent: bool = False
@classmethod
- def from_args(cls, args) -> 'ReleaseQuery':
+ def from_args(cls, args) -> "ReleaseQuery":
- query_str = args.get('q') or '*'
+ query_str = args.get("q") or "*"
- container_id = args.get('container_id')
+ container_id = args.get("container_id")
# TODO: as filter, not in query string
if container_id:
query_str += ' container_id:"{}"'.format(container_id)
# TODO: where are container_issnl queries actually used?
- issnl = args.get('container_issnl')
+ issnl = args.get("container_issnl")
if issnl and query_str:
query_str += ' container_issnl:"{}"'.format(issnl)
- offset = args.get('offset', '0')
+ offset = args.get("offset", "0")
offset = max(0, int(offset)) if offset.isnumeric() else 0
return ReleaseQuery(
q=query_str,
offset=offset,
- fulltext_only=bool(args.get('fulltext_only')),
+ fulltext_only=bool(args.get("fulltext_only")),
container_id=container_id,
- recent=bool(args.get('recent')),
+ recent=bool(args.get("recent")),
)
+
@dataclass
class GenericQuery:
q: Optional[str] = None
@@ -67,11 +67,11 @@ class GenericQuery:
offset: Optional[int] = None
@classmethod
- def from_args(cls, args) -> 'GenericQuery':
- query_str = args.get('q')
+ def from_args(cls, args) -> "GenericQuery":
+ query_str = args.get("q")
if not query_str:
- query_str = '*'
- offset = args.get('offset', '0')
+ query_str = "*"
+ offset = args.get("offset", "0")
offset = max(0, int(offset)) if offset.isnumeric() else 0
return GenericQuery(
@@ -79,6 +79,7 @@ class GenericQuery:
offset=offset,
)
+
@dataclass
class SearchHits:
count_returned: int
@@ -89,6 +90,7 @@ class SearchHits:
query_time_ms: int
results: List[Any]
+
def _hits_total_int(val: Any) -> int:
"""
Compatibility hack between ES 6.x and 7.x. In ES 6x, total is returned as
@@ -97,7 +99,7 @@ def _hits_total_int(val: Any) -> int:
if isinstance(val, int):
return val
else:
- return int(val['value'])
+ return int(val["value"])
def results_to_dict(response: elasticsearch_dsl.response.Response) -> List[dict]:
@@ -121,6 +123,7 @@ def results_to_dict(response: elasticsearch_dsl.response.Response) -> List[dict]
h[key] = h[key].encode("utf8", "ignore").decode("utf8")
return results
+
def wrap_es_execution(search: Search) -> Any:
"""
Executes a Search object, and converts various ES error types into
@@ -146,6 +149,7 @@ def wrap_es_execution(search: Search) -> Any:
raise FatcatSearchError(e.status_code, str(e.error), description)
return resp
+
def agg_to_dict(agg) -> dict:
"""
Takes a simple term aggregation result (with buckets) and returns a simple
@@ -157,14 +161,13 @@ def agg_to_dict(agg) -> dict:
for bucket in agg.buckets:
result[bucket.key] = bucket.doc_count
if agg.sum_other_doc_count:
- result['_other'] = agg.sum_other_doc_count
+ result["_other"] = agg.sum_other_doc_count
return result
-def do_container_search(
- query: GenericQuery, deep_page_limit: int = 2000
-) -> SearchHits:
- search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_CONTAINER_INDEX'])
+def do_container_search(query: GenericQuery, deep_page_limit: int = 2000) -> SearchHits:
+
+ search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_CONTAINER_INDEX"])
search = search.query(
"query_string",
@@ -199,11 +202,10 @@ def do_container_search(
results=results,
)
-def do_release_search(
- query: ReleaseQuery, deep_page_limit: int = 2000
-) -> SearchHits:
- search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+def do_release_search(query: ReleaseQuery, deep_page_limit: int = 2000) -> SearchHits:
+
+ search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_RELEASE_INDEX"])
# availability filters
if query.fulltext_only:
@@ -240,7 +242,11 @@ def do_release_search(
search = search.query(
"boosting",
- positive=Q("bool", must=basic_biblio, should=[has_fulltext],),
+ positive=Q(
+ "bool",
+ must=basic_biblio,
+ should=[has_fulltext],
+ ),
negative=poor_metadata,
negative_boost=0.5,
)
@@ -260,9 +266,13 @@ def do_release_search(
for h in results:
# Ensure 'contrib_names' is a list, not a single string
- if type(h['contrib_names']) is not list:
- h['contrib_names'] = [h['contrib_names'], ]
- h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']]
+ if type(h["contrib_names"]) is not list:
+ h["contrib_names"] = [
+ h["contrib_names"],
+ ]
+ h["contrib_names"] = [
+ name.encode("utf8", "ignore").decode("utf8") for name in h["contrib_names"]
+ ]
return SearchHits(
count_returned=len(results),
@@ -274,6 +284,7 @@ def do_release_search(
results=results,
)
+
def get_elastic_container_random_releases(ident: str, limit=5) -> dict:
"""
Returns a list of releases from the container.
@@ -281,16 +292,16 @@ def get_elastic_container_random_releases(ident: str, limit=5) -> dict:
assert limit > 0 and limit <= 100
- search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_RELEASE_INDEX"])
search = search.query(
- 'bool',
+ "bool",
must=[
- Q('term', container_id=ident),
- Q('range', release_year={ "lte": datetime.datetime.today().year }),
- ]
+ Q("term", container_id=ident),
+ Q("range", release_year={"lte": datetime.datetime.today().year}),
+ ],
)
- search = search.sort('-in_web', '-release_date')
- search = search[:int(limit)]
+ search = search.sort("-in_web", "-release_date")
+ search = search[: int(limit)]
search = search.params(request_cache=True)
# not needed: search = search.params(track_total_hits=True)
@@ -299,6 +310,7 @@ def get_elastic_container_random_releases(ident: str, limit=5) -> dict:
return results
+
def get_elastic_entity_stats() -> dict:
"""
TODO: files, filesets, webcaptures (no schema yet)
@@ -312,11 +324,11 @@ def get_elastic_entity_stats() -> dict:
stats = {}
# release totals
- search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_RELEASE_INDEX"])
search.aggs.bucket(
- 'release_ref_count',
- 'sum',
- field='ref_count',
+ "release_ref_count",
+ "sum",
+ field="ref_count",
)
search = search[:0] # pylint: disable=unsubscriptable-object
@@ -324,15 +336,15 @@ def get_elastic_entity_stats() -> dict:
search = search.params(track_total_hits=True)
resp = wrap_es_execution(search)
- stats['release'] = {
+ stats["release"] = {
"total": _hits_total_int(resp.hits.total),
"refs_total": int(resp.aggregations.release_ref_count.value),
}
# paper counts
- search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_RELEASE_INDEX"])
search = search.query(
- 'terms',
+ "terms",
release_type=[
"article-journal",
"paper-conference",
@@ -341,17 +353,21 @@ def get_elastic_entity_stats() -> dict:
],
)
search.aggs.bucket(
- 'paper_like',
- 'filters',
+ "paper_like",
+ "filters",
filters={
- "in_web": { "term": { "in_web": "true" } },
- "is_oa": { "term": { "is_oa": "true" } },
- "in_kbart": { "term": { "in_kbart": "true" } },
- "in_web_not_kbart": { "bool": { "filter": [
- { "term": { "in_web": "true" } },
- { "term": { "in_kbart": "false" } },
- ]}},
- }
+ "in_web": {"term": {"in_web": "true"}},
+ "is_oa": {"term": {"is_oa": "true"}},
+ "in_kbart": {"term": {"in_kbart": "true"}},
+ "in_web_not_kbart": {
+ "bool": {
+ "filter": [
+ {"term": {"in_web": "true"}},
+ {"term": {"in_kbart": "false"}},
+ ]
+ }
+ },
+ },
)
search = search[:0]
@@ -359,35 +375,36 @@ def get_elastic_entity_stats() -> dict:
search = search.params(track_total_hits=True)
resp = wrap_es_execution(search)
buckets = resp.aggregations.paper_like.buckets
- stats['papers'] = {
- 'total': _hits_total_int(resp.hits.total),
- 'in_web': buckets.in_web.doc_count,
- 'is_oa': buckets.is_oa.doc_count,
- 'in_kbart': buckets.in_kbart.doc_count,
- 'in_web_not_kbart': buckets.in_web_not_kbart.doc_count,
+ stats["papers"] = {
+ "total": _hits_total_int(resp.hits.total),
+ "in_web": buckets.in_web.doc_count,
+ "is_oa": buckets.is_oa.doc_count,
+ "in_kbart": buckets.in_kbart.doc_count,
+ "in_web_not_kbart": buckets.in_web_not_kbart.doc_count,
}
# container counts
- search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_CONTAINER_INDEX'])
+ search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_CONTAINER_INDEX"])
search.aggs.bucket(
- 'release_ref_count',
- 'sum',
- field='ref_count',
+ "release_ref_count",
+ "sum",
+ field="ref_count",
)
search = search[:0] # pylint: disable=unsubscriptable-object
search = search.params(request_cache=True)
search = search.params(track_total_hits=True)
resp = wrap_es_execution(search)
- stats['container'] = {
+ stats["container"] = {
"total": _hits_total_int(resp.hits.total),
}
return stats
+
def get_elastic_search_coverage(query: ReleaseQuery) -> dict:
- search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_RELEASE_INDEX"])
search = search.query(
"query_string",
query=query.q,
@@ -398,10 +415,10 @@ def get_elastic_search_coverage(query: ReleaseQuery) -> dict:
fields=["biblio"],
)
search.aggs.bucket(
- 'preservation',
- 'terms',
- field='preservation',
- missing='_unknown',
+ "preservation",
+ "terms",
+ field="preservation",
+ missing="_unknown",
)
if query.recent:
date_today = datetime.date.today()
@@ -416,21 +433,24 @@ def get_elastic_search_coverage(query: ReleaseQuery) -> dict:
resp = wrap_es_execution(search)
preservation_bucket = agg_to_dict(resp.aggregations.preservation)
- preservation_bucket['total'] = _hits_total_int(resp.hits.total)
- for k in ('bright', 'dark', 'shadows_only', 'none'):
+ preservation_bucket["total"] = _hits_total_int(resp.hits.total)
+ for k in ("bright", "dark", "shadows_only", "none"):
if k not in preservation_bucket:
preservation_bucket[k] = 0
- if app.config['FATCAT_MERGE_SHADOW_PRESERVATION']:
- preservation_bucket['none'] += preservation_bucket['shadows_only']
- preservation_bucket['shadows_only'] = 0
+ if app.config["FATCAT_MERGE_SHADOW_PRESERVATION"]:
+ preservation_bucket["none"] += preservation_bucket["shadows_only"]
+ preservation_bucket["shadows_only"] = 0
stats = {
- 'total': _hits_total_int(resp.hits.total),
- 'preservation': preservation_bucket,
+ "total": _hits_total_int(resp.hits.total),
+ "preservation": preservation_bucket,
}
return stats
-def get_elastic_container_stats(ident, issnl=None, es_client=None, es_index=None, merge_shadows=None):
+
+def get_elastic_container_stats(
+ ident, issnl=None, es_client=None, es_index=None, merge_shadows=None
+):
"""
Returns dict:
ident
@@ -444,41 +464,41 @@ def get_elastic_container_stats(ident, issnl=None, es_client=None, es_index=None
if not es_client:
es_client = app.es_client
if not es_index:
- es_index = app.config['ELASTICSEARCH_RELEASE_INDEX']
+ es_index = app.config["ELASTICSEARCH_RELEASE_INDEX"]
if merge_shadows is None:
- merge_shadows = app.config['FATCAT_MERGE_SHADOW_PRESERVATION']
+ merge_shadows = app.config["FATCAT_MERGE_SHADOW_PRESERVATION"]
search = Search(using=es_client, index=es_index)
search = search.query(
- 'term',
+ "term",
container_id=ident,
)
search.aggs.bucket(
- 'container_stats',
- 'filters',
+ "container_stats",
+ "filters",
filters={
"in_web": {
- "term": { "in_web": True },
+ "term": {"in_web": True},
},
"in_kbart": {
- "term": { "in_kbart": True },
+ "term": {"in_kbart": True},
},
"is_preserved": {
- "term": { "is_preserved": True },
+ "term": {"is_preserved": True},
},
},
)
search.aggs.bucket(
- 'preservation',
- 'terms',
- field='preservation',
- missing='_unknown',
+ "preservation",
+ "terms",
+ field="preservation",
+ missing="_unknown",
)
search.aggs.bucket(
- 'release_type',
- 'terms',
- field='release_type',
- missing='_unknown',
+ "release_type",
+ "terms",
+ field="release_type",
+ missing="_unknown",
)
search = search[:0]
@@ -489,27 +509,28 @@ def get_elastic_container_stats(ident, issnl=None, es_client=None, es_index=None
container_stats = resp.aggregations.container_stats.buckets
preservation_bucket = agg_to_dict(resp.aggregations.preservation)
- preservation_bucket['total'] = _hits_total_int(resp.hits.total)
- for k in ('bright', 'dark', 'shadows_only', 'none'):
+ preservation_bucket["total"] = _hits_total_int(resp.hits.total)
+ for k in ("bright", "dark", "shadows_only", "none"):
if k not in preservation_bucket:
preservation_bucket[k] = 0
if merge_shadows:
- preservation_bucket['none'] += preservation_bucket['shadows_only']
- preservation_bucket['shadows_only'] = 0
+ preservation_bucket["none"] += preservation_bucket["shadows_only"]
+ preservation_bucket["shadows_only"] = 0
release_type_bucket = agg_to_dict(resp.aggregations.release_type)
stats = {
- 'ident': ident,
- 'issnl': issnl,
- 'total': _hits_total_int(resp.hits.total),
- 'in_web': container_stats['in_web']['doc_count'],
- 'in_kbart': container_stats['in_kbart']['doc_count'],
- 'is_preserved': container_stats['is_preserved']['doc_count'],
- 'preservation': preservation_bucket,
- 'release_type': release_type_bucket,
+ "ident": ident,
+ "issnl": issnl,
+ "total": _hits_total_int(resp.hits.total),
+ "in_web": container_stats["in_web"]["doc_count"],
+ "in_kbart": container_stats["in_kbart"]["doc_count"],
+ "is_preserved": container_stats["is_preserved"]["doc_count"],
+ "preservation": preservation_bucket,
+ "release_type": release_type_bucket,
}
return stats
+
def get_elastic_container_histogram_legacy(ident) -> List:
"""
Fetches a stacked histogram of {year, in_ia}. This is for the older style
@@ -522,48 +543,58 @@ def get_elastic_container_histogram_legacy(ident) -> List:
(year, in_ia, count)
"""
- search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_RELEASE_INDEX"])
search = search.query(
- 'bool',
+ "bool",
must=[
- Q("range", release_year={
- "gte": datetime.datetime.today().year - 499,
- "lte": datetime.datetime.today().year,
- }),
+ Q(
+ "range",
+ release_year={
+ "gte": datetime.datetime.today().year - 499,
+ "lte": datetime.datetime.today().year,
+ },
+ ),
],
filter=[
- Q("bool", minimum_should_match=1, should=[
- Q("match", container_id=ident),
- ]),
+ Q(
+ "bool",
+ minimum_should_match=1,
+ should=[
+ Q("match", container_id=ident),
+ ],
+ ),
],
)
search.aggs.bucket(
- 'year_in_ia',
- 'composite',
+ "year_in_ia",
+ "composite",
size=1000,
sources=[
- {"year": {
- "histogram": {
- "field": "release_year",
- "interval": 1,
- },
- }},
- {"in_ia": {
- "terms": {
- "field": "in_ia",
- },
- }},
+ {
+ "year": {
+ "histogram": {
+ "field": "release_year",
+ "interval": 1,
+ },
+ }
+ },
+ {
+ "in_ia": {
+ "terms": {
+ "field": "in_ia",
+ },
+ }
+ },
],
)
search = search[:0]
- search = search.params(request_cache='true')
+ search = search.params(request_cache="true")
search = search.params(track_total_hits=True)
resp = wrap_es_execution(search)
buckets = resp.aggregations.year_in_ia.buckets
- vals = [(int(h['key']['year']), h['key']['in_ia'], h['doc_count'])
- for h in buckets]
+ vals = [(int(h["key"]["year"]), h["key"]["in_ia"], h["doc_count"]) for h in buckets]
vals = sorted(vals)
return vals
@@ -580,7 +611,7 @@ def get_elastic_preservation_by_year(query) -> List[dict]:
{year (int), bright (int), dark (int), shadows_only (int), none (int)}
"""
- search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_RELEASE_INDEX"])
if query.q not in [None, "*"]:
search = search.query(
"query_string",
@@ -607,41 +638,47 @@ def get_elastic_preservation_by_year(query) -> List[dict]:
)
search.aggs.bucket(
- 'year_preservation',
- 'composite',
+ "year_preservation",
+ "composite",
size=1500,
sources=[
- {"year": {
- "histogram": {
- "field": "release_year",
- "interval": 1,
- },
- }},
- {"preservation": {
- "terms": {
- "field": "preservation",
- },
- }},
+ {
+ "year": {
+ "histogram": {
+ "field": "release_year",
+ "interval": 1,
+ },
+ }
+ },
+ {
+ "preservation": {
+ "terms": {
+ "field": "preservation",
+ },
+ }
+ },
],
)
search = search[:0]
- search = search.params(request_cache='true')
+ search = search.params(request_cache="true")
search = search.params(track_total_hits=True)
resp = wrap_es_execution(search)
buckets = resp.aggregations.year_preservation.buckets
- year_nums = set([int(h['key']['year']) for h in buckets])
+ year_nums = set([int(h["key"]["year"]) for h in buckets])
year_dicts = dict()
if year_nums:
- for num in range(min(year_nums), max(year_nums)+1):
+ for num in range(min(year_nums), max(year_nums) + 1):
year_dicts[num] = dict(year=num, bright=0, dark=0, shadows_only=0, none=0)
for row in buckets:
- year_dicts[int(row['key']['year'])][row['key']['preservation']] = int(row['doc_count'])
- if app.config['FATCAT_MERGE_SHADOW_PRESERVATION']:
+ year_dicts[int(row["key"]["year"])][row["key"]["preservation"]] = int(
+ row["doc_count"]
+ )
+ if app.config["FATCAT_MERGE_SHADOW_PRESERVATION"]:
for k in year_dicts.keys():
- year_dicts[k]['none'] += year_dicts[k]['shadows_only']
- year_dicts[k]['shadows_only'] = 0
- return sorted(year_dicts.values(), key=lambda x: x['year'])
+ year_dicts[k]["none"] += year_dicts[k]["shadows_only"]
+ year_dicts[k]["shadows_only"] = 0
+ return sorted(year_dicts.values(), key=lambda x: x["year"])
def get_elastic_preservation_by_date(query) -> List[dict]:
@@ -656,7 +693,7 @@ def get_elastic_preservation_by_date(query) -> List[dict]:
{date (str), bright (int), dark (int), shadows_only (int), none (int)}
"""
- search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_RELEASE_INDEX"])
if query.q not in [None, "*"]:
search = search.query(
"query_string",
@@ -678,32 +715,37 @@ def get_elastic_preservation_by_date(query) -> List[dict]:
start_date = date_today - datetime.timedelta(days=60)
end_date = date_today + datetime.timedelta(days=1)
search = search.filter(
- "range", release_date=dict(
+ "range",
+ release_date=dict(
gte=str(start_date),
lte=str(end_date),
- )
+ ),
)
search.aggs.bucket(
- 'date_preservation',
- 'composite',
+ "date_preservation",
+ "composite",
size=1500,
sources=[
- {"date": {
- "histogram": {
- "field": "release_date",
- "interval": 1,
- },
- }},
- {"preservation": {
- "terms": {
- "field": "preservation",
- },
- }},
+ {
+ "date": {
+ "histogram": {
+ "field": "release_date",
+ "interval": 1,
+ },
+ }
+ },
+ {
+ "preservation": {
+ "terms": {
+ "field": "preservation",
+ },
+ }
+ },
],
)
search = search[:0]
- search = search.params(request_cache='true')
+ search = search.params(request_cache="true")
search = search.params(track_total_hits=True)
resp = wrap_es_execution(search)
@@ -711,15 +753,18 @@ def get_elastic_preservation_by_date(query) -> List[dict]:
date_dicts = dict()
this_date = start_date
while this_date <= end_date:
- date_dicts[str(this_date)] = dict(date=str(this_date), bright=0, dark=0, shadows_only=0, none=0)
+ date_dicts[str(this_date)] = dict(
+ date=str(this_date), bright=0, dark=0, shadows_only=0, none=0
+ )
this_date = this_date + datetime.timedelta(days=1)
for row in buckets:
- date_dicts[row['key']['date'][0:10]][row['key']['preservation']] = int(row['doc_count'])
- if app.config['FATCAT_MERGE_SHADOW_PRESERVATION']:
+ date_dicts[row["key"]["date"][0:10]][row["key"]["preservation"]] = int(row["doc_count"])
+ if app.config["FATCAT_MERGE_SHADOW_PRESERVATION"]:
for k in date_dicts.keys():
- date_dicts[k]['none'] += date_dicts[k]['shadows_only']
- date_dicts[k]['shadows_only'] = 0
- return sorted(date_dicts.values(), key=lambda x: x['date'])
+ date_dicts[k]["none"] += date_dicts[k]["shadows_only"]
+ date_dicts[k]["shadows_only"] = 0
+ return sorted(date_dicts.values(), key=lambda x: x["date"])
+
def get_elastic_container_preservation_by_volume(container_id: str) -> List[dict]:
"""
@@ -733,52 +778,64 @@ def get_elastic_container_preservation_by_volume(container_id: str) -> List[dict
{year (int), bright (int), dark (int), shadows_only (int), none (int)}
"""
- search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_RELEASE_INDEX"])
search = search.query(
- 'bool',
+ "bool",
filter=[
- Q("bool", must=[
- Q("match", container_id=container_id),
- Q("exists", field="volume"),
- ]),
+ Q(
+ "bool",
+ must=[
+ Q("match", container_id=container_id),
+ Q("exists", field="volume"),
+ ],
+ ),
],
)
search.aggs.bucket(
- 'volume_preservation',
- 'composite',
+ "volume_preservation",
+ "composite",
size=1500,
sources=[
- {"volume": {
- "terms": {
- "field": "volume",
- },
- }},
- {"preservation": {
- "terms": {
- "field": "preservation",
- },
- }},
+ {
+ "volume": {
+ "terms": {
+ "field": "volume",
+ },
+ }
+ },
+ {
+ "preservation": {
+ "terms": {
+ "field": "preservation",
+ },
+ }
+ },
],
)
search = search[:0]
- search = search.params(request_cache='true')
+ search = search.params(request_cache="true")
search = search.params(track_total_hits=True)
resp = wrap_es_execution(search)
buckets = resp.aggregations.volume_preservation.buckets
- volume_nums = set([int(h['key']['volume']) for h in buckets if h['key']['volume'].isdigit()])
+ volume_nums = set(
+ [int(h["key"]["volume"]) for h in buckets if h["key"]["volume"].isdigit()]
+ )
volume_dicts = dict()
if volume_nums:
- for num in range(min(volume_nums), max(volume_nums)+1):
+ for num in range(min(volume_nums), max(volume_nums) + 1):
volume_dicts[num] = dict(volume=num, bright=0, dark=0, shadows_only=0, none=0)
for row in buckets:
- if row['key']['volume'].isdigit():
- volume_dicts[int(row['key']['volume'])][row['key']['preservation']] = int(row['doc_count'])
- if app.config['FATCAT_MERGE_SHADOW_PRESERVATION']:
+ if row["key"]["volume"].isdigit():
+ volume_dicts[int(row["key"]["volume"])][row["key"]["preservation"]] = int(
+ row["doc_count"]
+ )
+ if app.config["FATCAT_MERGE_SHADOW_PRESERVATION"]:
for k in volume_dicts.keys():
- volume_dicts[k]['none'] += volume_dicts[k]['shadows_only']
- volume_dicts[k]['shadows_only'] = 0
- return sorted(volume_dicts.values(), key=lambda x: x['volume'])
+ volume_dicts[k]["none"] += volume_dicts[k]["shadows_only"]
+ volume_dicts[k]["shadows_only"] = 0
+ return sorted(volume_dicts.values(), key=lambda x: x["volume"])
+
def get_elastic_preservation_by_type(query: ReleaseQuery) -> List[dict]:
"""
@@ -789,7 +846,7 @@ def get_elastic_preservation_by_type(query: ReleaseQuery) -> List[dict]:
{year (int), bright (int), dark (int), shadows_only (int), none (int)}
"""
- search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_RELEASE_INDEX"])
if query.q not in [None, "*"]:
search = search.query(
"query_string",
@@ -804,11 +861,14 @@ def get_elastic_preservation_by_type(query: ReleaseQuery) -> List[dict]:
)
if query.container_id:
search = search.query(
- 'bool',
+ "bool",
filter=[
- Q("bool", must=[
- Q("match", container_id=query.container_id),
- ]),
+ Q(
+ "bool",
+ must=[
+ Q("match", container_id=query.container_id),
+ ],
+ ),
],
)
if query.recent:
@@ -817,39 +877,45 @@ def get_elastic_preservation_by_type(query: ReleaseQuery) -> List[dict]:
end_date = str(date_today + datetime.timedelta(days=1))
search = search.filter("range", release_date=dict(gte=start_date, lte=end_date))
search.aggs.bucket(
- 'type_preservation',
- 'composite',
+ "type_preservation",
+ "composite",
size=1500,
sources=[
- {"release_type": {
- "terms": {
- "field": "release_type",
- },
- }},
- {"preservation": {
- "terms": {
- "field": "preservation",
- },
- }},
+ {
+ "release_type": {
+ "terms": {
+ "field": "release_type",
+ },
+ }
+ },
+ {
+ "preservation": {
+ "terms": {
+ "field": "preservation",
+ },
+ }
+ },
],
)
search = search[:0]
- search = search.params(request_cache='true')
+ search = search.params(request_cache="true")
search = search.params(track_total_hits=True)
resp = wrap_es_execution(search)
buckets = resp.aggregations.type_preservation.buckets
- type_set = set([h['key']['release_type'] for h in buckets])
+ type_set = set([h["key"]["release_type"] for h in buckets])
type_dicts = dict()
for k in type_set:
type_dicts[k] = dict(release_type=k, bright=0, dark=0, shadows_only=0, none=0, total=0)
for row in buckets:
- type_dicts[row['key']['release_type']][row['key']['preservation']] = int(row['doc_count'])
+ type_dicts[row["key"]["release_type"]][row["key"]["preservation"]] = int(
+ row["doc_count"]
+ )
for k in type_set:
- for p in ('bright', 'dark', 'shadows_only', 'none'):
- type_dicts[k]['total'] += type_dicts[k][p]
- if app.config['FATCAT_MERGE_SHADOW_PRESERVATION']:
+ for p in ("bright", "dark", "shadows_only", "none"):
+ type_dicts[k]["total"] += type_dicts[k][p]
+ if app.config["FATCAT_MERGE_SHADOW_PRESERVATION"]:
for k in type_set:
- type_dicts[k]['none'] += type_dicts[k]['shadows_only']
- type_dicts[k]['shadows_only'] = 0
- return sorted(type_dicts.values(), key=lambda x: x['total'], reverse=True)
+ type_dicts[k]["none"] += type_dicts[k]["shadows_only"]
+ type_dicts[k]["shadows_only"] = 0
+ return sorted(type_dicts.values(), key=lambda x: x["total"], reverse=True)