From e28fa0da97f4edc070f665a9f5fd4f4036196a18 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 27 Jul 2020 20:08:51 -0700 Subject: search: 'recent' mode for coverage search --- python/fatcat_web/graphics.py | 26 ++++++- python/fatcat_web/routes.py | 15 +++-- python/fatcat_web/search.py | 86 +++++++++++++++++++++++- python/fatcat_web/templates/coverage_search.html | 14 ++++ 4 files changed, 134 insertions(+), 7 deletions(-) (limited to 'python/fatcat_web') diff --git a/python/fatcat_web/graphics.py b/python/fatcat_web/graphics.py index 96c3531a..7d6e5702 100644 --- a/python/fatcat_web/graphics.py +++ b/python/fatcat_web/graphics.py @@ -61,6 +61,30 @@ def preservation_by_year_histogram(rows: List[Dict]) -> pygal.Graph: chart.add('Bright', [y['bright'] for y in years]) return chart +def preservation_by_date_histogram(rows: List[Dict]) -> pygal.Graph: + """ + Note: this returns a raw pygal chart; it does not render it to SVG/PNG + """ + + dates = sorted(rows, key=lambda x: x['date']) + + CleanStyle.colors = ("red", "darkred", "darkolivegreen", "limegreen") + label_count = len(dates) + if len(dates) > 30: + label_count = 10 + chart = pygal.StackedBar(dynamic_print_values=True, style=CleanStyle, + width=1000, height=500, x_labels_major_count=label_count, + show_minor_x_labels=False, x_label_rotation=20) + #chart.title = "Preservation by Date" + chart.x_title = "Date" + #chart.y_title = "Count" + chart.x_labels = [str(y['date']) for y in dates] + chart.add('None', [y['none'] for y in dates]) + chart.add('Shadow', [y['shadows_only'] for y in dates]) + chart.add('Dark', [y['dark'] for y in dates]) + chart.add('Bright', [y['bright'] for y in dates]) + return chart + def preservation_by_volume_histogram(rows: List[Dict]) -> pygal.Graph: """ Note: this returns a raw pygal chart; it does not render it to SVG/PNG @@ -75,7 +99,7 @@ def preservation_by_volume_histogram(rows: List[Dict]) -> pygal.Graph: chart = pygal.StackedBar(dynamic_print_values=True, style=CleanStyle, width=1000, height=500, x_labels_major_count=label_count, show_minor_x_labels=False, x_label_rotation=20) - #chart.title = "Preservation by Year" + #chart.title = "Preservation by Volume" chart.x_title = "Volume" #chart.y_title = "Count" chart.x_labels = [str(y['volume']) for y in volumes] diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index a741112f..20fe0e12 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -746,19 +746,24 @@ def coverage_search(): query = ReleaseQuery.from_args(request.args) coverage_stats = get_elastic_search_coverage(query) + year_histogram_svg = None + date_histogram_svg = None + coverage_type_preservation = None if coverage_stats['total'] > 1: - year_histogram = get_elastic_preservation_by_year(query) - year_histogram_svg = preservation_by_year_histogram(year_histogram).render_data_uri() coverage_type_preservation = get_elastic_preservation_by_type(query) - else: - year_histogram_svg = None - coverage_type_preservation = None + if query.recent: + date_histogram = get_elastic_preservation_by_date(query) + date_histogram_svg = preservation_by_date_histogram(date_histogram).render_data_uri() + else: + year_histogram = get_elastic_preservation_by_year(query) + year_histogram_svg = preservation_by_year_histogram(year_histogram).render_data_uri() return render_template( 'coverage_search.html', query=query, coverage_stats=coverage_stats, coverage_type_preservation=coverage_type_preservation, year_histogram_svg=year_histogram_svg, + date_histogram_svg=date_histogram_svg, ) def get_changelog_stats(): diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index 3ba6fdb2..b0d27b2e 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -31,6 +31,7 @@ class ReleaseQuery: offset: Optional[int] = None fulltext_only: bool = False container_id: Optional[str] = None + recent: bool = False @classmethod def from_args(cls, args) -> 'ReleaseQuery': @@ -55,6 +56,7 @@ class ReleaseQuery: offset=offset, fulltext_only=bool(args.get('fulltext_only')), container_id=container_id, + recent=bool(args.get('recent')), ) @dataclass @@ -384,6 +386,11 @@ def get_elastic_search_coverage(query: ReleaseQuery) -> dict: field='preservation', missing='_unknown', ) + if query.recent: + date_today = datetime.date.today() + start_date = str(date_today - datetime.timedelta(days=60)) + end_date = str(date_today + datetime.timedelta(days=1)) + search = search.filter("range", release_date=dict(gte=start_date, lte=end_date)) search = search[:0] @@ -550,7 +557,6 @@ def get_elastic_preservation_by_year(query) -> List[dict]: allow_leading_wildcard=False, lenient=True, fields=[ - "title^2", "biblio", ], ) @@ -598,6 +604,79 @@ def get_elastic_preservation_by_year(query) -> List[dict]: year_dicts[int(row['key']['year'])][row['key']['preservation']] = int(row['doc_count']) return sorted(year_dicts.values(), key=lambda x: x['year']) + +def get_elastic_preservation_by_date(query) -> List[dict]: + """ + Fetches a stacked histogram of {date, preservation}. + + Preservation has 4 potential values; this function filters to the past 250 + years (at most), or about 1000 values. + + Returns a list of dicts, sorted by date, with keys/values like: + + {date (str), bright (int), dark (int), shadows_only (int), none (int)} + """ + + search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + if query.q not in [None, "*"]: + search = search.query( + "query_string", + query=query.q, + default_operator="AND", + analyze_wildcard=True, + allow_leading_wildcard=False, + lenient=True, + fields=[ + "biblio", + ], + ) + if query.container_id: + search = search.filter( + "term", + container_id=query.container_id, + ) + date_today = datetime.date.today() + start_date = date_today - datetime.timedelta(days=60) + end_date = date_today + datetime.timedelta(days=1) + search = search.filter( + "range", release_date=dict( + gte=str(start_date), + lte=str(end_date), + ) + ) + + search.aggs.bucket( + 'date_preservation', + 'composite', + size=1500, + sources=[ + {"date": { + "histogram": { + "field": "release_date", + "interval": 1, + }, + }}, + {"preservation": { + "terms": { + "field": "preservation", + }, + }}, + ], + ) + search = search[:0] + search = search.params(request_cache='true') + resp = wrap_es_execution(search) + + buckets = resp.aggregations.date_preservation.buckets + date_dicts = dict() + this_date = start_date + while this_date <= end_date: + date_dicts[str(this_date)] = dict(date=str(this_date), bright=0, dark=0, shadows_only=0, none=0) + this_date = this_date + datetime.timedelta(days=1) + for row in buckets: + date_dicts[row['key']['date'][0:10]][row['key']['preservation']] = int(row['doc_count']) + return sorted(date_dicts.values(), key=lambda x: x['date']) + def get_elastic_container_preservation_by_volume(container_id: str) -> List[dict]: """ Fetches a stacked histogram of {volume, preservation}. @@ -682,6 +761,11 @@ def get_elastic_preservation_by_type(query: ReleaseQuery) -> List[dict]: ]), ], ) + if query.recent: + date_today = datetime.date.today() + start_date = str(date_today - datetime.timedelta(days=60)) + end_date = str(date_today + datetime.timedelta(days=1)) + search = search.filter("range", release_date=dict(gte=start_date, lte=end_date)) search.aggs.bucket( 'type_preservation', 'composite', diff --git a/python/fatcat_web/templates/coverage_search.html b/python/fatcat_web/templates/coverage_search.html index c730ef9d..1e0f8327 100644 --- a/python/fatcat_web/templates/coverage_search.html +++ b/python/fatcat_web/templates/coverage_search.html @@ -20,6 +20,10 @@ +
+ + +

Can also search for releases (eg, individual papers) or containers (eg, journals). @@ -55,6 +59,16 @@ {% endif %} +{% if date_histogram_svg != None %} +

+

Perpetual Access Coverage by Date

+ +
+ +
+ +{% endif %} + {% if coverage_type_preservation != None %}

Perpetual Access Coverage by Release Type

-- cgit v1.2.3