From bd3c6566fb9fdd5507782f19672fc62d0c551d05 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 7 Jul 2020 16:18:53 -0700 Subject: preservation coverage updates (first round) - new by-year chart with stacked histograms of all 4 preservation statuses - new-style single progress bar showing overall preservation status - new by-volume query and chart Old endpoints are left as-is, with the intention of having them "deprecated" for some time span until entirely removing them. --- python/fatcat_web/graphics.py | 52 +++++- python/fatcat_web/routes.py | 64 +++++++- python/fatcat_web/search.py | 182 ++++++++++++++++++++- .../templates/container_view_coverage.html | 60 ++++--- 4 files changed, 326 insertions(+), 32 deletions(-) (limited to 'python') diff --git a/python/fatcat_web/graphics.py b/python/fatcat_web/graphics.py index 5493d175..96c3531a 100644 --- a/python/fatcat_web/graphics.py +++ b/python/fatcat_web/graphics.py @@ -1,8 +1,10 @@ +from typing import List, Tuple, Dict + import pygal from pygal.style import CleanStyle -def ia_coverage_histogram(rows): +def ia_coverage_histogram(rows: List[Tuple]) -> pygal.Graph: """ Note: this returns a raw pygal chart; it does not render it to SVG/PNG """ @@ -34,3 +36,51 @@ def ia_coverage_histogram(rows): chart.add('via Fatcat', [y['available'] for y in years]) chart.add('Missing', [y['missing'] for y in years]) return chart + +def preservation_by_year_histogram(rows: List[Dict]) -> pygal.Graph: + """ + Note: this returns a raw pygal chart; it does not render it to SVG/PNG + """ + + years = sorted(rows, key=lambda x: x['year']) + + CleanStyle.colors = ("red", "darkred", "darkolivegreen", "limegreen") + label_count = len(years) + if len(years) > 30: + label_count = 10 + chart = pygal.StackedBar(dynamic_print_values=True, style=CleanStyle, + width=1000, height=500, x_labels_major_count=label_count, + show_minor_x_labels=False, x_label_rotation=20) + #chart.title = "Preservation by Year" + chart.x_title = "Year" + #chart.y_title = "Count" + chart.x_labels = [str(y['year']) for y in years] + chart.add('None', [y['none'] for y in years]) + chart.add('Shadow', [y['shadows_only'] for y in years]) + chart.add('Dark', [y['dark'] for y in years]) + chart.add('Bright', [y['bright'] for y in years]) + return chart + +def preservation_by_volume_histogram(rows: List[Dict]) -> pygal.Graph: + """ + Note: this returns a raw pygal chart; it does not render it to SVG/PNG + """ + + volumes = sorted(rows, key=lambda x: x['volume']) + + CleanStyle.colors = ("red", "darkred", "darkolivegreen", "limegreen") + label_count = len(volumes) + if len(volumes) >= 30: + label_count = 10 + chart = pygal.StackedBar(dynamic_print_values=True, style=CleanStyle, + width=1000, height=500, x_labels_major_count=label_count, + show_minor_x_labels=False, x_label_rotation=20) + #chart.title = "Preservation by Year" + chart.x_title = "Volume" + #chart.y_title = "Count" + chart.x_labels = [str(y['volume']) for y in volumes] + chart.add('None', [y['none'] for y in volumes]) + chart.add('Shadow', [y['shadows_only'] for y in volumes]) + chart.add('Dark', [y['dark'] for y in volumes]) + chart.add('Bright', [y['bright'] for y in volumes]) + return chart diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index 6f3ec21b..34cf2d50 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -14,7 +14,7 @@ from fatcat_tools.normal import * from fatcat_web import app, api, auth_api, priv_api, mwoauth, Config from fatcat_web.auth import handle_token_login, handle_logout, load_user, handle_ia_xauth, handle_wmoauth from fatcat_web.cors import crossdomain -from fatcat_web.search import ReleaseQuery, GenericQuery, do_release_search, do_container_search, get_elastic_entity_stats, get_elastic_container_stats, get_elastic_container_histogram, FatcatSearchError +from fatcat_web.search import ReleaseQuery, GenericQuery, do_release_search, do_container_search, get_elastic_entity_stats, get_elastic_container_stats, get_elastic_container_histogram_legacy, get_elastic_container_preservation_by_year, get_elastic_container_preservation_by_volume, get_elastic_container_preservation_by_type, FatcatSearchError from fatcat_web.entity_helpers import * from fatcat_web.graphics import * from fatcat_web.kafka import * @@ -765,7 +765,7 @@ def container_issnl_stats(issnl): raise ae try: stats = get_elastic_container_stats(container.ident, issnl=container.issnl) - except Exception as ae: + except (ValueError, IOError) as ae: app.log.error(ae) abort(503) return jsonify(stats) @@ -792,7 +792,7 @@ def container_ident_ia_coverage_years_json(ident): except ApiException as ae: abort(ae.status) try: - histogram = get_elastic_container_histogram(container.ident) + histogram = get_elastic_container_histogram_legacy(container.ident) except Exception as ae: app.log.error(ae) abort(503) @@ -807,12 +807,68 @@ def container_ident_ia_coverage_years_svg(ident): except ApiException as ae: abort(ae.status) try: - histogram = get_elastic_container_histogram(container.ident) + histogram = get_elastic_container_histogram_legacy(container.ident) except Exception as ae: app.log.error(ae) abort(503) return ia_coverage_histogram(histogram).render_response() +@app.route('/container//preservation_by_year.json', methods=['GET', 'OPTIONS']) +@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) +def container_ident_preservation_by_year_json(ident): + try: + container = api.get_container(ident) + except ApiException as ae: + abort(ae.status) + try: + histogram = get_elastic_container_preservation_by_year(container.ident) + except Exception as ae: + app.log.error(ae) + abort(503) + return jsonify({'container_id': ident, "histogram": histogram}) + +@app.route('/container//preservation_by_year.svg', methods=['GET', 'OPTIONS']) +@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) +def container_ident_preservation_by_year_svg(ident): + try: + container = api.get_container(ident) + except ApiException as ae: + abort(ae.status) + try: + histogram = get_elastic_container_preservation_by_year(container.ident) + except Exception as ae: + app.log.error(ae) + abort(503) + return preservation_by_year_histogram(histogram).render_response() + +@app.route('/container//preservation_by_volume.json', methods=['GET', 'OPTIONS']) +@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) +def container_ident_preservation_by_volume_json(ident): + try: + container = api.get_container(ident) + except ApiException as ae: + abort(ae.status) + try: + histogram = get_elastic_container_preservation_by_volume(container.ident) + except Exception as ae: + app.log.error(ae) + abort(503) + return jsonify({'container_id': ident, "histogram": histogram}) + +@app.route('/container//preservation_by_volume.svg', methods=['GET', 'OPTIONS']) +@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) +def container_ident_preservation_by_volume_svg(ident): + try: + container = api.get_container(ident) + except ApiException as ae: + abort(ae.status) + try: + histogram = get_elastic_container_preservation_by_volume(container.ident) + except Exception as ae: + app.log.error(ae) + abort(503) + return preservation_by_volume_histogram(histogram).render_response() + @app.route('/release/.bib', methods=['GET']) def release_bibtex(ident): try: diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index f60860c9..9703a434 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -259,7 +259,7 @@ def do_release_search( results=results, ) -def get_elastic_container_random_releases(ident, limit=5): +def get_elastic_container_random_releases(ident: str, limit=5) -> dict: """ Returns a list of releases from the container. """ @@ -283,7 +283,7 @@ def get_elastic_container_random_releases(ident, limit=5): return results -def get_elastic_entity_stats(): +def get_elastic_entity_stats() -> dict: """ TODO: files, filesets, webcaptures (no schema yet) @@ -417,6 +417,9 @@ def get_elastic_container_stats(ident, issnl=None): container_stats = resp.aggregations.container_stats.buckets preservation_bucket = agg_to_dict(resp.aggregations.preservation) + for k in ('bright', 'dark', 'shadows_only', 'none'): + if not k in preservation_bucket: + preservation_bucket[k] = 0 release_type_bucket = agg_to_dict(resp.aggregations.release_type) stats = { 'ident': ident, @@ -431,9 +434,11 @@ def get_elastic_container_stats(ident, issnl=None): return stats -def get_elastic_container_histogram(ident): +def get_elastic_container_histogram_legacy(ident) -> List: """ - Fetches a stacked histogram + Fetches a stacked histogram of {year, in_ia}. This is for the older style + of coverage graph (SVG or JSON export). This function should be DEPRECATED + to be removed in the near future. Filters to the past 500 years (at most), or about 1000 values. @@ -480,7 +485,174 @@ def get_elastic_container_histogram(ident): resp = wrap_es_execution(search) buckets = resp.aggregations.year_in_ia.buckets - vals = [(h['key']['year'], h['key']['in_ia'], h['doc_count']) + vals = [(int(h['key']['year']), h['key']['in_ia'], h['doc_count']) for h in buckets] vals = sorted(vals) return vals + + +def get_elastic_container_preservation_by_year(container_id: str) -> List[dict]: + """ + Fetches a stacked histogram of {year, preservation}. + + Preservation has 4 potential values; this function filters to the past 250 + years (at most), or about 1000 values. + + Returns a list of dicts, sorted by year, with keys/values like: + + {year (int), bright (int), dark (int), shadows_only (int), none (int)} + """ + + search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + search = search.params(request_cache='true') + search = search.query( + 'bool', + must=[ + Q("range", release_year={ + "gte": datetime.datetime.today().year - 249, + "lte": datetime.datetime.today().year, + }), + ], + filter=[ + Q("bool", minimum_should_match=1, should=[ + Q("match", container_id=container_id), + ]), + ], + ) + search.aggs.bucket( + 'year_preservation', + 'composite', + size=1500, + sources=[ + {"year": { + "histogram": { + "field": "release_year", + "interval": 1, + }, + }}, + {"preservation": { + "terms": { + "field": "preservation", + }, + }}, + ], + ) + search = search[:0] + + resp = wrap_es_execution(search) + + buckets = resp.aggregations.year_preservation.buckets + year_nums = set([int(h['key']['year']) for h in buckets]) + year_dicts = dict() + for num in range(min(year_nums), max(year_nums)+1): + year_dicts[num] = dict(year=num, bright=0, dark=0, shadows_only=0, none=0) + for row in buckets: + year_dicts[int(row['key']['year'])][row['key']['preservation']] = int(row['doc_count']) + return sorted(year_dicts.values(), key=lambda x: x['year']) + +def get_elastic_container_preservation_by_volume(container_id: str) -> List[dict]: + """ + Fetches a stacked histogram of {volume, preservation}. + + Currently only includes volume numbers which are simple integers (all chars + are digits). + + Returns a list of dicts, sorted by volume, with keys/values like: + + {year (int), bright (int), dark (int), shadows_only (int), none (int)} + """ + + search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + search = search.params(request_cache='true') + search = search.query( + 'bool', + filter=[ + Q("bool", must=[ + Q("match", container_id=container_id), + Q("exists", field="volume"), + ]), + ], + ) + search.aggs.bucket( + 'volume_preservation', + 'composite', + size=1500, + sources=[ + {"volume": { + "terms": { + "field": "volume", + }, + }}, + {"preservation": { + "terms": { + "field": "preservation", + }, + }}, + ], + ) + search = search[:0] + + resp = wrap_es_execution(search) + + buckets = resp.aggregations.volume_preservation.buckets + volume_nums = set([int(h['key']['volume']) for h in buckets if h['key']['volume'].isdigit()]) + volume_dicts = dict() + for num in range(min(volume_nums), max(volume_nums)+1): + volume_dicts[num] = dict(volume=num, bright=0, dark=0, shadows_only=0, none=0) + for row in buckets: + if row['key']['volume'].isdigit(): + volume_dicts[int(row['key']['volume'])][row['key']['preservation']] = int(row['doc_count']) + return sorted(volume_dicts.values(), key=lambda x: x['volume']) + +def get_elastic_container_preservation_by_type(container_id: str) -> List[dict]: + """ + Fetches preservation coverage by release type + + Returns a list of dicts, sorted by total count, with keys/values like: + + {year (int), bright (int), dark (int), shadows_only (int), none (int)} + """ + + search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + search = search.params(request_cache='true') + search = search.query( + 'bool', + filter=[ + Q("bool", must=[ + Q("match", container_id=container_id), + ]), + ], + ) + search.aggs.bucket( + 'type_preservation', + 'composite', + size=1500, + sources=[ + {"release_type": { + "terms": { + "field": "release_type", + }, + "missing": "_unknown", + }}, + {"preservation": { + "terms": { + "field": "preservation", + }, + }}, + ], + ) + search = search[:0] + + resp = wrap_es_execution(search) + + buckets = resp.aggregations.volume_preservation.buckets + type_set = set([h['key']['release_type'] for h in buckets]) + type_dicts = dict() + for k in type_set: + type_dicts[k] = dict(release_type=t, bright=0, dark=0, shadows_only=0, none=0, total=0) + for row in buckets: + type_dicts[row['key']['release_type']][row['key']['preservation']] = int(row['doc_count']) + for k in type_set: + for p in ('bright', 'dark', 'shadows_only', 'none'): + type_dicts[k]['total'] += type_dicts[k][p] + return sorted(type_dicts.values(), key=lambda x: x['total']) diff --git a/python/fatcat_web/templates/container_view_coverage.html b/python/fatcat_web/templates/container_view_coverage.html index ffd1a447..fc643f81 100644 --- a/python/fatcat_web/templates/container_view_coverage.html +++ b/python/fatcat_web/templates/container_view_coverage.html @@ -19,34 +19,50 @@ {% set frac_preserved = container._stats.is_preserved/container._stats.total %} {% set frac_web = container._stats.in_web/container._stats.total %} -
-
-
{{ (frac_web*100)|int }}%
-
-
- {{ "{:,}".format(container._stats.in_web) }} preserved and available (bright) -
-
+ {% set pstats = container._stats.preservation %} + {% set frac_bright = container._stats.preservation.bright/container._stats.total %} + {% set frac_dark = container._stats.preservation.dark/container._stats.total %} + {% set frac_shadows_only = container._stats.preservation.shadows_only/container._stats.total %} + {% set frac_none = container._stats.preservation.none/container._stats.total %} -
-
-
-
{{ (frac_preserved*100)|int }}%
+
+
+
{# {{ (frac_bright*100)|int }}% #}
-
- {{ "{:,}".format(container._stats.is_preserved) }} preserved at all (bright or dark) +
+
{# {{ (frac_dark*100)|int }}% #}
-
- -
-
-
-
{{ (frac_kbart*100)|int }}%
+
+
{# {{ (frac_shadows_only*100)|int }}% #}
-
- {{ "{:,}".format(container._stats.in_kbart ) }} preserved by Keeper (dark) +
+
{# {{ (frac_none*100)|int }}% #}
+ + + + + + + +
+ {{ "{:,}".format(pstats.bright) }} + {{ (frac_bright*100)|round(2,method='ceil') }}% + preserved and publicly available (bright) +
+ {{ "{:,}".format(pstats.dark) }} + {{ (frac_dark*100)|round(2,method='ceil') }}% + preserved but not publicly accessible (dark) +
+ {{ "{:,}".format(pstats.shadows_only) }} + {{ (frac_shadows_only*100)|round(2,method='ceil') }}% + only independently preserved in "shadow" libraries +
+ {{ "{:,}".format(pstats.none) }} + {{ (frac_none*100)|round(2,method='ceil') }}% + no known independent preservation +
{% endif %}
-- cgit v1.2.3