diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2020-07-07 16:18:53 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-07-30 18:21:15 -0700 | 
| commit | bd3c6566fb9fdd5507782f19672fc62d0c551d05 (patch) | |
| tree | ce52e1f52300003ccad9bfc90156370310091b0f /python/fatcat_web | |
| parent | 46004ea6ca55613d6330899dfeb7afff6bfa2229 (diff) | |
| download | fatcat-bd3c6566fb9fdd5507782f19672fc62d0c551d05.tar.gz fatcat-bd3c6566fb9fdd5507782f19672fc62d0c551d05.zip | |
preservation coverage updates (first round)
- new by-year chart with stacked histograms of all 4 preservation
  statuses
- new-style single progress bar showing overall preservation status
- new by-volume query and chart
Old endpoints are left as-is, with the intention of having them
"deprecated" for some time span until entirely removing them.
Diffstat (limited to 'python/fatcat_web')
| -rw-r--r-- | python/fatcat_web/graphics.py | 52 | ||||
| -rw-r--r-- | python/fatcat_web/routes.py | 64 | ||||
| -rw-r--r-- | python/fatcat_web/search.py | 182 | ||||
| -rw-r--r-- | python/fatcat_web/templates/container_view_coverage.html | 60 | 
4 files changed, 326 insertions, 32 deletions
| diff --git a/python/fatcat_web/graphics.py b/python/fatcat_web/graphics.py index 5493d175..96c3531a 100644 --- a/python/fatcat_web/graphics.py +++ b/python/fatcat_web/graphics.py @@ -1,8 +1,10 @@ +from typing import List, Tuple, Dict +  import pygal  from pygal.style import CleanStyle -def ia_coverage_histogram(rows): +def ia_coverage_histogram(rows: List[Tuple]) -> pygal.Graph:      """      Note: this returns a raw pygal chart; it does not render it to SVG/PNG      """ @@ -34,3 +36,51 @@ def ia_coverage_histogram(rows):      chart.add('via Fatcat', [y['available'] for y in years])      chart.add('Missing', [y['missing'] for y in years])      return chart + +def preservation_by_year_histogram(rows: List[Dict]) -> pygal.Graph: +    """ +    Note: this returns a raw pygal chart; it does not render it to SVG/PNG +    """ + +    years = sorted(rows, key=lambda x: x['year']) + +    CleanStyle.colors = ("red", "darkred", "darkolivegreen", "limegreen") +    label_count = len(years) +    if len(years) > 30: +        label_count = 10 +    chart = pygal.StackedBar(dynamic_print_values=True, style=CleanStyle, +        width=1000, height=500, x_labels_major_count=label_count, +        show_minor_x_labels=False, x_label_rotation=20) +    #chart.title = "Preservation by Year" +    chart.x_title = "Year" +    #chart.y_title = "Count" +    chart.x_labels = [str(y['year']) for y in years] +    chart.add('None', [y['none'] for y in years]) +    chart.add('Shadow', [y['shadows_only'] for y in years]) +    chart.add('Dark', [y['dark'] for y in years]) +    chart.add('Bright', [y['bright'] for y in years]) +    return chart + +def preservation_by_volume_histogram(rows: List[Dict]) -> pygal.Graph: +    """ +    Note: this returns a raw pygal chart; it does not render it to SVG/PNG +    """ + +    volumes = sorted(rows, key=lambda x: x['volume']) + +    CleanStyle.colors = ("red", "darkred", "darkolivegreen", "limegreen") +    label_count = len(volumes) +    if len(volumes) >= 30: +        label_count = 10 +    chart = pygal.StackedBar(dynamic_print_values=True, style=CleanStyle, +        width=1000, height=500, x_labels_major_count=label_count, +        show_minor_x_labels=False, x_label_rotation=20) +    #chart.title = "Preservation by Year" +    chart.x_title = "Volume" +    #chart.y_title = "Count" +    chart.x_labels = [str(y['volume']) for y in volumes] +    chart.add('None', [y['none'] for y in volumes]) +    chart.add('Shadow', [y['shadows_only'] for y in volumes]) +    chart.add('Dark', [y['dark'] for y in volumes]) +    chart.add('Bright', [y['bright'] for y in volumes]) +    return chart diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index 6f3ec21b..34cf2d50 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -14,7 +14,7 @@ from fatcat_tools.normal import *  from fatcat_web import app, api, auth_api, priv_api, mwoauth, Config  from fatcat_web.auth import handle_token_login, handle_logout, load_user, handle_ia_xauth, handle_wmoauth  from fatcat_web.cors import crossdomain -from fatcat_web.search import ReleaseQuery, GenericQuery, do_release_search, do_container_search, get_elastic_entity_stats, get_elastic_container_stats, get_elastic_container_histogram, FatcatSearchError +from fatcat_web.search import ReleaseQuery, GenericQuery, do_release_search, do_container_search, get_elastic_entity_stats, get_elastic_container_stats, get_elastic_container_histogram_legacy, get_elastic_container_preservation_by_year, get_elastic_container_preservation_by_volume, get_elastic_container_preservation_by_type, FatcatSearchError  from fatcat_web.entity_helpers import *  from fatcat_web.graphics import *  from fatcat_web.kafka import * @@ -765,7 +765,7 @@ def container_issnl_stats(issnl):          raise ae      try:          stats = get_elastic_container_stats(container.ident, issnl=container.issnl) -    except Exception as ae: +    except (ValueError, IOError) as ae:          app.log.error(ae)          abort(503)      return jsonify(stats) @@ -792,7 +792,7 @@ def container_ident_ia_coverage_years_json(ident):      except ApiException as ae:          abort(ae.status)      try: -        histogram = get_elastic_container_histogram(container.ident) +        histogram = get_elastic_container_histogram_legacy(container.ident)      except Exception as ae:          app.log.error(ae)          abort(503) @@ -807,12 +807,68 @@ def container_ident_ia_coverage_years_svg(ident):      except ApiException as ae:          abort(ae.status)      try: -        histogram = get_elastic_container_histogram(container.ident) +        histogram = get_elastic_container_histogram_legacy(container.ident)      except Exception as ae:          app.log.error(ae)          abort(503)      return ia_coverage_histogram(histogram).render_response() +@app.route('/container/<ident>/preservation_by_year.json', methods=['GET', 'OPTIONS']) +@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) +def container_ident_preservation_by_year_json(ident): +    try: +        container = api.get_container(ident) +    except ApiException as ae: +        abort(ae.status) +    try: +        histogram = get_elastic_container_preservation_by_year(container.ident) +    except Exception as ae: +        app.log.error(ae) +        abort(503) +    return jsonify({'container_id': ident, "histogram": histogram}) + +@app.route('/container/<ident>/preservation_by_year.svg', methods=['GET', 'OPTIONS']) +@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) +def container_ident_preservation_by_year_svg(ident): +    try: +        container = api.get_container(ident) +    except ApiException as ae: +        abort(ae.status) +    try: +        histogram = get_elastic_container_preservation_by_year(container.ident) +    except Exception as ae: +        app.log.error(ae) +        abort(503) +    return preservation_by_year_histogram(histogram).render_response() + +@app.route('/container/<ident>/preservation_by_volume.json', methods=['GET', 'OPTIONS']) +@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) +def container_ident_preservation_by_volume_json(ident): +    try: +        container = api.get_container(ident) +    except ApiException as ae: +        abort(ae.status) +    try: +        histogram = get_elastic_container_preservation_by_volume(container.ident) +    except Exception as ae: +        app.log.error(ae) +        abort(503) +    return jsonify({'container_id': ident, "histogram": histogram}) + +@app.route('/container/<ident>/preservation_by_volume.svg', methods=['GET', 'OPTIONS']) +@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) +def container_ident_preservation_by_volume_svg(ident): +    try: +        container = api.get_container(ident) +    except ApiException as ae: +        abort(ae.status) +    try: +        histogram = get_elastic_container_preservation_by_volume(container.ident) +    except Exception as ae: +        app.log.error(ae) +        abort(503) +    return preservation_by_volume_histogram(histogram).render_response() +  @app.route('/release/<ident>.bib', methods=['GET'])  def release_bibtex(ident):      try: diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index f60860c9..9703a434 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -259,7 +259,7 @@ def do_release_search(          results=results,      ) -def get_elastic_container_random_releases(ident, limit=5): +def get_elastic_container_random_releases(ident: str, limit=5) -> dict:      """      Returns a list of releases from the container.      """ @@ -283,7 +283,7 @@ def get_elastic_container_random_releases(ident, limit=5):      return results -def get_elastic_entity_stats(): +def get_elastic_entity_stats() -> dict:      """      TODO: files, filesets, webcaptures (no schema yet) @@ -417,6 +417,9 @@ def get_elastic_container_stats(ident, issnl=None):      container_stats = resp.aggregations.container_stats.buckets      preservation_bucket = agg_to_dict(resp.aggregations.preservation) +    for k in ('bright', 'dark', 'shadows_only', 'none'): +        if not k in preservation_bucket: +            preservation_bucket[k] = 0      release_type_bucket = agg_to_dict(resp.aggregations.release_type)      stats = {          'ident': ident, @@ -431,9 +434,11 @@ def get_elastic_container_stats(ident, issnl=None):      return stats -def get_elastic_container_histogram(ident): +def get_elastic_container_histogram_legacy(ident) -> List:      """ -    Fetches a stacked histogram +    Fetches a stacked histogram of {year, in_ia}. This is for the older style +    of coverage graph (SVG or JSON export). This function should be DEPRECATED +    to be removed in the near future.      Filters to the past 500 years (at most), or about 1000 values. @@ -480,7 +485,174 @@ def get_elastic_container_histogram(ident):      resp = wrap_es_execution(search)      buckets = resp.aggregations.year_in_ia.buckets -    vals = [(h['key']['year'], h['key']['in_ia'], h['doc_count']) +    vals = [(int(h['key']['year']), h['key']['in_ia'], h['doc_count'])              for h in buckets]      vals = sorted(vals)      return vals + + +def get_elastic_container_preservation_by_year(container_id: str) -> List[dict]: +    """ +    Fetches a stacked histogram of {year, preservation}. + +    Preservation has 4 potential values; this function filters to the past 250  +    years (at most), or about 1000 values. + +    Returns a list of dicts, sorted by year, with keys/values like: + +        {year (int), bright (int), dark (int), shadows_only (int), none (int)} +    """ + +    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) +    search = search.params(request_cache='true') +    search = search.query( +        'bool', +        must=[ +            Q("range", release_year={ +                "gte": datetime.datetime.today().year - 249, +                "lte": datetime.datetime.today().year, +            }), +        ], +        filter=[ +            Q("bool", minimum_should_match=1, should=[ +                Q("match", container_id=container_id), +            ]), +        ], +    ) +    search.aggs.bucket( +        'year_preservation', +        'composite', +        size=1500, +        sources=[ +            {"year": { +                "histogram": { +                    "field": "release_year", +                    "interval": 1, +                }, +            }}, +            {"preservation": { +                "terms": { +                    "field": "preservation", +                }, +            }}, +        ], +    ) +    search = search[:0] + +    resp = wrap_es_execution(search) + +    buckets = resp.aggregations.year_preservation.buckets +    year_nums = set([int(h['key']['year']) for h in buckets]) +    year_dicts = dict() +    for num in range(min(year_nums), max(year_nums)+1): +        year_dicts[num] = dict(year=num, bright=0, dark=0, shadows_only=0, none=0) +    for row in buckets: +        year_dicts[int(row['key']['year'])][row['key']['preservation']] = int(row['doc_count']) +    return sorted(year_dicts.values(), key=lambda x: x['year']) + +def get_elastic_container_preservation_by_volume(container_id: str) -> List[dict]: +    """ +    Fetches a stacked histogram of {volume, preservation}. + +    Currently only includes volume numbers which are simple integers (all chars +    are digits). + +    Returns a list of dicts, sorted by volume, with keys/values like: + +        {year (int), bright (int), dark (int), shadows_only (int), none (int)} +    """ + +    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) +    search = search.params(request_cache='true') +    search = search.query( +        'bool', +        filter=[ +            Q("bool", must=[ +                Q("match", container_id=container_id), +                Q("exists", field="volume"), +            ]), +        ], +    ) +    search.aggs.bucket( +        'volume_preservation', +        'composite', +        size=1500, +        sources=[ +            {"volume": { +                "terms": { +                    "field": "volume", +                }, +            }}, +            {"preservation": { +                "terms": { +                    "field": "preservation", +                }, +            }}, +        ], +    ) +    search = search[:0] + +    resp = wrap_es_execution(search) + +    buckets = resp.aggregations.volume_preservation.buckets +    volume_nums = set([int(h['key']['volume']) for h in buckets if h['key']['volume'].isdigit()]) +    volume_dicts = dict() +    for num in range(min(volume_nums), max(volume_nums)+1): +        volume_dicts[num] = dict(volume=num, bright=0, dark=0, shadows_only=0, none=0) +    for row in buckets: +        if row['key']['volume'].isdigit(): +            volume_dicts[int(row['key']['volume'])][row['key']['preservation']] = int(row['doc_count']) +    return sorted(volume_dicts.values(), key=lambda x: x['volume']) + +def get_elastic_container_preservation_by_type(container_id: str) -> List[dict]: +    """ +    Fetches preservation coverage by release type + +    Returns a list of dicts, sorted by total count, with keys/values like: + +        {year (int), bright (int), dark (int), shadows_only (int), none (int)} +    """ + +    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) +    search = search.params(request_cache='true') +    search = search.query( +        'bool', +        filter=[ +            Q("bool", must=[ +                Q("match", container_id=container_id), +            ]), +        ], +    ) +    search.aggs.bucket( +        'type_preservation', +        'composite', +        size=1500, +        sources=[ +            {"release_type": { +                "terms": { +                    "field": "release_type", +                }, +                "missing": "_unknown", +            }}, +            {"preservation": { +                "terms": { +                    "field": "preservation", +                }, +            }}, +        ], +    ) +    search = search[:0] + +    resp = wrap_es_execution(search) + +    buckets = resp.aggregations.volume_preservation.buckets +    type_set = set([h['key']['release_type'] for h in buckets]) +    type_dicts = dict() +    for k in type_set: +        type_dicts[k] = dict(release_type=t, bright=0, dark=0, shadows_only=0, none=0, total=0) +    for row in buckets: +        type_dicts[row['key']['release_type']][row['key']['preservation']] = int(row['doc_count']) +    for k in type_set: +        for p in ('bright', 'dark', 'shadows_only', 'none'): +            type_dicts[k]['total'] += type_dicts[k][p] +    return sorted(type_dicts.values(), key=lambda x: x['total']) diff --git a/python/fatcat_web/templates/container_view_coverage.html b/python/fatcat_web/templates/container_view_coverage.html index ffd1a447..fc643f81 100644 --- a/python/fatcat_web/templates/container_view_coverage.html +++ b/python/fatcat_web/templates/container_view_coverage.html @@ -19,34 +19,50 @@          {% set frac_preserved = container._stats.is_preserved/container._stats.total %}          {% set frac_web = container._stats.in_web/container._stats.total %} -        <div class="ui large {{ entity_macros.progress_color(frac_web) }} progress" style="margin-bottom: 0.1em;"> -          <div class="bar" style="width: {{ (frac_web*100)|int }}%;"> -            <div class="progress">{{ (frac_web*100)|int }}%</div> -          </div> -          <div class="label"> -            {{ "{:,}".format(container._stats.in_web) }} preserved and available (bright) -          </div> -        </div> +        {% set pstats = container._stats.preservation %} +        {% set frac_bright = container._stats.preservation.bright/container._stats.total %} +        {% set frac_dark = container._stats.preservation.dark/container._stats.total %} +        {% set frac_shadows_only = container._stats.preservation.shadows_only/container._stats.total %} +        {% set frac_none = container._stats.preservation.none/container._stats.total %} -        <br> -        <div class="ui large {{ entity_macros.progress_color(frac_preserved) }} progress" style="margin-bottom: 0.1em;"> -          <div class="bar" style="width: {{ (frac_preserved*100)|int }}%;"> -            <div class="progress">{{ (frac_preserved*100)|int }}%</div> +        <div class="ui large multiple progress" data-percent="0,0,0,0" style="margin-bottom: 0.1em;"> +          <div class="green bar" style="border-radius: 0; min-width: 0; width: {{ (frac_bright*100)|round(method='ceil') }}%;" title="bright"> +            <div class="progress">{# {{ (frac_bright*100)|int }}% #}</div>            </div> -          <div class="label"> -            {{ "{:,}".format(container._stats.is_preserved) }} preserved at all (bright or dark) +          <div class="green bar" style="border-radius: 0; min-width: 0; width: {{ (frac_dark*100)|round(method='ceil') }}%; background-color: darkgreen;"> +            <div class="progress">{# {{ (frac_dark*100)|int }}% #}</div>            </div> -        </div> - -        <br> -        <div class="ui large {{ entity_macros.progress_color(frac_kbart) }} progress" style="margin-bottom: 0.1em; margin-top: 1em;"> -          <div class="bar" style="width: {{ (frac_kbart*100)|int }}%;"> -            <div class="progress">{{ (frac_kbart*100)|int }}%</div> +          <div class="red bar" style="border-radius: 0; min-width: 0; width: {{ (frac_shadows_only*100)|round(method='ceil') }}%; background-color: darkred;"> +            <div class="progress">{# {{ (frac_shadows_only*100)|int }}% #}</div>            </div> -          <div class="label"> -            {{ "{:,}".format(container._stats.in_kbart ) }} preserved by Keeper (dark) +          <div class="red bar" style="border-radius: 0; min-width: 0; width: {{ (frac_none*100)|round(method='ceil') }}%;"> +            <div class="progress">{# {{ (frac_none*100)|int }}% #}</div>            </div>          </div> +        <table class="ui very basic very compact collapsing table" style="font-weight: bold; margin-left: 1em;"> +          <tbody> +            <tr> +              <td style="background-color: green;"> +              <td class="right aligned" >{{ "{:,}".format(pstats.bright) }} +              <td class="right aligned" >{{ (frac_bright*100)|round(2,method='ceil') }}% +              <td>preserved and publicly available (bright) +            <tr> +              <td style="background-color: darkgreen;"> +              <td class="right aligned" >{{ "{:,}".format(pstats.dark) }} +              <td class="right aligned" >{{ (frac_dark*100)|round(2,method='ceil') }}% +              <td>preserved but not publicly accessible (dark) +            <tr> +              <td style="background-color: darkred;"> +              <td class="right aligned" >{{ "{:,}".format(pstats.shadows_only) }} +              <td class="right aligned" >{{ (frac_shadows_only*100)|round(2,method='ceil') }}% +              <td>only independently preserved in "shadow" libraries +            <tr> +              <td style="background-color: red;"> +              <td class="right aligned" >{{ "{:,}".format(pstats.none) }} +              <td class="right aligned" >{{ (frac_none*100)|round(2,method='ceil') }}% +              <td>no known independent preservation +          </tbody> +        </table>        {% endif %}      </div> | 
