aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-07-07 16:18:53 -0700
committerBryan Newbold <bnewbold@robocracy.org>2020-07-30 18:21:15 -0700
commitbd3c6566fb9fdd5507782f19672fc62d0c551d05 (patch)
treece52e1f52300003ccad9bfc90156370310091b0f
parent46004ea6ca55613d6330899dfeb7afff6bfa2229 (diff)
downloadfatcat-bd3c6566fb9fdd5507782f19672fc62d0c551d05.tar.gz
fatcat-bd3c6566fb9fdd5507782f19672fc62d0c551d05.zip
preservation coverage updates (first round)
- new by-year chart with stacked histograms of all 4 preservation statuses - new-style single progress bar showing overall preservation status - new by-volume query and chart Old endpoints are left as-is, with the intention of having them "deprecated" for some time span until entirely removing them.
-rw-r--r--python/fatcat_web/graphics.py52
-rw-r--r--python/fatcat_web/routes.py64
-rw-r--r--python/fatcat_web/search.py182
-rw-r--r--python/fatcat_web/templates/container_view_coverage.html60
4 files changed, 326 insertions, 32 deletions
diff --git a/python/fatcat_web/graphics.py b/python/fatcat_web/graphics.py
index 5493d175..96c3531a 100644
--- a/python/fatcat_web/graphics.py
+++ b/python/fatcat_web/graphics.py
@@ -1,8 +1,10 @@
+from typing import List, Tuple, Dict
+
import pygal
from pygal.style import CleanStyle
-def ia_coverage_histogram(rows):
+def ia_coverage_histogram(rows: List[Tuple]) -> pygal.Graph:
"""
Note: this returns a raw pygal chart; it does not render it to SVG/PNG
"""
@@ -34,3 +36,51 @@ def ia_coverage_histogram(rows):
chart.add('via Fatcat', [y['available'] for y in years])
chart.add('Missing', [y['missing'] for y in years])
return chart
+
+def preservation_by_year_histogram(rows: List[Dict]) -> pygal.Graph:
+ """
+ Note: this returns a raw pygal chart; it does not render it to SVG/PNG
+ """
+
+ years = sorted(rows, key=lambda x: x['year'])
+
+ CleanStyle.colors = ("red", "darkred", "darkolivegreen", "limegreen")
+ label_count = len(years)
+ if len(years) > 30:
+ label_count = 10
+ chart = pygal.StackedBar(dynamic_print_values=True, style=CleanStyle,
+ width=1000, height=500, x_labels_major_count=label_count,
+ show_minor_x_labels=False, x_label_rotation=20)
+ #chart.title = "Preservation by Year"
+ chart.x_title = "Year"
+ #chart.y_title = "Count"
+ chart.x_labels = [str(y['year']) for y in years]
+ chart.add('None', [y['none'] for y in years])
+ chart.add('Shadow', [y['shadows_only'] for y in years])
+ chart.add('Dark', [y['dark'] for y in years])
+ chart.add('Bright', [y['bright'] for y in years])
+ return chart
+
+def preservation_by_volume_histogram(rows: List[Dict]) -> pygal.Graph:
+ """
+ Note: this returns a raw pygal chart; it does not render it to SVG/PNG
+ """
+
+ volumes = sorted(rows, key=lambda x: x['volume'])
+
+ CleanStyle.colors = ("red", "darkred", "darkolivegreen", "limegreen")
+ label_count = len(volumes)
+ if len(volumes) >= 30:
+ label_count = 10
+ chart = pygal.StackedBar(dynamic_print_values=True, style=CleanStyle,
+ width=1000, height=500, x_labels_major_count=label_count,
+ show_minor_x_labels=False, x_label_rotation=20)
+ #chart.title = "Preservation by Year"
+ chart.x_title = "Volume"
+ #chart.y_title = "Count"
+ chart.x_labels = [str(y['volume']) for y in volumes]
+ chart.add('None', [y['none'] for y in volumes])
+ chart.add('Shadow', [y['shadows_only'] for y in volumes])
+ chart.add('Dark', [y['dark'] for y in volumes])
+ chart.add('Bright', [y['bright'] for y in volumes])
+ return chart
diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py
index 6f3ec21b..34cf2d50 100644
--- a/python/fatcat_web/routes.py
+++ b/python/fatcat_web/routes.py
@@ -14,7 +14,7 @@ from fatcat_tools.normal import *
from fatcat_web import app, api, auth_api, priv_api, mwoauth, Config
from fatcat_web.auth import handle_token_login, handle_logout, load_user, handle_ia_xauth, handle_wmoauth
from fatcat_web.cors import crossdomain
-from fatcat_web.search import ReleaseQuery, GenericQuery, do_release_search, do_container_search, get_elastic_entity_stats, get_elastic_container_stats, get_elastic_container_histogram, FatcatSearchError
+from fatcat_web.search import ReleaseQuery, GenericQuery, do_release_search, do_container_search, get_elastic_entity_stats, get_elastic_container_stats, get_elastic_container_histogram_legacy, get_elastic_container_preservation_by_year, get_elastic_container_preservation_by_volume, get_elastic_container_preservation_by_type, FatcatSearchError
from fatcat_web.entity_helpers import *
from fatcat_web.graphics import *
from fatcat_web.kafka import *
@@ -765,7 +765,7 @@ def container_issnl_stats(issnl):
raise ae
try:
stats = get_elastic_container_stats(container.ident, issnl=container.issnl)
- except Exception as ae:
+ except (ValueError, IOError) as ae:
app.log.error(ae)
abort(503)
return jsonify(stats)
@@ -792,7 +792,7 @@ def container_ident_ia_coverage_years_json(ident):
except ApiException as ae:
abort(ae.status)
try:
- histogram = get_elastic_container_histogram(container.ident)
+ histogram = get_elastic_container_histogram_legacy(container.ident)
except Exception as ae:
app.log.error(ae)
abort(503)
@@ -807,12 +807,68 @@ def container_ident_ia_coverage_years_svg(ident):
except ApiException as ae:
abort(ae.status)
try:
- histogram = get_elastic_container_histogram(container.ident)
+ histogram = get_elastic_container_histogram_legacy(container.ident)
except Exception as ae:
app.log.error(ae)
abort(503)
return ia_coverage_histogram(histogram).render_response()
+@app.route('/container/<ident>/preservation_by_year.json', methods=['GET', 'OPTIONS'])
+@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type'])
+def container_ident_preservation_by_year_json(ident):
+ try:
+ container = api.get_container(ident)
+ except ApiException as ae:
+ abort(ae.status)
+ try:
+ histogram = get_elastic_container_preservation_by_year(container.ident)
+ except Exception as ae:
+ app.log.error(ae)
+ abort(503)
+ return jsonify({'container_id': ident, "histogram": histogram})
+
+@app.route('/container/<ident>/preservation_by_year.svg', methods=['GET', 'OPTIONS'])
+@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type'])
+def container_ident_preservation_by_year_svg(ident):
+ try:
+ container = api.get_container(ident)
+ except ApiException as ae:
+ abort(ae.status)
+ try:
+ histogram = get_elastic_container_preservation_by_year(container.ident)
+ except Exception as ae:
+ app.log.error(ae)
+ abort(503)
+ return preservation_by_year_histogram(histogram).render_response()
+
+@app.route('/container/<ident>/preservation_by_volume.json', methods=['GET', 'OPTIONS'])
+@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type'])
+def container_ident_preservation_by_volume_json(ident):
+ try:
+ container = api.get_container(ident)
+ except ApiException as ae:
+ abort(ae.status)
+ try:
+ histogram = get_elastic_container_preservation_by_volume(container.ident)
+ except Exception as ae:
+ app.log.error(ae)
+ abort(503)
+ return jsonify({'container_id': ident, "histogram": histogram})
+
+@app.route('/container/<ident>/preservation_by_volume.svg', methods=['GET', 'OPTIONS'])
+@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type'])
+def container_ident_preservation_by_volume_svg(ident):
+ try:
+ container = api.get_container(ident)
+ except ApiException as ae:
+ abort(ae.status)
+ try:
+ histogram = get_elastic_container_preservation_by_volume(container.ident)
+ except Exception as ae:
+ app.log.error(ae)
+ abort(503)
+ return preservation_by_volume_histogram(histogram).render_response()
+
@app.route('/release/<ident>.bib', methods=['GET'])
def release_bibtex(ident):
try:
diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py
index f60860c9..9703a434 100644
--- a/python/fatcat_web/search.py
+++ b/python/fatcat_web/search.py
@@ -259,7 +259,7 @@ def do_release_search(
results=results,
)
-def get_elastic_container_random_releases(ident, limit=5):
+def get_elastic_container_random_releases(ident: str, limit=5) -> dict:
"""
Returns a list of releases from the container.
"""
@@ -283,7 +283,7 @@ def get_elastic_container_random_releases(ident, limit=5):
return results
-def get_elastic_entity_stats():
+def get_elastic_entity_stats() -> dict:
"""
TODO: files, filesets, webcaptures (no schema yet)
@@ -417,6 +417,9 @@ def get_elastic_container_stats(ident, issnl=None):
container_stats = resp.aggregations.container_stats.buckets
preservation_bucket = agg_to_dict(resp.aggregations.preservation)
+ for k in ('bright', 'dark', 'shadows_only', 'none'):
+ if not k in preservation_bucket:
+ preservation_bucket[k] = 0
release_type_bucket = agg_to_dict(resp.aggregations.release_type)
stats = {
'ident': ident,
@@ -431,9 +434,11 @@ def get_elastic_container_stats(ident, issnl=None):
return stats
-def get_elastic_container_histogram(ident):
+def get_elastic_container_histogram_legacy(ident) -> List:
"""
- Fetches a stacked histogram
+ Fetches a stacked histogram of {year, in_ia}. This is for the older style
+ of coverage graph (SVG or JSON export). This function should be DEPRECATED
+ to be removed in the near future.
Filters to the past 500 years (at most), or about 1000 values.
@@ -480,7 +485,174 @@ def get_elastic_container_histogram(ident):
resp = wrap_es_execution(search)
buckets = resp.aggregations.year_in_ia.buckets
- vals = [(h['key']['year'], h['key']['in_ia'], h['doc_count'])
+ vals = [(int(h['key']['year']), h['key']['in_ia'], h['doc_count'])
for h in buckets]
vals = sorted(vals)
return vals
+
+
+def get_elastic_container_preservation_by_year(container_id: str) -> List[dict]:
+ """
+ Fetches a stacked histogram of {year, preservation}.
+
+ Preservation has 4 potential values; this function filters to the past 250
+ years (at most), or about 1000 values.
+
+ Returns a list of dicts, sorted by year, with keys/values like:
+
+ {year (int), bright (int), dark (int), shadows_only (int), none (int)}
+ """
+
+ search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ search = search.params(request_cache='true')
+ search = search.query(
+ 'bool',
+ must=[
+ Q("range", release_year={
+ "gte": datetime.datetime.today().year - 249,
+ "lte": datetime.datetime.today().year,
+ }),
+ ],
+ filter=[
+ Q("bool", minimum_should_match=1, should=[
+ Q("match", container_id=container_id),
+ ]),
+ ],
+ )
+ search.aggs.bucket(
+ 'year_preservation',
+ 'composite',
+ size=1500,
+ sources=[
+ {"year": {
+ "histogram": {
+ "field": "release_year",
+ "interval": 1,
+ },
+ }},
+ {"preservation": {
+ "terms": {
+ "field": "preservation",
+ },
+ }},
+ ],
+ )
+ search = search[:0]
+
+ resp = wrap_es_execution(search)
+
+ buckets = resp.aggregations.year_preservation.buckets
+ year_nums = set([int(h['key']['year']) for h in buckets])
+ year_dicts = dict()
+ for num in range(min(year_nums), max(year_nums)+1):
+ year_dicts[num] = dict(year=num, bright=0, dark=0, shadows_only=0, none=0)
+ for row in buckets:
+ year_dicts[int(row['key']['year'])][row['key']['preservation']] = int(row['doc_count'])
+ return sorted(year_dicts.values(), key=lambda x: x['year'])
+
+def get_elastic_container_preservation_by_volume(container_id: str) -> List[dict]:
+ """
+ Fetches a stacked histogram of {volume, preservation}.
+
+ Currently only includes volume numbers which are simple integers (all chars
+ are digits).
+
+ Returns a list of dicts, sorted by volume, with keys/values like:
+
+ {year (int), bright (int), dark (int), shadows_only (int), none (int)}
+ """
+
+ search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ search = search.params(request_cache='true')
+ search = search.query(
+ 'bool',
+ filter=[
+ Q("bool", must=[
+ Q("match", container_id=container_id),
+ Q("exists", field="volume"),
+ ]),
+ ],
+ )
+ search.aggs.bucket(
+ 'volume_preservation',
+ 'composite',
+ size=1500,
+ sources=[
+ {"volume": {
+ "terms": {
+ "field": "volume",
+ },
+ }},
+ {"preservation": {
+ "terms": {
+ "field": "preservation",
+ },
+ }},
+ ],
+ )
+ search = search[:0]
+
+ resp = wrap_es_execution(search)
+
+ buckets = resp.aggregations.volume_preservation.buckets
+ volume_nums = set([int(h['key']['volume']) for h in buckets if h['key']['volume'].isdigit()])
+ volume_dicts = dict()
+ for num in range(min(volume_nums), max(volume_nums)+1):
+ volume_dicts[num] = dict(volume=num, bright=0, dark=0, shadows_only=0, none=0)
+ for row in buckets:
+ if row['key']['volume'].isdigit():
+ volume_dicts[int(row['key']['volume'])][row['key']['preservation']] = int(row['doc_count'])
+ return sorted(volume_dicts.values(), key=lambda x: x['volume'])
+
+def get_elastic_container_preservation_by_type(container_id: str) -> List[dict]:
+ """
+ Fetches preservation coverage by release type
+
+ Returns a list of dicts, sorted by total count, with keys/values like:
+
+ {year (int), bright (int), dark (int), shadows_only (int), none (int)}
+ """
+
+ search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ search = search.params(request_cache='true')
+ search = search.query(
+ 'bool',
+ filter=[
+ Q("bool", must=[
+ Q("match", container_id=container_id),
+ ]),
+ ],
+ )
+ search.aggs.bucket(
+ 'type_preservation',
+ 'composite',
+ size=1500,
+ sources=[
+ {"release_type": {
+ "terms": {
+ "field": "release_type",
+ },
+ "missing": "_unknown",
+ }},
+ {"preservation": {
+ "terms": {
+ "field": "preservation",
+ },
+ }},
+ ],
+ )
+ search = search[:0]
+
+ resp = wrap_es_execution(search)
+
+ buckets = resp.aggregations.volume_preservation.buckets
+ type_set = set([h['key']['release_type'] for h in buckets])
+ type_dicts = dict()
+ for k in type_set:
+ type_dicts[k] = dict(release_type=t, bright=0, dark=0, shadows_only=0, none=0, total=0)
+ for row in buckets:
+ type_dicts[row['key']['release_type']][row['key']['preservation']] = int(row['doc_count'])
+ for k in type_set:
+ for p in ('bright', 'dark', 'shadows_only', 'none'):
+ type_dicts[k]['total'] += type_dicts[k][p]
+ return sorted(type_dicts.values(), key=lambda x: x['total'])
diff --git a/python/fatcat_web/templates/container_view_coverage.html b/python/fatcat_web/templates/container_view_coverage.html
index ffd1a447..fc643f81 100644
--- a/python/fatcat_web/templates/container_view_coverage.html
+++ b/python/fatcat_web/templates/container_view_coverage.html
@@ -19,34 +19,50 @@
{% set frac_preserved = container._stats.is_preserved/container._stats.total %}
{% set frac_web = container._stats.in_web/container._stats.total %}
- <div class="ui large {{ entity_macros.progress_color(frac_web) }} progress" style="margin-bottom: 0.1em;">
- <div class="bar" style="width: {{ (frac_web*100)|int }}%;">
- <div class="progress">{{ (frac_web*100)|int }}%</div>
- </div>
- <div class="label">
- {{ "{:,}".format(container._stats.in_web) }} preserved and available (bright)
- </div>
- </div>
+ {% set pstats = container._stats.preservation %}
+ {% set frac_bright = container._stats.preservation.bright/container._stats.total %}
+ {% set frac_dark = container._stats.preservation.dark/container._stats.total %}
+ {% set frac_shadows_only = container._stats.preservation.shadows_only/container._stats.total %}
+ {% set frac_none = container._stats.preservation.none/container._stats.total %}
- <br>
- <div class="ui large {{ entity_macros.progress_color(frac_preserved) }} progress" style="margin-bottom: 0.1em;">
- <div class="bar" style="width: {{ (frac_preserved*100)|int }}%;">
- <div class="progress">{{ (frac_preserved*100)|int }}%</div>
+ <div class="ui large multiple progress" data-percent="0,0,0,0" style="margin-bottom: 0.1em;">
+ <div class="green bar" style="border-radius: 0; min-width: 0; width: {{ (frac_bright*100)|round(method='ceil') }}%;" title="bright">
+ <div class="progress">{# {{ (frac_bright*100)|int }}% #}</div>
</div>
- <div class="label">
- {{ "{:,}".format(container._stats.is_preserved) }} preserved at all (bright or dark)
+ <div class="green bar" style="border-radius: 0; min-width: 0; width: {{ (frac_dark*100)|round(method='ceil') }}%; background-color: darkgreen;">
+ <div class="progress">{# {{ (frac_dark*100)|int }}% #}</div>
</div>
- </div>
-
- <br>
- <div class="ui large {{ entity_macros.progress_color(frac_kbart) }} progress" style="margin-bottom: 0.1em; margin-top: 1em;">
- <div class="bar" style="width: {{ (frac_kbart*100)|int }}%;">
- <div class="progress">{{ (frac_kbart*100)|int }}%</div>
+ <div class="red bar" style="border-radius: 0; min-width: 0; width: {{ (frac_shadows_only*100)|round(method='ceil') }}%; background-color: darkred;">
+ <div class="progress">{# {{ (frac_shadows_only*100)|int }}% #}</div>
</div>
- <div class="label">
- {{ "{:,}".format(container._stats.in_kbart ) }} preserved by Keeper (dark)
+ <div class="red bar" style="border-radius: 0; min-width: 0; width: {{ (frac_none*100)|round(method='ceil') }}%;">
+ <div class="progress">{# {{ (frac_none*100)|int }}% #}</div>
</div>
</div>
+ <table class="ui very basic very compact collapsing table" style="font-weight: bold; margin-left: 1em;">
+ <tbody>
+ <tr>
+ <td style="background-color: green;">
+ <td class="right aligned" >{{ "{:,}".format(pstats.bright) }}
+ <td class="right aligned" >{{ (frac_bright*100)|round(2,method='ceil') }}%
+ <td>preserved and publicly available (bright)
+ <tr>
+ <td style="background-color: darkgreen;">
+ <td class="right aligned" >{{ "{:,}".format(pstats.dark) }}
+ <td class="right aligned" >{{ (frac_dark*100)|round(2,method='ceil') }}%
+ <td>preserved but not publicly accessible (dark)
+ <tr>
+ <td style="background-color: darkred;">
+ <td class="right aligned" >{{ "{:,}".format(pstats.shadows_only) }}
+ <td class="right aligned" >{{ (frac_shadows_only*100)|round(2,method='ceil') }}%
+ <td>only independently preserved in "shadow" libraries
+ <tr>
+ <td style="background-color: red;">
+ <td class="right aligned" >{{ "{:,}".format(pstats.none) }}
+ <td class="right aligned" >{{ (frac_none*100)|round(2,method='ceil') }}%
+ <td>no known independent preservation
+ </tbody>
+ </table>
{% endif %}
</div>