diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2020-07-27 20:08:51 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-07-30 18:28:03 -0700 | 
| commit | e28fa0da97f4edc070f665a9f5fd4f4036196a18 (patch) | |
| tree | 22acacb729c961450d8957000ea9b0b9a1829a8f /python | |
| parent | 8e6ab69b9cb3a88661f6ba13ded0d7afff8948a5 (diff) | |
| download | fatcat-e28fa0da97f4edc070f665a9f5fd4f4036196a18.tar.gz fatcat-e28fa0da97f4edc070f665a9f5fd4f4036196a18.zip | |
search: 'recent' mode for coverage search
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat_web/graphics.py | 26 | ||||
| -rw-r--r-- | python/fatcat_web/routes.py | 15 | ||||
| -rw-r--r-- | python/fatcat_web/search.py | 86 | ||||
| -rw-r--r-- | python/fatcat_web/templates/coverage_search.html | 14 | 
4 files changed, 134 insertions, 7 deletions
| diff --git a/python/fatcat_web/graphics.py b/python/fatcat_web/graphics.py index 96c3531a..7d6e5702 100644 --- a/python/fatcat_web/graphics.py +++ b/python/fatcat_web/graphics.py @@ -61,6 +61,30 @@ def preservation_by_year_histogram(rows: List[Dict]) -> pygal.Graph:      chart.add('Bright', [y['bright'] for y in years])      return chart +def preservation_by_date_histogram(rows: List[Dict]) -> pygal.Graph: +    """ +    Note: this returns a raw pygal chart; it does not render it to SVG/PNG +    """ + +    dates = sorted(rows, key=lambda x: x['date']) + +    CleanStyle.colors = ("red", "darkred", "darkolivegreen", "limegreen") +    label_count = len(dates) +    if len(dates) > 30: +        label_count = 10 +    chart = pygal.StackedBar(dynamic_print_values=True, style=CleanStyle, +        width=1000, height=500, x_labels_major_count=label_count, +        show_minor_x_labels=False, x_label_rotation=20) +    #chart.title = "Preservation by Date" +    chart.x_title = "Date" +    #chart.y_title = "Count" +    chart.x_labels = [str(y['date']) for y in dates] +    chart.add('None', [y['none'] for y in dates]) +    chart.add('Shadow', [y['shadows_only'] for y in dates]) +    chart.add('Dark', [y['dark'] for y in dates]) +    chart.add('Bright', [y['bright'] for y in dates]) +    return chart +  def preservation_by_volume_histogram(rows: List[Dict]) -> pygal.Graph:      """      Note: this returns a raw pygal chart; it does not render it to SVG/PNG @@ -75,7 +99,7 @@ def preservation_by_volume_histogram(rows: List[Dict]) -> pygal.Graph:      chart = pygal.StackedBar(dynamic_print_values=True, style=CleanStyle,          width=1000, height=500, x_labels_major_count=label_count,          show_minor_x_labels=False, x_label_rotation=20) -    #chart.title = "Preservation by Year" +    #chart.title = "Preservation by Volume"      chart.x_title = "Volume"      #chart.y_title = "Count"      chart.x_labels = [str(y['volume']) for y in volumes] diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index a741112f..20fe0e12 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -746,19 +746,24 @@ def coverage_search():      query = ReleaseQuery.from_args(request.args)      coverage_stats = get_elastic_search_coverage(query) +    year_histogram_svg = None +    date_histogram_svg = None +    coverage_type_preservation = None      if coverage_stats['total'] > 1: -        year_histogram = get_elastic_preservation_by_year(query) -        year_histogram_svg = preservation_by_year_histogram(year_histogram).render_data_uri()          coverage_type_preservation = get_elastic_preservation_by_type(query) -    else: -        year_histogram_svg = None -        coverage_type_preservation = None +        if query.recent: +            date_histogram = get_elastic_preservation_by_date(query) +            date_histogram_svg = preservation_by_date_histogram(date_histogram).render_data_uri() +        else: +            year_histogram = get_elastic_preservation_by_year(query) +            year_histogram_svg = preservation_by_year_histogram(year_histogram).render_data_uri()      return render_template(          'coverage_search.html',          query=query,          coverage_stats=coverage_stats,          coverage_type_preservation=coverage_type_preservation,          year_histogram_svg=year_histogram_svg, +        date_histogram_svg=date_histogram_svg,      )  def get_changelog_stats(): diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index 3ba6fdb2..b0d27b2e 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -31,6 +31,7 @@ class ReleaseQuery:      offset: Optional[int] = None      fulltext_only: bool = False      container_id: Optional[str] = None +    recent: bool = False      @classmethod      def from_args(cls, args) -> 'ReleaseQuery': @@ -55,6 +56,7 @@ class ReleaseQuery:              offset=offset,              fulltext_only=bool(args.get('fulltext_only')),              container_id=container_id, +            recent=bool(args.get('recent')),          )  @dataclass @@ -384,6 +386,11 @@ def get_elastic_search_coverage(query: ReleaseQuery) -> dict:          field='preservation',          missing='_unknown',      ) +    if query.recent: +        date_today = datetime.date.today() +        start_date = str(date_today - datetime.timedelta(days=60)) +        end_date = str(date_today + datetime.timedelta(days=1)) +        search = search.filter("range", release_date=dict(gte=start_date, lte=end_date))      search = search[:0] @@ -550,7 +557,6 @@ def get_elastic_preservation_by_year(query) -> List[dict]:              allow_leading_wildcard=False,              lenient=True,              fields=[ -                "title^2",                  "biblio",              ],          ) @@ -598,6 +604,79 @@ def get_elastic_preservation_by_year(query) -> List[dict]:          year_dicts[int(row['key']['year'])][row['key']['preservation']] = int(row['doc_count'])      return sorted(year_dicts.values(), key=lambda x: x['year']) + +def get_elastic_preservation_by_date(query) -> List[dict]: +    """ +    Fetches a stacked histogram of {date, preservation}. + +    Preservation has 4 potential values; this function filters to the past 250 +    years (at most), or about 1000 values. + +    Returns a list of dicts, sorted by date, with keys/values like: + +        {date (str), bright (int), dark (int), shadows_only (int), none (int)} +    """ + +    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) +    if query.q not in [None, "*"]: +        search = search.query( +            "query_string", +            query=query.q, +            default_operator="AND", +            analyze_wildcard=True, +            allow_leading_wildcard=False, +            lenient=True, +            fields=[ +                "biblio", +            ], +        ) +    if query.container_id: +        search = search.filter( +            "term", +            container_id=query.container_id, +        ) +    date_today = datetime.date.today() +    start_date = date_today - datetime.timedelta(days=60) +    end_date = date_today + datetime.timedelta(days=1) +    search = search.filter( +        "range", release_date=dict( +            gte=str(start_date), +            lte=str(end_date), +        ) +    ) + +    search.aggs.bucket( +        'date_preservation', +        'composite', +        size=1500, +        sources=[ +            {"date": { +                "histogram": { +                    "field": "release_date", +                    "interval": 1, +                }, +            }}, +            {"preservation": { +                "terms": { +                    "field": "preservation", +                }, +            }}, +        ], +    ) +    search = search[:0] +    search = search.params(request_cache='true') +    resp = wrap_es_execution(search) + +    buckets = resp.aggregations.date_preservation.buckets +    date_dicts = dict() +    this_date = start_date +    while this_date <= end_date: +        date_dicts[str(this_date)] = dict(date=str(this_date), bright=0, dark=0, shadows_only=0, none=0) +        this_date = this_date + datetime.timedelta(days=1) +    for row in buckets: +        date_dicts[row['key']['date'][0:10]][row['key']['preservation']] = int(row['doc_count']) +    return sorted(date_dicts.values(), key=lambda x: x['date']) +  def get_elastic_container_preservation_by_volume(container_id: str) -> List[dict]:      """      Fetches a stacked histogram of {volume, preservation}. @@ -682,6 +761,11 @@ def get_elastic_preservation_by_type(query: ReleaseQuery) -> List[dict]:                  ]),              ],          ) +    if query.recent: +        date_today = datetime.date.today() +        start_date = str(date_today - datetime.timedelta(days=60)) +        end_date = str(date_today + datetime.timedelta(days=1)) +        search = search.filter("range", release_date=dict(gte=start_date, lte=end_date))      search.aggs.bucket(          'type_preservation',          'composite', diff --git a/python/fatcat_web/templates/coverage_search.html b/python/fatcat_web/templates/coverage_search.html index c730ef9d..1e0f8327 100644 --- a/python/fatcat_web/templates/coverage_search.html +++ b/python/fatcat_web/templates/coverage_search.html @@ -20,6 +20,10 @@            <input type="text" placeholder="Query..." name="q" value="{% if query.q %}{{ query.q }}{% endif %}" aria-label="visualize preservation coverage">            <button class="ui primary button">Search</button>          </div> +        <div class="ui checkbox" style="float: right; margin: 1em;"> +          <input type="checkbox" name="recent" id="recent" value="true" {% if query.recent %}checked{% endif %}> +          <label for="recent">Recent Publications Only</label> +        </div>          <br>Can also search for <b><a href="/release/search?q={{ query.q or "" }}">releases</a></b> (eg, individual papers) or <b><a href="/container/search?q={{ query.q or "" }}">containers</a></b> (eg, journals).        </div>      </form> @@ -55,6 +59,16 @@  {% endif %} +{% if date_histogram_svg != None %} +  <br><br> +  <h2>Perpetual Access Coverage by Date</h2> + +  <figure style="margin: 0 0 0 0;"> +    <embed type="image/svg+xml" src="{{ date_histogram_svg|safe }}" /> +  </figure> + +{% endif %} +  {% if coverage_type_preservation != None %}    <br><br>    <h2>Perpetual Access Coverage by Release Type</h2> | 
