aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-07-27 20:08:51 -0700
committerBryan Newbold <bnewbold@robocracy.org>2020-07-30 18:28:03 -0700
commite28fa0da97f4edc070f665a9f5fd4f4036196a18 (patch)
tree22acacb729c961450d8957000ea9b0b9a1829a8f /python
parent8e6ab69b9cb3a88661f6ba13ded0d7afff8948a5 (diff)
downloadfatcat-e28fa0da97f4edc070f665a9f5fd4f4036196a18.tar.gz
fatcat-e28fa0da97f4edc070f665a9f5fd4f4036196a18.zip
search: 'recent' mode for coverage search
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_web/graphics.py26
-rw-r--r--python/fatcat_web/routes.py15
-rw-r--r--python/fatcat_web/search.py86
-rw-r--r--python/fatcat_web/templates/coverage_search.html14
4 files changed, 134 insertions, 7 deletions
diff --git a/python/fatcat_web/graphics.py b/python/fatcat_web/graphics.py
index 96c3531a..7d6e5702 100644
--- a/python/fatcat_web/graphics.py
+++ b/python/fatcat_web/graphics.py
@@ -61,6 +61,30 @@ def preservation_by_year_histogram(rows: List[Dict]) -> pygal.Graph:
chart.add('Bright', [y['bright'] for y in years])
return chart
+def preservation_by_date_histogram(rows: List[Dict]) -> pygal.Graph:
+ """
+ Note: this returns a raw pygal chart; it does not render it to SVG/PNG
+ """
+
+ dates = sorted(rows, key=lambda x: x['date'])
+
+ CleanStyle.colors = ("red", "darkred", "darkolivegreen", "limegreen")
+ label_count = len(dates)
+ if len(dates) > 30:
+ label_count = 10
+ chart = pygal.StackedBar(dynamic_print_values=True, style=CleanStyle,
+ width=1000, height=500, x_labels_major_count=label_count,
+ show_minor_x_labels=False, x_label_rotation=20)
+ #chart.title = "Preservation by Date"
+ chart.x_title = "Date"
+ #chart.y_title = "Count"
+ chart.x_labels = [str(y['date']) for y in dates]
+ chart.add('None', [y['none'] for y in dates])
+ chart.add('Shadow', [y['shadows_only'] for y in dates])
+ chart.add('Dark', [y['dark'] for y in dates])
+ chart.add('Bright', [y['bright'] for y in dates])
+ return chart
+
def preservation_by_volume_histogram(rows: List[Dict]) -> pygal.Graph:
"""
Note: this returns a raw pygal chart; it does not render it to SVG/PNG
@@ -75,7 +99,7 @@ def preservation_by_volume_histogram(rows: List[Dict]) -> pygal.Graph:
chart = pygal.StackedBar(dynamic_print_values=True, style=CleanStyle,
width=1000, height=500, x_labels_major_count=label_count,
show_minor_x_labels=False, x_label_rotation=20)
- #chart.title = "Preservation by Year"
+ #chart.title = "Preservation by Volume"
chart.x_title = "Volume"
#chart.y_title = "Count"
chart.x_labels = [str(y['volume']) for y in volumes]
diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py
index a741112f..20fe0e12 100644
--- a/python/fatcat_web/routes.py
+++ b/python/fatcat_web/routes.py
@@ -746,19 +746,24 @@ def coverage_search():
query = ReleaseQuery.from_args(request.args)
coverage_stats = get_elastic_search_coverage(query)
+ year_histogram_svg = None
+ date_histogram_svg = None
+ coverage_type_preservation = None
if coverage_stats['total'] > 1:
- year_histogram = get_elastic_preservation_by_year(query)
- year_histogram_svg = preservation_by_year_histogram(year_histogram).render_data_uri()
coverage_type_preservation = get_elastic_preservation_by_type(query)
- else:
- year_histogram_svg = None
- coverage_type_preservation = None
+ if query.recent:
+ date_histogram = get_elastic_preservation_by_date(query)
+ date_histogram_svg = preservation_by_date_histogram(date_histogram).render_data_uri()
+ else:
+ year_histogram = get_elastic_preservation_by_year(query)
+ year_histogram_svg = preservation_by_year_histogram(year_histogram).render_data_uri()
return render_template(
'coverage_search.html',
query=query,
coverage_stats=coverage_stats,
coverage_type_preservation=coverage_type_preservation,
year_histogram_svg=year_histogram_svg,
+ date_histogram_svg=date_histogram_svg,
)
def get_changelog_stats():
diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py
index 3ba6fdb2..b0d27b2e 100644
--- a/python/fatcat_web/search.py
+++ b/python/fatcat_web/search.py
@@ -31,6 +31,7 @@ class ReleaseQuery:
offset: Optional[int] = None
fulltext_only: bool = False
container_id: Optional[str] = None
+ recent: bool = False
@classmethod
def from_args(cls, args) -> 'ReleaseQuery':
@@ -55,6 +56,7 @@ class ReleaseQuery:
offset=offset,
fulltext_only=bool(args.get('fulltext_only')),
container_id=container_id,
+ recent=bool(args.get('recent')),
)
@dataclass
@@ -384,6 +386,11 @@ def get_elastic_search_coverage(query: ReleaseQuery) -> dict:
field='preservation',
missing='_unknown',
)
+ if query.recent:
+ date_today = datetime.date.today()
+ start_date = str(date_today - datetime.timedelta(days=60))
+ end_date = str(date_today + datetime.timedelta(days=1))
+ search = search.filter("range", release_date=dict(gte=start_date, lte=end_date))
search = search[:0]
@@ -550,7 +557,6 @@ def get_elastic_preservation_by_year(query) -> List[dict]:
allow_leading_wildcard=False,
lenient=True,
fields=[
- "title^2",
"biblio",
],
)
@@ -598,6 +604,79 @@ def get_elastic_preservation_by_year(query) -> List[dict]:
year_dicts[int(row['key']['year'])][row['key']['preservation']] = int(row['doc_count'])
return sorted(year_dicts.values(), key=lambda x: x['year'])
+
+def get_elastic_preservation_by_date(query) -> List[dict]:
+ """
+ Fetches a stacked histogram of {date, preservation}.
+
+ Preservation has 4 potential values; this function filters to the past 250
+ years (at most), or about 1000 values.
+
+ Returns a list of dicts, sorted by date, with keys/values like:
+
+ {date (str), bright (int), dark (int), shadows_only (int), none (int)}
+ """
+
+ search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ if query.q not in [None, "*"]:
+ search = search.query(
+ "query_string",
+ query=query.q,
+ default_operator="AND",
+ analyze_wildcard=True,
+ allow_leading_wildcard=False,
+ lenient=True,
+ fields=[
+ "biblio",
+ ],
+ )
+ if query.container_id:
+ search = search.filter(
+ "term",
+ container_id=query.container_id,
+ )
+ date_today = datetime.date.today()
+ start_date = date_today - datetime.timedelta(days=60)
+ end_date = date_today + datetime.timedelta(days=1)
+ search = search.filter(
+ "range", release_date=dict(
+ gte=str(start_date),
+ lte=str(end_date),
+ )
+ )
+
+ search.aggs.bucket(
+ 'date_preservation',
+ 'composite',
+ size=1500,
+ sources=[
+ {"date": {
+ "histogram": {
+ "field": "release_date",
+ "interval": 1,
+ },
+ }},
+ {"preservation": {
+ "terms": {
+ "field": "preservation",
+ },
+ }},
+ ],
+ )
+ search = search[:0]
+ search = search.params(request_cache='true')
+ resp = wrap_es_execution(search)
+
+ buckets = resp.aggregations.date_preservation.buckets
+ date_dicts = dict()
+ this_date = start_date
+ while this_date <= end_date:
+ date_dicts[str(this_date)] = dict(date=str(this_date), bright=0, dark=0, shadows_only=0, none=0)
+ this_date = this_date + datetime.timedelta(days=1)
+ for row in buckets:
+ date_dicts[row['key']['date'][0:10]][row['key']['preservation']] = int(row['doc_count'])
+ return sorted(date_dicts.values(), key=lambda x: x['date'])
+
def get_elastic_container_preservation_by_volume(container_id: str) -> List[dict]:
"""
Fetches a stacked histogram of {volume, preservation}.
@@ -682,6 +761,11 @@ def get_elastic_preservation_by_type(query: ReleaseQuery) -> List[dict]:
]),
],
)
+ if query.recent:
+ date_today = datetime.date.today()
+ start_date = str(date_today - datetime.timedelta(days=60))
+ end_date = str(date_today + datetime.timedelta(days=1))
+ search = search.filter("range", release_date=dict(gte=start_date, lte=end_date))
search.aggs.bucket(
'type_preservation',
'composite',
diff --git a/python/fatcat_web/templates/coverage_search.html b/python/fatcat_web/templates/coverage_search.html
index c730ef9d..1e0f8327 100644
--- a/python/fatcat_web/templates/coverage_search.html
+++ b/python/fatcat_web/templates/coverage_search.html
@@ -20,6 +20,10 @@
<input type="text" placeholder="Query..." name="q" value="{% if query.q %}{{ query.q }}{% endif %}" aria-label="visualize preservation coverage">
<button class="ui primary button">Search</button>
</div>
+ <div class="ui checkbox" style="float: right; margin: 1em;">
+ <input type="checkbox" name="recent" id="recent" value="true" {% if query.recent %}checked{% endif %}>
+ <label for="recent">Recent Publications Only</label>
+ </div>
<br>Can also search for <b><a href="/release/search?q={{ query.q or "" }}">releases</a></b> (eg, individual papers) or <b><a href="/container/search?q={{ query.q or "" }}">containers</a></b> (eg, journals).
</div>
</form>
@@ -55,6 +59,16 @@
{% endif %}
+{% if date_histogram_svg != None %}
+ <br><br>
+ <h2>Perpetual Access Coverage by Date</h2>
+
+ <figure style="margin: 0 0 0 0;">
+ <embed type="image/svg+xml" src="{{ date_histogram_svg|safe }}" />
+ </figure>
+
+{% endif %}
+
{% if coverage_type_preservation != None %}
<br><br>
<h2>Perpetual Access Coverage by Release Type</h2>