aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-02-22 11:32:23 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-02-22 11:32:23 -0800
commit74c5f30ab878a914d3edb51040f4d78054684947 (patch)
tree4bd40ad84148d7572c1451ea353d06c4c5fe90cc /python
parent7ac8611d5b36007710926ba4508828642a80c13c (diff)
downloadfatcat-74c5f30ab878a914d3edb51040f4d78054684947.tar.gz
fatcat-74c5f30ab878a914d3edb51040f4d78054684947.zip
add general and container-specific stats
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_web/routes.py50
-rw-r--r--python/fatcat_web/search.py134
-rw-r--r--python/fatcat_web/templates/stats.html48
3 files changed, 229 insertions, 3 deletions
diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py
index 115c1981..11f73e4f 100644
--- a/python/fatcat_web/routes.py
+++ b/python/fatcat_web/routes.py
@@ -7,7 +7,8 @@ from flask_login import login_required
from fatcat_web import app, api, auth_api, priv_api
from fatcat_web.auth import handle_token_login, handle_logout, load_user, handle_ia_xauth
from fatcat_client.rest import ApiException
-from fatcat_web.search import do_release_search, do_container_search
+from fatcat_web.search import *
+from fatcat_web.cors import crossdomain
from fatcat_tools.transforms import *
@@ -106,6 +107,16 @@ def creator_history(ident):
entity=entity,
history=history)
+@app.route('/container/issnl/<issnl>/stats.json', methods=['GET', 'OPTIONS'])
+@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type'])
+def container_issnl_stats(issnl):
+ try:
+ stats = get_elastic_container_stats(issnl)
+ except Exception as ae:
+ print(ae)
+ abort(503)
+ return jsonify(stats)
+
@app.route('/creator/<ident>/edit', methods=['GET'])
def creator_edit_view(ident):
try:
@@ -356,6 +367,10 @@ def release_search():
query = request.args.get('q')
fulltext_only = bool(request.args.get('fulltext_only'))
+ issnl = request.args.get('container_issnl')
+ if issnl and query:
+ query += ' container_issnl:"{}"'.format(issnl)
+
if 'q' in request.args.keys():
# always do files for HTML
found = do_release_search(query, fulltext_only=fulltext_only)
@@ -375,6 +390,36 @@ def container_search():
else:
return render_template('container_search.html', query=query)
+def get_changelog_stats():
+ stats = {}
+ latest_changelog = api.get_changelog(limit=1)[0]
+ stats['changelog'] = {"latest": {
+ "index": latest_changelog.index,
+ "timestamp": latest_changelog.timestamp.isoformat(),
+ }}
+ return stats
+
+@app.route('/stats.json', methods=['GET', 'OPTIONS'])
+@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type'])
+def stats_json():
+ try:
+ stats = get_elastic_entity_stats()
+ stats.update(get_changelog_stats())
+ except Exception as ae:
+ print(ae)
+ abort(503)
+ return jsonify(stats)
+
+@app.route('/stats', methods=['GET'])
+def stats_page():
+ try:
+ stats = get_elastic_entity_stats()
+ stats.update(get_changelog_stats())
+ except Exception as ae:
+ print(ae)
+ abort(503)
+ return render_template('stats.html', stats=stats)
+
### Auth ####################################################################
@@ -490,6 +535,7 @@ def fatcat_photo():
'fatcat.jpg',
mimetype='image/jpeg')
-@app.route('/health', methods=['GET'])
+@app.route('/health', methods=['GET', 'OPTIONS'])
+@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type'])
def health():
return jsonify({'ok': True})
diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py
index d18416d6..f10ce406 100644
--- a/python/fatcat_web/search.py
+++ b/python/fatcat_web/search.py
@@ -87,7 +87,7 @@ def do_release_search(q, limit=30, fulltext_only=True):
def do_container_search(q, limit=30):
# Convert raw ISSN-L to ISSN-L query
- if len(q.split()) == 1 and len(q) == 9 and isdigit(q[0:4]) and q[4] == '-':
+ if len(q.split()) == 1 and len(q) == 9 and q[0:4].isdigit() and q[4] == '-':
q = 'issnl:"{}"'.format(q)
search_request = {
@@ -106,3 +106,135 @@ def do_container_search(q, limit=30):
resp["query"] = { "q": q }
return resp
+def get_elastic_entity_stats():
+ """
+ TODO: files, filesets, webcaptures (no schema yet)
+
+ Returns dict:
+ changelog: {latest: {index, datetime}}
+ release: {total, refs_total}
+ papers: {total, in_web, in_oa, in_kbart, in_web_not_kbart}
+ """
+
+ stats = {}
+
+ # 2. releases
+ # x=> total count
+ # x=> total citation records
+ # x=> total (paper, chapter, proceeding)
+ # x=> with fulltext on web
+ # x=> open access
+ # x=> not in KBART, in IA
+ #
+ # Can probably do the above with two queries:
+ # - all releases, aggregate count and sum(ref_count)
+ # - in-scope works, aggregate count by (fulltext, OA, kbart/ia)
+
+ # 2a. release totals
+ query = {
+ "size": 0,
+ "aggs": {
+ "release_ref_count": { "sum": { "field": "ref_count" } }
+ }
+ }
+ resp = requests.get(
+ "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']),
+ json=query,
+ params=dict(request_cache="true"))
+ # TODO: abort()
+ resp.raise_for_status()
+ resp = resp.json()
+ stats['release'] = {
+ "total": resp['hits']['total'],
+ "refs_total": int(resp['aggregations']['release_ref_count']['value']),
+ }
+
+ # 2b. paper counts
+ query = {
+ "size": 0,
+ "query": {
+ "terms": { "release_type": [
+ # "chapter", "thesis",
+ "article-journal", "paper-conference",
+ ] } },
+ "aggs": { "paper_like": { "filters": { "filters": {
+ "in_web": { "term": { "in_web": "true" } },
+ "is_oa": { "term": { "is_oa": "true" } },
+ "in_kbart": { "term": { "in_kbart": "true" } },
+ "in_web_not_kbart": { "bool": { "filter": [
+ { "term": { "in_web": "true" } },
+ { "term": { "in_kbart": "false" } }
+ ]}}
+ }}}}
+ }
+ resp = requests.get(
+ "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']),
+ json=query,
+ params=dict(request_cache="true"))
+ # TODO: abort()
+ resp.raise_for_status()
+ resp = resp.json()
+ buckets = resp['aggregations']['paper_like']['buckets']
+ stats['papers'] = {
+ 'total': resp['hits']['total'],
+ 'in_web': buckets['in_web']['doc_count'],
+ 'is_oa': buckets['is_oa']['doc_count'],
+ 'in_kbart': buckets['in_kbart']['doc_count'],
+ 'in_web_not_kbart': buckets['in_web_not_kbart']['doc_count'],
+ }
+
+ # 3. containers
+ # => total count
+ query = {
+ "size": 0,
+ }
+ resp = requests.get(
+ "{}/fatcat_container/_search".format(app.config['ELASTICSEARCH_BACKEND']),
+ json=query,
+ params=dict(request_cache="true"))
+ # TODO: abort()
+ resp.raise_for_status()
+ resp = resp.json()
+ stats['container'] = {
+ "total": resp['hits']['total'],
+ }
+
+ return stats
+
+def get_elastic_container_stats(issnl):
+ """
+ TODO: container_id, not issnl
+
+ Returns dict:
+ total
+ in_web
+ preserved
+ """
+
+ query = {
+ "size": 0,
+ "query": {
+ "term": { "container_issnl": issnl }
+ },
+ "aggs": { "container_stats": { "filters": { "filters": {
+ "in_web": { "term": { "in_web": "true" } },
+ "is_preserved": { "term": { "is_preserved": "true" } },
+ }}}}
+ }
+ resp = requests.get(
+ "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']),
+ json=query,
+ params=dict(request_cache="true"))
+ # TODO: abort()
+ print(resp.json())
+ resp.raise_for_status()
+ resp = resp.json()
+ buckets = resp['aggregations']['container_stats']['buckets']
+ stats = {
+ 'issnl': issnl,
+ 'total': resp['hits']['total'],
+ 'in_web': buckets['in_web']['doc_count'],
+ 'is_preserved': buckets['is_preserved']['doc_count'],
+ }
+
+ return stats
diff --git a/python/fatcat_web/templates/stats.html b/python/fatcat_web/templates/stats.html
new file mode 100644
index 00000000..92205b3d
--- /dev/null
+++ b/python/fatcat_web/templates/stats.html
@@ -0,0 +1,48 @@
+{% extends "base.html" %}
+{% block body %}
+
+<h1>Stats</h1>
+
+You can also fetch these numbers <a href="./stats.json">as JSON</a>.
+
+<h3>Changelog</h3>
+
+<p>Latest changelog index is {{ stats.changelog.latest.index }} ({{ stats.changelog.latest.timestamp}}).
+
+<h3>Entities</h3>
+
+<table class="ui structured table">
+ <tbody>
+ <tr><td rowspan="5" class="active top aligned"><b>"Papers"</b></td>
+ <td>Total</td>
+ <td class="right aligned">{{ stats.papers.total }}</td>
+ <tr>
+ <td>Fulltext on web</td>
+ <td class="right aligned">{{ stats.papers.in_web }}</td>
+ <tr>
+ <td>"Gold" Open Access</td>
+ <td class="right aligned">{{ stats.papers.is_oa }}</td>
+ <tr>
+ <td>In a Keepers/KBART archive</td>
+ <td class="right aligned">{{ stats.papers.in_kbart }}</td>
+ <tr>
+ <td>On web, not in Keepers</td>
+ <td class="right aligned">{{ stats.papers.in_web_not_kbart }}</td>
+
+ <tr><td rowspan="2" class="active top aligned"><b>Releases</b></td>
+ <td>Total</td>
+ <td class="right aligned">{{ stats.release.total }}</td>
+ <tr>
+ <td>References (raw, unlinked)</td>
+ <td class="right aligned">{{ stats.release.refs_total }}</td>
+
+ <tr><td rowspan="1" class="active top aligned"><b>Containers</b></td>
+ <td>Total</td>
+ <td class="right aligned">{{ stats.container.total }}</td>
+ </tbody>
+</table>
+
+<br>
+<i>"Papers" are journal articles and conference proceedings, a subset of Releases</i>
+
+{% endblock %}