From 74c5f30ab878a914d3edb51040f4d78054684947 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 22 Feb 2019 11:32:23 -0800 Subject: add general and container-specific stats --- python/fatcat_web/routes.py | 50 +++++++++++- python/fatcat_web/search.py | 134 ++++++++++++++++++++++++++++++++- python/fatcat_web/templates/stats.html | 48 ++++++++++++ 3 files changed, 229 insertions(+), 3 deletions(-) create mode 100644 python/fatcat_web/templates/stats.html diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index 115c1981..11f73e4f 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -7,7 +7,8 @@ from flask_login import login_required from fatcat_web import app, api, auth_api, priv_api from fatcat_web.auth import handle_token_login, handle_logout, load_user, handle_ia_xauth from fatcat_client.rest import ApiException -from fatcat_web.search import do_release_search, do_container_search +from fatcat_web.search import * +from fatcat_web.cors import crossdomain from fatcat_tools.transforms import * @@ -106,6 +107,16 @@ def creator_history(ident): entity=entity, history=history) +@app.route('/container/issnl//stats.json', methods=['GET', 'OPTIONS']) +@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) +def container_issnl_stats(issnl): + try: + stats = get_elastic_container_stats(issnl) + except Exception as ae: + print(ae) + abort(503) + return jsonify(stats) + @app.route('/creator//edit', methods=['GET']) def creator_edit_view(ident): try: @@ -356,6 +367,10 @@ def release_search(): query = request.args.get('q') fulltext_only = bool(request.args.get('fulltext_only')) + issnl = request.args.get('container_issnl') + if issnl and query: + query += ' container_issnl:"{}"'.format(issnl) + if 'q' in request.args.keys(): # always do files for HTML found = do_release_search(query, fulltext_only=fulltext_only) @@ -375,6 +390,36 @@ def container_search(): else: return render_template('container_search.html', query=query) +def get_changelog_stats(): + stats = {} + latest_changelog = api.get_changelog(limit=1)[0] + stats['changelog'] = {"latest": { + "index": latest_changelog.index, + "timestamp": latest_changelog.timestamp.isoformat(), + }} + return stats + +@app.route('/stats.json', methods=['GET', 'OPTIONS']) +@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) +def stats_json(): + try: + stats = get_elastic_entity_stats() + stats.update(get_changelog_stats()) + except Exception as ae: + print(ae) + abort(503) + return jsonify(stats) + +@app.route('/stats', methods=['GET']) +def stats_page(): + try: + stats = get_elastic_entity_stats() + stats.update(get_changelog_stats()) + except Exception as ae: + print(ae) + abort(503) + return render_template('stats.html', stats=stats) + ### Auth #################################################################### @@ -490,6 +535,7 @@ def fatcat_photo(): 'fatcat.jpg', mimetype='image/jpeg') -@app.route('/health', methods=['GET']) +@app.route('/health', methods=['GET', 'OPTIONS']) +@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) def health(): return jsonify({'ok': True}) diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index d18416d6..f10ce406 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -87,7 +87,7 @@ def do_release_search(q, limit=30, fulltext_only=True): def do_container_search(q, limit=30): # Convert raw ISSN-L to ISSN-L query - if len(q.split()) == 1 and len(q) == 9 and isdigit(q[0:4]) and q[4] == '-': + if len(q.split()) == 1 and len(q) == 9 and q[0:4].isdigit() and q[4] == '-': q = 'issnl:"{}"'.format(q) search_request = { @@ -106,3 +106,135 @@ def do_container_search(q, limit=30): resp["query"] = { "q": q } return resp +def get_elastic_entity_stats(): + """ + TODO: files, filesets, webcaptures (no schema yet) + + Returns dict: + changelog: {latest: {index, datetime}} + release: {total, refs_total} + papers: {total, in_web, in_oa, in_kbart, in_web_not_kbart} + """ + + stats = {} + + # 2. releases + # x=> total count + # x=> total citation records + # x=> total (paper, chapter, proceeding) + # x=> with fulltext on web + # x=> open access + # x=> not in KBART, in IA + # + # Can probably do the above with two queries: + # - all releases, aggregate count and sum(ref_count) + # - in-scope works, aggregate count by (fulltext, OA, kbart/ia) + + # 2a. release totals + query = { + "size": 0, + "aggs": { + "release_ref_count": { "sum": { "field": "ref_count" } } + } + } + resp = requests.get( + "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), + json=query, + params=dict(request_cache="true")) + # TODO: abort() + resp.raise_for_status() + resp = resp.json() + stats['release'] = { + "total": resp['hits']['total'], + "refs_total": int(resp['aggregations']['release_ref_count']['value']), + } + + # 2b. paper counts + query = { + "size": 0, + "query": { + "terms": { "release_type": [ + # "chapter", "thesis", + "article-journal", "paper-conference", + ] } }, + "aggs": { "paper_like": { "filters": { "filters": { + "in_web": { "term": { "in_web": "true" } }, + "is_oa": { "term": { "is_oa": "true" } }, + "in_kbart": { "term": { "in_kbart": "true" } }, + "in_web_not_kbart": { "bool": { "filter": [ + { "term": { "in_web": "true" } }, + { "term": { "in_kbart": "false" } } + ]}} + }}}} + } + resp = requests.get( + "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), + json=query, + params=dict(request_cache="true")) + # TODO: abort() + resp.raise_for_status() + resp = resp.json() + buckets = resp['aggregations']['paper_like']['buckets'] + stats['papers'] = { + 'total': resp['hits']['total'], + 'in_web': buckets['in_web']['doc_count'], + 'is_oa': buckets['is_oa']['doc_count'], + 'in_kbart': buckets['in_kbart']['doc_count'], + 'in_web_not_kbart': buckets['in_web_not_kbart']['doc_count'], + } + + # 3. containers + # => total count + query = { + "size": 0, + } + resp = requests.get( + "{}/fatcat_container/_search".format(app.config['ELASTICSEARCH_BACKEND']), + json=query, + params=dict(request_cache="true")) + # TODO: abort() + resp.raise_for_status() + resp = resp.json() + stats['container'] = { + "total": resp['hits']['total'], + } + + return stats + +def get_elastic_container_stats(issnl): + """ + TODO: container_id, not issnl + + Returns dict: + total + in_web + preserved + """ + + query = { + "size": 0, + "query": { + "term": { "container_issnl": issnl } + }, + "aggs": { "container_stats": { "filters": { "filters": { + "in_web": { "term": { "in_web": "true" } }, + "is_preserved": { "term": { "is_preserved": "true" } }, + }}}} + } + resp = requests.get( + "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), + json=query, + params=dict(request_cache="true")) + # TODO: abort() + print(resp.json()) + resp.raise_for_status() + resp = resp.json() + buckets = resp['aggregations']['container_stats']['buckets'] + stats = { + 'issnl': issnl, + 'total': resp['hits']['total'], + 'in_web': buckets['in_web']['doc_count'], + 'is_preserved': buckets['is_preserved']['doc_count'], + } + + return stats diff --git a/python/fatcat_web/templates/stats.html b/python/fatcat_web/templates/stats.html new file mode 100644 index 00000000..92205b3d --- /dev/null +++ b/python/fatcat_web/templates/stats.html @@ -0,0 +1,48 @@ +{% extends "base.html" %} +{% block body %} + +

Stats

+ +You can also fetch these numbers as JSON. + +

Changelog

+ +

Latest changelog index is {{ stats.changelog.latest.index }} ({{ stats.changelog.latest.timestamp}}). + +

Entities

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
"Papers"Total{{ stats.papers.total }}
Fulltext on web{{ stats.papers.in_web }}
"Gold" Open Access{{ stats.papers.is_oa }}
In a Keepers/KBART archive{{ stats.papers.in_kbart }}
On web, not in Keepers{{ stats.papers.in_web_not_kbart }}
ReleasesTotal{{ stats.release.total }}
References (raw, unlinked){{ stats.release.refs_total }}
ContainersTotal{{ stats.container.total }}
+ +
+"Papers" are journal articles and conference proceedings, a subset of Releases + +{% endblock %} -- cgit v1.2.3