diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-02-22 11:32:23 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-02-22 11:32:23 -0800 |
commit | 74c5f30ab878a914d3edb51040f4d78054684947 (patch) | |
tree | 4bd40ad84148d7572c1451ea353d06c4c5fe90cc | |
parent | 7ac8611d5b36007710926ba4508828642a80c13c (diff) | |
download | fatcat-74c5f30ab878a914d3edb51040f4d78054684947.tar.gz fatcat-74c5f30ab878a914d3edb51040f4d78054684947.zip |
add general and container-specific stats
-rw-r--r-- | python/fatcat_web/routes.py | 50 | ||||
-rw-r--r-- | python/fatcat_web/search.py | 134 | ||||
-rw-r--r-- | python/fatcat_web/templates/stats.html | 48 |
3 files changed, 229 insertions, 3 deletions
diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index 115c1981..11f73e4f 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -7,7 +7,8 @@ from flask_login import login_required from fatcat_web import app, api, auth_api, priv_api from fatcat_web.auth import handle_token_login, handle_logout, load_user, handle_ia_xauth from fatcat_client.rest import ApiException -from fatcat_web.search import do_release_search, do_container_search +from fatcat_web.search import * +from fatcat_web.cors import crossdomain from fatcat_tools.transforms import * @@ -106,6 +107,16 @@ def creator_history(ident): entity=entity, history=history) +@app.route('/container/issnl/<issnl>/stats.json', methods=['GET', 'OPTIONS']) +@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) +def container_issnl_stats(issnl): + try: + stats = get_elastic_container_stats(issnl) + except Exception as ae: + print(ae) + abort(503) + return jsonify(stats) + @app.route('/creator/<ident>/edit', methods=['GET']) def creator_edit_view(ident): try: @@ -356,6 +367,10 @@ def release_search(): query = request.args.get('q') fulltext_only = bool(request.args.get('fulltext_only')) + issnl = request.args.get('container_issnl') + if issnl and query: + query += ' container_issnl:"{}"'.format(issnl) + if 'q' in request.args.keys(): # always do files for HTML found = do_release_search(query, fulltext_only=fulltext_only) @@ -375,6 +390,36 @@ def container_search(): else: return render_template('container_search.html', query=query) +def get_changelog_stats(): + stats = {} + latest_changelog = api.get_changelog(limit=1)[0] + stats['changelog'] = {"latest": { + "index": latest_changelog.index, + "timestamp": latest_changelog.timestamp.isoformat(), + }} + return stats + +@app.route('/stats.json', methods=['GET', 'OPTIONS']) +@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) +def stats_json(): + try: + stats = get_elastic_entity_stats() + stats.update(get_changelog_stats()) + except Exception as ae: + print(ae) + abort(503) + return jsonify(stats) + +@app.route('/stats', methods=['GET']) +def stats_page(): + try: + stats = get_elastic_entity_stats() + stats.update(get_changelog_stats()) + except Exception as ae: + print(ae) + abort(503) + return render_template('stats.html', stats=stats) + ### Auth #################################################################### @@ -490,6 +535,7 @@ def fatcat_photo(): 'fatcat.jpg', mimetype='image/jpeg') -@app.route('/health', methods=['GET']) +@app.route('/health', methods=['GET', 'OPTIONS']) +@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) def health(): return jsonify({'ok': True}) diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index d18416d6..f10ce406 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -87,7 +87,7 @@ def do_release_search(q, limit=30, fulltext_only=True): def do_container_search(q, limit=30): # Convert raw ISSN-L to ISSN-L query - if len(q.split()) == 1 and len(q) == 9 and isdigit(q[0:4]) and q[4] == '-': + if len(q.split()) == 1 and len(q) == 9 and q[0:4].isdigit() and q[4] == '-': q = 'issnl:"{}"'.format(q) search_request = { @@ -106,3 +106,135 @@ def do_container_search(q, limit=30): resp["query"] = { "q": q } return resp +def get_elastic_entity_stats(): + """ + TODO: files, filesets, webcaptures (no schema yet) + + Returns dict: + changelog: {latest: {index, datetime}} + release: {total, refs_total} + papers: {total, in_web, in_oa, in_kbart, in_web_not_kbart} + """ + + stats = {} + + # 2. releases + # x=> total count + # x=> total citation records + # x=> total (paper, chapter, proceeding) + # x=> with fulltext on web + # x=> open access + # x=> not in KBART, in IA + # + # Can probably do the above with two queries: + # - all releases, aggregate count and sum(ref_count) + # - in-scope works, aggregate count by (fulltext, OA, kbart/ia) + + # 2a. release totals + query = { + "size": 0, + "aggs": { + "release_ref_count": { "sum": { "field": "ref_count" } } + } + } + resp = requests.get( + "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), + json=query, + params=dict(request_cache="true")) + # TODO: abort() + resp.raise_for_status() + resp = resp.json() + stats['release'] = { + "total": resp['hits']['total'], + "refs_total": int(resp['aggregations']['release_ref_count']['value']), + } + + # 2b. paper counts + query = { + "size": 0, + "query": { + "terms": { "release_type": [ + # "chapter", "thesis", + "article-journal", "paper-conference", + ] } }, + "aggs": { "paper_like": { "filters": { "filters": { + "in_web": { "term": { "in_web": "true" } }, + "is_oa": { "term": { "is_oa": "true" } }, + "in_kbart": { "term": { "in_kbart": "true" } }, + "in_web_not_kbart": { "bool": { "filter": [ + { "term": { "in_web": "true" } }, + { "term": { "in_kbart": "false" } } + ]}} + }}}} + } + resp = requests.get( + "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), + json=query, + params=dict(request_cache="true")) + # TODO: abort() + resp.raise_for_status() + resp = resp.json() + buckets = resp['aggregations']['paper_like']['buckets'] + stats['papers'] = { + 'total': resp['hits']['total'], + 'in_web': buckets['in_web']['doc_count'], + 'is_oa': buckets['is_oa']['doc_count'], + 'in_kbart': buckets['in_kbart']['doc_count'], + 'in_web_not_kbart': buckets['in_web_not_kbart']['doc_count'], + } + + # 3. containers + # => total count + query = { + "size": 0, + } + resp = requests.get( + "{}/fatcat_container/_search".format(app.config['ELASTICSEARCH_BACKEND']), + json=query, + params=dict(request_cache="true")) + # TODO: abort() + resp.raise_for_status() + resp = resp.json() + stats['container'] = { + "total": resp['hits']['total'], + } + + return stats + +def get_elastic_container_stats(issnl): + """ + TODO: container_id, not issnl + + Returns dict: + total + in_web + preserved + """ + + query = { + "size": 0, + "query": { + "term": { "container_issnl": issnl } + }, + "aggs": { "container_stats": { "filters": { "filters": { + "in_web": { "term": { "in_web": "true" } }, + "is_preserved": { "term": { "is_preserved": "true" } }, + }}}} + } + resp = requests.get( + "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), + json=query, + params=dict(request_cache="true")) + # TODO: abort() + print(resp.json()) + resp.raise_for_status() + resp = resp.json() + buckets = resp['aggregations']['container_stats']['buckets'] + stats = { + 'issnl': issnl, + 'total': resp['hits']['total'], + 'in_web': buckets['in_web']['doc_count'], + 'is_preserved': buckets['is_preserved']['doc_count'], + } + + return stats diff --git a/python/fatcat_web/templates/stats.html b/python/fatcat_web/templates/stats.html new file mode 100644 index 00000000..92205b3d --- /dev/null +++ b/python/fatcat_web/templates/stats.html @@ -0,0 +1,48 @@ +{% extends "base.html" %} +{% block body %} + +<h1>Stats</h1> + +You can also fetch these numbers <a href="./stats.json">as JSON</a>. + +<h3>Changelog</h3> + +<p>Latest changelog index is {{ stats.changelog.latest.index }} ({{ stats.changelog.latest.timestamp}}). + +<h3>Entities</h3> + +<table class="ui structured table"> + <tbody> + <tr><td rowspan="5" class="active top aligned"><b>"Papers"</b></td> + <td>Total</td> + <td class="right aligned">{{ stats.papers.total }}</td> + <tr> + <td>Fulltext on web</td> + <td class="right aligned">{{ stats.papers.in_web }}</td> + <tr> + <td>"Gold" Open Access</td> + <td class="right aligned">{{ stats.papers.is_oa }}</td> + <tr> + <td>In a Keepers/KBART archive</td> + <td class="right aligned">{{ stats.papers.in_kbart }}</td> + <tr> + <td>On web, not in Keepers</td> + <td class="right aligned">{{ stats.papers.in_web_not_kbart }}</td> + + <tr><td rowspan="2" class="active top aligned"><b>Releases</b></td> + <td>Total</td> + <td class="right aligned">{{ stats.release.total }}</td> + <tr> + <td>References (raw, unlinked)</td> + <td class="right aligned">{{ stats.release.refs_total }}</td> + + <tr><td rowspan="1" class="active top aligned"><b>Containers</b></td> + <td>Total</td> + <td class="right aligned">{{ stats.container.total }}</td> + </tbody> +</table> + +<br> +<i>"Papers" are journal articles and conference proceedings, a subset of Releases</i> + +{% endblock %} |