diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-02-22 11:32:23 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-02-22 11:32:23 -0800 | 
| commit | 74c5f30ab878a914d3edb51040f4d78054684947 (patch) | |
| tree | 4bd40ad84148d7572c1451ea353d06c4c5fe90cc /python/fatcat_web | |
| parent | 7ac8611d5b36007710926ba4508828642a80c13c (diff) | |
| download | fatcat-74c5f30ab878a914d3edb51040f4d78054684947.tar.gz fatcat-74c5f30ab878a914d3edb51040f4d78054684947.zip | |
add general and container-specific stats
Diffstat (limited to 'python/fatcat_web')
| -rw-r--r-- | python/fatcat_web/routes.py | 50 | ||||
| -rw-r--r-- | python/fatcat_web/search.py | 134 | ||||
| -rw-r--r-- | python/fatcat_web/templates/stats.html | 48 | 
3 files changed, 229 insertions, 3 deletions
| diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index 115c1981..11f73e4f 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -7,7 +7,8 @@ from flask_login import login_required  from fatcat_web import app, api, auth_api, priv_api  from fatcat_web.auth import handle_token_login, handle_logout, load_user, handle_ia_xauth  from fatcat_client.rest import ApiException -from fatcat_web.search import do_release_search, do_container_search +from fatcat_web.search import * +from fatcat_web.cors import crossdomain  from fatcat_tools.transforms import * @@ -106,6 +107,16 @@ def creator_history(ident):          entity=entity,          history=history) +@app.route('/container/issnl/<issnl>/stats.json', methods=['GET', 'OPTIONS']) +@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) +def container_issnl_stats(issnl): +    try: +        stats = get_elastic_container_stats(issnl) +    except Exception as ae: +        print(ae) +        abort(503) +    return jsonify(stats) +  @app.route('/creator/<ident>/edit', methods=['GET'])  def creator_edit_view(ident):      try: @@ -356,6 +367,10 @@ def release_search():      query = request.args.get('q')      fulltext_only = bool(request.args.get('fulltext_only')) +    issnl = request.args.get('container_issnl') +    if issnl and query: +        query += ' container_issnl:"{}"'.format(issnl) +      if 'q' in request.args.keys():          # always do files for HTML          found = do_release_search(query, fulltext_only=fulltext_only) @@ -375,6 +390,36 @@ def container_search():      else:          return render_template('container_search.html', query=query) +def get_changelog_stats(): +    stats = {} +    latest_changelog = api.get_changelog(limit=1)[0] +    stats['changelog'] = {"latest": { +        "index": latest_changelog.index, +        "timestamp": latest_changelog.timestamp.isoformat(), +    }} +    return stats + +@app.route('/stats.json', methods=['GET', 'OPTIONS']) +@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) +def stats_json(): +    try: +        stats = get_elastic_entity_stats() +        stats.update(get_changelog_stats()) +    except Exception as ae: +        print(ae) +        abort(503) +    return jsonify(stats) + +@app.route('/stats', methods=['GET']) +def stats_page(): +    try: +        stats = get_elastic_entity_stats() +        stats.update(get_changelog_stats()) +    except Exception as ae: +        print(ae) +        abort(503) +    return render_template('stats.html', stats=stats) +  ### Auth #################################################################### @@ -490,6 +535,7 @@ def fatcat_photo():                                 'fatcat.jpg',                                 mimetype='image/jpeg') -@app.route('/health', methods=['GET']) +@app.route('/health', methods=['GET', 'OPTIONS']) +@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type'])  def health():      return jsonify({'ok': True}) diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index d18416d6..f10ce406 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -87,7 +87,7 @@ def do_release_search(q, limit=30, fulltext_only=True):  def do_container_search(q, limit=30):      # Convert raw ISSN-L to ISSN-L query -    if len(q.split()) == 1 and len(q) == 9 and isdigit(q[0:4]) and q[4] == '-': +    if len(q.split()) == 1 and len(q) == 9 and q[0:4].isdigit() and q[4] == '-':          q = 'issnl:"{}"'.format(q)      search_request = { @@ -106,3 +106,135 @@ def do_container_search(q, limit=30):      resp["query"] = { "q": q }      return resp +def get_elastic_entity_stats(): +    """ +    TODO: files, filesets, webcaptures (no schema yet) + +    Returns dict: +        changelog: {latest: {index, datetime}} +        release: {total, refs_total} +        papers: {total, in_web, in_oa, in_kbart, in_web_not_kbart} +    """ + +    stats = {} + +    # 2. releases +    #  x=> total count +    #  x=> total citation records +    #  x=> total (paper, chapter, proceeding) +    #  x=> with fulltext on web +    #  x=> open access +    #  x=> not in KBART, in IA +    #  +    # Can probably do the above with two queries: +    #  - all releases, aggregate count and sum(ref_count) +    #  - in-scope works, aggregate count by (fulltext, OA, kbart/ia) + +    # 2a. release totals +    query = { +        "size": 0, +        "aggs": { +            "release_ref_count": { "sum": { "field": "ref_count" } } +        } +    } +    resp = requests.get( +        "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), +        json=query, +        params=dict(request_cache="true")) +    # TODO: abort() +    resp.raise_for_status() +    resp = resp.json() +    stats['release'] = { +        "total": resp['hits']['total'], +        "refs_total": int(resp['aggregations']['release_ref_count']['value']), +    } + +    # 2b. paper counts +    query = { +        "size": 0, +        "query": { +            "terms": { "release_type": [ +                # "chapter", "thesis", +                "article-journal", "paper-conference", +            ] } }, +        "aggs": { "paper_like": { "filters": { "filters": { +                "in_web": { "term": { "in_web": "true" } }, +                "is_oa": { "term": { "is_oa": "true" } }, +                "in_kbart": { "term": { "in_kbart": "true" } }, +                "in_web_not_kbart": { "bool": { "filter": [ +                        { "term": { "in_web": "true" } }, +                        { "term": { "in_kbart": "false" } } +                ]}} +        }}}} +    } +    resp = requests.get( +        "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), +        json=query, +        params=dict(request_cache="true")) +    # TODO: abort() +    resp.raise_for_status() +    resp = resp.json() +    buckets = resp['aggregations']['paper_like']['buckets'] +    stats['papers'] = { +        'total': resp['hits']['total'], +        'in_web': buckets['in_web']['doc_count'], +        'is_oa': buckets['is_oa']['doc_count'], +        'in_kbart': buckets['in_kbart']['doc_count'], +        'in_web_not_kbart': buckets['in_web_not_kbart']['doc_count'], +    } + +    # 3. containers +    #   => total count +    query = { +        "size": 0, +    } +    resp = requests.get( +        "{}/fatcat_container/_search".format(app.config['ELASTICSEARCH_BACKEND']), +        json=query, +        params=dict(request_cache="true")) +    # TODO: abort() +    resp.raise_for_status() +    resp = resp.json() +    stats['container'] = { +        "total": resp['hits']['total'], +    } + +    return stats + +def get_elastic_container_stats(issnl): +    """ +    TODO: container_id, not issnl + +    Returns dict: +        total +        in_web +        preserved +    """ + +    query = { +        "size": 0, +        "query": { +            "term": { "container_issnl": issnl } +        }, +        "aggs": { "container_stats": { "filters": { "filters": { +                "in_web": { "term": { "in_web": "true" } }, +                "is_preserved": { "term": { "is_preserved": "true" } }, +        }}}} +    } +    resp = requests.get( +        "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), +        json=query, +        params=dict(request_cache="true")) +    # TODO: abort() +    print(resp.json()) +    resp.raise_for_status() +    resp = resp.json() +    buckets = resp['aggregations']['container_stats']['buckets'] +    stats = { +        'issnl': issnl, +        'total': resp['hits']['total'], +        'in_web': buckets['in_web']['doc_count'], +        'is_preserved': buckets['is_preserved']['doc_count'], +    } + +    return stats diff --git a/python/fatcat_web/templates/stats.html b/python/fatcat_web/templates/stats.html new file mode 100644 index 00000000..92205b3d --- /dev/null +++ b/python/fatcat_web/templates/stats.html @@ -0,0 +1,48 @@ +{% extends "base.html" %} +{% block body %} + +<h1>Stats</h1> + +You can also fetch these numbers <a href="./stats.json">as JSON</a>. + +<h3>Changelog</h3> + +<p>Latest changelog index is {{ stats.changelog.latest.index }} ({{ stats.changelog.latest.timestamp}}). + +<h3>Entities</h3> + +<table class="ui structured table"> +  <tbody> +    <tr><td rowspan="5" class="active top aligned"><b>"Papers"</b></td> +        <td>Total</td> +        <td class="right aligned">{{ stats.papers.total }}</td> +    <tr> +        <td>Fulltext on web</td> +        <td class="right aligned">{{ stats.papers.in_web }}</td> +    <tr> +        <td>"Gold" Open Access</td> +        <td class="right aligned">{{ stats.papers.is_oa }}</td> +    <tr> +        <td>In a Keepers/KBART archive</td> +        <td class="right aligned">{{ stats.papers.in_kbart }}</td> +    <tr> +        <td>On web, not in Keepers</td> +        <td class="right aligned">{{ stats.papers.in_web_not_kbart }}</td> + +    <tr><td rowspan="2" class="active top aligned"><b>Releases</b></td> +        <td>Total</td> +        <td class="right aligned">{{ stats.release.total }}</td> +    <tr> +        <td>References (raw, unlinked)</td> +        <td class="right aligned">{{ stats.release.refs_total }}</td> + +    <tr><td rowspan="1" class="active top aligned"><b>Containers</b></td> +        <td>Total</td> +        <td class="right aligned">{{ stats.container.total }}</td> +  </tbody> +</table> + +<br> +<i>"Papers" are journal articles and conference proceedings, a subset of Releases</i> + +{% endblock %} | 
