diff options
Diffstat (limited to 'extra/elasticsearch/stats.py')
-rw-r--r-- | extra/elasticsearch/stats.py | 174 |
1 files changed, 174 insertions, 0 deletions
diff --git a/extra/elasticsearch/stats.py b/extra/elasticsearch/stats.py new file mode 100644 index 00000000..b6d1f8a5 --- /dev/null +++ b/extra/elasticsearch/stats.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 + +import sys +import requests +import datetime +from fatcat_tools import public_api + +#api_host_url = "http://localhost:9411/v0" +api_host_url = "https://api.fatcat.wiki/v0" + +# NOTE: must not have trailing slash +elastic_host_url = "https://search.fatcat.wiki" + +api = public_api(api_host_url) + +def get_changelog_stats(): + + stats = {} + + # 1. latest changelog + latest_changelog = api.get_changelog(limit=1)[0] + stats['changelog'] = {"latest": { + "index": latest_changelog.index, + "timestamp": latest_changelog.timestamp.isoformat(), + }} + return stats + +def get_elastic_entity_stats(): + """ + TODO: files, filesets, webcaptures (no schema yet) + + Returns dict: + changelog: {latest: {index, datetime}} + release: {total, refs_total} + papers: {total, in_web, in_oa, in_kbart, in_web_not_kbart} + """ + + stats = {} + + # 2. releases + # x=> total count + # x=> total citation records + # x=> total (paper, chapter, proceeding) + # x=> with fulltext on web + # x=> open access + # x=> not in KBART, in IA + # + # Can probably do the above with two queries: + # - all releases, aggregate count and sum(ref_count) + # - in-scope works, aggregate count by (fulltext, OA, kbart/ia) + + # 2a. release totals + query = { + "size": 0, + "aggs": { + "release_ref_count": { "sum": { "field": "ref_count" } } + } + } + resp = requests.get( + "{}/fatcat_release/_search".format(elastic_host_url), + json=query, + params=dict(request_cache="true")) + # TODO: abort() + resp.raise_for_status() + resp = resp.json() + stats['release'] = { + "total": resp['hits']['total'], + "refs_total": resp['aggregations']['release_ref_count'], + } + + # 2b. paper counts + query = { + "size": 0, + "query": { + "terms": { "release_type": [ + # "chapter", "thesis", + "article-journal", "paper-conference", + ] } }, + "aggs": { "paper_like": { "filters": { "filters": { + "in_web": { "term": { "in_web": "true" } }, + "is_oa": { "term": { "is_oa": "true" } }, + "in_kbart": { "term": { "in_kbart": "true" } }, + "in_web_not_kbart": { "bool": { "filter": [ + { "term": { "in_web": "true" } }, + { "term": { "in_kbart": "false" } } + ]}} + }}}} + } + resp = requests.get( + "{}/fatcat_release/_search".format(elastic_host_url), + json=query, + params=dict(request_cache="true")) + # TODO: abort() + resp.raise_for_status() + print(resp.json()) + resp = resp.json() + buckets = resp['aggregations']['paper_like']['buckets'] + stats['papers'] = { + 'total': resp['hits']['total'], + 'in_web': buckets['in_web']['doc_count'], + 'is_oa': buckets['is_oa']['doc_count'], + 'in_kbart': buckets['in_kbart']['doc_count'], + 'in_web_not_kbart': buckets['in_web_not_kbart']['doc_count'], + } + + # 3. containers + # => total count + query = { + "size": 0, + } + resp = requests.get( + "{}/fatcat_container/_search".format(elastic_host_url), + json=query, + params=dict(request_cache="true")) + # TODO: abort() + resp.raise_for_status() + resp = resp.json() + stats['container'] = { + "total": resp['hits']['total'], + } + + return stats + +def print_stats(stats): + latest_changelog = stats['changelog']['latest'] + print("Latest changelog: {} ({})".format( + latest_changelog['index'], + latest_changelog['timestamp'])) + print(stats) + +stats = {} +stats.update(get_changelog_stats()) +stats.update(get_elastic_entity_stats()) +print_stats(stats) + +def get_elastic_container_stats(issnl): + """ + TODO: container_id, not issnl + + Returns dict: + total + in_web + preserved + """ + + query = { + "size": 0, + "query": { + "term": { "container_issnl": issnl } + }, + "aggs": { "container_stats": { "filters": { "filters": { + "in_web": { "term": { "in_web": "true" } }, + "is_preserved": { "term": { "is_preserved": "true" } }, + }}}} + } + resp = requests.get( + "{}/fatcat_release/_search".format(elastic_host_url), + json=query, + params=dict(request_cache="true")) + # TODO: abort() + print(resp.json()) + resp.raise_for_status() + resp = resp.json() + buckets = resp['aggregations']['container_stats']['buckets'] + stats = { + 'issnl': issnl, + 'total': resp['hits']['total'], + 'in_web': buckets['in_web']['doc_count'], + 'is_preserved': buckets['is_preserved']['doc_count'], + } + + return stats + +print(get_elastic_container_stats("0140-6736")) |