aboutsummaryrefslogtreecommitdiffstats
path: root/extra/elasticsearch/stats.py
diff options
context:
space:
mode:
Diffstat (limited to 'extra/elasticsearch/stats.py')
-rw-r--r--extra/elasticsearch/stats.py174
1 files changed, 174 insertions, 0 deletions
diff --git a/extra/elasticsearch/stats.py b/extra/elasticsearch/stats.py
new file mode 100644
index 00000000..b6d1f8a5
--- /dev/null
+++ b/extra/elasticsearch/stats.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+
+import sys
+import requests
+import datetime
+from fatcat_tools import public_api
+
+#api_host_url = "http://localhost:9411/v0"
+api_host_url = "https://api.fatcat.wiki/v0"
+
+# NOTE: must not have trailing slash
+elastic_host_url = "https://search.fatcat.wiki"
+
+api = public_api(api_host_url)
+
+def get_changelog_stats():
+
+ stats = {}
+
+ # 1. latest changelog
+ latest_changelog = api.get_changelog(limit=1)[0]
+ stats['changelog'] = {"latest": {
+ "index": latest_changelog.index,
+ "timestamp": latest_changelog.timestamp.isoformat(),
+ }}
+ return stats
+
+def get_elastic_entity_stats():
+ """
+ TODO: files, filesets, webcaptures (no schema yet)
+
+ Returns dict:
+ changelog: {latest: {index, datetime}}
+ release: {total, refs_total}
+ papers: {total, in_web, in_oa, in_kbart, in_web_not_kbart}
+ """
+
+ stats = {}
+
+ # 2. releases
+ # x=> total count
+ # x=> total citation records
+ # x=> total (paper, chapter, proceeding)
+ # x=> with fulltext on web
+ # x=> open access
+ # x=> not in KBART, in IA
+ #
+ # Can probably do the above with two queries:
+ # - all releases, aggregate count and sum(ref_count)
+ # - in-scope works, aggregate count by (fulltext, OA, kbart/ia)
+
+ # 2a. release totals
+ query = {
+ "size": 0,
+ "aggs": {
+ "release_ref_count": { "sum": { "field": "ref_count" } }
+ }
+ }
+ resp = requests.get(
+ "{}/fatcat_release/_search".format(elastic_host_url),
+ json=query,
+ params=dict(request_cache="true"))
+ # TODO: abort()
+ resp.raise_for_status()
+ resp = resp.json()
+ stats['release'] = {
+ "total": resp['hits']['total'],
+ "refs_total": resp['aggregations']['release_ref_count'],
+ }
+
+ # 2b. paper counts
+ query = {
+ "size": 0,
+ "query": {
+ "terms": { "release_type": [
+ # "chapter", "thesis",
+ "article-journal", "paper-conference",
+ ] } },
+ "aggs": { "paper_like": { "filters": { "filters": {
+ "in_web": { "term": { "in_web": "true" } },
+ "is_oa": { "term": { "is_oa": "true" } },
+ "in_kbart": { "term": { "in_kbart": "true" } },
+ "in_web_not_kbart": { "bool": { "filter": [
+ { "term": { "in_web": "true" } },
+ { "term": { "in_kbart": "false" } }
+ ]}}
+ }}}}
+ }
+ resp = requests.get(
+ "{}/fatcat_release/_search".format(elastic_host_url),
+ json=query,
+ params=dict(request_cache="true"))
+ # TODO: abort()
+ resp.raise_for_status()
+ print(resp.json())
+ resp = resp.json()
+ buckets = resp['aggregations']['paper_like']['buckets']
+ stats['papers'] = {
+ 'total': resp['hits']['total'],
+ 'in_web': buckets['in_web']['doc_count'],
+ 'is_oa': buckets['is_oa']['doc_count'],
+ 'in_kbart': buckets['in_kbart']['doc_count'],
+ 'in_web_not_kbart': buckets['in_web_not_kbart']['doc_count'],
+ }
+
+ # 3. containers
+ # => total count
+ query = {
+ "size": 0,
+ }
+ resp = requests.get(
+ "{}/fatcat_container/_search".format(elastic_host_url),
+ json=query,
+ params=dict(request_cache="true"))
+ # TODO: abort()
+ resp.raise_for_status()
+ resp = resp.json()
+ stats['container'] = {
+ "total": resp['hits']['total'],
+ }
+
+ return stats
+
+def print_stats(stats):
+ latest_changelog = stats['changelog']['latest']
+ print("Latest changelog: {} ({})".format(
+ latest_changelog['index'],
+ latest_changelog['timestamp']))
+ print(stats)
+
+stats = {}
+stats.update(get_changelog_stats())
+stats.update(get_elastic_entity_stats())
+print_stats(stats)
+
+def get_elastic_container_stats(issnl):
+ """
+ TODO: container_id, not issnl
+
+ Returns dict:
+ total
+ in_web
+ preserved
+ """
+
+ query = {
+ "size": 0,
+ "query": {
+ "term": { "container_issnl": issnl }
+ },
+ "aggs": { "container_stats": { "filters": { "filters": {
+ "in_web": { "term": { "in_web": "true" } },
+ "is_preserved": { "term": { "is_preserved": "true" } },
+ }}}}
+ }
+ resp = requests.get(
+ "{}/fatcat_release/_search".format(elastic_host_url),
+ json=query,
+ params=dict(request_cache="true"))
+ # TODO: abort()
+ print(resp.json())
+ resp.raise_for_status()
+ resp = resp.json()
+ buckets = resp['aggregations']['container_stats']['buckets']
+ stats = {
+ 'issnl': issnl,
+ 'total': resp['hits']['total'],
+ 'in_web': buckets['in_web']['doc_count'],
+ 'is_preserved': buckets['is_preserved']['doc_count'],
+ }
+
+ return stats
+
+print(get_elastic_container_stats("0140-6736"))