#!/usr/bin/env python3 import sys import requests import datetime from fatcat_tools import public_api #api_host_url = "http://localhost:9411/v0" api_host_url = "https://api.fatcat.wiki/v0" # NOTE: must not have trailing slash elastic_host_url = "https://search.fatcat.wiki" api = public_api(api_host_url) def get_changelog_stats(): stats = {} # 1. latest changelog latest_changelog = api.get_changelog(limit=1)[0] stats['changelog'] = {"latest": { "index": latest_changelog.index, "timestamp": latest_changelog.timestamp.isoformat(), }} return stats def get_elastic_entity_stats(): """ TODO: files, filesets, webcaptures (no schema yet) Returns dict: changelog: {latest: {index, datetime}} release: {total, refs_total} papers: {total, in_web, in_oa, in_kbart, in_web_not_kbart} """ stats = {} # 2. releases # x=> total count # x=> total citation records # x=> total (paper, chapter, proceeding) # x=> with fulltext on web # x=> open access # x=> not in KBART, in IA # # Can probably do the above with two queries: # - all releases, aggregate count and sum(ref_count) # - in-scope works, aggregate count by (fulltext, OA, kbart/ia) # 2a. release totals query = { "size": 0, "aggs": { "release_ref_count": { "sum": { "field": "ref_count" } } } } resp = requests.get( "{}/fatcat_release/_search".format(elastic_host_url), json=query, params=dict(request_cache="true")) # TODO: abort() resp.raise_for_status() resp = resp.json() stats['release'] = { "total": resp['hits']['total'], "refs_total": resp['aggregations']['release_ref_count'], } # 2b. paper counts query = { "size": 0, "query": { "terms": { "release_type": [ # "chapter", "thesis", "article-journal", "paper-conference", ] } }, "aggs": { "paper_like": { "filters": { "filters": { "in_web": { "term": { "in_web": "true" } }, "is_oa": { "term": { "is_oa": "true" } }, "in_kbart": { "term": { "in_kbart": "true" } }, "in_web_not_kbart": { "bool": { "filter": [ { "term": { "in_web": "true" } }, { "term": { "in_kbart": "false" } } ]}} }}}} } resp = requests.get( "{}/fatcat_release/_search".format(elastic_host_url), json=query, params=dict(request_cache="true")) # TODO: abort() resp.raise_for_status() print(resp.json()) resp = resp.json() buckets = resp['aggregations']['paper_like']['buckets'] stats['papers'] = { 'total': resp['hits']['total'], 'in_web': buckets['in_web']['doc_count'], 'is_oa': buckets['is_oa']['doc_count'], 'in_kbart': buckets['in_kbart']['doc_count'], 'in_web_not_kbart': buckets['in_web_not_kbart']['doc_count'], } # 3. containers # => total count query = { "size": 0, } resp = requests.get( "{}/fatcat_container/_search".format(elastic_host_url), json=query, params=dict(request_cache="true")) # TODO: abort() resp.raise_for_status() resp = resp.json() stats['container'] = { "total": resp['hits']['total'], } return stats def print_stats(stats): latest_changelog = stats['changelog']['latest'] print("Latest changelog: {} ({})".format( latest_changelog['index'], latest_changelog['timestamp'])) print(stats) stats = {} stats.update(get_changelog_stats()) stats.update(get_elastic_entity_stats()) print_stats(stats) def get_elastic_container_stats(issnl): """ TODO: container_id, not issnl Returns dict: total in_web preserved """ query = { "size": 0, "query": { "term": { "container_issnl": issnl } }, "aggs": { "container_stats": { "filters": { "filters": { "in_web": { "term": { "in_web": "true" } }, "is_preserved": { "term": { "is_preserved": "true" } }, }}}} } resp = requests.get( "{}/fatcat_release/_search".format(elastic_host_url), json=query, params=dict(request_cache="true")) # TODO: abort() print(resp.json()) resp.raise_for_status() resp = resp.json() buckets = resp['aggregations']['container_stats']['buckets'] stats = { 'issnl': issnl, 'total': resp['hits']['total'], 'in_web': buckets['in_web']['doc_count'], 'is_preserved': buckets['is_preserved']['doc_count'], } return stats print(get_elastic_container_stats("0140-6736"))