""" Helpers for doing elasticsearch queries (used in the web interface; not part of the formal API) TODO: ELASTICSEARCH_*_INDEX should probably be factored out and just hard-coded """ import datetime import requests from flask import abort, flash from fatcat_web import app def do_search(index, request, limit=30, offset=0, deep_page_limit=2000): # Sanity checks if limit > 100: limit = 100 if offset < 0: offset = 0 if offset > deep_page_limit: # Avoid deep paging problem. offset = deep_page_limit request["size"] = int(limit) request["from"] = int(offset) # print(request) resp = requests.get("%s/%s/_search" % (app.config['ELASTICSEARCH_BACKEND'], index), json=request) if resp.status_code == 400: print("elasticsearch 400: " + str(resp.content)) flash("Search query failed to parse; you might need to use quotes.
{}
".format(resp.content))
abort(resp.status_code)
elif resp.status_code != 200:
print("elasticsearch non-200 status code: " + str(resp.status_code))
print(resp.content)
abort(resp.status_code)
content = resp.json()
results = [h['_source'] for h in content['hits']['hits']]
for h in results:
# Handle surrogate strings that elasticsearch returns sometimes,
# probably due to mangled data processing in some pipeline.
# "Crimes against Unicode"; production workaround
for key in h:
if type(h[key]) is str:
h[key] = h[key].encode('utf8', 'ignore').decode('utf8')
return {"count_returned": len(results),
"count_found": content['hits']['total'],
"results": results,
"offset": offset,
"deep_page_limit": deep_page_limit}
def do_release_search(q, limit=30, fulltext_only=True, offset=0):
#print("Search hit: " + q)
if limit > 100:
# Sanity check
limit = 100
# Convert raw DOIs to DOI queries
if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1:
q = 'doi:"{}"'.format(q)
if fulltext_only:
q += " in_web:true"
search_request = {
"query": {
"query_string": {
"query": q,
"default_operator": "AND",
"analyze_wildcard": True,
"lenient": True,
"fields": ["biblio"],
},
},
}
resp = do_search(app.config['ELASTICSEARCH_RELEASE_INDEX'], search_request, offset=offset)
for h in resp['results']:
# Ensure 'contrib_names' is a list, not a single string
if type(h['contrib_names']) is not list:
h['contrib_names'] = [h['contrib_names'], ]
h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']]
resp["query"] = { "q": q }
resp["limit"] = limit
return resp
def do_container_search(q, limit=30, offset=0):
# Convert raw ISSN-L to ISSN-L query
if len(q.split()) == 1 and len(q) == 9 and q[0:4].isdigit() and q[4] == '-':
q = 'issnl:"{}"'.format(q)
search_request = {
"query": {
"query_string": {
"query": q,
"default_operator": "AND",
"analyze_wildcard": True,
"lenient": True,
"fields": ["biblio"],
},
},
}
resp = do_search(app.config['ELASTICSEARCH_CONTAINER_INDEX'], search_request, limit=limit, offset=offset)
resp["query"] = { "q": q }
resp["limit"] = limit
return resp
def get_elastic_entity_stats():
"""
TODO: files, filesets, webcaptures (no schema yet)
Returns dict:
changelog: {latest: {index, datetime}}
release: {total, refs_total}
papers: {total, in_web, in_oa, in_kbart, in_web_not_kbart}
"""
stats = {}
# 2. releases
# - total count
# - total citation records
# - total (paper, chapter, proceeding)
# - " with fulltext on web
# - " open access
# - " not in KBART, in IA
#
# Can do the above with two queries:
# - all releases, aggregate count and sum(ref_count)
# - in-scope works, aggregate count by (fulltext, OA, kbart/ia)
# 2a. release totals
query = {
"size": 0,
"aggs": {
"release_ref_count": { "sum": { "field": "ref_count" } }
}
}
resp = requests.get(
"{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']),
json=query,
params=dict(request_cache="true"))
# TODO: abort()
resp.raise_for_status()
resp = resp.json()
stats['release'] = {
"total": resp['hits']['total'],
"refs_total": int(resp['aggregations']['release_ref_count']['value']),
}
# 2b. paper counts
query = {
"size": 0,
"query": {
"terms": { "release_type": [
# "chapter", "thesis",
"article-journal", "paper-conference",
] } },
"aggs": { "paper_like": { "filters": { "filters": {
"in_web": { "term": { "in_web": "true" } },
"is_oa": { "term": { "is_oa": "true" } },
"in_kbart": { "term": { "in_kbart": "true" } },
"in_web_not_kbart": { "bool": { "filter": [
{ "term": { "in_web": "true" } },
{ "term": { "in_kbart": "false" } }
]}}
}}}}
}
resp = requests.get(
"{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']),
json=query,
params=dict(request_cache="true"))
# TODO: abort()
resp.raise_for_status()
resp = resp.json()
buckets = resp['aggregations']['paper_like']['buckets']
stats['papers'] = {
'total': resp['hits']['total'],
'in_web': buckets['in_web']['doc_count'],
'is_oa': buckets['is_oa']['doc_count'],
'in_kbart': buckets['in_kbart']['doc_count'],
'in_web_not_kbart': buckets['in_web_not_kbart']['doc_count'],
}
# 3. containers
# => total count
query = {
"size": 0,
}
resp = requests.get(
"{}/fatcat_container/_search".format(app.config['ELASTICSEARCH_BACKEND']),
json=query,
params=dict(request_cache="true"))
# TODO: abort()
resp.raise_for_status()
resp = resp.json()
stats['container'] = {
"total": resp['hits']['total'],
}
return stats
def get_elastic_container_stats(ident, issnl=None):
"""
Returns dict:
ident
issnl (optional)
total
in_web
in_kbart
preserved
"""
query = {
"size": 0,
"query": {
"term": { "container_id": ident }
},
"aggs": { "container_stats": { "filters": { "filters": {
"in_web": { "term": { "in_web": "true" } },
"in_kbart": { "term": { "in_kbart": "true" } },
"is_preserved": { "term": { "is_preserved": "true" } },
}}}}
}
resp = requests.get(
"{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']),
json=query,
params=dict(request_cache="true"))
# TODO: abort()
#print(resp.json())
resp.raise_for_status()
resp = resp.json()
buckets = resp['aggregations']['container_stats']['buckets']
stats = {
'ident': ident,
'issnl': issnl,
'total': resp['hits']['total'],
'in_web': buckets['in_web']['doc_count'],
'in_kbart': buckets['in_kbart']['doc_count'],
'is_preserved': buckets['is_preserved']['doc_count'],
}
return stats
def get_elastic_container_random_releases(ident, limit=5):
"""
Returns a list of releases from the container.
"""
assert limit > 0 and limit <= 100
query = {
"size": int(limit),
"sort": [
{ "in_web": {"order": "desc"} },
{ "release_date": {"order": "desc"} },
],
"query": {
"bool": {
"must": [
{ "term": { "container_id": ident } },
{ "range": { "release_year": { "lte": datetime.datetime.today().year } } },
],
},
},
}
resp = requests.get(
"{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']),
json=query,
params=dict(request_cache="true"))
# TODO: abort()
#print(resp.json())
resp.raise_for_status()
resp = resp.json()
#print(resp)
hits = [h['_source'] for h in resp['hits']['hits']]
for h in hits:
# Handle surrogate strings that elasticsearch returns sometimes,
# probably due to mangled data processing in some pipeline.
# "Crimes against Unicode"; production workaround
for key in h:
if type(h[key]) is str:
h[key] = h[key].encode('utf8', 'ignore').decode('utf8')
return hits
def get_elastic_container_histogram(ident):
"""
Fetches a stacked histogram
Filters to the past 500 years (at most), or about 1000 values.
Returns a list of tuples:
(year, in_ia, count)
"""
query = {
"aggs": {
"year_in_ia": {
"composite": {
"size": 1000,
"sources": [
{"year": {
"histogram": {
"field": "release_year",
"interval": 1,
}}},
{"in_ia": {
"terms": {
"field": "in_ia",
}}},
],
},
},
},
"size": 0,
"query": {
"bool": {
"must": [{
"range": {
"release_year": {
"gte": datetime.datetime.today().year - 499,
"lte": datetime.datetime.today().year,
}
}
}],
"filter": [{
"bool": {
"should": [{
"match": {
"container_id": ident
}
}],
"minimum_should_match": 1,
},
}],
}
}
}
resp = requests.get(
"{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']),
json=query,
params=dict(request_cache="true"))
resp.raise_for_status()
# TODO: abort()
resp = resp.json()
#print(resp)
vals = [(h['key']['year'], h['key']['in_ia'], h['doc_count'])
for h in resp['aggregations']['year_in_ia']['buckets']]
vals = sorted(vals)
return vals