""" Helpers to make elasticsearch queries. TODO: switch to using elasticsearch-dsl library instead of requests+json. already have a WIP branch for this in fatcat repo. """ import json import datetime import requests from flask import abort, flash from fatcat_covid19.webface import app def do_search(index, request, limit=30, offset=0, deep_page_limit=2000): # Sanity checks if limit > 100: limit = 100 if offset < 0: offset = 0 if offset > deep_page_limit: # Avoid deep paging problem. offset = deep_page_limit request["size"] = int(limit) request["from"] = int(offset) # print(request) resp = requests.get("%s/%s/_search" % (app.config['ELASTICSEARCH_BACKEND'], index), json=request) if resp.status_code == 400: print("elasticsearch 400: " + str(resp.content)) #flash("Search query failed to parse; you might need to use quotes.

{}".format(resp.content)) abort(resp.status_code) elif resp.status_code != 200: print("elasticsearch non-200 status code: " + str(resp.status_code)) print(resp.content) abort(resp.status_code) content = resp.json() #print(json.dumps(content, indent=2)) results = [] for h in content['hits']['hits']: r = h['_source'] r['_highlights'] = [] highlights = h.get('highlight', {}) for k in highlights: r['_highlights'] += highlights[k] results.append(r) #print(json.dumps(results, indent=2)) for h in results: # Handle surrogate strings that elasticsearch returns sometimes, # probably due to mangled data processing in some pipeline. # "Crimes against Unicode"; production workaround for key in h: if type(h[key]) is str: h[key] = h[key].encode('utf8', 'ignore').decode('utf8') return {"count_returned": len(results), "count_found": content['hits']['total'], "results": results, "offset": offset, "deep_page_limit": deep_page_limit} def do_fulltext_search(q, limit=30, offset=0): #print("Search hit: " + q) if limit > 100: # Sanity check limit = 100 # Convert raw DOIs to DOI queries if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1: q = 'doi:"{}"'.format(q) search_request = { "query": { "query_string": { "query": q, "default_operator": "AND", "analyze_wildcard": True, "lenient": True, "fields": [ "everything", "abstract", "fulltext.body", "fulltext.annex", ], }, }, "highlight" : { "number_of_fragments" : 3, "fragment_size" : 150, "fields" : { "abstract": { }, "fulltext.body": { }, "fulltext.annex": { }, #"everything": { "number_of_fragments" : 3 }, #"fulltext.abstract": { "number_of_fragments" : 3 }, #"fulltext.body": { "number_of_fragments" : 3 }, #"fulltext.annex": { "number_of_fragments" : 3 }, }, }, } resp = do_search(app.config['ELASTICSEARCH_FULLTEXT_INDEX'], search_request, offset=offset) for h in resp['results']: # Ensure 'contrib_names' is a list, not a single string if type(h['contrib_names']) is not list: h['contrib_names'] = [h['contrib_names'], ] h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']] resp["query"] = { "q": q } resp["limit"] = limit return resp