""" Helpers to make elasticsearch queries. TODO: switch to using elasticsearch-dsl library instead of requests+json. already have a WIP branch for this in fatcat repo. """ import json import datetime from flask import abort from fatcat_covid19.webface import app import elasticsearch from elasticsearch_dsl import Search, Q def generic_search_execute(search, limit=25, offset=0, deep_page_limit=2000): # Sanity checks if limit > 100: limit = 100 if offset < 0: offset = 0 if offset > deep_page_limit: # Avoid deep paging problem. offset = deep_page_limit search = search[int(offset):int(offset)+int(limit)] try: resp = search.execute() except elasticsearch.exceptions.RequestError as e: # this is a "user" error print("elasticsearch 400: " + str(e.info)) #flash("Search query failed to parse; you might need to use quotes.

{}: {}".format(e.error, e.info['error']['root_cause'][0]['reason'])) abort(e.status_code) except elasticsearch.exceptions.TransportError as e: # all other errors print("elasticsearch non-200 status code: {}".format(e.info)) flash("Elasticsearch error: {}".format(e.error)) abort(e.status_code) # convert from objects to python dicts results = [] for h in resp: r = h._d_ #print(json.dumps(h.meta._d_, indent=2)) r['_highlights'] = [] if 'highlight' in dir(h.meta): highlights = h.meta.highlight._d_ for k in highlights: r['_highlights'] += highlights[k] results.append(r) for h in results: # Handle surrogate strings that elasticsearch returns sometimes, # probably due to mangled data processing in some pipeline. # "Crimes against Unicode"; production workaround for key in h: if type(h[key]) is str: h[key] = h[key].encode('utf8', 'ignore').decode('utf8') return { "count_returned": len(results), "count_found": int(resp.hits.total), "results": results, "offset": offset, "limit": limit, "deep_page_limit": deep_page_limit, "query_time_ms": int(resp.took), } def do_fulltext_search(q, limit=25, offset=0, filter_time=None, filter_type=None): # Convert raw DOIs to DOI queries if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1: q = 'doi:"{}"'.format(q) search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_FULLTEXT_INDEX']) # type filters if filter_type == "papers": search = search.filter("terms", release_type=[ "article-journal", "paper-conference", "chapter", ]) elif filter_type == "reports": search = search.filter("terms", release_type=[ "report", "standard", ]) elif filter_type == "datasets": search = search.filter("terms", release_type=[ "dataset", "software", ]) elif filter_type == "everything" or filter_type == None: pass else: abort(400) # time filters if filter_time == "past_week": week_ago_date = str(datetime.date.today() - datetime.timedelta(days=7)) search = search.filter("range", release_date=dict(gte=week_ago_date)) elif filter_time == "this_year": search = search.filter("term", release_year=datetime.date.today().year) elif filter_time == "since_2000": search = search.filter("range", release_year=dict(gte=2000)) elif filter_time == "before_1925": search = search.filter("range", release_year=dict(lte=1924)) elif filter_time == "all_time" or filter_time == None: pass else: abort(400) search = search.query( 'query_string', query=q, default_operator="AND", analyze_wildcard=True, lenient=True, fields=[ "everything", "abstract", "fulltext.body", "fulltext.annex", ], ) search = search.highlight( "abstract", "fulltext.body", "fulltext.annex", number_of_fragments=3, fragment_size=150, ) resp = generic_search_execute(search, offset=offset) for h in resp['results']: # Ensure 'contrib_names' is a list, not a single string if type(h['contrib_names']) is not list: h['contrib_names'] = [h['contrib_names'], ] h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']] resp["query"] = { "q": q } return resp