From 8be6905af1d3637742b76b8f0de44471b2b90759 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 8 Apr 2020 12:35:27 -0700 Subject: refactor search to use elasticsearch-dsl --- fatcat_covid19/search.py | 130 ++++++++++++++++++++++------------------------ fatcat_covid19/webface.py | 2 + 2 files changed, 64 insertions(+), 68 deletions(-) diff --git a/fatcat_covid19/search.py b/fatcat_covid19/search.py index 0baeb6d..82d5c60 100644 --- a/fatcat_covid19/search.py +++ b/fatcat_covid19/search.py @@ -8,11 +8,14 @@ already have a WIP branch for this in fatcat repo. import json import datetime -import requests -from flask import abort, flash +from flask import abort from fatcat_covid19.webface import app -def do_search(index, request, limit=25, offset=0, deep_page_limit=2000): +import elasticsearch +from elasticsearch_dsl import Search, Q + + +def generic_search_execute(search, limit=25, offset=0, deep_page_limit=2000): # Sanity checks if limit > 100: @@ -23,33 +26,33 @@ def do_search(index, request, limit=25, offset=0, deep_page_limit=2000): # Avoid deep paging problem. offset = deep_page_limit - request["size"] = int(limit) - request["from"] = int(offset) - # print(request) - resp = requests.get("%s/%s/_search" % - (app.config['ELASTICSEARCH_BACKEND'], index), - json=request) - - if resp.status_code == 400: - print("elasticsearch 400: " + str(resp.content)) - #flash("Search query failed to parse; you might need to use quotes.

{}".format(resp.content)) - abort(resp.status_code) - elif resp.status_code != 200: - print("elasticsearch non-200 status code: " + str(resp.status_code)) - print(resp.content) - abort(resp.status_code) - - content = resp.json() - #print(json.dumps(content, indent=2)) + search = search[int(offset):int(offset)+int(limit)] + + try: + resp = search.execute() + except elasticsearch.exceptions.RequestError as e: + # this is a "user" error + print("elasticsearch 400: " + str(e.info)) + #flash("Search query failed to parse; you might need to use quotes.

{}: {}".format(e.error, e.info['error']['root_cause'][0]['reason'])) + abort(e.status_code) + except elasticsearch.exceptions.TransportError as e: + # all other errors + print("elasticsearch non-200 status code: {}".format(e.info)) + flash("Elasticsearch error: {}".format(e.error)) + abort(e.status_code) + + # convert from objects to python dicts results = [] - for h in content['hits']['hits']: - r = h['_source'] + for h in resp: + r = h._d_ + #print(json.dumps(h.meta._d_, indent=2)) r['_highlights'] = [] - highlights = h.get('highlight', {}) - for k in highlights: - r['_highlights'] += highlights[k] + if 'highlight' in dir(h.meta): + highlights = h.meta.highlight._d_ + for k in highlights: + r['_highlights'] += highlights[k] results.append(r) - #print(json.dumps(results, indent=2)) + for h in results: # Handle surrogate strings that elasticsearch returns sometimes, # probably due to mangled data processing in some pipeline. @@ -58,60 +61,51 @@ def do_search(index, request, limit=25, offset=0, deep_page_limit=2000): if type(h[key]) is str: h[key] = h[key].encode('utf8', 'ignore').decode('utf8') - return {"count_returned": len(results), - "count_found": content['hits']['total'], - "results": results, - "offset": offset, - "deep_page_limit": deep_page_limit} + return { + "count_returned": len(results), + "count_found": int(resp.hits.total), + "results": results, + "offset": offset, + "limit": limit, + "deep_page_limit": deep_page_limit, + "query_time_ms": int(resp.took), + } def do_fulltext_search(q, limit=25, offset=0): - #print("Search hit: " + q) - if limit > 100: - # Sanity check - limit = 100 - # Convert raw DOIs to DOI queries if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1: q = 'doi:"{}"'.format(q) + search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_FULLTEXT_INDEX']) + search = search.query( + 'query_string', + query=q, + default_operator="AND", + analyze_wildcard=True, + lenient=True, + fields=[ + "everything", + "abstract", + "fulltext.body", + "fulltext.annex", + ], + ) + search = search.highlight( + "abstract", + "fulltext.body", + "fulltext.annex", + number_of_fragments=3, + fragment_size=150, + ) - search_request = { - "query": { - "query_string": { - "query": q, - "default_operator": "AND", - "analyze_wildcard": True, - "lenient": True, - "fields": [ - "everything", - "abstract", - "fulltext.body", - "fulltext.annex", - ], - }, - }, - "highlight" : { - "number_of_fragments" : 3, - "fragment_size" : 150, - "fields" : { - "abstract": { }, - "fulltext.body": { }, - "fulltext.annex": { }, - #"everything": { "number_of_fragments" : 3 }, - #"fulltext.abstract": { "number_of_fragments" : 3 }, - #"fulltext.body": { "number_of_fragments" : 3 }, - #"fulltext.annex": { "number_of_fragments" : 3 }, - }, - }, - } + resp = generic_search_execute(search, offset=offset) - resp = do_search(app.config['ELASTICSEARCH_FULLTEXT_INDEX'], search_request, offset=offset) for h in resp['results']: # Ensure 'contrib_names' is a list, not a single string if type(h['contrib_names']) is not list: h['contrib_names'] = [h['contrib_names'], ] h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']] + resp["query"] = { "q": q } - resp["limit"] = limit return resp diff --git a/fatcat_covid19/webface.py b/fatcat_covid19/webface.py index eb7d4c0..662a2f0 100644 --- a/fatcat_covid19/webface.py +++ b/fatcat_covid19/webface.py @@ -9,6 +9,7 @@ import subprocess from flask import Flask, Blueprint, g, app, render_template, request from flask_babel import Babel, gettext from flask.logging import create_logger +import elasticsearch import sentry_sdk from sentry_sdk.integrations.flask import FlaskIntegration @@ -49,6 +50,7 @@ app = Flask(__name__, static_url_path='/static') app.config.from_object(BaseConfig()) app.log = create_logger(app) babel = Babel(app) +app.es_client = elasticsearch.Elasticsearch(app.config['ELASTICSEARCH_BACKEND']) from fatcat_covid19.search import * -- cgit v1.2.3