From 46e67e77cf7709dae7375a22a6fed1ac730355bf Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 20 Feb 2019 19:58:29 -0800 Subject: refactor entity searches --- python/fatcat_web/routes.py | 16 +-- python/fatcat_web/search.py | 116 ++++++++++------------ python/fatcat_web/templates/container_search.html | 4 + python/fatcat_web/templates/release_search.html | 4 + 4 files changed, 62 insertions(+), 78 deletions(-) (limited to 'python/fatcat_web') diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index 7f10ee2b..115c1981 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -353,18 +353,12 @@ def changelog_entry_view(index): @app.route('/release/search', methods=['GET', 'POST']) def release_search(): - limit = 20 query = request.args.get('q') fulltext_only = bool(request.args.get('fulltext_only')) - # Convert raw DOIs to DOI queries - if query is not None: - if len(query.split()) == 1 and query.startswith("10.") and query.count("/") >= 1: - query = 'doi:"{}"'.format(query) - if 'q' in request.args.keys(): # always do files for HTML - found = do_release_search(query, limit=limit, fulltext_only=fulltext_only) + found = do_release_search(query, fulltext_only=fulltext_only) return render_template('release_search.html', found=found, query=query, fulltext_only=fulltext_only) else: return render_template('release_search.html', query=query, fulltext_only=fulltext_only) @@ -372,17 +366,11 @@ def release_search(): @app.route('/container/search', methods=['GET', 'POST']) def container_search(): - limit = 20 query = request.args.get('q') - # Convert raw ISSN-L to ISSN-L query - if query is not None: - if len(query.split()) == 1 and len(query) == 9 and isdigit(query[0:4]) and query[4] == '-': - query = 'issnl:"{}"'.format(query) - if 'q' in request.args.keys(): # always do files for HTML - found = do_container_search(query, limit=limit) + found = do_container_search(query) return render_template('container_search.html', found=found, query=query) else: return render_template('container_search.html', query=query) diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index a301fcb5..d18416d6 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -10,33 +10,18 @@ the formal API) TODO: ELASTICSEARCH_*_INDEX should probably be factored out and just hard-coded """ -def do_release_search(q, limit=50, fulltext_only=True): - #print("Search hit: " + q) +def do_search(index, request, limit=30): + if limit > 100: # Sanity check limit = 100 - if fulltext_only: - q += " in_web:true" - - search_request = { - "query": { - "query_string": { - "query": q, - "default_operator": "AND", - "analyze_wildcard": True, - "lenient": True, - "fields": ["title^5", "contrib_names^2", "container_title"] - }, - }, - "size": int(limit), - } - - #print(search_request) + request["size"] = int(limit) + #print(request) resp = requests.get("%s/%s/_search" % - (app.config['ELASTICSEARCH_BACKEND'], app.config['ELASTICSEARCH_RELEASE_INDEX']), - json=search_request) + (app.config['ELASTICSEARCH_BACKEND'], index), + json=request) if resp.status_code == 400: print("elasticsearch 400: " + str(resp.content)) @@ -48,73 +33,76 @@ def do_release_search(q, limit=50, fulltext_only=True): abort(resp.status_code) content = resp.json() - #print(content) results = [h['_source'] for h in content['hits']['hits']] for h in results: - # Ensure 'contrib_names' is a list, not a single string - if type(h['contrib_names']) is not list: - h['contrib_names'] = [h['contrib_names'], ] # Handle surrogate strings that elasticsearch returns sometimes, # probably due to mangled data processing in some pipeline. # "Crimes against Unicode"; production workaround for key in h: if type(h[key]) is str: h[key] = h[key].encode('utf8', 'ignore').decode('utf8') - h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']] - found = content['hits']['total'] - return {"query": { "q": q }, - "count_returned": len(results), - "count_found": found, + return {"count_returned": len(results), + "count_found": content['hits']['total'], "results": results } -def do_container_search(q, limit=50): + +def do_release_search(q, limit=30, fulltext_only=True): #print("Search hit: " + q) if limit > 100: # Sanity check limit = 100 + # Convert raw DOIs to DOI queries + if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1: + q = 'doi:"{}"'.format(q) + + + if fulltext_only: + q += " in_web:true" + search_request = { "query": { "query_string": { - "query": q, - "default_operator": "AND", - "analyze_wildcard": True, - "lenient": True, - "fields": ["name^5", "publisher"] + "query": q, + "default_operator": "AND", + "analyze_wildcard": True, + "lenient": True, + "fields": ["title^5", "contrib_names^2", "container_title"], }, }, - "size": int(limit), } - #print(search_request) - resp = requests.get("%s/%s/_search" % - (app.config['ELASTICSEARCH_BACKEND'], app.config['ELASTICSEARCH_CONTAINER_INDEX']), - json=search_request) + resp = do_search(app.config['ELASTICSEARCH_RELEASE_INDEX'], search_request) + for h in resp['results']: + # Ensure 'contrib_names' is a list, not a single string + if type(h['contrib_names']) is not list: + h['contrib_names'] = [h['contrib_names'], ] + h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']] + resp["query"] = { "q": q } + return resp - if resp.status_code == 400: - print("elasticsearch 400: " + str(resp.content)) - flash("Search query failed to parse; you might need to use quotes.

{}".format(resp.content)) - abort(resp.status_code) - elif resp.status_code != 200: - print("elasticsearch non-200 status code: " + str(resp.status_code)) - print(resp.content) - abort(resp.status_code) - content = resp.json() - #print(content) - results = [h['_source'] for h in content['hits']['hits']] - for h in results: - # Handle surrogate strings that elasticsearch returns sometimes, - # probably due to mangled data processing in some pipeline. - # "Crimes against Unicode"; production workaround - for key in h: - if type(h[key]) is str: - h[key] = h[key].encode('utf8', 'ignore').decode('utf8') +def do_container_search(q, limit=30): + + # Convert raw ISSN-L to ISSN-L query + if len(q.split()) == 1 and len(q) == 9 and isdigit(q[0:4]) and q[4] == '-': + q = 'issnl:"{}"'.format(q) + + search_request = { + "query": { + "query_string": { + "query": q, + "default_operator": "AND", + "analyze_wildcard": True, + "lenient": True, + "fields": ["name^5", "publisher"], + }, + }, + } + + resp = do_search(app.config['ELASTICSEARCH_CONTAINER_INDEX'], search_request, limit=limit) + resp["query"] = { "q": q } + return resp - found = content['hits']['total'] - return {"query": { "q": q }, - "count_returned": len(results), - "count_found": found, - "results": results } diff --git a/python/fatcat_web/templates/container_search.html b/python/fatcat_web/templates/container_search.html index 9fc7aa98..925fbc64 100644 --- a/python/fatcat_web/templates/container_search.html +++ b/python/fatcat_web/templates/container_search.html @@ -33,6 +33,10 @@ {% endif %} {% endfor %} +{% if found.results|length > 8 %} +
+ Showing top {{ found.count_returned }} out of {{ found.count_found }} results for: {{ found.query.q }} +{% endif %} {% else %}

No results found!

diff --git a/python/fatcat_web/templates/release_search.html b/python/fatcat_web/templates/release_search.html index 961bf54f..a7dc3c80 100644 --- a/python/fatcat_web/templates/release_search.html +++ b/python/fatcat_web/templates/release_search.html @@ -46,6 +46,10 @@ {% endif %}
{% endfor %} +{% if found.results|length > 8 %} +
+ Showing top {{ found.count_returned }} out of {{ found.count_found }} results for: {{ found.query.q }} +{% endif %} {% else %}

No results found!

-- cgit v1.2.3