diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_web/__init__.py | 5 | ||||
-rw-r--r-- | python/fatcat_web/routes.py | 44 | ||||
-rw-r--r-- | python/fatcat_web/search.py | 619 | ||||
-rw-r--r-- | python/fatcat_web/templates/container_search.html | 16 | ||||
-rw-r--r-- | python/fatcat_web/templates/entity_macros.html | 10 | ||||
-rw-r--r-- | python/fatcat_web/templates/release_search.html | 20 | ||||
-rw-r--r-- | python/tests/fixtures.py | 4 | ||||
-rw-r--r-- | python/tests/web_search.py | 161 |
8 files changed, 506 insertions, 373 deletions
diff --git a/python/fatcat_web/__init__.py b/python/fatcat_web/__init__.py index 562ffeb2..487de58a 100644 --- a/python/fatcat_web/__init__.py +++ b/python/fatcat_web/__init__.py @@ -11,6 +11,7 @@ from authlib.flask.client import OAuth from loginpass import create_flask_blueprint, Gitlab, GitHub, ORCiD from raven.contrib.flask import Sentry import fatcat_openapi_client +import elasticsearch from fatcat_web.web_config import Config @@ -71,7 +72,9 @@ mwoauth = MWOAuth( mwoauth.handshaker.user_agent = "fatcat.wiki;python_web_interface" app.register_blueprint(mwoauth.bp, url_prefix='/auth/wikipedia') -from fatcat_web import routes, editing_routes, auth, cors, forms # noqa: E402 +app.es_client = elasticsearch.Elasticsearch(Config.ELASTICSEARCH_BACKEND) + +from fatcat_web import routes, editing_routes, auth, cors, forms # TODO: blocking on ORCID support in loginpass if Config.ORCID_CLIENT_ID: diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index 2489ac03..4a66b3c2 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -14,7 +14,7 @@ from fatcat_tools.normal import * from fatcat_web import app, api, auth_api, priv_api, mwoauth, Config from fatcat_web.auth import handle_token_login, handle_logout, load_user, handle_ia_xauth, handle_wmoauth from fatcat_web.cors import crossdomain -from fatcat_web.search import * +from fatcat_web.search import ReleaseQuery, GenericQuery, do_release_search, do_container_search, get_elastic_entity_stats, get_elastic_container_stats, get_elastic_container_histogram from fatcat_web.entity_helpers import * from fatcat_web.graphics import * from fatcat_web.kafka import * @@ -706,44 +706,22 @@ def generic_search(): @app.route('/release/search', methods=['GET', 'POST']) def release_search(): - query = request.args.get('q') - if not query: - query = '*' - fulltext_only = bool(request.args.get('fulltext_only')) + if 'q' not in request.args.keys(): + return render_template('release_search.html', query=ReleaseQuery(), found=None) - issnl = request.args.get('container_issnl') - if issnl and query: - query += ' container_issnl:"{}"'.format(issnl) - - container_id = request.args.get('container_id') - if container_id and query: - query += ' container_id:"{}"'.format(container_id) - - offset = request.args.get('offset', '0') - offset = max(0, int(offset)) if offset.isnumeric() else 0 - - if 'q' in request.args.keys(): - # always do files for HTML - found = do_release_search(query, fulltext_only=fulltext_only, offset=offset) - return render_template('release_search.html', found=found, query=query, fulltext_only=fulltext_only) - else: - return render_template('release_search.html', query=query, fulltext_only=fulltext_only) + query = ReleaseQuery.from_args(request.args) + found = do_release_search(query) + return render_template('release_search.html', query=query, found=found) @app.route('/container/search', methods=['GET', 'POST']) def container_search(): - query = request.args.get('q') - if not query: - query = '*' - offset = request.args.get('offset', '0') - offset = max(0, int(offset)) if offset.isnumeric() else 0 + if 'q' not in request.args.keys(): + return render_template('container_search.html', query=GenericQuery(), found=None) - if 'q' in request.args.keys(): - # always do files for HTML - found = do_container_search(query, offset=offset) - return render_template('container_search.html', found=found, query=query) - else: - return render_template('container_search.html', query=query) + query = GenericQuery.from_args(request.args) + found = do_container_search(query) + return render_template('container_search.html', query=query, found=found) def get_changelog_stats(): stats = {} diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index 4a87c735..55caa9c5 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -2,118 +2,257 @@ """ Helpers for doing elasticsearch queries (used in the web interface; not part of the formal API) - -TODO: ELASTICSEARCH_*_INDEX should probably be factored out and just hard-coded """ +import sys import datetime -import requests -from flask import abort, flash +from dataclasses import dataclass +from typing import List, Optional, Any + +import elasticsearch +from elasticsearch_dsl import Search, Q +import elasticsearch_dsl.response + from fatcat_web import app -def do_search(index, request, limit=30, offset=0, deep_page_limit=2000): +@dataclass +class ReleaseQuery: + q: Optional[str] = None + limit: Optional[int] = None + offset: Optional[int] = None + fulltext_only: bool = False + container_id: Optional[str] = None + + @classmethod + def from_args(cls, args) -> 'ReleaseQuery': + + query_str = args.get('q') or '*' + + container_id = args.get('container_id') + # TODO: as filter, not in query string + if container_id: + query_str += ' container_id:"{}"'.format(container_id) + + # TODO: where are container_issnl queries actually used? + issnl = args.get('container_issnl') + if issnl and query_str: + query_str += ' container_issnl:"{}"'.format(issnl) + + offset = args.get('offset', '0') + offset = max(0, int(offset)) if offset.isnumeric() else 0 + + return ReleaseQuery( + q=query_str, + offset=offset, + fulltext_only=bool(args.get('fulltext_only')), + container_id=container_id, + ) + +@dataclass +class GenericQuery: + q: Optional[str] = None + limit: Optional[int] = None + offset: Optional[int] = None + + @classmethod + def from_args(cls, args) -> 'GenericQuery': + query_str = args.get('q') + if not query_str: + query_str = '*' + offset = args.get('offset', '0') + offset = max(0, int(offset)) if offset.isnumeric() else 0 + + return GenericQuery( + q=query_str, + offset=offset, + ) + +@dataclass +class SearchHits: + count_returned: int + count_found: int + offset: int + limit: int + deep_page_limit: int + query_time_ms: int + results: List[Any] + + +def results_to_dict(response: elasticsearch_dsl.response.Response) -> List[dict]: + """ + Takes a response returns all the hits as JSON objects. - # Sanity checks - if limit > 100: - limit = 100 - if offset < 0: - offset = 0 - if offset > deep_page_limit: - # Avoid deep paging problem. - offset = deep_page_limit + Also handles surrogate strings that elasticsearch returns sometimes, + probably due to mangled data processing in some pipeline. "Crimes against + Unicode"; production workaround + """ + + results = [] + for h in response: + r = h._d_ + # print(h.meta._d_) + results.append(r) - request["size"] = int(limit) - request["from"] = int(offset) - # print(request) - resp = requests.get("%s/%s/_search" % - (app.config['ELASTICSEARCH_BACKEND'], index), - json=request) - - if resp.status_code == 400: - print("elasticsearch 400: " + str(resp.content)) - flash("Search query failed to parse; you might need to use quotes.<p><code>{}</code>".format(resp.content)) - abort(resp.status_code) - elif resp.status_code != 200: - print("elasticsearch non-200 status code: " + str(resp.status_code)) - print(resp.content) - abort(resp.status_code) - - content = resp.json() - results = [h['_source'] for h in content['hits']['hits']] for h in results: - # Handle surrogate strings that elasticsearch returns sometimes, - # probably due to mangled data processing in some pipeline. - # "Crimes against Unicode"; production workaround for key in h: if type(h[key]) is str: - h[key] = h[key].encode('utf8', 'ignore').decode('utf8') + h[key] = h[key].encode("utf8", "ignore").decode("utf8") + return results - return {"count_returned": len(results), - "count_found": content['hits']['total'], - "results": results, - "offset": offset, - "deep_page_limit": deep_page_limit} +def wrap_es_execution(search: Search) -> Any: + """ + Executes a Search object, and converts various ES error types into + something we can pretty print to the user. + """ + try: + resp = search.execute() + except elasticsearch.exceptions.RequestError as e: + # this is a "user" error + print("elasticsearch 400: " + str(e.info), file=sys.stderr) + if e.info.get("error", {}).get("root_cause", {}): + raise ValueError(str(e.info["error"]["root_cause"][0].get("reason"))) + else: + raise ValueError(str(e.info)) + except elasticsearch.exceptions.TransportError as e: + # all other errors + print("elasticsearch non-200 status code: {}".format(e.info), file=sys.stderr) + raise IOError(str(e.info)) + return resp +def do_container_search( + query: GenericQuery, deep_page_limit: int = 2000 +) -> SearchHits: -def do_release_search(q, limit=30, fulltext_only=True, offset=0): + search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_CONTAINER_INDEX']) - #print("Search hit: " + q) - if limit > 100: - # Sanity check - limit = 100 + search = search.query( + "query_string", + query=query.q, + default_operator="AND", + analyze_wildcard=True, + allow_leading_wildcard=False, + lenient=True, + fields=["biblio"], + ) - # Convert raw DOIs to DOI queries - if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1: - q = 'doi:"{}"'.format(q) + # Sanity checks + limit = min((int(query.limit or 25), 100)) + offset = max((int(query.offset or 0), 0)) + if offset > deep_page_limit: + # Avoid deep paging problem. + offset = deep_page_limit - if fulltext_only: - q += " in_web:true" + search = search[offset : (offset + limit)] + + resp = wrap_es_execution(search) + results = results_to_dict(resp) + + return SearchHits( + count_returned=len(results), + count_found=int(resp.hits.total), + offset=offset, + limit=limit, + deep_page_limit=deep_page_limit, + query_time_ms=int(resp.took), + results=results, + ) + +def do_release_search( + query: ReleaseQuery, deep_page_limit: int = 2000 +) -> SearchHits: + + search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + + # availability filters + if query.fulltext_only: + search = search.filter("term", in_ia=True) + + # Below, we combine several queries to improve scoring. + + # this query use the fancy built-in query string parser + basic_biblio = Q( + "query_string", + query=query.q, + default_operator="AND", + analyze_wildcard=True, + allow_leading_wildcard=False, + lenient=True, + fields=[ + "title^2", + "biblio", + ], + ) + has_fulltext = Q("term", in_ia=True) + poor_metadata = Q( + "bool", + should=[ + # if these fields aren't set, metadata is poor. The more that do + # not exist, the stronger the signal. + Q("bool", must_not=Q("exists", field="title")), + Q("bool", must_not=Q("exists", field="release_year")), + Q("bool", must_not=Q("exists", field="release_type")), + Q("bool", must_not=Q("exists", field="release_stage")), + ], + ) - search_request = { - "query": { - "query_string": { - "query": q, - "default_operator": "AND", - "analyze_wildcard": True, - "lenient": True, - "fields": ["biblio"], - }, - }, - } + search = search.query( + "boosting", + positive=Q("bool", must=basic_biblio, should=[has_fulltext],), + negative=poor_metadata, + negative_boost=0.5, + ) + + # Sanity checks + limit = min((int(query.limit or 25), 100)) + offset = max((int(query.offset or 0), 0)) + if offset > deep_page_limit: + # Avoid deep paging problem. + offset = deep_page_limit - resp = do_search(app.config['ELASTICSEARCH_RELEASE_INDEX'], search_request, offset=offset) - for h in resp['results']: + search = search[offset : (offset + limit)] + + resp = wrap_es_execution(search) + results = results_to_dict(resp) + + for h in results: # Ensure 'contrib_names' is a list, not a single string if type(h['contrib_names']) is not list: h['contrib_names'] = [h['contrib_names'], ] h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']] - resp["query"] = { "q": q } - resp["limit"] = limit - return resp + return SearchHits( + count_returned=len(results), + count_found=int(resp.hits.total), + offset=offset, + limit=limit, + deep_page_limit=deep_page_limit, + query_time_ms=int(resp.took), + results=results, + ) -def do_container_search(q, limit=30, offset=0): +def get_elastic_container_random_releases(ident, limit=5): + """ + Returns a list of releases from the container. + """ - # Convert raw ISSN-L to ISSN-L query - if len(q.split()) == 1 and len(q) == 9 and q[0:4].isdigit() and q[4] == '-': - q = 'issnl:"{}"'.format(q) + assert limit > 0 and limit <= 100 - search_request = { - "query": { - "query_string": { - "query": q, - "default_operator": "AND", - "analyze_wildcard": True, - "lenient": True, - "fields": ["biblio"], - }, - }, - } + search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + search = search.query( + 'bool', + must=[ + Q('term', container_id=ident), + Q('range', release_year={ "lte": datetime.datetime.today().year }), + ] + ) + search = search.sort('-in_web', '-release_date') + search = search[:int(limit)] - resp = do_search(app.config['ELASTICSEARCH_CONTAINER_INDEX'], search_request, limit=limit, offset=offset) - resp["query"] = { "q": q } - resp["limit"] = limit - return resp + search = search.params(request_cache=True) + resp = wrap_es_execution(search) + results = results_to_dict(resp) + + return results def get_elastic_entity_stats(): """ @@ -127,85 +266,73 @@ def get_elastic_entity_stats(): stats = {} - # 2. releases - # - total count - # - total citation records - # - total (paper, chapter, proceeding) - # - " with fulltext on web - # - " open access - # - " not in KBART, in IA - # - # Can do the above with two queries: - # - all releases, aggregate count and sum(ref_count) - # - in-scope works, aggregate count by (fulltext, OA, kbart/ia) - - # 2a. release totals - query = { - "size": 0, - "aggs": { - "release_ref_count": { "sum": { "field": "ref_count" } } - } - } - resp = requests.get( - "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), - json=query, - params=dict(request_cache="true")) - # TODO: abort() - resp.raise_for_status() - resp = resp.json() + # release totals + search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + search.aggs.bucket( + 'release_ref_count', + 'sum', + field='ref_count', + ) + search = search[:0] # pylint: disable=unsubscriptable-object + + search = search.params(request_cache=True) + resp = wrap_es_execution(search) + stats['release'] = { - "total": resp['hits']['total'], - "refs_total": int(resp['aggregations']['release_ref_count']['value']), + "total": int(resp.hits.total), + "refs_total": int(resp.aggregations.release_ref_count.value), } - # 2b. paper counts - query = { - "size": 0, - "query": { - "terms": { "release_type": [ - # "chapter", "thesis", - "article-journal", "paper-conference", - ] } }, - "aggs": { "paper_like": { "filters": { "filters": { - "in_web": { "term": { "in_web": "true" } }, - "is_oa": { "term": { "is_oa": "true" } }, - "in_kbart": { "term": { "in_kbart": "true" } }, - "in_web_not_kbart": { "bool": { "filter": [ - { "term": { "in_web": "true" } }, - { "term": { "in_kbart": "false" } } - ]}} - }}}} - } - resp = requests.get( - "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), - json=query, - params=dict(request_cache="true")) - # TODO: abort() - resp.raise_for_status() - resp = resp.json() - buckets = resp['aggregations']['paper_like']['buckets'] + # paper counts + search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + search = search.query( + 'terms', + release_type=[ + "article-journal", + "paper-conference", + # "chapter", + # "thesis", + ], + ) + search.aggs.bucket( + 'paper_like', + 'filters', + filters={ + "in_web": { "term": { "in_web": "true" } }, + "is_oa": { "term": { "is_oa": "true" } }, + "in_kbart": { "term": { "in_kbart": "true" } }, + "in_web_not_kbart": { "bool": { "filter": [ + { "term": { "in_web": "true" } }, + { "term": { "in_kbart": "false" } }, + ]}}, + } + ) + search = search[:0] + + search = search.params(request_cache=True) + resp = wrap_es_execution(search) + buckets = resp.aggregations.paper_like.buckets stats['papers'] = { - 'total': resp['hits']['total'], - 'in_web': buckets['in_web']['doc_count'], - 'is_oa': buckets['is_oa']['doc_count'], - 'in_kbart': buckets['in_kbart']['doc_count'], - 'in_web_not_kbart': buckets['in_web_not_kbart']['doc_count'], + 'total': resp.hits.total, + 'in_web': buckets.in_web.doc_count, + 'is_oa': buckets.is_oa.doc_count, + 'in_kbart': buckets.in_kbart.doc_count, + 'in_web_not_kbart': buckets.in_web_not_kbart.doc_count, } - # 3. containers - # => total count - query = { - "size": 0, - } - resp = requests.get( - "{}/fatcat_container/_search".format(app.config['ELASTICSEARCH_BACKEND']), - json=query, - params=dict(request_cache="true")) - # TODO: abort() - resp.raise_for_status() - resp = resp.json() + # container counts + search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_CONTAINER_INDEX']) + search.aggs.bucket( + 'release_ref_count', + 'sum', + field='ref_count', + ) + search = search[:0] # pylint: disable=unsubscriptable-object + + search = search.params(request_cache=True) + resp = wrap_es_execution(search) stats['container'] = { - "total": resp['hits']['total'], + "total": resp.hits.total, } return stats @@ -221,30 +348,36 @@ def get_elastic_container_stats(ident, issnl=None): preserved """ - query = { - "size": 0, - "query": { - "term": { "container_id": ident } + search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + search = search.query( + 'term', + container_id=ident, + ) + search.aggs.bucket( + 'container_stats', + 'filters', + filters={ + "in_web": { + "term": { "in_web": True }, + }, + "in_kbart": { + "term": { "in_kbart": True }, + }, + "is_preserved": { + "term": { "is_preserved": True }, + }, }, - "aggs": { "container_stats": { "filters": { "filters": { - "in_web": { "term": { "in_web": "true" } }, - "in_kbart": { "term": { "in_kbart": "true" } }, - "is_preserved": { "term": { "is_preserved": "true" } }, - }}}} - } - resp = requests.get( - "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), - json=query, - params=dict(request_cache="true")) - # TODO: abort() - #print(resp.json()) - resp.raise_for_status() - resp = resp.json() - buckets = resp['aggregations']['container_stats']['buckets'] + ) + search = search[:0] + + search = search.params(request_cache=True) + resp = wrap_es_execution(search) + + buckets = resp.aggregations.container_stats.buckets stats = { 'ident': ident, 'issnl': issnl, - 'total': resp['hits']['total'], + 'total': resp.hits.total, 'in_web': buckets['in_web']['doc_count'], 'in_kbart': buckets['in_kbart']['doc_count'], 'is_preserved': buckets['is_preserved']['doc_count'], @@ -252,48 +385,6 @@ def get_elastic_container_stats(ident, issnl=None): return stats -def get_elastic_container_random_releases(ident, limit=5): - """ - Returns a list of releases from the container. - """ - - assert limit > 0 and limit <= 100 - - query = { - "size": int(limit), - "sort": [ - { "in_web": {"order": "desc"} }, - { "release_date": {"order": "desc"} }, - ], - "query": { - "bool": { - "must": [ - { "term": { "container_id": ident } }, - { "range": { "release_year": { "lte": datetime.datetime.today().year } } }, - ], - }, - }, - } - resp = requests.get( - "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), - json=query, - params=dict(request_cache="true")) - # TODO: abort() - #print(resp.json()) - resp.raise_for_status() - resp = resp.json() - #print(resp) - hits = [h['_source'] for h in resp['hits']['hits']] - for h in hits: - # Handle surrogate strings that elasticsearch returns sometimes, - # probably due to mangled data processing in some pipeline. - # "Crimes against Unicode"; production workaround - for key in h: - if type(h[key]) is str: - h[key] = h[key].encode('utf8', 'ignore').decode('utf8') - - return hits - def get_elastic_container_histogram(ident): """ Fetches a stacked histogram @@ -304,58 +395,46 @@ def get_elastic_container_histogram(ident): (year, in_ia, count) """ - query = { - "aggs": { - "year_in_ia": { - "composite": { - "size": 1000, - "sources": [ - {"year": { - "histogram": { - "field": "release_year", - "interval": 1, - }}}, - {"in_ia": { - "terms": { - "field": "in_ia", - }}}, - ], + search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + search = search.query( + 'bool', + must=[ + Q("range", release_year={ + "gte": datetime.datetime.today().year - 499, + "lte": datetime.datetime.today().year, + }), + ], + filter=[ + Q("bool", minimum_should_match=1, should=[ + Q("match", container_id=ident), + ]), + ], + ) + search.aggs.bucket( + 'year_in_ia', + 'composite', + size=1000, + sources=[ + {"year": { + "histogram": { + "field": "release_year", + "interval": 1, }, - }, - }, - "size": 0, - "query": { - "bool": { - "must": [{ - "range": { - "release_year": { - "gte": datetime.datetime.today().year - 499, - "lte": datetime.datetime.today().year, - } - } - }], - "filter": [{ - "bool": { - "should": [{ - "match": { - "container_id": ident - } - }], - "minimum_should_match": 1, - }, - }], - } - } - } - resp = requests.get( - "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), - json=query, - params=dict(request_cache="true")) - resp.raise_for_status() - # TODO: abort() - resp = resp.json() - #print(resp) + }}, + {"in_ia": { + "terms": { + "field": "in_ia", + }, + }}, + ], + ) + search = search[:0] + + search = search.params(request_cache='true') + resp = wrap_es_execution(search) + + buckets = resp.aggregations.year_in_ia.buckets vals = [(h['key']['year'], h['key']['in_ia'], h['doc_count']) - for h in resp['aggregations']['year_in_ia']['buckets']] + for h in buckets] vals = sorted(vals) return vals diff --git a/python/fatcat_web/templates/container_search.html b/python/fatcat_web/templates/container_search.html index 1a804595..2566f542 100644 --- a/python/fatcat_web/templates/container_search.html +++ b/python/fatcat_web/templates/container_search.html @@ -2,8 +2,8 @@ {% extends "base.html" %} {% block title %} -{% if query %} - Search: {{ query }} +{% if query.q %} + Search: {{ query.q }} {% else %} Release Search {% endif %} @@ -18,9 +18,9 @@ <form class="" role="search" action="/container/search" method="get"> <div class="ui form"> <div class="ui action input huge fluid"> - <input type="text" placeholder="Query..." name="q" value="{% if query %}{{ query }}{% endif %}" aria-label="search container metadata"> <button class="ui button">Search</button> + <input type="text" placeholder="Query..." name="q" value="{% if query.q %}{{ query.q }}{% endif %}" aria-label="search container metadata"> <button class="ui button">Search</button> </div> - <br>Can also lookup by <b><a href="/container/lookup">identifier</a></b> or search <b><a href="/release/search?q={{ query or "" }}">releases</a></b>. + <br>Can also lookup by <b><a href="/container/lookup">identifier</a></b> or search <b><a href="/release/search?q={{ query.q or "" }}">releases</a></b>. </div> </form> </div> @@ -32,7 +32,7 @@ {% if found %} {% if found.results %} - {{ entity_macros.top_results(found) }} + {{ entity_macros.top_results(query, found) }} {% for entity in found.results %} <div> @@ -55,13 +55,13 @@ {% if found.results|length > 8 %} <div class="ui divider"></div> <div style="text-align: center"> - {{ entity_macros.bottom_results(found, endpoint='container_search') }} + {{ entity_macros.bottom_results(query, found, endpoint='container_search') }} </div> {% endif %} {% else %} - Raw query was: <i>{{ found.query.q }}</i> + Raw query was: <i>{{ query.q }}</i> <div class="ui centered stackable grid" style="padding-top: 15%;"> <div class="row"> @@ -72,7 +72,7 @@ <h2>No results found!</h2> <p>You could try elsewhere:</p> <ul> - <li>Search <a href="https://scholar.google.com/scholar?q={{ found.query.q | urlencode }}">Google Scholar</a></li> + <li>Search <a href="https://scholar.google.com/scholar?q={{ query.q | urlencode }}">Google Scholar</a></li> </ul> </div> </div> diff --git a/python/fatcat_web/templates/entity_macros.html b/python/fatcat_web/templates/entity_macros.html index c22eb106..0e7f135a 100644 --- a/python/fatcat_web/templates/entity_macros.html +++ b/python/fatcat_web/templates/entity_macros.html @@ -262,7 +262,7 @@ yellow {% endif %} {%- endmacro %} -{% macro top_results(found) -%} +{% macro top_results(query, found) -%} <i>Showing {% if found.offset == 0 %} @@ -278,13 +278,13 @@ yellow {%- endmacro %} -{% macro bottom_results(found, endpoint='release_search') -%} +{% macro bottom_results(query, found, endpoint='release_search') -%} {% if found.offset > 0 %} {% if found.offset - found.limit < 0 %} - <a href="{{ url_for(endpoint, q=found.query.q, offset=0) }}">« Previous</a> + <a href="{{ url_for(endpoint, q=query.q, offset=0) }}">« Previous</a> {% else %} - <a href="{{ url_for(endpoint, q=found.query.q, offset=found.offset - found.limit) }}">« Previous</a> + <a href="{{ url_for(endpoint, q=query.q, offset=found.offset - found.limit) }}">« Previous</a> {% endif %} {% else %} <span style="color:gray">« Previous</span> @@ -294,7 +294,7 @@ yellow found.count_returned }} out of {{ found.count_found }} results</i> {% if found.offset + found.limit < found.count_found and found.offset + found.limit < found.deep_page_limit %} - <a href="{{ url_for(endpoint, q=found.query.q, offset=found.offset + found.limit) }}">Next »</a> + <a href="{{ url_for(endpoint, q=query.q, offset=found.offset + found.limit) }}">Next »</a> {% else %} <span style="color:gray">Next »</span> {% endif %} diff --git a/python/fatcat_web/templates/release_search.html b/python/fatcat_web/templates/release_search.html index a600f1b2..58aa35d6 100644 --- a/python/fatcat_web/templates/release_search.html +++ b/python/fatcat_web/templates/release_search.html @@ -2,8 +2,8 @@ {% extends "base.html" %} {% block title %} -{% if query %} - Search: {{ query }} +{% if query.q %} + Search: {{ query.q }} {% else %} Release Search {% endif %} @@ -18,14 +18,14 @@ <form class="" role="search" action="/release/search" method="get"> <div class="ui form"> <div class="ui action input huge fluid"> - <input type="text" placeholder="Query..." name="q" value="{% if query %}{{ query }}{% endif %}" aria-label="search release metadata"> + <input type="text" placeholder="Query..." name="q" value="{% if query.q %}{{ query.q }}{% endif %}" aria-label="search release metadata"> <button class="ui primary button">Search</button> </div> <div class="ui checkbox" style="float: right; margin: 1em;"> <input type="checkbox" name="fulltext_only" id="fulltext_only" value="true" {% if fulltext_only %}checked{% endif %}> <label for="fulltext_only">Fulltext Available Only</label> </div> - <br>Can also lookup by <b><a href="/release/lookup">identifier</a></b> or search for <b><a href="/container/search?q={{ query or "" }}">containers</a></b> (eg, journals). + <br>Can also lookup by <b><a href="/release/lookup">identifier</a></b> or search for <b><a href="/container/search?q={{ query.q or "" }}">containers</a></b> (eg, journals). </div> </form> </div> @@ -37,7 +37,7 @@ {% if found %} {% if found.results %} - {{ entity_macros.top_results(found) }} + {{ entity_macros.top_results(query, found) }} {% for paper in found.results %} {{ entity_macros.release_search_result_row(paper) }} @@ -46,13 +46,13 @@ {% if found.results|length > 8 %} <div class="ui divider"></div> <div style="text-align: center"> - {{ entity_macros.bottom_results(found, endpoint='release_search') }} + {{ entity_macros.bottom_results(query, found, endpoint='release_search') }} </div> {% endif %} {% else %} - Raw query was: <i>{{ found.query.q }}</i> + Raw query was: <i>{{ query.q }}</i> <div class="ui centered stackable grid" style="padding-top: 15%;"> <div class="row"> @@ -63,9 +63,9 @@ <h2>No results found!</h2> <p>You could try elsewhere:</p> <ul> - <li>Search <a href="https://dissem.in/search?q={{ found.query.q | urlencode }}">dissem.in</a></li> - <li>Search <a href="https://www.base-search.net/Search/Results?lookfor={{ found.query.q | urlencode }}">BASE</a></li> - <li>Search <a href="https://scholar.google.com/scholar?q={{ found.query.q | urlencode }}">Google Scholar</a></li> + <li>Search <a href="https://dissem.in/search?q={{ query.q | urlencode }}">dissem.in</a></li> + <li>Search <a href="https://www.base-search.net/Search/Results?lookfor={{ query.q | urlencode }}">BASE</a></li> + <li>Search <a href="https://scholar.google.com/scholar?q={{ query.q | urlencode }}">Google Scholar</a></li> </ul> </div> </div> diff --git a/python/tests/fixtures.py b/python/tests/fixtures.py index 44c7be63..3263f243 100644 --- a/python/tests/fixtures.py +++ b/python/tests/fixtures.py @@ -1,9 +1,10 @@ import pytest from dotenv import load_dotenv +import elasticsearch + import fatcat_web import fatcat_openapi_client - from fatcat_openapi_client import * from fatcat_tools import authenticated_api @@ -13,6 +14,7 @@ def full_app(): fatcat_web.app.testing = True fatcat_web.app.debug = False fatcat_web.app.config['WTF_CSRF_ENABLED'] = False + fatcat_web.app.es_client = elasticsearch.Elasticsearch("mockbackend") return fatcat_web.app @pytest.fixture diff --git a/python/tests/web_search.py b/python/tests/web_search.py index 7647bcf5..55e90d56 100644 --- a/python/tests/web_search.py +++ b/python/tests/web_search.py @@ -1,31 +1,36 @@ import json -import responses +import pytest +from fatcat_web.search import get_elastic_container_random_releases, get_elastic_container_histogram +from fatcat_openapi_client.rest import ApiException from fixtures import * -@responses.activate -def test_release_search(app): + +def test_release_search(app, mocker): with open('tests/files/elastic_release_search.json') as f: elastic_resp=json.loads(f.read()) - responses.add(responses.GET, 'http://localhost:9200/fatcat_release/_search', - json=elastic_resp, status=200) + es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp)), + ] rv = app.get('/release/search?q=blood') assert rv.status_code == 200 assert b"Showing" in rv.data assert b"Quantum Studies of Acetylene Adsorption on Ice Surface" in rv.data -@responses.activate -def test_container_search(app): +def test_container_search(app, mocker): with open('tests/files/elastic_container_search.json') as f: elastic_resp=json.loads(f.read()) - responses.add(responses.GET, 'http://localhost:9200/fatcat_container/_search', - json=elastic_resp, status=200) + es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp)), + ] rv = app.get('/container/search?q=blood') assert rv.status_code == 200 @@ -33,6 +38,20 @@ def test_container_search(app): assert b"European Instructional Course Lectures" in rv.data assert b"British Editorial Society of Bone and Joint Surger" in rv.data +def test_random_releases(app, mocker): + + with open('tests/files/elastic_release_search.json') as f: + elastic_resp=json.loads(f.read()) + + es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp)), + ] + + resp = get_elastic_container_random_releases("123") + assert len(resp) >= 1 + + elastic_resp1 = { 'timed_out': False, 'aggregations': { @@ -60,39 +79,34 @@ elastic_resp3 = { 'took': 0 } -@responses.activate -def test_stats(app): - - responses.add(responses.GET, - 'http://localhost:9200/fatcat_release/_search?request_cache=true', - json=elastic_resp1.copy(), status=200) - responses.add(responses.GET, - 'http://localhost:9200/fatcat_release/_search?request_cache=true', - json=elastic_resp2.copy(), status=200) - responses.add(responses.GET, - 'http://localhost:9200/fatcat_container/_search?request_cache=true', - json=elastic_resp3.copy(), status=200) +def test_stats(app, mocker): + + es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp1)), + (200, {}, json.dumps(elastic_resp2)), + (200, {}, json.dumps(elastic_resp3)), + ] + rv = app.get('/stats') assert rv.status_code == 200 - # TODO: robe these responses better - -@responses.activate -def test_stats_json(app): - - responses.add(responses.GET, - 'http://localhost:9200/fatcat_release/_search?request_cache=true', - json=elastic_resp1.copy(), status=200) - responses.add(responses.GET, - 'http://localhost:9200/fatcat_release/_search?request_cache=true', - json=elastic_resp2.copy(), status=200) - responses.add(responses.GET, - 'http://localhost:9200/fatcat_container/_search?request_cache=true', - json=elastic_resp3.copy(), status=200) + assert b"80,578,584" in rv.data + +def test_stats_json(app, mocker): + + es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp1)), + (200, {}, json.dumps(elastic_resp2)), + (200, {}, json.dumps(elastic_resp3)), + ] + rv = app.get('/stats.json') assert rv.status_code == 200 + assert rv.json['papers']['in_kbart'] == 51594200 + assert rv.json['release']['refs_total'] == 8031459 -@responses.activate -def test_container_stats(app): +def test_container_stats(app, mocker): elastic_resp = { 'timed_out': False, @@ -106,14 +120,71 @@ def test_container_stats(app): 'took': 50 } - responses.add(responses.GET, - 'http://localhost:9200/fatcat_release/_search?request_cache=true', - json=elastic_resp, status=200) + es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp)), + (200, {}, json.dumps(elastic_resp)), + ] rv = app.get('/container/issnl/1234-5678/stats.json') + #print(rv.json) + assert rv.status_code == 200 + + rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/stats.json') + assert rv.status_code == 200 + +def test_container_coverage(app, mocker): + + elastic_resp1 = { + 'timed_out': False, + 'aggregations': { + 'container_stats': {'buckets': { + 'is_preserved': {'doc_count': 461939}, + 'in_kbart': {'doc_count': 461939}, + 'in_web': {'doc_count': 2797}}}}, + 'hits': {'total': 461939, 'hits': [], 'max_score': 0.0}, + '_shards': {'successful': 5, 'total': 5, 'skipped': 0, 'failed': 0}, + 'took': 50 + } + + elastic_resp2 = { + 'took': 294, + 'timed_out': False, + '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0}, + 'hits': {'total': 4327, 'max_score': 0.0, 'hits': []}, + 'aggregations': {'year_in_ia': { + 'after_key': {'year': 2020.0, 'in_ia': True}, + 'buckets': [ + {'key': {'year': 2004.0, 'in_ia': False}, 'doc_count': 4}, + {'key': {'year': 2004.0, 'in_ia': True}, 'doc_count': 68}, + {'key': {'year': 2005.0, 'in_ia': False}, 'doc_count': 26}, + {'key': {'year': 2005.0, 'in_ia': True}, 'doc_count': 428}, + {'key': {'year': 2006.0, 'in_ia': False}, 'doc_count': 14}, + {'key': {'year': 2006.0, 'in_ia': True}, 'doc_count': 487}, + {'key': {'year': 2007.0, 'in_ia': False}, 'doc_count': 13}, + {'key': {'year': 2007.0, 'in_ia': True}, 'doc_count': 345}, + ], + }}, + } + + es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp1)), + ] + + rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/coverage') assert rv.status_code == 200 - # TODO: probe this response better -# TODO: container stats -# TODO: container ISSN-L query -# TODO: release DOI query -# TODO: release fulltext (filter) query + es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp2)), + ] + + rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/ia_coverage_years.json') + assert rv.status_code == 200 + + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp2)), + ] + + rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/ia_coverage_years.svg') + assert rv.status_code == 200 |