diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat_web/__init__.py | 5 | ||||
| -rw-r--r-- | python/fatcat_web/routes.py | 44 | ||||
| -rw-r--r-- | python/fatcat_web/search.py | 619 | ||||
| -rw-r--r-- | python/fatcat_web/templates/container_search.html | 16 | ||||
| -rw-r--r-- | python/fatcat_web/templates/entity_macros.html | 10 | ||||
| -rw-r--r-- | python/fatcat_web/templates/release_search.html | 20 | ||||
| -rw-r--r-- | python/tests/fixtures.py | 4 | ||||
| -rw-r--r-- | python/tests/web_search.py | 161 | 
8 files changed, 506 insertions, 373 deletions
| diff --git a/python/fatcat_web/__init__.py b/python/fatcat_web/__init__.py index 562ffeb2..487de58a 100644 --- a/python/fatcat_web/__init__.py +++ b/python/fatcat_web/__init__.py @@ -11,6 +11,7 @@ from authlib.flask.client import OAuth  from loginpass import create_flask_blueprint, Gitlab, GitHub, ORCiD  from raven.contrib.flask import Sentry  import fatcat_openapi_client +import elasticsearch  from fatcat_web.web_config import Config @@ -71,7 +72,9 @@ mwoauth = MWOAuth(  mwoauth.handshaker.user_agent = "fatcat.wiki;python_web_interface"  app.register_blueprint(mwoauth.bp, url_prefix='/auth/wikipedia') -from fatcat_web import routes, editing_routes, auth, cors, forms  # noqa: E402 +app.es_client = elasticsearch.Elasticsearch(Config.ELASTICSEARCH_BACKEND) + +from fatcat_web import routes, editing_routes, auth, cors, forms  # TODO: blocking on ORCID support in loginpass  if Config.ORCID_CLIENT_ID: diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index 2489ac03..4a66b3c2 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -14,7 +14,7 @@ from fatcat_tools.normal import *  from fatcat_web import app, api, auth_api, priv_api, mwoauth, Config  from fatcat_web.auth import handle_token_login, handle_logout, load_user, handle_ia_xauth, handle_wmoauth  from fatcat_web.cors import crossdomain -from fatcat_web.search import * +from fatcat_web.search import ReleaseQuery, GenericQuery, do_release_search, do_container_search, get_elastic_entity_stats, get_elastic_container_stats, get_elastic_container_histogram  from fatcat_web.entity_helpers import *  from fatcat_web.graphics import *  from fatcat_web.kafka import * @@ -706,44 +706,22 @@ def generic_search():  @app.route('/release/search', methods=['GET', 'POST'])  def release_search(): -    query = request.args.get('q') -    if not query: -        query = '*' -    fulltext_only = bool(request.args.get('fulltext_only')) +    if 'q' not in request.args.keys(): +        return render_template('release_search.html', query=ReleaseQuery(), found=None) -    issnl = request.args.get('container_issnl') -    if issnl and query: -        query += ' container_issnl:"{}"'.format(issnl) - -    container_id = request.args.get('container_id') -    if container_id and query: -        query += ' container_id:"{}"'.format(container_id) - -    offset = request.args.get('offset', '0') -    offset = max(0, int(offset)) if offset.isnumeric() else 0 - -    if 'q' in request.args.keys(): -        # always do files for HTML -        found = do_release_search(query, fulltext_only=fulltext_only, offset=offset) -        return render_template('release_search.html', found=found, query=query, fulltext_only=fulltext_only) -    else: -        return render_template('release_search.html', query=query, fulltext_only=fulltext_only) +    query = ReleaseQuery.from_args(request.args) +    found = do_release_search(query) +    return render_template('release_search.html', query=query, found=found)  @app.route('/container/search', methods=['GET', 'POST'])  def container_search(): -    query = request.args.get('q') -    if not query: -        query = '*' -    offset = request.args.get('offset', '0') -    offset = max(0, int(offset)) if offset.isnumeric() else 0 +    if 'q' not in request.args.keys(): +        return render_template('container_search.html', query=GenericQuery(), found=None) -    if 'q' in request.args.keys(): -        # always do files for HTML -        found = do_container_search(query, offset=offset) -        return render_template('container_search.html', found=found, query=query) -    else: -        return render_template('container_search.html', query=query) +    query = GenericQuery.from_args(request.args) +    found = do_container_search(query) +    return render_template('container_search.html', query=query, found=found)  def get_changelog_stats():      stats = {} diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index 4a87c735..55caa9c5 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -2,118 +2,257 @@  """  Helpers for doing elasticsearch queries (used in the web interface; not part of  the formal API) - -TODO: ELASTICSEARCH_*_INDEX should probably be factored out and just hard-coded  """ +import sys  import datetime -import requests -from flask import abort, flash +from dataclasses import dataclass +from typing import List, Optional, Any + +import elasticsearch +from elasticsearch_dsl import Search, Q +import elasticsearch_dsl.response +  from fatcat_web import app -def do_search(index, request, limit=30, offset=0, deep_page_limit=2000): +@dataclass +class ReleaseQuery: +    q: Optional[str] = None +    limit: Optional[int] = None +    offset: Optional[int] = None +    fulltext_only: bool = False +    container_id: Optional[str] = None + +    @classmethod +    def from_args(cls, args) -> 'ReleaseQuery': + +        query_str = args.get('q') or '*' + +        container_id = args.get('container_id') +        # TODO: as filter, not in query string +        if container_id: +            query_str += ' container_id:"{}"'.format(container_id) + +        # TODO: where are container_issnl queries actually used? +        issnl = args.get('container_issnl') +        if issnl and query_str: +            query_str += ' container_issnl:"{}"'.format(issnl) + +        offset = args.get('offset', '0') +        offset = max(0, int(offset)) if offset.isnumeric() else 0 + +        return ReleaseQuery( +            q=query_str, +            offset=offset, +            fulltext_only=bool(args.get('fulltext_only')), +            container_id=container_id, +        ) + +@dataclass +class GenericQuery: +    q: Optional[str] = None +    limit: Optional[int] = None +    offset: Optional[int] = None + +    @classmethod +    def from_args(cls, args) -> 'GenericQuery': +        query_str = args.get('q') +        if not query_str: +            query_str = '*' +        offset = args.get('offset', '0') +        offset = max(0, int(offset)) if offset.isnumeric() else 0 + +        return GenericQuery( +            q=query_str, +            offset=offset, +        ) + +@dataclass +class SearchHits: +    count_returned: int +    count_found: int +    offset: int +    limit: int +    deep_page_limit: int +    query_time_ms: int +    results: List[Any] + + +def results_to_dict(response: elasticsearch_dsl.response.Response) -> List[dict]: +    """ +    Takes a response returns all the hits as JSON objects. -    # Sanity checks -    if limit > 100: -        limit = 100 -    if offset < 0: -        offset = 0 -    if offset > deep_page_limit: -        # Avoid deep paging problem. -        offset = deep_page_limit +    Also handles surrogate strings that elasticsearch returns sometimes, +    probably due to mangled data processing in some pipeline. "Crimes against +    Unicode"; production workaround +    """ + +    results = [] +    for h in response: +        r = h._d_ +        # print(h.meta._d_) +        results.append(r) -    request["size"] = int(limit) -    request["from"] = int(offset) -    # print(request) -    resp = requests.get("%s/%s/_search" % -            (app.config['ELASTICSEARCH_BACKEND'], index), -        json=request) - -    if resp.status_code == 400: -        print("elasticsearch 400: " + str(resp.content)) -        flash("Search query failed to parse; you might need to use quotes.<p><code>{}</code>".format(resp.content)) -        abort(resp.status_code) -    elif resp.status_code != 200: -        print("elasticsearch non-200 status code: " + str(resp.status_code)) -        print(resp.content) -        abort(resp.status_code) - -    content = resp.json() -    results = [h['_source'] for h in content['hits']['hits']]      for h in results: -        # Handle surrogate strings that elasticsearch returns sometimes, -        # probably due to mangled data processing in some pipeline. -        # "Crimes against Unicode"; production workaround          for key in h:              if type(h[key]) is str: -                h[key] = h[key].encode('utf8', 'ignore').decode('utf8') +                h[key] = h[key].encode("utf8", "ignore").decode("utf8") +    return results -    return {"count_returned": len(results), -            "count_found": content['hits']['total'], -            "results": results, -            "offset": offset, -            "deep_page_limit": deep_page_limit} +def wrap_es_execution(search: Search) -> Any: +    """ +    Executes a Search object, and converts various ES error types into +    something we can pretty print to the user. +    """ +    try: +        resp = search.execute() +    except elasticsearch.exceptions.RequestError as e: +        # this is a "user" error +        print("elasticsearch 400: " + str(e.info), file=sys.stderr) +        if e.info.get("error", {}).get("root_cause", {}): +            raise ValueError(str(e.info["error"]["root_cause"][0].get("reason"))) +        else: +            raise ValueError(str(e.info)) +    except elasticsearch.exceptions.TransportError as e: +        # all other errors +        print("elasticsearch non-200 status code: {}".format(e.info), file=sys.stderr) +        raise IOError(str(e.info)) +    return resp +def do_container_search( +    query: GenericQuery, deep_page_limit: int = 2000 +) -> SearchHits: -def do_release_search(q, limit=30, fulltext_only=True, offset=0): +    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_CONTAINER_INDEX']) -    #print("Search hit: " + q) -    if limit > 100: -        # Sanity check -        limit = 100 +    search = search.query( +        "query_string", +        query=query.q, +        default_operator="AND", +        analyze_wildcard=True, +        allow_leading_wildcard=False, +        lenient=True, +        fields=["biblio"], +    ) -    # Convert raw DOIs to DOI queries -    if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1: -        q = 'doi:"{}"'.format(q) +    # Sanity checks +    limit = min((int(query.limit or 25), 100)) +    offset = max((int(query.offset or 0), 0)) +    if offset > deep_page_limit: +        # Avoid deep paging problem. +        offset = deep_page_limit -    if fulltext_only: -        q += " in_web:true" +    search = search[offset : (offset + limit)] + +    resp = wrap_es_execution(search) +    results = results_to_dict(resp) + +    return SearchHits( +        count_returned=len(results), +        count_found=int(resp.hits.total), +        offset=offset, +        limit=limit, +        deep_page_limit=deep_page_limit, +        query_time_ms=int(resp.took), +        results=results, +    ) + +def do_release_search( +    query: ReleaseQuery, deep_page_limit: int = 2000 +) -> SearchHits: + +    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + +    # availability filters +    if query.fulltext_only: +        search = search.filter("term", in_ia=True) + +    # Below, we combine several queries to improve scoring. + +    # this query use the fancy built-in query string parser +    basic_biblio = Q( +        "query_string", +        query=query.q, +        default_operator="AND", +        analyze_wildcard=True, +        allow_leading_wildcard=False, +        lenient=True, +        fields=[ +            "title^2", +            "biblio", +        ], +    ) +    has_fulltext = Q("term", in_ia=True) +    poor_metadata = Q( +        "bool", +        should=[ +            # if these fields aren't set, metadata is poor. The more that do +            # not exist, the stronger the signal. +            Q("bool", must_not=Q("exists", field="title")), +            Q("bool", must_not=Q("exists", field="release_year")), +            Q("bool", must_not=Q("exists", field="release_type")), +            Q("bool", must_not=Q("exists", field="release_stage")), +        ], +    ) -    search_request = { -        "query": { -            "query_string": { -                "query": q, -                "default_operator": "AND", -                "analyze_wildcard": True, -                "lenient": True, -                "fields": ["biblio"], -            }, -        }, -    } +    search = search.query( +        "boosting", +        positive=Q("bool", must=basic_biblio, should=[has_fulltext],), +        negative=poor_metadata, +        negative_boost=0.5, +    ) + +    # Sanity checks +    limit = min((int(query.limit or 25), 100)) +    offset = max((int(query.offset or 0), 0)) +    if offset > deep_page_limit: +        # Avoid deep paging problem. +        offset = deep_page_limit -    resp = do_search(app.config['ELASTICSEARCH_RELEASE_INDEX'], search_request, offset=offset) -    for h in resp['results']: +    search = search[offset : (offset + limit)] + +    resp = wrap_es_execution(search) +    results = results_to_dict(resp) + +    for h in results:          # Ensure 'contrib_names' is a list, not a single string          if type(h['contrib_names']) is not list:              h['contrib_names'] = [h['contrib_names'], ]          h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']] -    resp["query"] = { "q": q } -    resp["limit"] = limit -    return resp +    return SearchHits( +        count_returned=len(results), +        count_found=int(resp.hits.total), +        offset=offset, +        limit=limit, +        deep_page_limit=deep_page_limit, +        query_time_ms=int(resp.took), +        results=results, +    ) -def do_container_search(q, limit=30, offset=0): +def get_elastic_container_random_releases(ident, limit=5): +    """ +    Returns a list of releases from the container. +    """ -    # Convert raw ISSN-L to ISSN-L query -    if len(q.split()) == 1 and len(q) == 9 and q[0:4].isdigit() and q[4] == '-': -        q = 'issnl:"{}"'.format(q) +    assert limit > 0 and limit <= 100 -    search_request = { -        "query": { -            "query_string": { -                "query": q, -                "default_operator": "AND", -                "analyze_wildcard": True, -                "lenient": True, -                "fields": ["biblio"], -            }, -        }, -    } +    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) +    search = search.query( +        'bool', +        must=[ +            Q('term', container_id=ident), +            Q('range', release_year={ "lte": datetime.datetime.today().year }), +        ] +    ) +    search = search.sort('-in_web', '-release_date') +    search = search[:int(limit)] -    resp = do_search(app.config['ELASTICSEARCH_CONTAINER_INDEX'], search_request, limit=limit, offset=offset) -    resp["query"] = { "q": q } -    resp["limit"] = limit -    return resp +    search = search.params(request_cache=True) +    resp = wrap_es_execution(search) +    results = results_to_dict(resp) + +    return results  def get_elastic_entity_stats():      """ @@ -127,85 +266,73 @@ def get_elastic_entity_stats():      stats = {} -    # 2. releases -    #  - total count -    #  - total citation records -    #  - total (paper, chapter, proceeding) -    #  - " with fulltext on web -    #  - " open access -    #  - " not in KBART, in IA -    # -    # Can do the above with two queries: -    #  - all releases, aggregate count and sum(ref_count) -    #  - in-scope works, aggregate count by (fulltext, OA, kbart/ia) - -    # 2a. release totals -    query = { -        "size": 0, -        "aggs": { -            "release_ref_count": { "sum": { "field": "ref_count" } } -        } -    } -    resp = requests.get( -        "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), -        json=query, -        params=dict(request_cache="true")) -    # TODO: abort() -    resp.raise_for_status() -    resp = resp.json() +    # release totals +    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) +    search.aggs.bucket( +        'release_ref_count', +        'sum', +        field='ref_count', +    ) +    search = search[:0]  # pylint: disable=unsubscriptable-object + +    search = search.params(request_cache=True) +    resp = wrap_es_execution(search) +      stats['release'] = { -        "total": resp['hits']['total'], -        "refs_total": int(resp['aggregations']['release_ref_count']['value']), +        "total": int(resp.hits.total), +        "refs_total": int(resp.aggregations.release_ref_count.value),      } -    # 2b. paper counts -    query = { -        "size": 0, -        "query": { -            "terms": { "release_type": [ -                # "chapter", "thesis", -                "article-journal", "paper-conference", -            ] } }, -        "aggs": { "paper_like": { "filters": { "filters": { -                "in_web": { "term": { "in_web": "true" } }, -                "is_oa": { "term": { "is_oa": "true" } }, -                "in_kbart": { "term": { "in_kbart": "true" } }, -                "in_web_not_kbart": { "bool": { "filter": [ -                        { "term": { "in_web": "true" } }, -                        { "term": { "in_kbart": "false" } } -                ]}} -        }}}} -    } -    resp = requests.get( -        "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), -        json=query, -        params=dict(request_cache="true")) -    # TODO: abort() -    resp.raise_for_status() -    resp = resp.json() -    buckets = resp['aggregations']['paper_like']['buckets'] +    # paper counts +    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) +    search = search.query( +        'terms', +        release_type=[ +            "article-journal", +            "paper-conference", +            # "chapter", +            # "thesis", +        ], +    ) +    search.aggs.bucket( +        'paper_like', +        'filters', +        filters={ +            "in_web": { "term": { "in_web": "true" } }, +            "is_oa": { "term": { "is_oa": "true" } }, +            "in_kbart": { "term": { "in_kbart": "true" } }, +            "in_web_not_kbart": { "bool": { "filter": [ +                { "term": { "in_web": "true" } }, +                { "term": { "in_kbart": "false" } }, +            ]}}, +        } +    ) +    search = search[:0] + +    search = search.params(request_cache=True) +    resp = wrap_es_execution(search) +    buckets = resp.aggregations.paper_like.buckets      stats['papers'] = { -        'total': resp['hits']['total'], -        'in_web': buckets['in_web']['doc_count'], -        'is_oa': buckets['is_oa']['doc_count'], -        'in_kbart': buckets['in_kbart']['doc_count'], -        'in_web_not_kbart': buckets['in_web_not_kbart']['doc_count'], +        'total': resp.hits.total, +        'in_web': buckets.in_web.doc_count, +        'is_oa': buckets.is_oa.doc_count, +        'in_kbart': buckets.in_kbart.doc_count, +        'in_web_not_kbart': buckets.in_web_not_kbart.doc_count,      } -    # 3. containers -    #   => total count -    query = { -        "size": 0, -    } -    resp = requests.get( -        "{}/fatcat_container/_search".format(app.config['ELASTICSEARCH_BACKEND']), -        json=query, -        params=dict(request_cache="true")) -    # TODO: abort() -    resp.raise_for_status() -    resp = resp.json() +    # container counts +    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_CONTAINER_INDEX']) +    search.aggs.bucket( +        'release_ref_count', +        'sum', +        field='ref_count', +    ) +    search = search[:0]  # pylint: disable=unsubscriptable-object + +    search = search.params(request_cache=True) +    resp = wrap_es_execution(search)      stats['container'] = { -        "total": resp['hits']['total'], +        "total": resp.hits.total,      }      return stats @@ -221,30 +348,36 @@ def get_elastic_container_stats(ident, issnl=None):          preserved      """ -    query = { -        "size": 0, -        "query": { -            "term": { "container_id": ident } +    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) +    search = search.query( +        'term', +        container_id=ident, +    ) +    search.aggs.bucket( +        'container_stats', +        'filters', +        filters={ +            "in_web": { +                "term": { "in_web": True }, +            }, +            "in_kbart": { +                "term": { "in_kbart": True }, +            }, +            "is_preserved": { +                "term": { "is_preserved": True }, +            },          }, -        "aggs": { "container_stats": { "filters": { "filters": { -                "in_web": { "term": { "in_web": "true" } }, -                "in_kbart": { "term": { "in_kbart": "true" } }, -                "is_preserved": { "term": { "is_preserved": "true" } }, -        }}}} -    } -    resp = requests.get( -        "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), -        json=query, -        params=dict(request_cache="true")) -    # TODO: abort() -    #print(resp.json()) -    resp.raise_for_status() -    resp = resp.json() -    buckets = resp['aggregations']['container_stats']['buckets'] +    ) +    search = search[:0] + +    search = search.params(request_cache=True) +    resp = wrap_es_execution(search) + +    buckets = resp.aggregations.container_stats.buckets      stats = {          'ident': ident,          'issnl': issnl, -        'total': resp['hits']['total'], +        'total': resp.hits.total,          'in_web': buckets['in_web']['doc_count'],          'in_kbart': buckets['in_kbart']['doc_count'],          'is_preserved': buckets['is_preserved']['doc_count'], @@ -252,48 +385,6 @@ def get_elastic_container_stats(ident, issnl=None):      return stats -def get_elastic_container_random_releases(ident, limit=5): -    """ -    Returns a list of releases from the container. -    """ - -    assert limit > 0 and limit <= 100 - -    query = { -        "size": int(limit), -        "sort": [ -            { "in_web": {"order": "desc"} }, -            { "release_date": {"order": "desc"} }, -        ], -        "query": { -            "bool": { -                "must": [ -                    { "term": { "container_id": ident } }, -                    { "range": { "release_year": { "lte": datetime.datetime.today().year } } }, -                ], -            }, -        }, -    } -    resp = requests.get( -        "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), -        json=query, -        params=dict(request_cache="true")) -    # TODO: abort() -    #print(resp.json()) -    resp.raise_for_status() -    resp = resp.json() -    #print(resp) -    hits = [h['_source'] for h in resp['hits']['hits']] -    for h in hits: -        # Handle surrogate strings that elasticsearch returns sometimes, -        # probably due to mangled data processing in some pipeline. -        # "Crimes against Unicode"; production workaround -        for key in h: -            if type(h[key]) is str: -                h[key] = h[key].encode('utf8', 'ignore').decode('utf8') - -    return hits -  def get_elastic_container_histogram(ident):      """      Fetches a stacked histogram @@ -304,58 +395,46 @@ def get_elastic_container_histogram(ident):          (year, in_ia, count)      """ -    query = { -        "aggs": { -            "year_in_ia": { -                "composite": { -                    "size": 1000, -                    "sources": [ -                        {"year": { -                            "histogram": { -                                "field": "release_year", -                                "interval": 1, -                        }}}, -                        {"in_ia": { -                            "terms": { -                                "field": "in_ia", -                        }}}, -                    ], +    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) +    search = search.query( +        'bool', +        must=[ +            Q("range", release_year={ +                "gte": datetime.datetime.today().year - 499, +                "lte": datetime.datetime.today().year, +            }), +        ], +        filter=[ +            Q("bool", minimum_should_match=1, should=[ +                Q("match", container_id=ident), +            ]), +        ], +    ) +    search.aggs.bucket( +        'year_in_ia', +        'composite', +        size=1000, +        sources=[ +            {"year": { +                "histogram": { +                    "field": "release_year", +                    "interval": 1,                  }, -            }, -        }, -        "size": 0, -        "query": { -            "bool": { -                "must": [{ -                    "range": { -                        "release_year": { -                            "gte": datetime.datetime.today().year - 499, -                            "lte": datetime.datetime.today().year, -                        } -                    } -                }], -                "filter": [{ -                    "bool": { -                        "should": [{ -                            "match": { -                                "container_id": ident -                            } -                        }], -                        "minimum_should_match": 1, -                    }, -                }], -            } -        } -    } -    resp = requests.get( -        "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), -        json=query, -        params=dict(request_cache="true")) -    resp.raise_for_status() -    # TODO: abort() -    resp = resp.json() -    #print(resp) +            }}, +            {"in_ia": { +                "terms": { +                    "field": "in_ia", +                }, +            }}, +        ], +    ) +    search = search[:0] + +    search = search.params(request_cache='true') +    resp = wrap_es_execution(search) + +    buckets = resp.aggregations.year_in_ia.buckets      vals = [(h['key']['year'], h['key']['in_ia'], h['doc_count']) -            for h in resp['aggregations']['year_in_ia']['buckets']] +            for h in buckets]      vals = sorted(vals)      return vals diff --git a/python/fatcat_web/templates/container_search.html b/python/fatcat_web/templates/container_search.html index 1a804595..2566f542 100644 --- a/python/fatcat_web/templates/container_search.html +++ b/python/fatcat_web/templates/container_search.html @@ -2,8 +2,8 @@  {% extends "base.html" %}  {% block title %} -{% if query %} -  Search: {{ query }} +{% if query.q %} +  Search: {{ query.q }}  {% else %}    Release Search  {% endif %} @@ -18,9 +18,9 @@      <form class="" role="search" action="/container/search" method="get">        <div class="ui form">          <div class="ui action input huge fluid"> -          <input type="text" placeholder="Query..." name="q" value="{% if query %}{{ query }}{% endif %}" aria-label="search container metadata"> <button class="ui button">Search</button> +          <input type="text" placeholder="Query..." name="q" value="{% if query.q %}{{ query.q }}{% endif %}" aria-label="search container metadata"> <button class="ui button">Search</button>          </div> -        <br>Can also lookup by <b><a href="/container/lookup">identifier</a></b> or search <b><a href="/release/search?q={{ query or "" }}">releases</a></b>. +        <br>Can also lookup by <b><a href="/container/lookup">identifier</a></b> or search <b><a href="/release/search?q={{ query.q or "" }}">releases</a></b>.        </div>      </form>    </div> @@ -32,7 +32,7 @@  {% if found %}    {% if found.results %} -    {{ entity_macros.top_results(found) }} +    {{ entity_macros.top_results(query, found) }}      {% for entity in found.results %}        <div> @@ -55,13 +55,13 @@      {% if found.results|length > 8 %}        <div class="ui divider"></div>        <div style="text-align: center"> -      {{ entity_macros.bottom_results(found, endpoint='container_search') }} +      {{ entity_macros.bottom_results(query, found, endpoint='container_search') }}        </div>      {% endif %}    {% else %} -    Raw query was: <i>{{ found.query.q }}</i> +    Raw query was: <i>{{ query.q }}</i>      <div class="ui centered stackable grid" style="padding-top: 15%;">        <div class="row"> @@ -72,7 +72,7 @@            <h2>No results found!</h2>            <p>You could try elsewhere:</p>            <ul> -            <li>Search <a href="https://scholar.google.com/scholar?q={{ found.query.q | urlencode }}">Google Scholar</a></li> +            <li>Search <a href="https://scholar.google.com/scholar?q={{ query.q | urlencode }}">Google Scholar</a></li>            </ul>          </div>        </div> diff --git a/python/fatcat_web/templates/entity_macros.html b/python/fatcat_web/templates/entity_macros.html index c22eb106..0e7f135a 100644 --- a/python/fatcat_web/templates/entity_macros.html +++ b/python/fatcat_web/templates/entity_macros.html @@ -262,7 +262,7 @@ yellow  {% endif %}  {%- endmacro %} -{% macro top_results(found) -%} +{% macro top_results(query, found) -%}  <i>Showing    {% if found.offset == 0 %} @@ -278,13 +278,13 @@ yellow  {%- endmacro %} -{% macro bottom_results(found, endpoint='release_search') -%} +{% macro bottom_results(query, found, endpoint='release_search') -%}  {% if found.offset > 0 %}    {% if found.offset - found.limit < 0 %} -    <a href="{{ url_for(endpoint, q=found.query.q, offset=0) }}">« Previous</a> +    <a href="{{ url_for(endpoint, q=query.q, offset=0) }}">« Previous</a>    {% else %} -    <a href="{{ url_for(endpoint, q=found.query.q, offset=found.offset - found.limit) }}">« Previous</a> +    <a href="{{ url_for(endpoint, q=query.q, offset=found.offset - found.limit) }}">« Previous</a>    {% endif %}  {% else %}    <span style="color:gray">« Previous</span> @@ -294,7 +294,7 @@ yellow  found.count_returned }} out of {{ found.count_found }} results</i>    {% if found.offset + found.limit < found.count_found and found.offset + found.limit < found.deep_page_limit %} -  <a href="{{ url_for(endpoint, q=found.query.q, offset=found.offset + found.limit) }}">Next »</a> +  <a href="{{ url_for(endpoint, q=query.q, offset=found.offset + found.limit) }}">Next »</a>    {% else %}    <span style="color:gray">Next »</span>  {% endif %} diff --git a/python/fatcat_web/templates/release_search.html b/python/fatcat_web/templates/release_search.html index a600f1b2..58aa35d6 100644 --- a/python/fatcat_web/templates/release_search.html +++ b/python/fatcat_web/templates/release_search.html @@ -2,8 +2,8 @@  {% extends "base.html" %}  {% block title %} -{% if query %} -  Search: {{ query }} +{% if query.q %} +  Search: {{ query.q }}  {% else %}    Release Search  {% endif %} @@ -18,14 +18,14 @@      <form class="" role="search" action="/release/search" method="get">        <div class="ui form">          <div class="ui action input huge fluid"> -          <input type="text" placeholder="Query..." name="q" value="{% if query %}{{ query }}{% endif %}" aria-label="search release metadata"> +          <input type="text" placeholder="Query..." name="q" value="{% if query.q %}{{ query.q }}{% endif %}" aria-label="search release metadata">            <button class="ui primary button">Search</button>          </div>          <div class="ui checkbox" style="float: right; margin: 1em;">            <input type="checkbox" name="fulltext_only" id="fulltext_only" value="true" {% if fulltext_only %}checked{% endif %}>            <label for="fulltext_only">Fulltext Available Only</label>          </div> -        <br>Can also lookup by <b><a href="/release/lookup">identifier</a></b> or search for <b><a href="/container/search?q={{ query or "" }}">containers</a></b> (eg, journals). +        <br>Can also lookup by <b><a href="/release/lookup">identifier</a></b> or search for <b><a href="/container/search?q={{ query.q or "" }}">containers</a></b> (eg, journals).        </div>      </form>    </div> @@ -37,7 +37,7 @@  {% if found %}    {% if found.results %} -    {{ entity_macros.top_results(found) }} +    {{ entity_macros.top_results(query, found) }}      {% for paper in found.results %}        {{ entity_macros.release_search_result_row(paper) }} @@ -46,13 +46,13 @@      {% if found.results|length > 8 %}        <div class="ui divider"></div>        <div style="text-align: center"> -      {{ entity_macros.bottom_results(found, endpoint='release_search') }} +      {{ entity_macros.bottom_results(query, found, endpoint='release_search') }}        </div>      {% endif %}    {% else %} -    Raw query was: <i>{{ found.query.q }}</i> +    Raw query was: <i>{{ query.q }}</i>      <div class="ui centered stackable grid" style="padding-top: 15%;">        <div class="row"> @@ -63,9 +63,9 @@            <h2>No results found!</h2>            <p>You could try elsewhere:</p>            <ul> -            <li>Search <a href="https://dissem.in/search?q={{ found.query.q | urlencode }}">dissem.in</a></li> -            <li>Search <a href="https://www.base-search.net/Search/Results?lookfor={{ found.query.q | urlencode }}">BASE</a></li> -            <li>Search <a href="https://scholar.google.com/scholar?q={{ found.query.q | urlencode }}">Google Scholar</a></li> +            <li>Search <a href="https://dissem.in/search?q={{ query.q | urlencode }}">dissem.in</a></li> +            <li>Search <a href="https://www.base-search.net/Search/Results?lookfor={{ query.q | urlencode }}">BASE</a></li> +            <li>Search <a href="https://scholar.google.com/scholar?q={{ query.q | urlencode }}">Google Scholar</a></li>            </ul>          </div>        </div> diff --git a/python/tests/fixtures.py b/python/tests/fixtures.py index 44c7be63..3263f243 100644 --- a/python/tests/fixtures.py +++ b/python/tests/fixtures.py @@ -1,9 +1,10 @@  import pytest  from dotenv import load_dotenv +import elasticsearch +  import fatcat_web  import fatcat_openapi_client -  from fatcat_openapi_client import *  from fatcat_tools import authenticated_api @@ -13,6 +14,7 @@ def full_app():      fatcat_web.app.testing = True      fatcat_web.app.debug = False      fatcat_web.app.config['WTF_CSRF_ENABLED'] = False +    fatcat_web.app.es_client = elasticsearch.Elasticsearch("mockbackend")      return fatcat_web.app  @pytest.fixture diff --git a/python/tests/web_search.py b/python/tests/web_search.py index 7647bcf5..55e90d56 100644 --- a/python/tests/web_search.py +++ b/python/tests/web_search.py @@ -1,31 +1,36 @@  import json -import responses +import pytest +from fatcat_web.search import get_elastic_container_random_releases, get_elastic_container_histogram +from fatcat_openapi_client.rest import ApiException  from fixtures import * -@responses.activate -def test_release_search(app): + +def test_release_search(app, mocker):      with open('tests/files/elastic_release_search.json') as f:          elastic_resp=json.loads(f.read()) -    responses.add(responses.GET, 'http://localhost:9200/fatcat_release/_search', -        json=elastic_resp, status=200) +    es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') +    es_raw.side_effect = [ +        (200, {}, json.dumps(elastic_resp)), +    ]      rv = app.get('/release/search?q=blood')      assert rv.status_code == 200      assert b"Showing" in rv.data      assert b"Quantum Studies of Acetylene Adsorption on Ice Surface" in rv.data -@responses.activate -def test_container_search(app): +def test_container_search(app, mocker):      with open('tests/files/elastic_container_search.json') as f:          elastic_resp=json.loads(f.read()) -    responses.add(responses.GET, 'http://localhost:9200/fatcat_container/_search', -        json=elastic_resp, status=200) +    es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') +    es_raw.side_effect = [ +        (200, {}, json.dumps(elastic_resp)), +    ]      rv = app.get('/container/search?q=blood')      assert rv.status_code == 200 @@ -33,6 +38,20 @@ def test_container_search(app):      assert b"European Instructional Course Lectures" in rv.data      assert b"British Editorial Society of Bone and Joint Surger" in rv.data +def test_random_releases(app, mocker): + +    with open('tests/files/elastic_release_search.json') as f: +        elastic_resp=json.loads(f.read()) + +    es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') +    es_raw.side_effect = [ +        (200, {}, json.dumps(elastic_resp)), +    ] + +    resp = get_elastic_container_random_releases("123") +    assert len(resp) >= 1 + +  elastic_resp1 = {      'timed_out': False,      'aggregations': { @@ -60,39 +79,34 @@ elastic_resp3 = {      'took': 0  } -@responses.activate -def test_stats(app): - -    responses.add(responses.GET, -        'http://localhost:9200/fatcat_release/_search?request_cache=true', -        json=elastic_resp1.copy(), status=200) -    responses.add(responses.GET, -        'http://localhost:9200/fatcat_release/_search?request_cache=true', -        json=elastic_resp2.copy(), status=200) -    responses.add(responses.GET, -        'http://localhost:9200/fatcat_container/_search?request_cache=true', -        json=elastic_resp3.copy(), status=200) +def test_stats(app, mocker): + +    es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') +    es_raw.side_effect = [ +        (200, {}, json.dumps(elastic_resp1)), +        (200, {}, json.dumps(elastic_resp2)), +        (200, {}, json.dumps(elastic_resp3)), +    ] +      rv = app.get('/stats')      assert rv.status_code == 200 -    # TODO: robe these responses better - -@responses.activate -def test_stats_json(app): - -    responses.add(responses.GET, -        'http://localhost:9200/fatcat_release/_search?request_cache=true', -        json=elastic_resp1.copy(), status=200) -    responses.add(responses.GET, -        'http://localhost:9200/fatcat_release/_search?request_cache=true', -        json=elastic_resp2.copy(), status=200) -    responses.add(responses.GET, -        'http://localhost:9200/fatcat_container/_search?request_cache=true', -        json=elastic_resp3.copy(), status=200) +    assert b"80,578,584" in rv.data + +def test_stats_json(app, mocker): + +    es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') +    es_raw.side_effect = [ +        (200, {}, json.dumps(elastic_resp1)), +        (200, {}, json.dumps(elastic_resp2)), +        (200, {}, json.dumps(elastic_resp3)), +    ] +      rv = app.get('/stats.json')      assert rv.status_code == 200 +    assert rv.json['papers']['in_kbart'] == 51594200 +    assert rv.json['release']['refs_total'] == 8031459 -@responses.activate -def test_container_stats(app): +def test_container_stats(app, mocker):      elastic_resp = {          'timed_out': False, @@ -106,14 +120,71 @@ def test_container_stats(app):          'took': 50      } -    responses.add(responses.GET, -        'http://localhost:9200/fatcat_release/_search?request_cache=true', -        json=elastic_resp, status=200) +    es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') +    es_raw.side_effect = [ +        (200, {}, json.dumps(elastic_resp)), +        (200, {}, json.dumps(elastic_resp)), +    ]      rv = app.get('/container/issnl/1234-5678/stats.json') +    #print(rv.json) +    assert rv.status_code == 200 + +    rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/stats.json') +    assert rv.status_code == 200 + +def test_container_coverage(app, mocker): + +    elastic_resp1 = { +        'timed_out': False, +        'aggregations': { +            'container_stats': {'buckets': { +              'is_preserved': {'doc_count': 461939}, +              'in_kbart': {'doc_count': 461939}, +              'in_web': {'doc_count': 2797}}}}, +        'hits': {'total': 461939, 'hits': [], 'max_score': 0.0}, +        '_shards': {'successful': 5, 'total': 5, 'skipped': 0, 'failed': 0}, +        'took': 50 +    } + +    elastic_resp2 = { +        'took': 294, +        'timed_out': False, +        '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0}, +        'hits': {'total': 4327, 'max_score': 0.0, 'hits': []}, +        'aggregations': {'year_in_ia': { +            'after_key': {'year': 2020.0, 'in_ia': True}, +            'buckets': [ +                {'key': {'year': 2004.0, 'in_ia': False}, 'doc_count': 4}, +                {'key': {'year': 2004.0, 'in_ia': True}, 'doc_count': 68}, +                {'key': {'year': 2005.0, 'in_ia': False}, 'doc_count': 26}, +                {'key': {'year': 2005.0, 'in_ia': True}, 'doc_count': 428}, +                {'key': {'year': 2006.0, 'in_ia': False}, 'doc_count': 14}, +                {'key': {'year': 2006.0, 'in_ia': True}, 'doc_count': 487}, +                {'key': {'year': 2007.0, 'in_ia': False}, 'doc_count': 13}, +                {'key': {'year': 2007.0, 'in_ia': True}, 'doc_count': 345}, +            ], +        }}, +    } + +    es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') +    es_raw.side_effect = [ +        (200, {}, json.dumps(elastic_resp1)), +    ] + +    rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/coverage')      assert rv.status_code == 200 -    # TODO: probe this response better -# TODO: container stats -# TODO: container ISSN-L query -# TODO: release DOI query -# TODO: release fulltext (filter) query +    es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') +    es_raw.side_effect = [ +        (200, {}, json.dumps(elastic_resp2)), +    ] + +    rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/ia_coverage_years.json') +    assert rv.status_code == 200 + +    es_raw.side_effect = [ +        (200, {}, json.dumps(elastic_resp2)), +    ] + +    rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/ia_coverage_years.svg') +    assert rv.status_code == 200 | 
