diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-05-21 14:23:31 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-05-21 14:23:41 -0700 | 
| commit | 01ae25c1bf24c8d9f7721f49122a15bf522bdbb4 (patch) | |
| tree | 36add4d6c2c7050f646a14c2a0ffc5f03436ad03 | |
| parent | 6c4f539463074bcb563675b6e4f19464339e5641 (diff) | |
| download | fatcat-scholar-01ae25c1bf24c8d9f7721f49122a15bf522bdbb4.tar.gz fatcat-scholar-01ae25c1bf24c8d9f7721f49122a15bf522bdbb4.zip | |
search query improvements
- wire up most of the filters and sort order
- query sticks around in search box
- crude error message (needs work)
| -rw-r--r-- | fatcat_scholar/search.py | 228 | ||||
| -rw-r--r-- | fatcat_scholar/templates/base.html | 13 | ||||
| -rw-r--r-- | fatcat_scholar/templates/search.html | 70 | ||||
| -rw-r--r-- | fatcat_scholar/templates/search_macros.html | 24 | ||||
| -rw-r--r-- | fatcat_scholar/web.py | 46 | 
5 files changed, 236 insertions, 145 deletions
| diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index f94c403..f8dd7fb 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -3,112 +3,123 @@  Helpers to make elasticsearch queries.  """ +import sys  import json  import datetime -  import elasticsearch -from elasticsearch_dsl import Search, Q +from pydantic import BaseModel  from dynaconf import settings - - -def generic_search_execute(search, limit=25, offset=0, deep_page_limit=2000): - -    # Sanity checks -    if limit > 100: -        limit = 100 -    if offset < 0: -        offset = 0 -    if offset > deep_page_limit: -        # Avoid deep paging problem. -        offset = deep_page_limit - -    search = search[int(offset):int(offset)+int(limit)] - -    try: -        resp = search.execute() -    except elasticsearch.exceptions.RequestError as e: -        # this is a "user" error -        print("elasticsearch 400: " + str(e.info)) -        #flash("Search query failed to parse; you might need to use quotes.<p><code>{}: {}</code>".format(e.error, e.info['error']['root_cause'][0]['reason'])) -        # XXX: abort(e.status_code) -        raise e -    except elasticsearch.exceptions.TransportError as e: -        # all other errors -        print("elasticsearch non-200 status code: {}".format(e.info)) -        # XXX: abort(e.status_code) -        raise e - -    # convert from objects to python dicts -    results = [] -    for h in resp: -        r = h._d_ -        #print(json.dumps(h.meta._d_, indent=2)) -        r['_highlights'] = [] -        if 'highlight' in dir(h.meta): -            highlights = h.meta.highlight._d_ -            for k in highlights: -                r['_highlights'] += highlights[k] -        results.append(r) - -    for h in results: -        # Handle surrogate strings that elasticsearch returns sometimes, -        # probably due to mangled data processing in some pipeline. -        # "Crimes against Unicode"; production workaround -        for key in h: -            if type(h[key]) is str: -                h[key] = h[key].encode('utf8', 'ignore').decode('utf8') - -    return { -        "count_returned": len(results), -        "count_found": int(resp.hits.total), -        "results": results, -        "offset": offset, -        "limit": limit, -        "deep_page_limit": deep_page_limit, -        "query_time_ms": int(resp.took), +from dataclasses import dataclass +from elasticsearch_dsl import Search, Q +from typing import List, Dict, Tuple, Optional, Any, Sequence + + +class FulltextQuery(BaseModel): +    q: Optional[str] = None +    limit: Optional[int] = None +    offset: Optional[int] = None +    filter_time: Optional[str] = None +    filter_type: Optional[str] = None +    filter_availability: Optional[str] = None +    sort_order: Optional[str] = None +    time_options: Any = { +        "label": "Release Date", +        "slug": "filter_time", +        "default": "all_time", +        "list": [ +            {"label": "All Time", "slug": "all_time"}, +            {"label": "Past Week", "slug": "past_week"}, +            {"label": "Past Year", "slug": "past_year"}, +            {"label": "Since 2000", "slug": "since_2000"}, +            {"label": "Before 1925", "slug": "before_1925"}, +        ], +    } +    type_options: Any = { +        "label": "Resource Type", +        "slug": "filter_type", +        "default": "papers", +        "list": [ +            {"label": "Papers", "slug": "papers"}, +            {"label": "Reports", "slug": "reports"}, +            {"label": "Datasets", "slug": "datasets"}, +            {"label": "Everything", "slug": "everything"}, +        ], +    } +    availability_options: Any = { +        "label": "Availability", +        "slug": "filter_availability", +        "default": "everything", +        "list": [ +            {"label": "Everything", "slug": "everything"}, +            {"label": "Fulltext", "slug": "fulltext"}, +            {"label": "Open Access", "slug": "oa"}, +        ], +    } +    sort_options: Any = { +        "label": "Sort Order", +        "slug": "sort_order", +        "default": "relevancy", +        "list": [ +            {"label": "All Time", "slug": "relevancy"}, +            {"label": "Recent First", "slug": "time_desc"}, +            {"label": "Oldest First", "slug": "time_asc"}, +        ],      } -def do_fulltext_search(q, limit=25, offset=0, filter_time=None, filter_type=None): +class FulltextHits(BaseModel): +    count_returned: int +    count_found: int +    offset: int +    limit: int +    deep_page_limit: int +    query_time_ms: int  +    results: List[Any] -    # Convert raw DOIs to DOI queries -    if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1: -        q = 'doi:"{}"'.format(q) + +def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> FulltextHits:      es_client = elasticsearch.Elasticsearch(settings.ELASTICSEARCH_BACKEND)      search = Search(using=es_client, index=settings.ELASTICSEARCH_FULLTEXT_INDEX) +    # Convert raw DOIs to DOI queries +    if len(query.q.split()) == 1 and query.q.startswith("10.") and query.q.count("/") >= 1: +        search = search.filter("terms", doi=query.q) +        query.q = "*" +      # type filters -    if filter_type == "papers": +    if query.filter_type == "papers":          search = search.filter("terms", type=[ "article-journal", "paper-conference", "chapter", ]) -    elif filter_type == "reports": +    elif query.filter_type == "reports":          search = search.filter("terms", type=[ "report", "standard", ]) -    elif filter_type == "datasets": +    elif query.filter_type == "datasets":          search = search.filter("terms", type=[ "dataset", "software", ]) -    elif filter_type == "everything" or filter_type == None: +    elif query.filter_type == "everything" or query.filter_type == None:          pass      else: -        # XXX: abort(400) -        raise Exception() +        raise ValueError(f"Unknown 'filter_type' parameter value: '{query.filter_type}'")      # time filters -    if filter_time == "past_week": +    if query.filter_time == "past_week":          week_ago_date = str(datetime.date.today() - datetime.timedelta(days=7))          search = search.filter("range", date=dict(gte=week_ago_date)) -    elif filter_time == "this_year": -        search = search.filter("term", year=datetime.date.today().year) -    elif filter_time == "since_2000": +    elif query.filter_time == "past_year": +        # (date in the past year) or (year is this year) +        # the later to catch papers which don't have release_date defined +        year_ago_date = str(datetime.date.today() - datetime.timedelta(days=365)) +        this_year = datetime.date.today().year +        search = search.filter(Q("range", date=dict(gte=year_ago_date)) | Q("term", year=this_year)) +    elif query.filter_time == "since_2000":          search = search.filter("range", year=dict(gte=2000)) -    elif filter_time == "before_1925": +    elif query.filter_time == "before_1925":          search = search.filter("range", year=dict(lt=1925)) -    elif filter_time == "all_time" or filter_time == None: +    elif query.filter_time == "all_time" or query.filter_time == None:          pass      else: -        # XXX: abort(400) -        raise Exception() +        raise ValueError(f"Unknown 'filter_time' parameter value: '{query.filter_time}'")      search = search.query(          'query_string', -        query=q, +        query=query.q,          default_operator="AND",          analyze_wildcard=True,          lenient=True, @@ -127,7 +138,62 @@ def do_fulltext_search(q, limit=25, offset=0, filter_time=None, filter_type=None          fragment_size=250,      ) -    resp = generic_search_execute(search, offset=offset) +    # sort order +    if query.sort_order == "time_asc": +        search = search.sort("year", "date") +    elif query.sort_order == "time_desc": +        search = search.sort("-year", "-date") +    elif query.sort_order == "relevancy" or query.sort_order == None: +        pass +    else: +        raise ValueError(f"Unknown 'sort_order' parameter value: '{query.sort_order}'") + +    # Sanity checks +    limit = min((int(query.limit or 25), 100)) +    offset = max((int(query.offset or 0), 0)) +    if offset > deep_page_limit: +        # Avoid deep paging problem. +        offset = deep_page_limit + +    search = search[offset:offset+limit] + +    try: +        resp = search.execute() +    except elasticsearch.exceptions.RequestError as e: +        # this is a "user" error +        print("elasticsearch 400: " + str(e.info), file=sys.stderr) +        raise ValueError(str(e.info)) +    except elasticsearch.exceptions.TransportError as e: +        # all other errors +        print("elasticsearch non-200 status code: {}".format(e.info), file=sys.stderr) +        raise IOError(str(e.info)) -    resp["query"] = { "q": q } -    return resp +    # convert from objects to python dicts +    results = [] +    for h in resp: +        r = h._d_ +        #print(json.dumps(h.meta._d_, indent=2)) +        r['_highlights'] = [] +        if 'highlight' in dir(h.meta): +            highlights = h.meta.highlight._d_ +            for k in highlights: +                r['_highlights'] += highlights[k] +        results.append(r) + +    for h in results: +        # Handle surrogate strings that elasticsearch returns sometimes, +        # probably due to mangled data processing in some pipeline. +        # "Crimes against Unicode"; production workaround +        for key in h: +            if type(h[key]) is str: +                h[key] = h[key].encode('utf8', 'ignore').decode('utf8') + +    return FulltextHits( +        count_returned=len(results), +        count_found=int(resp.hits.total), +        offset=offset, +        limit=limit, +        deep_page_limit=deep_page_limit, +        query_time_ms=int(resp.took), +        results=results, +    ) diff --git a/fatcat_scholar/templates/base.html b/fatcat_scholar/templates/base.html index fffdadd..a9eedd4 100644 --- a/fatcat_scholar/templates/base.html +++ b/fatcat_scholar/templates/base.html @@ -58,6 +58,15 @@      .ui.card a:hover {        opacity: 0.75;      } + +    .text-button { +      border: none; +      background-color: inherit; +      padding: 0; +      font-family: inherit; +      cursor: pointer; +      display: inline-block; +    }    </style>    <title>{%- block title -%}scholar.archive.org{%- endblock %}</title>    <link rel="stylesheet" @@ -116,11 +125,11 @@          </div>        </div>        <div class="ui twelve wide column"> -        <form class="" id="fulltext_query" action="{{ lang_prefix }}/search") }}" method="get" role="search" aria-label="papers" itemprop="potentialAction" itemscope itemtype="https://schema.org/SearchAction"> +        <form class="" id="search_form" action="{{ lang_prefix }}/search") }}" method="get" role="search" aria-label="papers" itemprop="potentialAction" itemscope itemtype="https://schema.org/SearchAction">            <meta itemprop="target" content="https://{{ settings.FATCAT_DOMAIN }}/fulltext/search?q={q}"/>            <div class="ui form">              <div class="ui action input large fluid"> -              <input type="text" placeholder="{{ _("by title, authors, identifiers...") }}" name="q" aria-label="search metadata" required itemprop="query-input" style="border-radius: 0; border: 1px #999 solid;"> +              <input type="search" placeholder="{{ _("by title, authors, identifiers...") }}" name="q" aria-label="search metadata" required itemprop="query-input" style="border-radius: 0; border: 1px #999 solid;" {% if query and query.q %}value="{{ query.q }}"{% endif %}>                <button class="ui green button" style="border-radius: 0; background-color: #44a25a; font-size: 1.2rem;">{{ _("Search") }}</button>              </div>            </div> diff --git a/fatcat_scholar/templates/search.html b/fatcat_scholar/templates/search.html index c2cd3ea..6d9fec2 100644 --- a/fatcat_scholar/templates/search.html +++ b/fatcat_scholar/templates/search.html @@ -5,68 +5,40 @@  <div class="ui equal height divided grid" style="margin-top: 1em;">    <div class="ui two wide column"> -    {% if found %} -    <div class="ui tiny statistic" style="width: 100%; text-align: center;"> -      <div class="value"> -        {{ "{:,}".format(found.count_found) }} -      </div> -      <div class="label"> -        Hits -      </div> +    {% if hits %} +    <div style="width: 100%; text-align: right;"> +      {# <h2>{{ "{:,}".format(hits.count_found) }}</h2> #} +      <h3 style="font-size: {% if hits.count_found >= 10000000 %}1.0em{% elif hits.count_found >= 1000 %}1.5em{% else %}2.0em{% endif %};">{{ "{:,}".format(hits.count_found) }}</h3> +      Hits +    </div> +    <div style="text-align: right;"> +      <span style="color: rgba(0,0,0,0.4);">in {{ "{:0.2}".format(hits.query_time_ms/1000.0) }}sec</span>      </div>      <div class="ui clearing divider"></div> -    {% else %} -      Maybe some filters, facets, counts over here?      {% endif %}      <div style="text-align: right;"> -      <span style="color: rgba(0,0,0,0.4);">Release Date</span> -      <div class="ui link list" style="margin-top: 0.3em;"> -        <a class="active item"><b>All Time</b></a> -        <a class="item" style="color: rgba(0,0,0);">Past Week</a> -        <a class="item" style="color: rgba(0,0,0);">Past Year</a> -        <a class="item" style="color: rgba(0,0,0);">Since 2000</a> -        <a class="item" style="color: rgba(0,0,0);">Before 1925</a> -      </div> - -      <br> -      <span style="color: rgba(0,0,0,0.4);">Resource Type</span> -      <div class="ui link list" style="margin-top: 0.3em;"> -        <a class="active item"><b>Papers</b></a> -        <a class="item" style="color: rgba(0,0,0);">Reports</a> -        <a class="item" style="color: rgba(0,0,0);">Datasets</a> -        <a class="item" style="color: rgba(0,0,0);">Everything</a> -      </div> - -      <br> -      <span style="color: rgba(0,0,0,0.4);">Availability</span> -      <div class="ui link list" style="margin-top: 0.3em;"> -        <a class="active item"><b>Everything</b></a> -        <a class="item" style="color: rgba(0,0,0);">Open Access</a> -        <a class="item" style="color: rgba(0,0,0);">Lending</a> -        <a class="item" style="color: rgba(0,0,0);">Paywall</a> -      </div> - -      <br> -      <span style="color: rgba(0,0,0,0.4);">Sort Order</span> -      <div class="ui link list" style="margin-top: 0.3em;"> -        <a class="active item"><b>Relevancy</b></a> -        <a class="item" style="color: rgba(0,0,0);">Recent First</a> -        <a class="item" style="color: rgba(0,0,0);">Oldest First</a> -      </div> +      {{ search_macros.query_option(query.time_options, query.filter_time) }} +      {{ search_macros.query_option(query.type_options, query.filter_type) }} +      {{ search_macros.query_option(query.availability_options, query.filter_availability) }} +      {{ search_macros.query_option(query.sort_options, query.sort_order) }}      </div>    </div>    <div class="ui thirteen wide column"> -    {% if found %} -      {% if found.results %} -        {% for paper in found.results %} +    {% if search_error %} +      <div class="ui error message"> +        <div class="header">Query Error</div> +        <p>{{ search_error }}</p> +      </div> +    {% elif hits %} +      {% if hits.results %} +        {% for paper in hits.results %}            {{ search_macros.fulltext_search_result_row(paper) }}          {% endfor %}        {% endif %}      {% else %} -    Some other message here when there is no search? Like a bunch of examples? -    Or does that ever happen... we can just run query on "*". +    No search submitted? Some message should go here.      {% endif %}    </div>  </div> diff --git a/fatcat_scholar/templates/search_macros.html b/fatcat_scholar/templates/search_macros.html index f5c4d85..bb96b61 100644 --- a/fatcat_scholar/templates/search_macros.html +++ b/fatcat_scholar/templates/search_macros.html @@ -1,6 +1,5 @@  {% macro fulltext_search_result_row(paper) -%} -{% set lang_code = "en" %}  <div class="ui grid">  <div class="thirteen wide column">    {# ### TITLE ROW #} @@ -175,3 +174,26 @@  </div>  </div>  {% endmacro %} + +{% macro query_option(options, selected) -%} +<span style="color: rgba(0,0,0,0.4);">{{ options.label }}</span> +<div class="ui link list" style="margin-top: 0.3em;"> +  {% if selected %} +    <input form="search_form" type="hidden" name="{{ options.slug }}" value="{{ selected }}"> +  {% endif %} +  {% for opt in options.list %} +    <button class="text-button" form="search_form" type="submit" name="{{ options.slug }}" value="{{ opt.slug }}"> +      {% if selected == opt.slug or (not selected and opt.slug == options.default) %} +        <span style="font-weight: bold;"> +      {% else %} +        <span> +      {% endif %} +      {{ opt.label }} +      </span> +    </button> +    <br> +    </span> +  {% endfor %} +</div> +<br> +{% endmacro %} diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py index a148c8b..06d6a02 100644 --- a/fatcat_scholar/web.py +++ b/fatcat_scholar/web.py @@ -4,25 +4,23 @@ This contains the FastAPI web application and RESTful API.  So far there are few endpoints, so we just put them all here!  """ +import sys  from enum import Enum -  import babel.support  from fastapi import FastAPI, APIRouter, Request, Depends, Header  from fastapi.staticfiles import StaticFiles  from fastapi.responses import HTMLResponse -from pydantic import BaseModel  from dynaconf import settings +from typing import List, Dict, Tuple, Optional, Any, Sequence  from fatcat_scholar.hacks import Jinja2Templates -from fatcat_scholar.search import do_fulltext_search +from fatcat_scholar.search import do_fulltext_search, FulltextQuery, FulltextHits -print(settings.as_dict()) +print(f"dynaconf settings: {settings.as_dict()}", file=sys.stderr)  I18N_LANG_TRANSLATIONS = ["de", "zh"]  I18N_LANG_OPTIONS = I18N_LANG_TRANSLATIONS + [settings.I18N_LANG_DEFAULT,] -class SearchParams(BaseModel): -    q: str = ""  class LangPrefix:      """ @@ -61,7 +59,7 @@ async def home():      return {"endpoints": {"/": "this", "/search": "fulltext search"}}  @api.get("/search", operation_id="get_search") -async def search(query: SearchParams = Depends(SearchParams)): +async def search(query: FulltextQuery = Depends(FulltextQuery)):      return {"message": "search results would go here, I guess"}  web = APIRouter() @@ -115,24 +113,48 @@ def load_i18n_templates():  i18n_templates = load_i18n_templates() +  @web.get("/", include_in_schema=False)  async def web_home(request: Request, lang: LangPrefix = Depends(LangPrefix), content: ContentNegotiation = Depends(ContentNegotiation)):      if content.mimetype == "application/json":          return await home()      return i18n_templates[lang.code].TemplateResponse("home.html", {"request": request, "locale": lang.code, "lang_prefix": lang.prefix}) +  @web.get("/about", include_in_schema=False)  async def web_about(request: Request, lang: LangPrefix = Depends(LangPrefix)):      return i18n_templates[lang.code].TemplateResponse("about.html", {"request": request, "locale": lang.code, "lang_prefix": lang.prefix}) +  @web.get("/search", include_in_schema=False) -async def web_search(request: Request, query: SearchParams = Depends(SearchParams), lang: LangPrefix = Depends(LangPrefix), content: ContentNegotiation = Depends(ContentNegotiation)): +async def web_search(request: Request, query: FulltextQuery = Depends(FulltextQuery), lang: LangPrefix = Depends(LangPrefix), content: ContentNegotiation = Depends(ContentNegotiation)): +      if content.mimetype == "application/json":          return await search(query) -    found = None -    if query.q: -        found = do_fulltext_search(query.q) -    return i18n_templates[lang.code].TemplateResponse("search.html", {"request": request, "locale": lang.code, "lang_prefix": lang.prefix, "found": found}) +    hits : Optional[FulltextHits] = None +    search_error: Optional[str] = None +    status_code: int = 200 +    if query.q is not None: +        try: +            hits = do_fulltext_search(query) +        except ValueError as e: +            search_error = str(e) +            status_code = 400 +        except IOError as e: +            search_error = str(e) +            status_code = 500 +    return i18n_templates[lang.code].TemplateResponse( +        "search.html", +        { +            "request": request, +            "locale": lang.code, +            "lang_prefix": lang.prefix, +            "hits": hits, +            "search_error": search_error, +            "query": query, +        }, +        status_code=status_code, +    )  app = FastAPI( | 
