diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-05-21 14:23:31 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-05-21 14:23:41 -0700 |
commit | 01ae25c1bf24c8d9f7721f49122a15bf522bdbb4 (patch) | |
tree | 36add4d6c2c7050f646a14c2a0ffc5f03436ad03 | |
parent | 6c4f539463074bcb563675b6e4f19464339e5641 (diff) | |
download | fatcat-scholar-01ae25c1bf24c8d9f7721f49122a15bf522bdbb4.tar.gz fatcat-scholar-01ae25c1bf24c8d9f7721f49122a15bf522bdbb4.zip |
search query improvements
- wire up most of the filters and sort order
- query sticks around in search box
- crude error message (needs work)
-rw-r--r-- | fatcat_scholar/search.py | 228 | ||||
-rw-r--r-- | fatcat_scholar/templates/base.html | 13 | ||||
-rw-r--r-- | fatcat_scholar/templates/search.html | 70 | ||||
-rw-r--r-- | fatcat_scholar/templates/search_macros.html | 24 | ||||
-rw-r--r-- | fatcat_scholar/web.py | 46 |
5 files changed, 236 insertions, 145 deletions
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index f94c403..f8dd7fb 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -3,112 +3,123 @@ Helpers to make elasticsearch queries. """ +import sys import json import datetime - import elasticsearch -from elasticsearch_dsl import Search, Q +from pydantic import BaseModel from dynaconf import settings - - -def generic_search_execute(search, limit=25, offset=0, deep_page_limit=2000): - - # Sanity checks - if limit > 100: - limit = 100 - if offset < 0: - offset = 0 - if offset > deep_page_limit: - # Avoid deep paging problem. - offset = deep_page_limit - - search = search[int(offset):int(offset)+int(limit)] - - try: - resp = search.execute() - except elasticsearch.exceptions.RequestError as e: - # this is a "user" error - print("elasticsearch 400: " + str(e.info)) - #flash("Search query failed to parse; you might need to use quotes.<p><code>{}: {}</code>".format(e.error, e.info['error']['root_cause'][0]['reason'])) - # XXX: abort(e.status_code) - raise e - except elasticsearch.exceptions.TransportError as e: - # all other errors - print("elasticsearch non-200 status code: {}".format(e.info)) - # XXX: abort(e.status_code) - raise e - - # convert from objects to python dicts - results = [] - for h in resp: - r = h._d_ - #print(json.dumps(h.meta._d_, indent=2)) - r['_highlights'] = [] - if 'highlight' in dir(h.meta): - highlights = h.meta.highlight._d_ - for k in highlights: - r['_highlights'] += highlights[k] - results.append(r) - - for h in results: - # Handle surrogate strings that elasticsearch returns sometimes, - # probably due to mangled data processing in some pipeline. - # "Crimes against Unicode"; production workaround - for key in h: - if type(h[key]) is str: - h[key] = h[key].encode('utf8', 'ignore').decode('utf8') - - return { - "count_returned": len(results), - "count_found": int(resp.hits.total), - "results": results, - "offset": offset, - "limit": limit, - "deep_page_limit": deep_page_limit, - "query_time_ms": int(resp.took), +from dataclasses import dataclass +from elasticsearch_dsl import Search, Q +from typing import List, Dict, Tuple, Optional, Any, Sequence + + +class FulltextQuery(BaseModel): + q: Optional[str] = None + limit: Optional[int] = None + offset: Optional[int] = None + filter_time: Optional[str] = None + filter_type: Optional[str] = None + filter_availability: Optional[str] = None + sort_order: Optional[str] = None + time_options: Any = { + "label": "Release Date", + "slug": "filter_time", + "default": "all_time", + "list": [ + {"label": "All Time", "slug": "all_time"}, + {"label": "Past Week", "slug": "past_week"}, + {"label": "Past Year", "slug": "past_year"}, + {"label": "Since 2000", "slug": "since_2000"}, + {"label": "Before 1925", "slug": "before_1925"}, + ], + } + type_options: Any = { + "label": "Resource Type", + "slug": "filter_type", + "default": "papers", + "list": [ + {"label": "Papers", "slug": "papers"}, + {"label": "Reports", "slug": "reports"}, + {"label": "Datasets", "slug": "datasets"}, + {"label": "Everything", "slug": "everything"}, + ], + } + availability_options: Any = { + "label": "Availability", + "slug": "filter_availability", + "default": "everything", + "list": [ + {"label": "Everything", "slug": "everything"}, + {"label": "Fulltext", "slug": "fulltext"}, + {"label": "Open Access", "slug": "oa"}, + ], + } + sort_options: Any = { + "label": "Sort Order", + "slug": "sort_order", + "default": "relevancy", + "list": [ + {"label": "All Time", "slug": "relevancy"}, + {"label": "Recent First", "slug": "time_desc"}, + {"label": "Oldest First", "slug": "time_asc"}, + ], } -def do_fulltext_search(q, limit=25, offset=0, filter_time=None, filter_type=None): +class FulltextHits(BaseModel): + count_returned: int + count_found: int + offset: int + limit: int + deep_page_limit: int + query_time_ms: int + results: List[Any] - # Convert raw DOIs to DOI queries - if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1: - q = 'doi:"{}"'.format(q) + +def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> FulltextHits: es_client = elasticsearch.Elasticsearch(settings.ELASTICSEARCH_BACKEND) search = Search(using=es_client, index=settings.ELASTICSEARCH_FULLTEXT_INDEX) + # Convert raw DOIs to DOI queries + if len(query.q.split()) == 1 and query.q.startswith("10.") and query.q.count("/") >= 1: + search = search.filter("terms", doi=query.q) + query.q = "*" + # type filters - if filter_type == "papers": + if query.filter_type == "papers": search = search.filter("terms", type=[ "article-journal", "paper-conference", "chapter", ]) - elif filter_type == "reports": + elif query.filter_type == "reports": search = search.filter("terms", type=[ "report", "standard", ]) - elif filter_type == "datasets": + elif query.filter_type == "datasets": search = search.filter("terms", type=[ "dataset", "software", ]) - elif filter_type == "everything" or filter_type == None: + elif query.filter_type == "everything" or query.filter_type == None: pass else: - # XXX: abort(400) - raise Exception() + raise ValueError(f"Unknown 'filter_type' parameter value: '{query.filter_type}'") # time filters - if filter_time == "past_week": + if query.filter_time == "past_week": week_ago_date = str(datetime.date.today() - datetime.timedelta(days=7)) search = search.filter("range", date=dict(gte=week_ago_date)) - elif filter_time == "this_year": - search = search.filter("term", year=datetime.date.today().year) - elif filter_time == "since_2000": + elif query.filter_time == "past_year": + # (date in the past year) or (year is this year) + # the later to catch papers which don't have release_date defined + year_ago_date = str(datetime.date.today() - datetime.timedelta(days=365)) + this_year = datetime.date.today().year + search = search.filter(Q("range", date=dict(gte=year_ago_date)) | Q("term", year=this_year)) + elif query.filter_time == "since_2000": search = search.filter("range", year=dict(gte=2000)) - elif filter_time == "before_1925": + elif query.filter_time == "before_1925": search = search.filter("range", year=dict(lt=1925)) - elif filter_time == "all_time" or filter_time == None: + elif query.filter_time == "all_time" or query.filter_time == None: pass else: - # XXX: abort(400) - raise Exception() + raise ValueError(f"Unknown 'filter_time' parameter value: '{query.filter_time}'") search = search.query( 'query_string', - query=q, + query=query.q, default_operator="AND", analyze_wildcard=True, lenient=True, @@ -127,7 +138,62 @@ def do_fulltext_search(q, limit=25, offset=0, filter_time=None, filter_type=None fragment_size=250, ) - resp = generic_search_execute(search, offset=offset) + # sort order + if query.sort_order == "time_asc": + search = search.sort("year", "date") + elif query.sort_order == "time_desc": + search = search.sort("-year", "-date") + elif query.sort_order == "relevancy" or query.sort_order == None: + pass + else: + raise ValueError(f"Unknown 'sort_order' parameter value: '{query.sort_order}'") + + # Sanity checks + limit = min((int(query.limit or 25), 100)) + offset = max((int(query.offset or 0), 0)) + if offset > deep_page_limit: + # Avoid deep paging problem. + offset = deep_page_limit + + search = search[offset:offset+limit] + + try: + resp = search.execute() + except elasticsearch.exceptions.RequestError as e: + # this is a "user" error + print("elasticsearch 400: " + str(e.info), file=sys.stderr) + raise ValueError(str(e.info)) + except elasticsearch.exceptions.TransportError as e: + # all other errors + print("elasticsearch non-200 status code: {}".format(e.info), file=sys.stderr) + raise IOError(str(e.info)) - resp["query"] = { "q": q } - return resp + # convert from objects to python dicts + results = [] + for h in resp: + r = h._d_ + #print(json.dumps(h.meta._d_, indent=2)) + r['_highlights'] = [] + if 'highlight' in dir(h.meta): + highlights = h.meta.highlight._d_ + for k in highlights: + r['_highlights'] += highlights[k] + results.append(r) + + for h in results: + # Handle surrogate strings that elasticsearch returns sometimes, + # probably due to mangled data processing in some pipeline. + # "Crimes against Unicode"; production workaround + for key in h: + if type(h[key]) is str: + h[key] = h[key].encode('utf8', 'ignore').decode('utf8') + + return FulltextHits( + count_returned=len(results), + count_found=int(resp.hits.total), + offset=offset, + limit=limit, + deep_page_limit=deep_page_limit, + query_time_ms=int(resp.took), + results=results, + ) diff --git a/fatcat_scholar/templates/base.html b/fatcat_scholar/templates/base.html index fffdadd..a9eedd4 100644 --- a/fatcat_scholar/templates/base.html +++ b/fatcat_scholar/templates/base.html @@ -58,6 +58,15 @@ .ui.card a:hover { opacity: 0.75; } + + .text-button { + border: none; + background-color: inherit; + padding: 0; + font-family: inherit; + cursor: pointer; + display: inline-block; + } </style> <title>{%- block title -%}scholar.archive.org{%- endblock %}</title> <link rel="stylesheet" @@ -116,11 +125,11 @@ </div> </div> <div class="ui twelve wide column"> - <form class="" id="fulltext_query" action="{{ lang_prefix }}/search") }}" method="get" role="search" aria-label="papers" itemprop="potentialAction" itemscope itemtype="https://schema.org/SearchAction"> + <form class="" id="search_form" action="{{ lang_prefix }}/search") }}" method="get" role="search" aria-label="papers" itemprop="potentialAction" itemscope itemtype="https://schema.org/SearchAction"> <meta itemprop="target" content="https://{{ settings.FATCAT_DOMAIN }}/fulltext/search?q={q}"/> <div class="ui form"> <div class="ui action input large fluid"> - <input type="text" placeholder="{{ _("by title, authors, identifiers...") }}" name="q" aria-label="search metadata" required itemprop="query-input" style="border-radius: 0; border: 1px #999 solid;"> + <input type="search" placeholder="{{ _("by title, authors, identifiers...") }}" name="q" aria-label="search metadata" required itemprop="query-input" style="border-radius: 0; border: 1px #999 solid;" {% if query and query.q %}value="{{ query.q }}"{% endif %}> <button class="ui green button" style="border-radius: 0; background-color: #44a25a; font-size: 1.2rem;">{{ _("Search") }}</button> </div> </div> diff --git a/fatcat_scholar/templates/search.html b/fatcat_scholar/templates/search.html index c2cd3ea..6d9fec2 100644 --- a/fatcat_scholar/templates/search.html +++ b/fatcat_scholar/templates/search.html @@ -5,68 +5,40 @@ <div class="ui equal height divided grid" style="margin-top: 1em;"> <div class="ui two wide column"> - {% if found %} - <div class="ui tiny statistic" style="width: 100%; text-align: center;"> - <div class="value"> - {{ "{:,}".format(found.count_found) }} - </div> - <div class="label"> - Hits - </div> + {% if hits %} + <div style="width: 100%; text-align: right;"> + {# <h2>{{ "{:,}".format(hits.count_found) }}</h2> #} + <h3 style="font-size: {% if hits.count_found >= 10000000 %}1.0em{% elif hits.count_found >= 1000 %}1.5em{% else %}2.0em{% endif %};">{{ "{:,}".format(hits.count_found) }}</h3> + Hits + </div> + <div style="text-align: right;"> + <span style="color: rgba(0,0,0,0.4);">in {{ "{:0.2}".format(hits.query_time_ms/1000.0) }}sec</span> </div> <div class="ui clearing divider"></div> - {% else %} - Maybe some filters, facets, counts over here? {% endif %} <div style="text-align: right;"> - <span style="color: rgba(0,0,0,0.4);">Release Date</span> - <div class="ui link list" style="margin-top: 0.3em;"> - <a class="active item"><b>All Time</b></a> - <a class="item" style="color: rgba(0,0,0);">Past Week</a> - <a class="item" style="color: rgba(0,0,0);">Past Year</a> - <a class="item" style="color: rgba(0,0,0);">Since 2000</a> - <a class="item" style="color: rgba(0,0,0);">Before 1925</a> - </div> - - <br> - <span style="color: rgba(0,0,0,0.4);">Resource Type</span> - <div class="ui link list" style="margin-top: 0.3em;"> - <a class="active item"><b>Papers</b></a> - <a class="item" style="color: rgba(0,0,0);">Reports</a> - <a class="item" style="color: rgba(0,0,0);">Datasets</a> - <a class="item" style="color: rgba(0,0,0);">Everything</a> - </div> - - <br> - <span style="color: rgba(0,0,0,0.4);">Availability</span> - <div class="ui link list" style="margin-top: 0.3em;"> - <a class="active item"><b>Everything</b></a> - <a class="item" style="color: rgba(0,0,0);">Open Access</a> - <a class="item" style="color: rgba(0,0,0);">Lending</a> - <a class="item" style="color: rgba(0,0,0);">Paywall</a> - </div> - - <br> - <span style="color: rgba(0,0,0,0.4);">Sort Order</span> - <div class="ui link list" style="margin-top: 0.3em;"> - <a class="active item"><b>Relevancy</b></a> - <a class="item" style="color: rgba(0,0,0);">Recent First</a> - <a class="item" style="color: rgba(0,0,0);">Oldest First</a> - </div> + {{ search_macros.query_option(query.time_options, query.filter_time) }} + {{ search_macros.query_option(query.type_options, query.filter_type) }} + {{ search_macros.query_option(query.availability_options, query.filter_availability) }} + {{ search_macros.query_option(query.sort_options, query.sort_order) }} </div> </div> <div class="ui thirteen wide column"> - {% if found %} - {% if found.results %} - {% for paper in found.results %} + {% if search_error %} + <div class="ui error message"> + <div class="header">Query Error</div> + <p>{{ search_error }}</p> + </div> + {% elif hits %} + {% if hits.results %} + {% for paper in hits.results %} {{ search_macros.fulltext_search_result_row(paper) }} {% endfor %} {% endif %} {% else %} - Some other message here when there is no search? Like a bunch of examples? - Or does that ever happen... we can just run query on "*". + No search submitted? Some message should go here. {% endif %} </div> </div> diff --git a/fatcat_scholar/templates/search_macros.html b/fatcat_scholar/templates/search_macros.html index f5c4d85..bb96b61 100644 --- a/fatcat_scholar/templates/search_macros.html +++ b/fatcat_scholar/templates/search_macros.html @@ -1,6 +1,5 @@ {% macro fulltext_search_result_row(paper) -%} -{% set lang_code = "en" %} <div class="ui grid"> <div class="thirteen wide column"> {# ### TITLE ROW #} @@ -175,3 +174,26 @@ </div> </div> {% endmacro %} + +{% macro query_option(options, selected) -%} +<span style="color: rgba(0,0,0,0.4);">{{ options.label }}</span> +<div class="ui link list" style="margin-top: 0.3em;"> + {% if selected %} + <input form="search_form" type="hidden" name="{{ options.slug }}" value="{{ selected }}"> + {% endif %} + {% for opt in options.list %} + <button class="text-button" form="search_form" type="submit" name="{{ options.slug }}" value="{{ opt.slug }}"> + {% if selected == opt.slug or (not selected and opt.slug == options.default) %} + <span style="font-weight: bold;"> + {% else %} + <span> + {% endif %} + {{ opt.label }} + </span> + </button> + <br> + </span> + {% endfor %} +</div> +<br> +{% endmacro %} diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py index a148c8b..06d6a02 100644 --- a/fatcat_scholar/web.py +++ b/fatcat_scholar/web.py @@ -4,25 +4,23 @@ This contains the FastAPI web application and RESTful API. So far there are few endpoints, so we just put them all here! """ +import sys from enum import Enum - import babel.support from fastapi import FastAPI, APIRouter, Request, Depends, Header from fastapi.staticfiles import StaticFiles from fastapi.responses import HTMLResponse -from pydantic import BaseModel from dynaconf import settings +from typing import List, Dict, Tuple, Optional, Any, Sequence from fatcat_scholar.hacks import Jinja2Templates -from fatcat_scholar.search import do_fulltext_search +from fatcat_scholar.search import do_fulltext_search, FulltextQuery, FulltextHits -print(settings.as_dict()) +print(f"dynaconf settings: {settings.as_dict()}", file=sys.stderr) I18N_LANG_TRANSLATIONS = ["de", "zh"] I18N_LANG_OPTIONS = I18N_LANG_TRANSLATIONS + [settings.I18N_LANG_DEFAULT,] -class SearchParams(BaseModel): - q: str = "" class LangPrefix: """ @@ -61,7 +59,7 @@ async def home(): return {"endpoints": {"/": "this", "/search": "fulltext search"}} @api.get("/search", operation_id="get_search") -async def search(query: SearchParams = Depends(SearchParams)): +async def search(query: FulltextQuery = Depends(FulltextQuery)): return {"message": "search results would go here, I guess"} web = APIRouter() @@ -115,24 +113,48 @@ def load_i18n_templates(): i18n_templates = load_i18n_templates() + @web.get("/", include_in_schema=False) async def web_home(request: Request, lang: LangPrefix = Depends(LangPrefix), content: ContentNegotiation = Depends(ContentNegotiation)): if content.mimetype == "application/json": return await home() return i18n_templates[lang.code].TemplateResponse("home.html", {"request": request, "locale": lang.code, "lang_prefix": lang.prefix}) + @web.get("/about", include_in_schema=False) async def web_about(request: Request, lang: LangPrefix = Depends(LangPrefix)): return i18n_templates[lang.code].TemplateResponse("about.html", {"request": request, "locale": lang.code, "lang_prefix": lang.prefix}) + @web.get("/search", include_in_schema=False) -async def web_search(request: Request, query: SearchParams = Depends(SearchParams), lang: LangPrefix = Depends(LangPrefix), content: ContentNegotiation = Depends(ContentNegotiation)): +async def web_search(request: Request, query: FulltextQuery = Depends(FulltextQuery), lang: LangPrefix = Depends(LangPrefix), content: ContentNegotiation = Depends(ContentNegotiation)): + if content.mimetype == "application/json": return await search(query) - found = None - if query.q: - found = do_fulltext_search(query.q) - return i18n_templates[lang.code].TemplateResponse("search.html", {"request": request, "locale": lang.code, "lang_prefix": lang.prefix, "found": found}) + hits : Optional[FulltextHits] = None + search_error: Optional[str] = None + status_code: int = 200 + if query.q is not None: + try: + hits = do_fulltext_search(query) + except ValueError as e: + search_error = str(e) + status_code = 400 + except IOError as e: + search_error = str(e) + status_code = 500 + return i18n_templates[lang.code].TemplateResponse( + "search.html", + { + "request": request, + "locale": lang.code, + "lang_prefix": lang.prefix, + "hits": hits, + "search_error": search_error, + "query": query, + }, + status_code=status_code, + ) app = FastAPI( |