aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-21 14:23:31 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-21 14:23:41 -0700
commit01ae25c1bf24c8d9f7721f49122a15bf522bdbb4 (patch)
tree36add4d6c2c7050f646a14c2a0ffc5f03436ad03
parent6c4f539463074bcb563675b6e4f19464339e5641 (diff)
downloadfatcat-scholar-01ae25c1bf24c8d9f7721f49122a15bf522bdbb4.tar.gz
fatcat-scholar-01ae25c1bf24c8d9f7721f49122a15bf522bdbb4.zip
search query improvements
- wire up most of the filters and sort order - query sticks around in search box - crude error message (needs work)
-rw-r--r--fatcat_scholar/search.py228
-rw-r--r--fatcat_scholar/templates/base.html13
-rw-r--r--fatcat_scholar/templates/search.html70
-rw-r--r--fatcat_scholar/templates/search_macros.html24
-rw-r--r--fatcat_scholar/web.py46
5 files changed, 236 insertions, 145 deletions
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index f94c403..f8dd7fb 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -3,112 +3,123 @@
Helpers to make elasticsearch queries.
"""
+import sys
import json
import datetime
-
import elasticsearch
-from elasticsearch_dsl import Search, Q
+from pydantic import BaseModel
from dynaconf import settings
-
-
-def generic_search_execute(search, limit=25, offset=0, deep_page_limit=2000):
-
- # Sanity checks
- if limit > 100:
- limit = 100
- if offset < 0:
- offset = 0
- if offset > deep_page_limit:
- # Avoid deep paging problem.
- offset = deep_page_limit
-
- search = search[int(offset):int(offset)+int(limit)]
-
- try:
- resp = search.execute()
- except elasticsearch.exceptions.RequestError as e:
- # this is a "user" error
- print("elasticsearch 400: " + str(e.info))
- #flash("Search query failed to parse; you might need to use quotes.<p><code>{}: {}</code>".format(e.error, e.info['error']['root_cause'][0]['reason']))
- # XXX: abort(e.status_code)
- raise e
- except elasticsearch.exceptions.TransportError as e:
- # all other errors
- print("elasticsearch non-200 status code: {}".format(e.info))
- # XXX: abort(e.status_code)
- raise e
-
- # convert from objects to python dicts
- results = []
- for h in resp:
- r = h._d_
- #print(json.dumps(h.meta._d_, indent=2))
- r['_highlights'] = []
- if 'highlight' in dir(h.meta):
- highlights = h.meta.highlight._d_
- for k in highlights:
- r['_highlights'] += highlights[k]
- results.append(r)
-
- for h in results:
- # Handle surrogate strings that elasticsearch returns sometimes,
- # probably due to mangled data processing in some pipeline.
- # "Crimes against Unicode"; production workaround
- for key in h:
- if type(h[key]) is str:
- h[key] = h[key].encode('utf8', 'ignore').decode('utf8')
-
- return {
- "count_returned": len(results),
- "count_found": int(resp.hits.total),
- "results": results,
- "offset": offset,
- "limit": limit,
- "deep_page_limit": deep_page_limit,
- "query_time_ms": int(resp.took),
+from dataclasses import dataclass
+from elasticsearch_dsl import Search, Q
+from typing import List, Dict, Tuple, Optional, Any, Sequence
+
+
+class FulltextQuery(BaseModel):
+ q: Optional[str] = None
+ limit: Optional[int] = None
+ offset: Optional[int] = None
+ filter_time: Optional[str] = None
+ filter_type: Optional[str] = None
+ filter_availability: Optional[str] = None
+ sort_order: Optional[str] = None
+ time_options: Any = {
+ "label": "Release Date",
+ "slug": "filter_time",
+ "default": "all_time",
+ "list": [
+ {"label": "All Time", "slug": "all_time"},
+ {"label": "Past Week", "slug": "past_week"},
+ {"label": "Past Year", "slug": "past_year"},
+ {"label": "Since 2000", "slug": "since_2000"},
+ {"label": "Before 1925", "slug": "before_1925"},
+ ],
+ }
+ type_options: Any = {
+ "label": "Resource Type",
+ "slug": "filter_type",
+ "default": "papers",
+ "list": [
+ {"label": "Papers", "slug": "papers"},
+ {"label": "Reports", "slug": "reports"},
+ {"label": "Datasets", "slug": "datasets"},
+ {"label": "Everything", "slug": "everything"},
+ ],
+ }
+ availability_options: Any = {
+ "label": "Availability",
+ "slug": "filter_availability",
+ "default": "everything",
+ "list": [
+ {"label": "Everything", "slug": "everything"},
+ {"label": "Fulltext", "slug": "fulltext"},
+ {"label": "Open Access", "slug": "oa"},
+ ],
+ }
+ sort_options: Any = {
+ "label": "Sort Order",
+ "slug": "sort_order",
+ "default": "relevancy",
+ "list": [
+ {"label": "All Time", "slug": "relevancy"},
+ {"label": "Recent First", "slug": "time_desc"},
+ {"label": "Oldest First", "slug": "time_asc"},
+ ],
}
-def do_fulltext_search(q, limit=25, offset=0, filter_time=None, filter_type=None):
+class FulltextHits(BaseModel):
+ count_returned: int
+ count_found: int
+ offset: int
+ limit: int
+ deep_page_limit: int
+ query_time_ms: int
+ results: List[Any]
- # Convert raw DOIs to DOI queries
- if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1:
- q = 'doi:"{}"'.format(q)
+
+def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> FulltextHits:
es_client = elasticsearch.Elasticsearch(settings.ELASTICSEARCH_BACKEND)
search = Search(using=es_client, index=settings.ELASTICSEARCH_FULLTEXT_INDEX)
+ # Convert raw DOIs to DOI queries
+ if len(query.q.split()) == 1 and query.q.startswith("10.") and query.q.count("/") >= 1:
+ search = search.filter("terms", doi=query.q)
+ query.q = "*"
+
# type filters
- if filter_type == "papers":
+ if query.filter_type == "papers":
search = search.filter("terms", type=[ "article-journal", "paper-conference", "chapter", ])
- elif filter_type == "reports":
+ elif query.filter_type == "reports":
search = search.filter("terms", type=[ "report", "standard", ])
- elif filter_type == "datasets":
+ elif query.filter_type == "datasets":
search = search.filter("terms", type=[ "dataset", "software", ])
- elif filter_type == "everything" or filter_type == None:
+ elif query.filter_type == "everything" or query.filter_type == None:
pass
else:
- # XXX: abort(400)
- raise Exception()
+ raise ValueError(f"Unknown 'filter_type' parameter value: '{query.filter_type}'")
# time filters
- if filter_time == "past_week":
+ if query.filter_time == "past_week":
week_ago_date = str(datetime.date.today() - datetime.timedelta(days=7))
search = search.filter("range", date=dict(gte=week_ago_date))
- elif filter_time == "this_year":
- search = search.filter("term", year=datetime.date.today().year)
- elif filter_time == "since_2000":
+ elif query.filter_time == "past_year":
+ # (date in the past year) or (year is this year)
+ # the later to catch papers which don't have release_date defined
+ year_ago_date = str(datetime.date.today() - datetime.timedelta(days=365))
+ this_year = datetime.date.today().year
+ search = search.filter(Q("range", date=dict(gte=year_ago_date)) | Q("term", year=this_year))
+ elif query.filter_time == "since_2000":
search = search.filter("range", year=dict(gte=2000))
- elif filter_time == "before_1925":
+ elif query.filter_time == "before_1925":
search = search.filter("range", year=dict(lt=1925))
- elif filter_time == "all_time" or filter_time == None:
+ elif query.filter_time == "all_time" or query.filter_time == None:
pass
else:
- # XXX: abort(400)
- raise Exception()
+ raise ValueError(f"Unknown 'filter_time' parameter value: '{query.filter_time}'")
search = search.query(
'query_string',
- query=q,
+ query=query.q,
default_operator="AND",
analyze_wildcard=True,
lenient=True,
@@ -127,7 +138,62 @@ def do_fulltext_search(q, limit=25, offset=0, filter_time=None, filter_type=None
fragment_size=250,
)
- resp = generic_search_execute(search, offset=offset)
+ # sort order
+ if query.sort_order == "time_asc":
+ search = search.sort("year", "date")
+ elif query.sort_order == "time_desc":
+ search = search.sort("-year", "-date")
+ elif query.sort_order == "relevancy" or query.sort_order == None:
+ pass
+ else:
+ raise ValueError(f"Unknown 'sort_order' parameter value: '{query.sort_order}'")
+
+ # Sanity checks
+ limit = min((int(query.limit or 25), 100))
+ offset = max((int(query.offset or 0), 0))
+ if offset > deep_page_limit:
+ # Avoid deep paging problem.
+ offset = deep_page_limit
+
+ search = search[offset:offset+limit]
+
+ try:
+ resp = search.execute()
+ except elasticsearch.exceptions.RequestError as e:
+ # this is a "user" error
+ print("elasticsearch 400: " + str(e.info), file=sys.stderr)
+ raise ValueError(str(e.info))
+ except elasticsearch.exceptions.TransportError as e:
+ # all other errors
+ print("elasticsearch non-200 status code: {}".format(e.info), file=sys.stderr)
+ raise IOError(str(e.info))
- resp["query"] = { "q": q }
- return resp
+ # convert from objects to python dicts
+ results = []
+ for h in resp:
+ r = h._d_
+ #print(json.dumps(h.meta._d_, indent=2))
+ r['_highlights'] = []
+ if 'highlight' in dir(h.meta):
+ highlights = h.meta.highlight._d_
+ for k in highlights:
+ r['_highlights'] += highlights[k]
+ results.append(r)
+
+ for h in results:
+ # Handle surrogate strings that elasticsearch returns sometimes,
+ # probably due to mangled data processing in some pipeline.
+ # "Crimes against Unicode"; production workaround
+ for key in h:
+ if type(h[key]) is str:
+ h[key] = h[key].encode('utf8', 'ignore').decode('utf8')
+
+ return FulltextHits(
+ count_returned=len(results),
+ count_found=int(resp.hits.total),
+ offset=offset,
+ limit=limit,
+ deep_page_limit=deep_page_limit,
+ query_time_ms=int(resp.took),
+ results=results,
+ )
diff --git a/fatcat_scholar/templates/base.html b/fatcat_scholar/templates/base.html
index fffdadd..a9eedd4 100644
--- a/fatcat_scholar/templates/base.html
+++ b/fatcat_scholar/templates/base.html
@@ -58,6 +58,15 @@
.ui.card a:hover {
opacity: 0.75;
}
+
+ .text-button {
+ border: none;
+ background-color: inherit;
+ padding: 0;
+ font-family: inherit;
+ cursor: pointer;
+ display: inline-block;
+ }
</style>
<title>{%- block title -%}scholar.archive.org{%- endblock %}</title>
<link rel="stylesheet"
@@ -116,11 +125,11 @@
</div>
</div>
<div class="ui twelve wide column">
- <form class="" id="fulltext_query" action="{{ lang_prefix }}/search") }}" method="get" role="search" aria-label="papers" itemprop="potentialAction" itemscope itemtype="https://schema.org/SearchAction">
+ <form class="" id="search_form" action="{{ lang_prefix }}/search") }}" method="get" role="search" aria-label="papers" itemprop="potentialAction" itemscope itemtype="https://schema.org/SearchAction">
<meta itemprop="target" content="https://{{ settings.FATCAT_DOMAIN }}/fulltext/search?q={q}"/>
<div class="ui form">
<div class="ui action input large fluid">
- <input type="text" placeholder="{{ _("by title, authors, identifiers...") }}" name="q" aria-label="search metadata" required itemprop="query-input" style="border-radius: 0; border: 1px #999 solid;">
+ <input type="search" placeholder="{{ _("by title, authors, identifiers...") }}" name="q" aria-label="search metadata" required itemprop="query-input" style="border-radius: 0; border: 1px #999 solid;" {% if query and query.q %}value="{{ query.q }}"{% endif %}>
<button class="ui green button" style="border-radius: 0; background-color: #44a25a; font-size: 1.2rem;">{{ _("Search") }}</button>
</div>
</div>
diff --git a/fatcat_scholar/templates/search.html b/fatcat_scholar/templates/search.html
index c2cd3ea..6d9fec2 100644
--- a/fatcat_scholar/templates/search.html
+++ b/fatcat_scholar/templates/search.html
@@ -5,68 +5,40 @@
<div class="ui equal height divided grid" style="margin-top: 1em;">
<div class="ui two wide column">
- {% if found %}
- <div class="ui tiny statistic" style="width: 100%; text-align: center;">
- <div class="value">
- {{ "{:,}".format(found.count_found) }}
- </div>
- <div class="label">
- Hits
- </div>
+ {% if hits %}
+ <div style="width: 100%; text-align: right;">
+ {# <h2>{{ "{:,}".format(hits.count_found) }}</h2> #}
+ <h3 style="font-size: {% if hits.count_found >= 10000000 %}1.0em{% elif hits.count_found >= 1000 %}1.5em{% else %}2.0em{% endif %};">{{ "{:,}".format(hits.count_found) }}</h3>
+ Hits
+ </div>
+ <div style="text-align: right;">
+ <span style="color: rgba(0,0,0,0.4);">in {{ "{:0.2}".format(hits.query_time_ms/1000.0) }}sec</span>
</div>
<div class="ui clearing divider"></div>
- {% else %}
- Maybe some filters, facets, counts over here?
{% endif %}
<div style="text-align: right;">
- <span style="color: rgba(0,0,0,0.4);">Release Date</span>
- <div class="ui link list" style="margin-top: 0.3em;">
- <a class="active item"><b>All Time</b></a>
- <a class="item" style="color: rgba(0,0,0);">Past Week</a>
- <a class="item" style="color: rgba(0,0,0);">Past Year</a>
- <a class="item" style="color: rgba(0,0,0);">Since 2000</a>
- <a class="item" style="color: rgba(0,0,0);">Before 1925</a>
- </div>
-
- <br>
- <span style="color: rgba(0,0,0,0.4);">Resource Type</span>
- <div class="ui link list" style="margin-top: 0.3em;">
- <a class="active item"><b>Papers</b></a>
- <a class="item" style="color: rgba(0,0,0);">Reports</a>
- <a class="item" style="color: rgba(0,0,0);">Datasets</a>
- <a class="item" style="color: rgba(0,0,0);">Everything</a>
- </div>
-
- <br>
- <span style="color: rgba(0,0,0,0.4);">Availability</span>
- <div class="ui link list" style="margin-top: 0.3em;">
- <a class="active item"><b>Everything</b></a>
- <a class="item" style="color: rgba(0,0,0);">Open Access</a>
- <a class="item" style="color: rgba(0,0,0);">Lending</a>
- <a class="item" style="color: rgba(0,0,0);">Paywall</a>
- </div>
-
- <br>
- <span style="color: rgba(0,0,0,0.4);">Sort Order</span>
- <div class="ui link list" style="margin-top: 0.3em;">
- <a class="active item"><b>Relevancy</b></a>
- <a class="item" style="color: rgba(0,0,0);">Recent First</a>
- <a class="item" style="color: rgba(0,0,0);">Oldest First</a>
- </div>
+ {{ search_macros.query_option(query.time_options, query.filter_time) }}
+ {{ search_macros.query_option(query.type_options, query.filter_type) }}
+ {{ search_macros.query_option(query.availability_options, query.filter_availability) }}
+ {{ search_macros.query_option(query.sort_options, query.sort_order) }}
</div>
</div>
<div class="ui thirteen wide column">
- {% if found %}
- {% if found.results %}
- {% for paper in found.results %}
+ {% if search_error %}
+ <div class="ui error message">
+ <div class="header">Query Error</div>
+ <p>{{ search_error }}</p>
+ </div>
+ {% elif hits %}
+ {% if hits.results %}
+ {% for paper in hits.results %}
{{ search_macros.fulltext_search_result_row(paper) }}
{% endfor %}
{% endif %}
{% else %}
- Some other message here when there is no search? Like a bunch of examples?
- Or does that ever happen... we can just run query on "*".
+ No search submitted? Some message should go here.
{% endif %}
</div>
</div>
diff --git a/fatcat_scholar/templates/search_macros.html b/fatcat_scholar/templates/search_macros.html
index f5c4d85..bb96b61 100644
--- a/fatcat_scholar/templates/search_macros.html
+++ b/fatcat_scholar/templates/search_macros.html
@@ -1,6 +1,5 @@
{% macro fulltext_search_result_row(paper) -%}
-{% set lang_code = "en" %}
<div class="ui grid">
<div class="thirteen wide column">
{# ### TITLE ROW #}
@@ -175,3 +174,26 @@
</div>
</div>
{% endmacro %}
+
+{% macro query_option(options, selected) -%}
+<span style="color: rgba(0,0,0,0.4);">{{ options.label }}</span>
+<div class="ui link list" style="margin-top: 0.3em;">
+ {% if selected %}
+ <input form="search_form" type="hidden" name="{{ options.slug }}" value="{{ selected }}">
+ {% endif %}
+ {% for opt in options.list %}
+ <button class="text-button" form="search_form" type="submit" name="{{ options.slug }}" value="{{ opt.slug }}">
+ {% if selected == opt.slug or (not selected and opt.slug == options.default) %}
+ <span style="font-weight: bold;">
+ {% else %}
+ <span>
+ {% endif %}
+ {{ opt.label }}
+ </span>
+ </button>
+ <br>
+ </span>
+ {% endfor %}
+</div>
+<br>
+{% endmacro %}
diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py
index a148c8b..06d6a02 100644
--- a/fatcat_scholar/web.py
+++ b/fatcat_scholar/web.py
@@ -4,25 +4,23 @@ This contains the FastAPI web application and RESTful API.
So far there are few endpoints, so we just put them all here!
"""
+import sys
from enum import Enum
-
import babel.support
from fastapi import FastAPI, APIRouter, Request, Depends, Header
from fastapi.staticfiles import StaticFiles
from fastapi.responses import HTMLResponse
-from pydantic import BaseModel
from dynaconf import settings
+from typing import List, Dict, Tuple, Optional, Any, Sequence
from fatcat_scholar.hacks import Jinja2Templates
-from fatcat_scholar.search import do_fulltext_search
+from fatcat_scholar.search import do_fulltext_search, FulltextQuery, FulltextHits
-print(settings.as_dict())
+print(f"dynaconf settings: {settings.as_dict()}", file=sys.stderr)
I18N_LANG_TRANSLATIONS = ["de", "zh"]
I18N_LANG_OPTIONS = I18N_LANG_TRANSLATIONS + [settings.I18N_LANG_DEFAULT,]
-class SearchParams(BaseModel):
- q: str = ""
class LangPrefix:
"""
@@ -61,7 +59,7 @@ async def home():
return {"endpoints": {"/": "this", "/search": "fulltext search"}}
@api.get("/search", operation_id="get_search")
-async def search(query: SearchParams = Depends(SearchParams)):
+async def search(query: FulltextQuery = Depends(FulltextQuery)):
return {"message": "search results would go here, I guess"}
web = APIRouter()
@@ -115,24 +113,48 @@ def load_i18n_templates():
i18n_templates = load_i18n_templates()
+
@web.get("/", include_in_schema=False)
async def web_home(request: Request, lang: LangPrefix = Depends(LangPrefix), content: ContentNegotiation = Depends(ContentNegotiation)):
if content.mimetype == "application/json":
return await home()
return i18n_templates[lang.code].TemplateResponse("home.html", {"request": request, "locale": lang.code, "lang_prefix": lang.prefix})
+
@web.get("/about", include_in_schema=False)
async def web_about(request: Request, lang: LangPrefix = Depends(LangPrefix)):
return i18n_templates[lang.code].TemplateResponse("about.html", {"request": request, "locale": lang.code, "lang_prefix": lang.prefix})
+
@web.get("/search", include_in_schema=False)
-async def web_search(request: Request, query: SearchParams = Depends(SearchParams), lang: LangPrefix = Depends(LangPrefix), content: ContentNegotiation = Depends(ContentNegotiation)):
+async def web_search(request: Request, query: FulltextQuery = Depends(FulltextQuery), lang: LangPrefix = Depends(LangPrefix), content: ContentNegotiation = Depends(ContentNegotiation)):
+
if content.mimetype == "application/json":
return await search(query)
- found = None
- if query.q:
- found = do_fulltext_search(query.q)
- return i18n_templates[lang.code].TemplateResponse("search.html", {"request": request, "locale": lang.code, "lang_prefix": lang.prefix, "found": found})
+ hits : Optional[FulltextHits] = None
+ search_error: Optional[str] = None
+ status_code: int = 200
+ if query.q is not None:
+ try:
+ hits = do_fulltext_search(query)
+ except ValueError as e:
+ search_error = str(e)
+ status_code = 400
+ except IOError as e:
+ search_error = str(e)
+ status_code = 500
+ return i18n_templates[lang.code].TemplateResponse(
+ "search.html",
+ {
+ "request": request,
+ "locale": lang.code,
+ "lang_prefix": lang.prefix,
+ "hits": hits,
+ "search_error": search_error,
+ "query": query,
+ },
+ status_code=status_code,
+ )
app = FastAPI(