search query improvements

- wire up most of the filters and sort order - query sticks around in search box - crude error message (needs work)
author: Bryan Newbold <bnewbold@archive.org> 2020-05-21 14:23:31 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-05-21 14:23:41 -0700
commit: 01ae25c1bf24c8d9f7721f49122a15bf522bdbb4 (patch)
tree: 36add4d6c2c7050f646a14c2a0ffc5f03436ad03 /fatcat_scholar/search.py
parent: 6c4f539463074bcb563675b6e4f19464339e5641 (diff)
download: fatcat-scholar-01ae25c1bf24c8d9f7721f49122a15bf522bdbb4.tar.gz
fatcat-scholar-01ae25c1bf24c8d9f7721f49122a15bf522bdbb4.zip
1 files changed, 147 insertions, 81 deletions
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index f94c403..f8dd7fb 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -3,112 +3,123 @@
 Helpers to make elasticsearch queries.
 """
 
+import sys
 import json
 import datetime
-
 import elasticsearch
-from elasticsearch_dsl import Search, Q
+from pydantic import BaseModel
 from dynaconf import settings
-
-
-def generic_search_execute(search, limit=25, offset=0, deep_page_limit=2000):
-
-    # Sanity checks
-    if limit > 100:
-        limit = 100
-    if offset < 0:
-        offset = 0
-    if offset > deep_page_limit:
-        # Avoid deep paging problem.
-        offset = deep_page_limit
-
-    search = search[int(offset):int(offset)+int(limit)]
-
-    try:
-        resp = search.execute()
-    except elasticsearch.exceptions.RequestError as e:
-        # this is a "user" error
-        print("elasticsearch 400: " + str(e.info))
-        #flash("Search query failed to parse; you might need to use quotes.<p><code>{}: {}</code>".format(e.error, e.info['error']['root_cause'][0]['reason']))
-        # XXX: abort(e.status_code)
-        raise e
-    except elasticsearch.exceptions.TransportError as e:
-        # all other errors
-        print("elasticsearch non-200 status code: {}".format(e.info))
-        # XXX: abort(e.status_code)
-        raise e
-
-    # convert from objects to python dicts
-    results = []
-    for h in resp:
-        r = h._d_
-        #print(json.dumps(h.meta._d_, indent=2))
-        r['_highlights'] = []
-        if 'highlight' in dir(h.meta):
-            highlights = h.meta.highlight._d_
-            for k in highlights:
-                r['_highlights'] += highlights[k]
-        results.append(r)
-
-    for h in results:
-        # Handle surrogate strings that elasticsearch returns sometimes,
-        # probably due to mangled data processing in some pipeline.
-        # "Crimes against Unicode"; production workaround
-        for key in h:
-            if type(h[key]) is str:
-                h[key] = h[key].encode('utf8', 'ignore').decode('utf8')
-
-    return {
-        "count_returned": len(results),
-        "count_found": int(resp.hits.total),
-        "results": results,
-        "offset": offset,
-        "limit": limit,
-        "deep_page_limit": deep_page_limit,
-        "query_time_ms": int(resp.took),
+from dataclasses import dataclass
+from elasticsearch_dsl import Search, Q
+from typing import List, Dict, Tuple, Optional, Any, Sequence
+
+
+class FulltextQuery(BaseModel):
+    q: Optional[str] = None
+    limit: Optional[int] = None
+    offset: Optional[int] = None
+    filter_time: Optional[str] = None
+    filter_type: Optional[str] = None
+    filter_availability: Optional[str] = None
+    sort_order: Optional[str] = None
+    time_options: Any = {
+        "label": "Release Date",
+        "slug": "filter_time",
+        "default": "all_time",
+        "list": [
+            {"label": "All Time", "slug": "all_time"},
+            {"label": "Past Week", "slug": "past_week"},
+            {"label": "Past Year", "slug": "past_year"},
+            {"label": "Since 2000", "slug": "since_2000"},
+            {"label": "Before 1925", "slug": "before_1925"},
+        ],
+    }
+    type_options: Any = {
+        "label": "Resource Type",
+        "slug": "filter_type",
+        "default": "papers",
+        "list": [
+            {"label": "Papers", "slug": "papers"},
+            {"label": "Reports", "slug": "reports"},
+            {"label": "Datasets", "slug": "datasets"},
+            {"label": "Everything", "slug": "everything"},
+        ],
+    }
+    availability_options: Any = {
+        "label": "Availability",
+        "slug": "filter_availability",
+        "default": "everything",
+        "list": [
+            {"label": "Everything", "slug": "everything"},
+            {"label": "Fulltext", "slug": "fulltext"},
+            {"label": "Open Access", "slug": "oa"},
+        ],
+    }
+    sort_options: Any = {
+        "label": "Sort Order",
+        "slug": "sort_order",
+        "default": "relevancy",
+        "list": [
+            {"label": "All Time", "slug": "relevancy"},
+            {"label": "Recent First", "slug": "time_desc"},
+            {"label": "Oldest First", "slug": "time_asc"},
+        ],
     }
 
-def do_fulltext_search(q, limit=25, offset=0, filter_time=None, filter_type=None):
+class FulltextHits(BaseModel):
+    count_returned: int
+    count_found: int
+    offset: int
+    limit: int
+    deep_page_limit: int
+    query_time_ms: int 
+    results: List[Any]
 
-    # Convert raw DOIs to DOI queries
-    if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1:
-        q = 'doi:"{}"'.format(q)
+
+def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> FulltextHits:
 
     es_client = elasticsearch.Elasticsearch(settings.ELASTICSEARCH_BACKEND)
     search = Search(using=es_client, index=settings.ELASTICSEARCH_FULLTEXT_INDEX)
 
+    # Convert raw DOIs to DOI queries
+    if len(query.q.split()) == 1 and query.q.startswith("10.") and query.q.count("/") >= 1:
+        search = search.filter("terms", doi=query.q)
+        query.q = "*"
+
     # type filters
-    if filter_type == "papers":
+    if query.filter_type == "papers":
         search = search.filter("terms", type=[ "article-journal", "paper-conference", "chapter", ])
-    elif filter_type == "reports":
+    elif query.filter_type == "reports":
         search = search.filter("terms", type=[ "report", "standard", ])
-    elif filter_type == "datasets":
+    elif query.filter_type == "datasets":
         search = search.filter("terms", type=[ "dataset", "software", ])
-    elif filter_type == "everything" or filter_type == None:
+    elif query.filter_type == "everything" or query.filter_type == None:
         pass
     else:
-        # XXX: abort(400)
-        raise Exception()
+        raise ValueError(f"Unknown 'filter_type' parameter value: '{query.filter_type}'")
 
     # time filters
-    if filter_time == "past_week":
+    if query.filter_time == "past_week":
         week_ago_date = str(datetime.date.today() - datetime.timedelta(days=7))
         search = search.filter("range", date=dict(gte=week_ago_date))
-    elif filter_time == "this_year":
-        search = search.filter("term", year=datetime.date.today().year)
-    elif filter_time == "since_2000":
+    elif query.filter_time == "past_year":
+        # (date in the past year) or (year is this year)
+        # the later to catch papers which don't have release_date defined
+        year_ago_date = str(datetime.date.today() - datetime.timedelta(days=365))
+        this_year = datetime.date.today().year
+        search = search.filter(Q("range", date=dict(gte=year_ago_date)) | Q("term", year=this_year))
+    elif query.filter_time == "since_2000":
         search = search.filter("range", year=dict(gte=2000))
-    elif filter_time == "before_1925":
+    elif query.filter_time == "before_1925":
         search = search.filter("range", year=dict(lt=1925))
-    elif filter_time == "all_time" or filter_time == None:
+    elif query.filter_time == "all_time" or query.filter_time == None:
         pass
     else:
-        # XXX: abort(400)
-        raise Exception()
+        raise ValueError(f"Unknown 'filter_time' parameter value: '{query.filter_time}'")
 
     search = search.query(
         'query_string',
-        query=q,
+        query=query.q,
         default_operator="AND",
         analyze_wildcard=True,
         lenient=True,
@@ -127,7 +138,62 @@ def do_fulltext_search(q, limit=25, offset=0, filter_time=None, filter_type=None
         fragment_size=250,
     )
 
-    resp = generic_search_execute(search, offset=offset)
+    # sort order
+    if query.sort_order == "time_asc":
+        search = search.sort("year", "date")
+    elif query.sort_order == "time_desc":
+        search = search.sort("-year", "-date")
+    elif query.sort_order == "relevancy" or query.sort_order == None:
+        pass
+    else:
+        raise ValueError(f"Unknown 'sort_order' parameter value: '{query.sort_order}'")
+
+    # Sanity checks
+    limit = min((int(query.limit or 25), 100))
+    offset = max((int(query.offset or 0), 0))
+    if offset > deep_page_limit:
+        # Avoid deep paging problem.
+        offset = deep_page_limit
+
+    search = search[offset:offset+limit]
+
+    try:
+        resp = search.execute()
+    except elasticsearch.exceptions.RequestError as e:
+        # this is a "user" error
+        print("elasticsearch 400: " + str(e.info), file=sys.stderr)
+        raise ValueError(str(e.info))
+    except elasticsearch.exceptions.TransportError as e:
+        # all other errors
+        print("elasticsearch non-200 status code: {}".format(e.info), file=sys.stderr)
+        raise IOError(str(e.info))
 
-    resp["query"] = { "q": q }
-    return resp
+    # convert from objects to python dicts
+    results = []
+    for h in resp:
+        r = h._d_
+        #print(json.dumps(h.meta._d_, indent=2))
+        r['_highlights'] = []
+        if 'highlight' in dir(h.meta):
+            highlights = h.meta.highlight._d_
+            for k in highlights:
+                r['_highlights'] += highlights[k]
+        results.append(r)
+
+    for h in results:
+        # Handle surrogate strings that elasticsearch returns sometimes,
+        # probably due to mangled data processing in some pipeline.
+        # "Crimes against Unicode"; production workaround
+        for key in h:
+            if type(h[key]) is str:
+                h[key] = h[key].encode('utf8', 'ignore').decode('utf8')
+
+    return FulltextHits(
+        count_returned=len(results),
+        count_found=int(resp.hits.total),
+        offset=offset,
+        limit=limit,
+        deep_page_limit=deep_page_limit,
+        query_time_ms=int(resp.took),
+        results=results,
+    )
author	Bryan Newbold <bnewbold@archive.org>	2020-05-21 14:23:31 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-05-21 14:23:41 -0700
commit	01ae25c1bf24c8d9f7721f49122a15bf522bdbb4 (patch)
tree	36add4d6c2c7050f646a14c2a0ffc5f03436ad03 /fatcat_scholar/search.py
parent	6c4f539463074bcb563675b6e4f19464339e5641 (diff)
download	fatcat-scholar-01ae25c1bf24c8d9f7721f49122a15bf522bdbb4.tar.gz fatcat-scholar-01ae25c1bf24c8d9f7721f49122a15bf522bdbb4.zip