From 693a6f71b1afef686b6783ba3afb1a67bb14b62b Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 25 Mar 2020 13:02:53 -0700
Subject: WIP: refactoring search to use elasticsearch-dsl

---
 python/fatcat_web/__init__.py |   5 +-
 python/fatcat_web/search.py   | 285 ++++++++++++++++++++----------------------
 2 files changed, 137 insertions(+), 153 deletions(-)
diff --git a/python/fatcat_web/__init__.py b/python/fatcat_web/__init__.py
index 562ffeb2..487de58a 100644
--- a/python/fatcat_web/__init__.py
+++ b/python/fatcat_web/__init__.py
@@ -11,6 +11,7 @@ from authlib.flask.client import OAuth
 from loginpass import create_flask_blueprint, Gitlab, GitHub, ORCiD
 from raven.contrib.flask import Sentry
 import fatcat_openapi_client
+import elasticsearch
 
 from fatcat_web.web_config import Config
 
@@ -71,7 +72,9 @@ mwoauth = MWOAuth(
 mwoauth.handshaker.user_agent = "fatcat.wiki;python_web_interface"
 app.register_blueprint(mwoauth.bp, url_prefix='/auth/wikipedia')
 
-from fatcat_web import routes, editing_routes, auth, cors, forms  # noqa: E402
+app.es_client = elasticsearch.Elasticsearch(Config.ELASTICSEARCH_BACKEND)
+
+from fatcat_web import routes, editing_routes, auth, cors, forms
 
 # TODO: blocking on ORCID support in loginpass
 if Config.ORCID_CLIENT_ID:
diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py
index 4a87c735..87d07e55 100644
--- a/python/fatcat_web/search.py
+++ b/python/fatcat_web/search.py
@@ -11,7 +11,10 @@ import requests
 from flask import abort, flash
 from fatcat_web import app
 
-def do_search(index, request, limit=30, offset=0, deep_page_limit=2000):
+import elasticsearch
+from elasticsearch_dsl import Search, Q
+
+def generic_search_execute(search, limit=30, offset=0, deep_page_limit=2000):
 
     # Sanity checks
     if limit > 100:
@@ -22,25 +25,24 @@ def do_search(index, request, limit=30, offset=0, deep_page_limit=2000):
         # Avoid deep paging problem.
         offset = deep_page_limit
 
-    request["size"] = int(limit)
-    request["from"] = int(offset)
-    # print(request)
-    resp = requests.get("%s/%s/_search" %
-            (app.config['ELASTICSEARCH_BACKEND'], index),
-        json=request)
-
-    if resp.status_code == 400:
-        print("elasticsearch 400: " + str(resp.content))
-        flash("Search query failed to parse; you might need to use quotes.<p><code>{}</code>".format(resp.content))
-        abort(resp.status_code)
-    elif resp.status_code != 200:
-        print("elasticsearch non-200 status code: " + str(resp.status_code))
-        print(resp.content)
-        abort(resp.status_code)
-
-    content = resp.json()
-    results = [h['_source'] for h in content['hits']['hits']]
-    for h in results:
+    search = search[int(offset):int(offset)+int(limit)]
+
+    try:
+        resp = search.execute()
+    except elasticsearch.exceptions.RequestError as e:
+        # this is a "user" error
+        print("elasticsearch 400: " + str(e.info))
+        flash("Search query failed to parse; you might need to use quotes.<p><code>{}: {}</code>".format(e.error, e.info['error']['root_cause'][0]['reason']))
+        abort(e.status_code)
+    except elasticsearch.exceptions.TransportError as e:
+        # all other errors
+        print("elasticsearch non-200 status code: {}".format(e.info))
+        flash("Elasticsearch error: {}".format(e.error))
+        abort(e.status_code)
+
+    # just the dict()
+    hits = [h._d_ for h in resp]
+    for h in hits:
         # Handle surrogate strings that elasticsearch returns sometimes,
         # probably due to mangled data processing in some pipeline.
         # "Crimes against Unicode"; production workaround
@@ -48,20 +50,16 @@ def do_search(index, request, limit=30, offset=0, deep_page_limit=2000):
             if type(h[key]) is str:
                 h[key] = h[key].encode('utf8', 'ignore').decode('utf8')
 
-    return {"count_returned": len(results),
-            "count_found": content['hits']['total'],
-            "results": results,
+    return {"count_returned": len(hits),
+            "count_found": int(resp.hits.total),
+            "results": hits,
             "offset": offset,
+            "limit": limit,
             "deep_page_limit": deep_page_limit}
 
 
 def do_release_search(q, limit=30, fulltext_only=True, offset=0):
 
-    #print("Search hit: " + q)
-    if limit > 100:
-        # Sanity check
-        limit = 100
-
     # Convert raw DOIs to DOI queries
     if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1:
         q = 'doi:"{}"'.format(q)
@@ -69,26 +67,25 @@ def do_release_search(q, limit=30, fulltext_only=True, offset=0):
     if fulltext_only:
         q += " in_web:true"
 
-    search_request = {
-        "query": {
-            "query_string": {
-                "query": q,
-                "default_operator": "AND",
-                "analyze_wildcard": True,
-                "lenient": True,
-                "fields": ["biblio"],
-            },
-        },
-    }
+    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) \
+        .query(
+            'query_string',
+            query=q,
+            default_operator="AND",
+            analyze_wildcard=True,
+            lenient=True,
+            fields=["biblio"],
+        )
+
+    resp = generic_search_execute(search, offset=offset)
 
-    resp = do_search(app.config['ELASTICSEARCH_RELEASE_INDEX'], search_request, offset=offset)
     for h in resp['results']:
+        print(h)
         # Ensure 'contrib_names' is a list, not a single string
         if type(h['contrib_names']) is not list:
             h['contrib_names'] = [h['contrib_names'], ]
         h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']]
     resp["query"] = { "q": q }
-    resp["limit"] = limit
     return resp
 
 
@@ -98,21 +95,18 @@ def do_container_search(q, limit=30, offset=0):
     if len(q.split()) == 1 and len(q) == 9 and q[0:4].isdigit() and q[4] == '-':
         q = 'issnl:"{}"'.format(q)
 
-    search_request = {
-        "query": {
-            "query_string": {
-                "query": q,
-                "default_operator": "AND",
-                "analyze_wildcard": True,
-                "lenient": True,
-                "fields": ["biblio"],
-            },
-        },
-    }
-
-    resp = do_search(app.config['ELASTICSEARCH_CONTAINER_INDEX'], search_request, limit=limit, offset=offset)
+    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) \
+        .query(
+            'query_string',
+            query=q,
+            default_operator="AND",
+            analyze_wildcard=True,
+            lenient=True,
+            fields=["biblio"],
+        )
+
+    resp = generic_search_execute(search, offset=offset)
     resp["query"] = { "q": q }
-    resp["limit"] = limit
     return resp
 
 def get_elastic_entity_stats():
@@ -127,85 +121,81 @@ def get_elastic_entity_stats():
 
     stats = {}
 
-    # 2. releases
-    #  - total count
-    #  - total citation records
-    #  - total (paper, chapter, proceeding)
-    #  - " with fulltext on web
-    #  - " open access
-    #  - " not in KBART, in IA
-    #
-    # Can do the above with two queries:
-    #  - all releases, aggregate count and sum(ref_count)
-    #  - in-scope works, aggregate count by (fulltext, OA, kbart/ia)
-
-    # 2a. release totals
-    query = {
-        "size": 0,
-        "aggs": {
-            "release_ref_count": { "sum": { "field": "ref_count" } }
-        }
-    }
-    resp = requests.get(
-        "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']),
-        json=query,
-        params=dict(request_cache="true"))
-    # TODO: abort()
-    resp.raise_for_status()
-    resp = resp.json()
+    # release totals
+    search = Search(
+        using=app.es_client,
+        index=app.config['ELASTICSEARCH_RELEASE_INDEX']) \
+        .extra(request_cache=True)
+    search.aggs.bucket(
+        'release_ref_count',
+        'sum',
+        field='ref_count',
+    )
+    search = search[:0]
+
+    # NOTE: not catching exceptions
+    resp = search.execute()
     stats['release'] = {
-        "total": resp['hits']['total'],
-        "refs_total": int(resp['aggregations']['release_ref_count']['value']),
+        "total": int(resp.hits.total),
+        "refs_total": int(resp.aggregations.release_ref_count.value),
     }
 
-    # 2b. paper counts
-    query = {
-        "size": 0,
-        "query": {
-            "terms": { "release_type": [
-                # "chapter", "thesis",
-                "article-journal", "paper-conference",
-            ] } },
-        "aggs": { "paper_like": { "filters": { "filters": {
-                "in_web": { "term": { "in_web": "true" } },
-                "is_oa": { "term": { "is_oa": "true" } },
-                "in_kbart": { "term": { "in_kbart": "true" } },
-                "in_web_not_kbart": { "bool": { "filter": [
-                        { "term": { "in_web": "true" } },
-                        { "term": { "in_kbart": "false" } }
-                ]}}
-        }}}}
-    }
-    resp = requests.get(
-        "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']),
-        json=query,
-        params=dict(request_cache="true"))
-    # TODO: abort()
-    resp.raise_for_status()
-    resp = resp.json()
-    buckets = resp['aggregations']['paper_like']['buckets']
+    # paper counts
+    search = Search(
+        using=app.es_client,
+        index=app.config['ELASTICSEARCH_RELEASE_INDEX']) \
+        .query(
+            'terms',
+            release_type=[
+                "article-journal",
+                "paper-conference",
+                # "chapter",
+                # "thesis",
+            ],
+        ) \
+        .extra(request_cache=True)
+    search.aggs.bucket(
+        'paper_like',
+        'filters',
+        filters={
+            "in_web": { "term": { "in_web": "true" } },
+            "is_oa": { "term": { "is_oa": "true" } },
+            "in_kbart": { "term": { "in_kbart": "true" } },
+            "in_web_not_kbart": { "bool": { "filter": [
+                { "term": { "in_web": "true" } },
+                { "term": { "in_kbart": "false" } },
+            ]}},
+        }
+    )
+    search = search[:0]
+
+    # NOTE: not catching exceptions
+    resp = search.execute()
+    buckets = resp.aggregations.paper_like.buckets
     stats['papers'] = {
-        'total': resp['hits']['total'],
-        'in_web': buckets['in_web']['doc_count'],
-        'is_oa': buckets['is_oa']['doc_count'],
-        'in_kbart': buckets['in_kbart']['doc_count'],
-        'in_web_not_kbart': buckets['in_web_not_kbart']['doc_count'],
+        'total': resp.hits.total,
+        'in_web': buckets.in_web.doc_count,
+        'is_oa': buckets.is_oa.doc_count,
+        'in_kbart': buckets.in_kbart.doc_count,
+        'in_web_not_kbart': buckets.in_web_not_kbart.doc_count,
     }
 
-    # 3. containers
-    #   => total count
-    query = {
-        "size": 0,
-    }
-    resp = requests.get(
-        "{}/fatcat_container/_search".format(app.config['ELASTICSEARCH_BACKEND']),
-        json=query,
-        params=dict(request_cache="true"))
-    # TODO: abort()
-    resp.raise_for_status()
-    resp = resp.json()
+    # container counts
+    search = Search(
+        using=app.es_client,
+        index=app.config['ELASTICSEARCH_CONTAINER_INDEX']) \
+        .extra(request_cache=True)
+    search.aggs.bucket(
+        'release_ref_count',
+        'sum',
+        field='ref_count',
+    )
+    search = search[:0]
+
+    # NOTE: not catching exceptions
+    resp = search.execute()
     stats['container'] = {
-        "total": resp['hits']['total'],
+        "total": resp.hits.total,
     }
 
     return stats
@@ -259,31 +249,22 @@ def get_elastic_container_random_releases(ident, limit=5):
 
     assert limit > 0 and limit <= 100
 
-    query = {
-        "size": int(limit),
-        "sort": [
-            { "in_web": {"order": "desc"} },
-            { "release_date": {"order": "desc"} },
-        ],
-        "query": {
-            "bool": {
-                "must": [
-                    { "term": { "container_id": ident } },
-                    { "range": { "release_year": { "lte": datetime.datetime.today().year } } },
-                ],
-            },
-        },
-    }
-    resp = requests.get(
-        "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']),
-        json=query,
-        params=dict(request_cache="true"))
-    # TODO: abort()
-    #print(resp.json())
-    resp.raise_for_status()
-    resp = resp.json()
-    #print(resp)
-    hits = [h['_source'] for h in resp['hits']['hits']]
+    search = Search(using=app.es_client, index=app.conf.ELASTICSEARCH_RELEASE_INDEX) \
+        .query('bool',
+            must=[
+                Q('term', container_id=ident),
+                Q('range', release_year={ "lte": datetime.datetime.today().year }),
+            ]
+        ) \
+        .sort('-in_web', '-release_date') \
+        .extra(request_cache=True)
+
+    search = search[:int(limit)]
+
+    resp = search.execute()
+
+    hits = [dict(h.source) for h in resp]
+
     for h in hits:
         # Handle surrogate strings that elasticsearch returns sometimes,
         # probably due to mangled data processing in some pipeline.
-- 
cgit v1.2.3


From 3cc6931f971b7cb61be354a10c6235dc1dd122e9 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 1 Jul 2020 18:58:46 -0700
Subject: web search: fix pylint error

---
 python/fatcat_web/search.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py
index 87d07e55..0fce6454 100644
--- a/python/fatcat_web/search.py
+++ b/python/fatcat_web/search.py
@@ -131,7 +131,7 @@ def get_elastic_entity_stats():
         'sum',
         field='ref_count',
     )
-    search = search[:0]
+    search = search[:0]  # pylint: disable=unsubscriptable-object
 
     # NOTE: not catching exceptions
     resp = search.execute()
@@ -190,7 +190,7 @@ def get_elastic_entity_stats():
         'sum',
         field='ref_count',
     )
-    search = search[:0]
+    search = search[:0]  # pylint: disable=unsubscriptable-object
 
     # NOTE: not catching exceptions
     resp = search.execute()
-- 
cgit v1.2.3


From b9ba24553e2e1de3c3ac0faeba59231ec512fa67 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 1 Jul 2020 20:27:45 -0700
Subject: refactor release and container search

Based on fatcat-scholar refactoring.

This doesn't include refactoring of stats, aggregates, or histograms
yet, just the direct queries.

Don't have any test coverage yet; intend to try elasticmock or figuring
out how to ingest mock JSON results directly.
---
 python/fatcat_web/routes.py                       |  44 +---
 python/fatcat_web/search.py                       | 274 ++++++++++++++++------
 python/fatcat_web/templates/container_search.html |  16 +-
 python/fatcat_web/templates/entity_macros.html    |  10 +-
 python/fatcat_web/templates/release_search.html   |  20 +-
 python/tests/web_search.py                        |   7 +-
 6 files changed, 235 insertions(+), 136 deletions(-)

diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py
index 2489ac03..4a66b3c2 100644
--- a/python/fatcat_web/routes.py
+++ b/python/fatcat_web/routes.py
@@ -14,7 +14,7 @@ from fatcat_tools.normal import *
 from fatcat_web import app, api, auth_api, priv_api, mwoauth, Config
 from fatcat_web.auth import handle_token_login, handle_logout, load_user, handle_ia_xauth, handle_wmoauth
 from fatcat_web.cors import crossdomain
-from fatcat_web.search import *
+from fatcat_web.search import ReleaseQuery, GenericQuery, do_release_search, do_container_search, get_elastic_entity_stats, get_elastic_container_stats, get_elastic_container_histogram
 from fatcat_web.entity_helpers import *
 from fatcat_web.graphics import *
 from fatcat_web.kafka import *
@@ -706,44 +706,22 @@ def generic_search():
 @app.route('/release/search', methods=['GET', 'POST'])
 def release_search():
 
-    query = request.args.get('q')
-    if not query:
-        query = '*'
-    fulltext_only = bool(request.args.get('fulltext_only'))
+    if 'q' not in request.args.keys():
+        return render_template('release_search.html', query=ReleaseQuery(), found=None)
 
-    issnl = request.args.get('container_issnl')
-    if issnl and query:
-        query += ' container_issnl:"{}"'.format(issnl)
-
-    container_id = request.args.get('container_id')
-    if container_id and query:
-        query += ' container_id:"{}"'.format(container_id)
-
-    offset = request.args.get('offset', '0')
-    offset = max(0, int(offset)) if offset.isnumeric() else 0
-
-    if 'q' in request.args.keys():
-        # always do files for HTML
-        found = do_release_search(query, fulltext_only=fulltext_only, offset=offset)
-        return render_template('release_search.html', found=found, query=query, fulltext_only=fulltext_only)
-    else:
-        return render_template('release_search.html', query=query, fulltext_only=fulltext_only)
+    query = ReleaseQuery.from_args(request.args)
+    found = do_release_search(query)
+    return render_template('release_search.html', query=query, found=found)
 
 @app.route('/container/search', methods=['GET', 'POST'])
 def container_search():
 
-    query = request.args.get('q')
-    if not query:
-        query = '*'
-    offset = request.args.get('offset', '0')
-    offset = max(0, int(offset)) if offset.isnumeric() else 0
+    if 'q' not in request.args.keys():
+        return render_template('container_search.html', query=GenericQuery(), found=None)
 
-    if 'q' in request.args.keys():
-        # always do files for HTML
-        found = do_container_search(query, offset=offset)
-        return render_template('container_search.html', found=found, query=query)
-    else:
-        return render_template('container_search.html', query=query)
+    query = GenericQuery.from_args(request.args)
+    found = do_container_search(query)
+    return render_template('container_search.html', query=query, found=found)
 
 def get_changelog_stats():
     stats = {}
diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py
index 0fce6454..5baa8497 100644
--- a/python/fatcat_web/search.py
+++ b/python/fatcat_web/search.py
@@ -2,112 +2,236 @@
 """
 Helpers for doing elasticsearch queries (used in the web interface; not part of
 the formal API)
-
-TODO: ELASTICSEARCH_*_INDEX should probably be factored out and just hard-coded
 """
 
+import sys
 import datetime
+from dataclasses import dataclass
+from typing import List, Optional, Any
+
 import requests
 from flask import abort, flash
-from fatcat_web import app
-
 import elasticsearch
 from elasticsearch_dsl import Search, Q
+import elasticsearch_dsl.response
 
-def generic_search_execute(search, limit=30, offset=0, deep_page_limit=2000):
+from fatcat_web import app
 
-    # Sanity checks
-    if limit > 100:
-        limit = 100
-    if offset < 0:
-        offset = 0
-    if offset > deep_page_limit:
-        # Avoid deep paging problem.
-        offset = deep_page_limit
+@dataclass
+class ReleaseQuery:
+    q: Optional[str] = None
+    limit: Optional[int] = None
+    offset: Optional[int] = None
+    fulltext_only: bool = False
+    container_id: Optional[str] = None
+
+    @classmethod
+    def from_args(cls, args) -> 'ReleaseQuery':
+
+        query_str = args.get('q') or '*'
+
+        container_id = args.get('container_id')
+        # TODO: as filter, not in query string
+        if container_id:
+            query_str += ' container_id:"{}"'.format(container_id)
+
+        # TODO: where are container_issnl queries actually used?
+        issnl = args.get('container_issnl')
+        if issnl and query_str:
+            query_str += ' container_issnl:"{}"'.format(issnl)
+
+        offset = args.get('offset', '0')
+        offset = max(0, int(offset)) if offset.isnumeric() else 0
+
+        return ReleaseQuery(
+            q=query_str,        
+            offset=offset,
+            fulltext_only=bool(args.get('fulltext_only')),
+            container_id=container_id,
+        )
+
+@dataclass
+class GenericQuery:
+    q: Optional[str] = None
+    limit: Optional[int] = None
+    offset: Optional[int] = None
+
+    @classmethod
+    def from_args(cls, args) -> 'GenericQuery':
+        query_str = args.get('q')
+        if not query_str:
+            query_str = '*'
+        offset = args.get('offset', '0')
+        offset = max(0, int(offset)) if offset.isnumeric() else 0
+
+        return GenericQuery(
+            q=query_str,
+            offset=offset,
+        )
+
+@dataclass
+class SearchHits:
+    count_returned: int
+    count_found: int
+    offset: int
+    limit: int
+    deep_page_limit: int
+    query_time_ms: int
+    results: List[Any]
+
+
+def results_to_dict(response: elasticsearch_dsl.response.Response) -> List[dict]:
+    """
+    Takes a response returns all the hits as JSON objects.
+
+    Also handles surrogate strings that elasticsearch returns sometimes,
+    probably due to mangled data processing in some pipeline. "Crimes against
+    Unicode"; production workaround
+    """
+
+    results = []
+    for h in response:
+        r = h._d_
+        # print(h.meta._d_)
+        results.append(r)
 
-    search = search[int(offset):int(offset)+int(limit)]
+    for h in results:
+        for key in h:
+            if type(h[key]) is str:
+                h[key] = h[key].encode("utf8", "ignore").decode("utf8")
+    return results
 
+def wrap_es_execution(search: Search) -> Any:
+    """
+    Executes a Search object, and converts various ES error types into
+    something we can pretty print to the user.
+    """
     try:
         resp = search.execute()
     except elasticsearch.exceptions.RequestError as e:
         # this is a "user" error
-        print("elasticsearch 400: " + str(e.info))
-        flash("Search query failed to parse; you might need to use quotes.<p><code>{}: {}</code>".format(e.error, e.info['error']['root_cause'][0]['reason']))
-        abort(e.status_code)
+        print("elasticsearch 400: " + str(e.info), file=sys.stderr)
+        if e.info.get("error", {}).get("root_cause", {}):
+            raise ValueError(str(e.info["error"]["root_cause"][0].get("reason")))
+        else:
+            raise ValueError(str(e.info))
     except elasticsearch.exceptions.TransportError as e:
         # all other errors
-        print("elasticsearch non-200 status code: {}".format(e.info))
-        flash("Elasticsearch error: {}".format(e.error))
-        abort(e.status_code)
+        print("elasticsearch non-200 status code: {}".format(e.info), file=sys.stderr)
+        raise IOError(str(e.info))
+    return resp
 
-    # just the dict()
-    hits = [h._d_ for h in resp]
-    for h in hits:
-        # Handle surrogate strings that elasticsearch returns sometimes,
-        # probably due to mangled data processing in some pipeline.
-        # "Crimes against Unicode"; production workaround
-        for key in h:
-            if type(h[key]) is str:
-                h[key] = h[key].encode('utf8', 'ignore').decode('utf8')
+def do_container_search(
+    query: GenericQuery, deep_page_limit: int = 2000
+) -> SearchHits:
 
-    return {"count_returned": len(hits),
-            "count_found": int(resp.hits.total),
-            "results": hits,
-            "offset": offset,
-            "limit": limit,
-            "deep_page_limit": deep_page_limit}
+    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_CONTAINER_INDEX'])
 
+    search = search.query(
+        "query_string",
+        query=query.q,
+        default_operator="AND",
+        analyze_wildcard=True,
+        allow_leading_wildcard=False,
+        lenient=True,
+        fields=["biblio"],
+    )
 
-def do_release_search(q, limit=30, fulltext_only=True, offset=0):
+    # Sanity checks
+    limit = min((int(query.limit or 25), 100))
+    offset = max((int(query.offset or 0), 0))
+    if offset > deep_page_limit:
+        # Avoid deep paging problem.
+        offset = deep_page_limit
 
-    # Convert raw DOIs to DOI queries
-    if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1:
-        q = 'doi:"{}"'.format(q)
+    search = search[offset : (offset + limit)]
 
-    if fulltext_only:
-        q += " in_web:true"
+    resp = wrap_es_execution(search)
+    results = results_to_dict(resp)
 
-    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) \
-        .query(
-            'query_string',
-            query=q,
-            default_operator="AND",
-            analyze_wildcard=True,
-            lenient=True,
-            fields=["biblio"],
-        )
+    return SearchHits(
+        count_returned=len(results),
+        count_found=int(resp.hits.total),
+        offset=offset,
+        limit=limit,
+        deep_page_limit=deep_page_limit,
+        query_time_ms=int(resp.took),
+        results=results,
+    )
 
-    resp = generic_search_execute(search, offset=offset)
+def do_release_search(
+    query: ReleaseQuery, deep_page_limit: int = 2000
+) -> SearchHits:
+
+    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+
+    # availability filters
+    if query.fulltext_only:
+        search = search.filter("term", in_ia=True)
+
+    # Below, we combine several queries to improve scoring.
+
+    # this query use the fancy built-in query string parser
+    basic_biblio = Q(
+        "query_string",
+        query=query.q,
+        default_operator="AND",
+        analyze_wildcard=True,
+        allow_leading_wildcard=False,
+        lenient=True,
+        fields=[
+            "title^2",
+            "biblio",
+        ],
+    )
+    has_fulltext = Q("term", in_ia=True)
+    poor_metadata = Q(
+        "bool",
+        should=[
+            # if these fields aren't set, metadata is poor. The more that do
+            # not exist, the stronger the signal.
+            Q("bool", must_not=Q("exists", field="title")),
+            Q("bool", must_not=Q("exists", field="release_year")),
+            Q("bool", must_not=Q("exists", field="release_type")),
+            Q("bool", must_not=Q("exists", field="release_stage")),
+        ],
+    )
 
-    for h in resp['results']:
-        print(h)
-        # Ensure 'contrib_names' is a list, not a single string
-        if type(h['contrib_names']) is not list:
-            h['contrib_names'] = [h['contrib_names'], ]
-        h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']]
-    resp["query"] = { "q": q }
-    return resp
+    search = search.query(
+        "boosting",
+        positive=Q("bool", must=basic_biblio, should=[has_fulltext],),
+        negative=poor_metadata,
+        negative_boost=0.5,
+    )
 
+    # Sanity checks
+    limit = min((int(query.limit or 25), 100))
+    offset = max((int(query.offset or 0), 0))
+    if offset > deep_page_limit:
+        # Avoid deep paging problem.
+        offset = deep_page_limit
 
-def do_container_search(q, limit=30, offset=0):
+    search = search[offset : (offset + limit)]
 
-    # Convert raw ISSN-L to ISSN-L query
-    if len(q.split()) == 1 and len(q) == 9 and q[0:4].isdigit() and q[4] == '-':
-        q = 'issnl:"{}"'.format(q)
+    resp = wrap_es_execution(search)
+    results = results_to_dict(resp)
 
-    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) \
-        .query(
-            'query_string',
-            query=q,
-            default_operator="AND",
-            analyze_wildcard=True,
-            lenient=True,
-            fields=["biblio"],
-        )
+    for h in results:
+        # Ensure 'contrib_names' is a list, not a single string
+        print(h, file=sys.stderr)
+        if type(h['contrib_names']) is not list:
+            h['contrib_names'] = [h['contrib_names'], ]
+        h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']]
 
-    resp = generic_search_execute(search, offset=offset)
-    resp["query"] = { "q": q }
-    return resp
+    return SearchHits(
+        count_returned=len(results),
+        count_found=int(resp.hits.total),
+        offset=offset,
+        limit=limit,
+        deep_page_limit=deep_page_limit,
+        query_time_ms=int(resp.took),
+        results=results,
+    )
 
 def get_elastic_entity_stats():
     """
diff --git a/python/fatcat_web/templates/container_search.html b/python/fatcat_web/templates/container_search.html
index 1a804595..2566f542 100644
--- a/python/fatcat_web/templates/container_search.html
+++ b/python/fatcat_web/templates/container_search.html
@@ -2,8 +2,8 @@
 {% extends "base.html" %}
 
 {% block title %}
-{% if query %}
-  Search: {{ query }}
+{% if query.q %}
+  Search: {{ query.q }}
 {% else %}
   Release Search
 {% endif %}
@@ -18,9 +18,9 @@
     <form class="" role="search" action="/container/search" method="get">
       <div class="ui form">
         <div class="ui action input huge fluid">
-          <input type="text" placeholder="Query..." name="q" value="{% if query %}{{ query }}{% endif %}" aria-label="search container metadata"> <button class="ui button">Search</button>
+          <input type="text" placeholder="Query..." name="q" value="{% if query.q %}{{ query.q }}{% endif %}" aria-label="search container metadata"> <button class="ui button">Search</button>
         </div>
-        <br>Can also lookup by <b><a href="/container/lookup">identifier</a></b> or search <b><a href="/release/search?q={{ query or "" }}">releases</a></b>.
+        <br>Can also lookup by <b><a href="/container/lookup">identifier</a></b> or search <b><a href="/release/search?q={{ query.q or "" }}">releases</a></b>.
       </div>
     </form>
   </div>
@@ -32,7 +32,7 @@
 {% if found %}
   {% if found.results %}
 
-    {{ entity_macros.top_results(found) }}
+    {{ entity_macros.top_results(query, found) }}
 
     {% for entity in found.results %}
       <div>
@@ -55,13 +55,13 @@
     {% if found.results|length > 8 %}
       <div class="ui divider"></div>
       <div style="text-align: center">
-      {{ entity_macros.bottom_results(found, endpoint='container_search') }}
+      {{ entity_macros.bottom_results(query, found, endpoint='container_search') }}
       </div>
     {% endif %}
 
   {% else %}
 
-    Raw query was: <i>{{ found.query.q }}</i>
+    Raw query was: <i>{{ query.q }}</i>
 
     <div class="ui centered stackable grid" style="padding-top: 15%;">
       <div class="row">
@@ -72,7 +72,7 @@
           <h2>No results found!</h2>
           <p>You could try elsewhere:</p>
           <ul>
-            <li>Search <a href="https://scholar.google.com/scholar?q={{ found.query.q | urlencode }}">Google Scholar</a></li>
+            <li>Search <a href="https://scholar.google.com/scholar?q={{ query.q | urlencode }}">Google Scholar</a></li>
           </ul>
         </div>
       </div>
diff --git a/python/fatcat_web/templates/entity_macros.html b/python/fatcat_web/templates/entity_macros.html
index c22eb106..0e7f135a 100644
--- a/python/fatcat_web/templates/entity_macros.html
+++ b/python/fatcat_web/templates/entity_macros.html
@@ -262,7 +262,7 @@ yellow
 {% endif %}
 {%- endmacro %}
 
-{% macro top_results(found) -%}
+{% macro top_results(query, found) -%}
 
 <i>Showing
   {% if found.offset == 0 %}
@@ -278,13 +278,13 @@ yellow
 {%- endmacro %}
 
 
-{% macro bottom_results(found, endpoint='release_search') -%}
+{% macro bottom_results(query, found, endpoint='release_search') -%}
 
 {% if found.offset > 0 %}
   {% if found.offset - found.limit < 0 %}
-    <a href="{{ url_for(endpoint, q=found.query.q, offset=0) }}">&#xab; Previous</a>
+    <a href="{{ url_for(endpoint, q=query.q, offset=0) }}">&#xab; Previous</a>
   {% else %}
-    <a href="{{ url_for(endpoint, q=found.query.q, offset=found.offset - found.limit) }}">&#xab; Previous</a>
+    <a href="{{ url_for(endpoint, q=query.q, offset=found.offset - found.limit) }}">&#xab; Previous</a>
   {% endif %}
 {% else %}
   <span style="color:gray">&#xab; Previous</span>
@@ -294,7 +294,7 @@ yellow
 found.count_returned }} out of {{ found.count_found }} results</i>&nbsp;&nbsp;
 
 {% if found.offset + found.limit < found.count_found and found.offset + found.limit < found.deep_page_limit %}
-  <a href="{{ url_for(endpoint, q=found.query.q, offset=found.offset + found.limit) }}">Next &#xbb;</a>
+  <a href="{{ url_for(endpoint, q=query.q, offset=found.offset + found.limit) }}">Next &#xbb;</a>
   {% else %}
   <span style="color:gray">Next &#xbb;</span>
 {% endif %}
diff --git a/python/fatcat_web/templates/release_search.html b/python/fatcat_web/templates/release_search.html
index a600f1b2..58aa35d6 100644
--- a/python/fatcat_web/templates/release_search.html
+++ b/python/fatcat_web/templates/release_search.html
@@ -2,8 +2,8 @@
 {% extends "base.html" %}
 
 {% block title %}
-{% if query %}
-  Search: {{ query }}
+{% if query.q %}
+  Search: {{ query.q }}
 {% else %}
   Release Search
 {% endif %}
@@ -18,14 +18,14 @@
     <form class="" role="search" action="/release/search" method="get">
       <div class="ui form">
         <div class="ui action input huge fluid">
-          <input type="text" placeholder="Query..." name="q" value="{% if query %}{{ query }}{% endif %}" aria-label="search release metadata">
+          <input type="text" placeholder="Query..." name="q" value="{% if query.q %}{{ query.q }}{% endif %}" aria-label="search release metadata">
           <button class="ui primary button">Search</button>
         </div>
         <div class="ui checkbox" style="float: right; margin: 1em;">
           <input type="checkbox" name="fulltext_only" id="fulltext_only" value="true" {% if fulltext_only %}checked{% endif %}>
           <label for="fulltext_only">Fulltext Available Only</label>
         </div>
-        <br>Can also lookup by <b><a href="/release/lookup">identifier</a></b> or search for <b><a href="/container/search?q={{ query or "" }}">containers</a></b> (eg, journals).
+        <br>Can also lookup by <b><a href="/release/lookup">identifier</a></b> or search for <b><a href="/container/search?q={{ query.q or "" }}">containers</a></b> (eg, journals).
       </div>
     </form>
   </div>
@@ -37,7 +37,7 @@
 {% if found %}
   {% if found.results %}
 
-    {{ entity_macros.top_results(found) }}
+    {{ entity_macros.top_results(query, found) }}
 
     {% for paper in found.results %}
       {{ entity_macros.release_search_result_row(paper) }}
@@ -46,13 +46,13 @@
     {% if found.results|length > 8 %}
       <div class="ui divider"></div>
       <div style="text-align: center">
-      {{ entity_macros.bottom_results(found, endpoint='release_search') }}
+      {{ entity_macros.bottom_results(query, found, endpoint='release_search') }}
       </div>
     {% endif %}
 
   {% else %}
 
-    Raw query was: <i>{{ found.query.q }}</i>
+    Raw query was: <i>{{ query.q }}</i>
 
     <div class="ui centered stackable grid" style="padding-top: 15%;">
       <div class="row">
@@ -63,9 +63,9 @@
           <h2>No results found!</h2>
           <p>You could try elsewhere:</p>
           <ul>
-            <li>Search <a href="https://dissem.in/search?q={{ found.query.q | urlencode }}">dissem.in</a></li>
-            <li>Search <a href="https://www.base-search.net/Search/Results?lookfor={{ found.query.q | urlencode }}">BASE</a></li>
-            <li>Search <a href="https://scholar.google.com/scholar?q={{ found.query.q | urlencode }}">Google Scholar</a></li>
+            <li>Search <a href="https://dissem.in/search?q={{ query.q | urlencode }}">dissem.in</a></li>
+            <li>Search <a href="https://www.base-search.net/Search/Results?lookfor={{ query.q | urlencode }}">BASE</a></li>
+            <li>Search <a href="https://scholar.google.com/scholar?q={{ query.q | urlencode }}">Google Scholar</a></li>
           </ul>
         </div>
       </div>
diff --git a/python/tests/web_search.py b/python/tests/web_search.py
index 7647bcf5..b55b0fcf 100644
--- a/python/tests/web_search.py
+++ b/python/tests/web_search.py
@@ -4,6 +4,7 @@ import responses
 
 from fixtures import *
 
+@pytest.mark.skip
 @responses.activate
 def test_release_search(app):
 
@@ -18,6 +19,7 @@ def test_release_search(app):
     assert b"Showing" in rv.data
     assert b"Quantum Studies of Acetylene Adsorption on Ice Surface" in rv.data
 
+@pytest.mark.skip
 @responses.activate
 def test_container_search(app):
 
@@ -112,8 +114,3 @@ def test_container_stats(app):
     rv = app.get('/container/issnl/1234-5678/stats.json')
     assert rv.status_code == 200
     # TODO: probe this response better
-
-# TODO: container stats
-# TODO: container ISSN-L query
-# TODO: release DOI query
-# TODO: release fulltext (filter) query
-- 
cgit v1.2.3


From a1f14f5b5ce087cb4681d46817da2be0777e4220 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 2 Jul 2020 12:12:11 -0700
Subject: update web_search tests to mock ES client

Instead of using 'responses' mock of 'requests' library.

Tried using 'elasticmock' helper but it didn't work.
---
 python/tests/fixtures.py   |  4 ++-
 python/tests/web_search.py | 88 +++++++++++++++++++++++-----------------------
 2 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/python/tests/fixtures.py b/python/tests/fixtures.py
index 44c7be63..3263f243 100644
--- a/python/tests/fixtures.py
+++ b/python/tests/fixtures.py
@@ -1,9 +1,10 @@
 
 import pytest
 from dotenv import load_dotenv
+import elasticsearch
+
 import fatcat_web
 import fatcat_openapi_client
-
 from fatcat_openapi_client import *
 from fatcat_tools import authenticated_api
 
@@ -13,6 +14,7 @@ def full_app():
     fatcat_web.app.testing = True
     fatcat_web.app.debug = False
     fatcat_web.app.config['WTF_CSRF_ENABLED'] = False
+    fatcat_web.app.es_client = elasticsearch.Elasticsearch("mockbackend")
     return fatcat_web.app
 
 @pytest.fixture
diff --git a/python/tests/web_search.py b/python/tests/web_search.py
index b55b0fcf..07985e04 100644
--- a/python/tests/web_search.py
+++ b/python/tests/web_search.py
@@ -1,33 +1,36 @@
 
 import json
-import responses
+import pytest
 
+from fatcat_openapi_client.rest import ApiException
 from fixtures import *
 
-@pytest.mark.skip
-@responses.activate
-def test_release_search(app):
+
+def test_release_search(app, mocker):
 
     with open('tests/files/elastic_release_search.json') as f:
         elastic_resp=json.loads(f.read())
 
-    responses.add(responses.GET, 'http://localhost:9200/fatcat_release/_search',
-        json=elastic_resp, status=200)
+    es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
+    es_raw.side_effect = [
+        (200, {}, json.dumps(elastic_resp)),
+    ]
 
     rv = app.get('/release/search?q=blood')
     assert rv.status_code == 200
     assert b"Showing" in rv.data
     assert b"Quantum Studies of Acetylene Adsorption on Ice Surface" in rv.data
 
-@pytest.mark.skip
-@responses.activate
-def test_container_search(app):
+def test_container_search(app, mocker):
 
     with open('tests/files/elastic_container_search.json') as f:
         elastic_resp=json.loads(f.read())
 
-    responses.add(responses.GET, 'http://localhost:9200/fatcat_container/_search',
-        json=elastic_resp, status=200)
+
+    es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
+    es_raw.side_effect = [
+        (200, {}, json.dumps(elastic_resp)),
+    ]
 
     rv = app.get('/container/search?q=blood')
     assert rv.status_code == 200
@@ -62,39 +65,35 @@ elastic_resp3 = {
     'took': 0
 }
 
-@responses.activate
-def test_stats(app):
-
-    responses.add(responses.GET,
-        'http://localhost:9200/fatcat_release/_search?request_cache=true',
-        json=elastic_resp1.copy(), status=200)
-    responses.add(responses.GET,
-        'http://localhost:9200/fatcat_release/_search?request_cache=true',
-        json=elastic_resp2.copy(), status=200)
-    responses.add(responses.GET,
-        'http://localhost:9200/fatcat_container/_search?request_cache=true',
-        json=elastic_resp3.copy(), status=200)
+def test_stats(app, mocker):
+
+    es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
+    es_raw.side_effect = [
+        (200, {}, json.dumps(elastic_resp1)),
+        (200, {}, json.dumps(elastic_resp2)),
+        (200, {}, json.dumps(elastic_resp3)),
+    ]
+
     rv = app.get('/stats')
     assert rv.status_code == 200
-    # TODO: robe these responses better
-
-@responses.activate
-def test_stats_json(app):
-
-    responses.add(responses.GET,
-        'http://localhost:9200/fatcat_release/_search?request_cache=true',
-        json=elastic_resp1.copy(), status=200)
-    responses.add(responses.GET,
-        'http://localhost:9200/fatcat_release/_search?request_cache=true',
-        json=elastic_resp2.copy(), status=200)
-    responses.add(responses.GET,
-        'http://localhost:9200/fatcat_container/_search?request_cache=true',
-        json=elastic_resp3.copy(), status=200)
+    assert b"80,578,584" in rv.data
+
+def test_stats_json(app, mocker):
+
+    es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
+    es_raw.side_effect = [
+        (200, {}, json.dumps(elastic_resp1)),
+        (200, {}, json.dumps(elastic_resp2)),
+        (200, {}, json.dumps(elastic_resp3)),
+    ]
+
     rv = app.get('/stats.json')
     assert rv.status_code == 200
+    assert rv.json['papers']['in_kbart'] == 51594200
+    assert rv.json['release']['refs_total'] == 8031459
 
-@responses.activate
-def test_container_stats(app):
+@pytest.mark.skip
+def test_container_stats(app, mocker):
 
     elastic_resp = {
         'timed_out': False,
@@ -108,9 +107,10 @@ def test_container_stats(app):
         'took': 50
     }
 
-    responses.add(responses.GET,
-        'http://localhost:9200/fatcat_release/_search?request_cache=true',
-        json=elastic_resp, status=200)
+    es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
+    es_raw.side_effect = [
+        (200, {}, json.dumps(elastic_resp)),
+    ]
     rv = app.get('/container/issnl/1234-5678/stats.json')
-    assert rv.status_code == 200
-    # TODO: probe this response better
+    #print(rv.json)
+    assert rv.status_code == 201
-- 
cgit v1.2.3


From 94dc508bc54399027c3e2cff0f21e41250c81d89 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 2 Jul 2020 19:14:13 -0700
Subject: finish backend refactoring of search code

---
 python/fatcat_web/search.py | 241 ++++++++++++++++++++------------------------
 python/tests/web_search.py  |  79 ++++++++++++++-
 2 files changed, 185 insertions(+), 135 deletions(-)

diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py
index 5baa8497..ca270110 100644
--- a/python/fatcat_web/search.py
+++ b/python/fatcat_web/search.py
@@ -9,7 +9,6 @@ import datetime
 from dataclasses import dataclass
 from typing import List, Optional, Any
 
-import requests
 from flask import abort, flash
 import elasticsearch
 from elasticsearch_dsl import Search, Q
@@ -218,7 +217,6 @@ def do_release_search(
 
     for h in results:
         # Ensure 'contrib_names' is a list, not a single string
-        print(h, file=sys.stderr)
         if type(h['contrib_names']) is not list:
             h['contrib_names'] = [h['contrib_names'], ]
         h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']]
@@ -233,6 +231,30 @@ def do_release_search(
         results=results,
     )
 
+def get_elastic_container_random_releases(ident, limit=5):
+    """
+    Returns a list of releases from the container.
+    """
+
+    assert limit > 0 and limit <= 100
+
+    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+    search = search.query(
+        'bool',
+        must=[
+            Q('term', container_id=ident),
+            Q('range', release_year={ "lte": datetime.datetime.today().year }),
+        ]
+    )
+    search = search.sort('-in_web', '-release_date')
+    search = search.params(request_cache=True)
+    search = search[:int(limit)]
+
+    resp = wrap_es_execution(search)
+    results = results_to_dict(resp)
+
+    return results
+
 def get_elastic_entity_stats():
     """
     TODO: files, filesets, webcaptures (no schema yet)
@@ -246,10 +268,8 @@ def get_elastic_entity_stats():
     stats = {}
 
     # release totals
-    search = Search(
-        using=app.es_client,
-        index=app.config['ELASTICSEARCH_RELEASE_INDEX']) \
-        .extra(request_cache=True)
+    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+    search = search.params(request_cache=True)
     search.aggs.bucket(
         'release_ref_count',
         'sum',
@@ -257,27 +277,25 @@ def get_elastic_entity_stats():
     )
     search = search[:0]  # pylint: disable=unsubscriptable-object
 
-    # NOTE: not catching exceptions
-    resp = search.execute()
+    resp = wrap_es_execution(search)
+
     stats['release'] = {
         "total": int(resp.hits.total),
         "refs_total": int(resp.aggregations.release_ref_count.value),
     }
 
     # paper counts
-    search = Search(
-        using=app.es_client,
-        index=app.config['ELASTICSEARCH_RELEASE_INDEX']) \
-        .query(
-            'terms',
-            release_type=[
-                "article-journal",
-                "paper-conference",
-                # "chapter",
-                # "thesis",
-            ],
-        ) \
-        .extra(request_cache=True)
+    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+    search = search.query(
+        'terms',
+        release_type=[
+            "article-journal",
+            "paper-conference",
+            # "chapter",
+            # "thesis",
+        ],
+    )
+    search = search.params(request_cache=True)
     search.aggs.bucket(
         'paper_like',
         'filters',
@@ -293,8 +311,7 @@ def get_elastic_entity_stats():
     )
     search = search[:0]
 
-    # NOTE: not catching exceptions
-    resp = search.execute()
+    resp = wrap_es_execution(search)
     buckets = resp.aggregations.paper_like.buckets
     stats['papers'] = {
         'total': resp.hits.total,
@@ -305,10 +322,8 @@ def get_elastic_entity_stats():
     }
 
     # container counts
-    search = Search(
-        using=app.es_client,
-        index=app.config['ELASTICSEARCH_CONTAINER_INDEX']) \
-        .extra(request_cache=True)
+    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_CONTAINER_INDEX'])
+    search = search.params(request_cache=True)
     search.aggs.bucket(
         'release_ref_count',
         'sum',
@@ -316,8 +331,7 @@ def get_elastic_entity_stats():
     )
     search = search[:0]  # pylint: disable=unsubscriptable-object
 
-    # NOTE: not catching exceptions
-    resp = search.execute()
+    resp = wrap_es_execution(search)
     stats['container'] = {
         "total": resp.hits.total,
     }
@@ -335,30 +349,36 @@ def get_elastic_container_stats(ident, issnl=None):
         preserved
     """
 
-    query = {
-        "size": 0,
-        "query": {
-            "term": { "container_id": ident }
+    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+    search = search.params(request_cache=True)
+    search = search.query(
+        'term',
+        container_id=ident,
+    )
+    search.aggs.bucket(
+        'container_stats',
+        'filters',
+        filters={
+            "in_web": {
+                "term": { "in_web": True },
+            },
+            "in_kbart": {
+                "term": { "in_kbart": True },
+            },
+            "is_preserved": {
+                "term": { "is_preserved": True },
+            },
         },
-        "aggs": { "container_stats": { "filters": { "filters": {
-                "in_web": { "term": { "in_web": "true" } },
-                "in_kbart": { "term": { "in_kbart": "true" } },
-                "is_preserved": { "term": { "is_preserved": "true" } },
-        }}}}
-    }
-    resp = requests.get(
-        "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']),
-        json=query,
-        params=dict(request_cache="true"))
-    # TODO: abort()
-    #print(resp.json())
-    resp.raise_for_status()
-    resp = resp.json()
-    buckets = resp['aggregations']['container_stats']['buckets']
+    )
+    search = search[:0]
+
+    resp = wrap_es_execution(search)
+
+    buckets = resp.aggregations.container_stats.buckets
     stats = {
         'ident': ident,
         'issnl': issnl,
-        'total': resp['hits']['total'],
+        'total': resp.hits.total,
         'in_web': buckets['in_web']['doc_count'],
         'in_kbart': buckets['in_kbart']['doc_count'],
         'is_preserved': buckets['is_preserved']['doc_count'],
@@ -366,39 +386,6 @@ def get_elastic_container_stats(ident, issnl=None):
 
     return stats
 
-def get_elastic_container_random_releases(ident, limit=5):
-    """
-    Returns a list of releases from the container.
-    """
-
-    assert limit > 0 and limit <= 100
-
-    search = Search(using=app.es_client, index=app.conf.ELASTICSEARCH_RELEASE_INDEX) \
-        .query('bool',
-            must=[
-                Q('term', container_id=ident),
-                Q('range', release_year={ "lte": datetime.datetime.today().year }),
-            ]
-        ) \
-        .sort('-in_web', '-release_date') \
-        .extra(request_cache=True)
-
-    search = search[:int(limit)]
-
-    resp = search.execute()
-
-    hits = [dict(h.source) for h in resp]
-
-    for h in hits:
-        # Handle surrogate strings that elasticsearch returns sometimes,
-        # probably due to mangled data processing in some pipeline.
-        # "Crimes against Unicode"; production workaround
-        for key in h:
-            if type(h[key]) is str:
-                h[key] = h[key].encode('utf8', 'ignore').decode('utf8')
-
-    return hits
-
 def get_elastic_container_histogram(ident):
     """
     Fetches a stacked histogram
@@ -409,58 +396,46 @@ def get_elastic_container_histogram(ident):
         (year, in_ia, count)
     """
 
-    query = {
-        "aggs": {
-            "year_in_ia": {
-                "composite": {
-                    "size": 1000,
-                    "sources": [
-                        {"year": {
-                            "histogram": {
-                                "field": "release_year",
-                                "interval": 1,
-                        }}},
-                        {"in_ia": {
-                            "terms": {
-                                "field": "in_ia",
-                        }}},
-                    ],
+    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+    search = search.params(request_cache='true')
+    search = search.query(
+        'bool',
+        must=[
+            Q("range", release_year={
+                "gte": datetime.datetime.today().year - 499,
+                "lte": datetime.datetime.today().year,
+            }),
+        ],
+        filter=[
+            Q("bool", minimum_should_match=1, should=[
+                Q("match", container_id=ident),
+            ]),
+        ],
+    )
+    search.aggs.bucket(
+        'year_in_ia',
+        'composite',
+        size=1000,
+        sources=[
+            {"year": {
+                "histogram": {
+                    "field": "release_year",
+                    "interval": 1,
                 },
-            },
-        },
-        "size": 0,
-        "query": {
-            "bool": {
-                "must": [{
-                    "range": {
-                        "release_year": {
-                            "gte": datetime.datetime.today().year - 499,
-                            "lte": datetime.datetime.today().year,
-                        }
-                    }
-                }],
-                "filter": [{
-                    "bool": {
-                        "should": [{
-                            "match": {
-                                "container_id": ident
-                            }
-                        }],
-                        "minimum_should_match": 1,
-                    },
-                }],
-            }
-        }
-    }
-    resp = requests.get(
-        "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']),
-        json=query,
-        params=dict(request_cache="true"))
-    resp.raise_for_status()
-    # TODO: abort()
-    resp = resp.json()
-    #print(resp)
+            }},
+            {"in_ia": {
+                "terms": {
+                    "field": "in_ia",
+                },
+            }},
+        ],
+    )
+    search = search[:0]
+
+    resp = wrap_es_execution(search)
+
+    buckets = resp.aggregations.year_in_ia.buckets
     vals = [(h['key']['year'], h['key']['in_ia'], h['doc_count'])
-            for h in resp['aggregations']['year_in_ia']['buckets']]
+            for h in buckets]
     vals = sorted(vals)
     return vals
diff --git a/python/tests/web_search.py b/python/tests/web_search.py
index 07985e04..460f5ee2 100644
--- a/python/tests/web_search.py
+++ b/python/tests/web_search.py
@@ -2,6 +2,7 @@
 import json
 import pytest
 
+from fatcat_web.search import get_elastic_container_random_releases, get_elastic_container_histogram
 from fatcat_openapi_client.rest import ApiException
 from fixtures import *
 
@@ -38,6 +39,20 @@ def test_container_search(app, mocker):
     assert b"European Instructional Course Lectures" in rv.data
     assert b"British Editorial Society of Bone and Joint Surger" in rv.data
 
+def test_random_releases(app, mocker):
+
+    with open('tests/files/elastic_release_search.json') as f:
+        elastic_resp=json.loads(f.read())
+
+    es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
+    es_raw.side_effect = [
+        (200, {}, json.dumps(elastic_resp)),
+    ]
+
+    resp = get_elastic_container_random_releases("123")
+    assert len(resp) >= 1
+
+
 elastic_resp1 = {
     'timed_out': False,
     'aggregations': {
@@ -92,7 +107,6 @@ def test_stats_json(app, mocker):
     assert rv.json['papers']['in_kbart'] == 51594200
     assert rv.json['release']['refs_total'] == 8031459
 
-@pytest.mark.skip
 def test_container_stats(app, mocker):
 
     elastic_resp = {
@@ -110,7 +124,68 @@ def test_container_stats(app, mocker):
     es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
     es_raw.side_effect = [
         (200, {}, json.dumps(elastic_resp)),
+        (200, {}, json.dumps(elastic_resp)),
     ]
     rv = app.get('/container/issnl/1234-5678/stats.json')
     #print(rv.json)
-    assert rv.status_code == 201
+    assert rv.status_code == 200
+
+    rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/stats.json')
+    assert rv.status_code == 200
+
+def test_container_coverage(app, mocker):
+
+    elastic_resp1 = {
+        'timed_out': False,
+        'aggregations': {
+            'container_stats': {'buckets': {
+              'is_preserved': {'doc_count': 461939},
+              'in_kbart': {'doc_count': 461939},
+              'in_web': {'doc_count': 2797}}}},
+        'hits': {'total': 461939, 'hits': [], 'max_score': 0.0},
+        '_shards': {'successful': 5, 'total': 5, 'skipped': 0, 'failed': 0},
+        'took': 50
+    }
+
+    elastic_resp2 = {
+        'took': 294,
+        'timed_out': False,
+        '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
+        'hits': {'total': 4327, 'max_score': 0.0, 'hits': []},
+        'aggregations': {'year_in_ia': {
+            'after_key': {'year': 2020.0, 'in_ia': True},
+            'buckets': [
+                {'key': {'year': 2004.0, 'in_ia': False}, 'doc_count': 4},
+                {'key': {'year': 2004.0, 'in_ia': True}, 'doc_count': 68},
+                {'key': {'year': 2005.0, 'in_ia': False}, 'doc_count': 26},
+                {'key': {'year': 2005.0, 'in_ia': True}, 'doc_count': 428},
+                {'key': {'year': 2006.0, 'in_ia': False}, 'doc_count': 14},
+                {'key': {'year': 2006.0, 'in_ia': True}, 'doc_count': 487},
+                {'key': {'year': 2007.0, 'in_ia': False}, 'doc_count': 13},
+                {'key': {'year': 2007.0, 'in_ia': True}, 'doc_count': 345},
+            ],
+        }},
+    }
+
+    es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
+    es_raw.side_effect = [
+        (200, {}, json.dumps(elastic_resp1)),
+    ]
+
+    rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/coverage')
+    assert rv.status_code == 200
+
+    es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
+    es_raw.side_effect = [
+        (200, {}, json.dumps(elastic_resp2)),
+    ]
+
+    rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/ia_coverage_years.json')
+    assert rv.status_code == 200
+
+    es_raw.side_effect = [
+        (200, {}, json.dumps(elastic_resp2)),
+    ]
+
+    rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/ia_coverage_years.svg')
+    assert rv.status_code == 200
-- 
cgit v1.2.3


From d798ee172294de09ab1621530df4e3498a17640e Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 23 Jul 2020 15:02:27 -0700
Subject: small lint fixes

---
 python/fatcat_web/search.py | 3 +--
 python/tests/web_search.py  | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py
index ca270110..1165a004 100644
--- a/python/fatcat_web/search.py
+++ b/python/fatcat_web/search.py
@@ -9,7 +9,6 @@ import datetime
 from dataclasses import dataclass
 from typing import List, Optional, Any
 
-from flask import abort, flash
 import elasticsearch
 from elasticsearch_dsl import Search, Q
 import elasticsearch_dsl.response
@@ -43,7 +42,7 @@ class ReleaseQuery:
         offset = max(0, int(offset)) if offset.isnumeric() else 0
 
         return ReleaseQuery(
-            q=query_str,        
+            q=query_str,
             offset=offset,
             fulltext_only=bool(args.get('fulltext_only')),
             container_id=container_id,
diff --git a/python/tests/web_search.py b/python/tests/web_search.py
index 460f5ee2..55e90d56 100644
--- a/python/tests/web_search.py
+++ b/python/tests/web_search.py
@@ -27,7 +27,6 @@ def test_container_search(app, mocker):
     with open('tests/files/elastic_container_search.json') as f:
         elastic_resp=json.loads(f.read())
 
-
     es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
     es_raw.side_effect = [
         (200, {}, json.dumps(elastic_resp)),
-- 
cgit v1.2.3


From 7010abf54fae6a04f4a0700651e64a1fe5b5b2c8 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 23 Jul 2020 16:10:40 -0700
Subject: re-order search params to satisfy pylint

Moved all the request_cache=True param calls to just before ES request
exectuation. The former ordering "just worked", but pylint didn't like
it, and I suspose it was not as idiomatic as it should have been.
---
 python/fatcat_web/search.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py
index 1165a004..55caa9c5 100644
--- a/python/fatcat_web/search.py
+++ b/python/fatcat_web/search.py
@@ -246,9 +246,9 @@ def get_elastic_container_random_releases(ident, limit=5):
         ]
     )
     search = search.sort('-in_web', '-release_date')
-    search = search.params(request_cache=True)
     search = search[:int(limit)]
 
+    search = search.params(request_cache=True)
     resp = wrap_es_execution(search)
     results = results_to_dict(resp)
 
@@ -268,7 +268,6 @@ def get_elastic_entity_stats():
 
     # release totals
     search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
-    search = search.params(request_cache=True)
     search.aggs.bucket(
         'release_ref_count',
         'sum',
@@ -276,6 +275,7 @@ def get_elastic_entity_stats():
     )
     search = search[:0]  # pylint: disable=unsubscriptable-object
 
+    search = search.params(request_cache=True)
     resp = wrap_es_execution(search)
 
     stats['release'] = {
@@ -294,7 +294,6 @@ def get_elastic_entity_stats():
             # "thesis",
         ],
     )
-    search = search.params(request_cache=True)
     search.aggs.bucket(
         'paper_like',
         'filters',
@@ -310,6 +309,7 @@ def get_elastic_entity_stats():
     )
     search = search[:0]
 
+    search = search.params(request_cache=True)
     resp = wrap_es_execution(search)
     buckets = resp.aggregations.paper_like.buckets
     stats['papers'] = {
@@ -322,7 +322,6 @@ def get_elastic_entity_stats():
 
     # container counts
     search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_CONTAINER_INDEX'])
-    search = search.params(request_cache=True)
     search.aggs.bucket(
         'release_ref_count',
         'sum',
@@ -330,6 +329,7 @@ def get_elastic_entity_stats():
     )
     search = search[:0]  # pylint: disable=unsubscriptable-object
 
+    search = search.params(request_cache=True)
     resp = wrap_es_execution(search)
     stats['container'] = {
         "total": resp.hits.total,
@@ -349,7 +349,6 @@ def get_elastic_container_stats(ident, issnl=None):
     """
 
     search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
-    search = search.params(request_cache=True)
     search = search.query(
         'term',
         container_id=ident,
@@ -371,6 +370,7 @@ def get_elastic_container_stats(ident, issnl=None):
     )
     search = search[:0]
 
+    search = search.params(request_cache=True)
     resp = wrap_es_execution(search)
 
     buckets = resp.aggregations.container_stats.buckets
@@ -396,7 +396,6 @@ def get_elastic_container_histogram(ident):
     """
 
     search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
-    search = search.params(request_cache='true')
     search = search.query(
         'bool',
         must=[
@@ -431,6 +430,7 @@ def get_elastic_container_histogram(ident):
     )
     search = search[:0]
 
+    search = search.params(request_cache='true')
     resp = wrap_es_execution(search)
 
     buckets = resp.aggregations.year_in_ia.buckets
-- 
cgit v1.2.3