aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2020-07-24 17:25:35 +0000
committerbnewbold <bnewbold@archive.org>2020-07-24 17:25:35 +0000
commit655de9edfb4fab6c861332bf60e02adf67ebfac6 (patch)
tree5053f9790f5adbab75c010bed988fe3c09838785
parent8b00843af1366cf019c896057706ace4afff27ba (diff)
parent7010abf54fae6a04f4a0700651e64a1fe5b5b2c8 (diff)
downloadfatcat-655de9edfb4fab6c861332bf60e02adf67ebfac6.tar.gz
fatcat-655de9edfb4fab6c861332bf60e02adf67ebfac6.zip
Merge branch 'bnewbold-es-refactor' into 'master'
web ES search refactor See merge request webgroup/fatcat!68
-rw-r--r--python/fatcat_web/__init__.py5
-rw-r--r--python/fatcat_web/routes.py44
-rw-r--r--python/fatcat_web/search.py619
-rw-r--r--python/fatcat_web/templates/container_search.html16
-rw-r--r--python/fatcat_web/templates/entity_macros.html10
-rw-r--r--python/fatcat_web/templates/release_search.html20
-rw-r--r--python/tests/fixtures.py4
-rw-r--r--python/tests/web_search.py161
8 files changed, 506 insertions, 373 deletions
diff --git a/python/fatcat_web/__init__.py b/python/fatcat_web/__init__.py
index 562ffeb2..487de58a 100644
--- a/python/fatcat_web/__init__.py
+++ b/python/fatcat_web/__init__.py
@@ -11,6 +11,7 @@ from authlib.flask.client import OAuth
from loginpass import create_flask_blueprint, Gitlab, GitHub, ORCiD
from raven.contrib.flask import Sentry
import fatcat_openapi_client
+import elasticsearch
from fatcat_web.web_config import Config
@@ -71,7 +72,9 @@ mwoauth = MWOAuth(
mwoauth.handshaker.user_agent = "fatcat.wiki;python_web_interface"
app.register_blueprint(mwoauth.bp, url_prefix='/auth/wikipedia')
-from fatcat_web import routes, editing_routes, auth, cors, forms # noqa: E402
+app.es_client = elasticsearch.Elasticsearch(Config.ELASTICSEARCH_BACKEND)
+
+from fatcat_web import routes, editing_routes, auth, cors, forms
# TODO: blocking on ORCID support in loginpass
if Config.ORCID_CLIENT_ID:
diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py
index 2489ac03..4a66b3c2 100644
--- a/python/fatcat_web/routes.py
+++ b/python/fatcat_web/routes.py
@@ -14,7 +14,7 @@ from fatcat_tools.normal import *
from fatcat_web import app, api, auth_api, priv_api, mwoauth, Config
from fatcat_web.auth import handle_token_login, handle_logout, load_user, handle_ia_xauth, handle_wmoauth
from fatcat_web.cors import crossdomain
-from fatcat_web.search import *
+from fatcat_web.search import ReleaseQuery, GenericQuery, do_release_search, do_container_search, get_elastic_entity_stats, get_elastic_container_stats, get_elastic_container_histogram
from fatcat_web.entity_helpers import *
from fatcat_web.graphics import *
from fatcat_web.kafka import *
@@ -706,44 +706,22 @@ def generic_search():
@app.route('/release/search', methods=['GET', 'POST'])
def release_search():
- query = request.args.get('q')
- if not query:
- query = '*'
- fulltext_only = bool(request.args.get('fulltext_only'))
+ if 'q' not in request.args.keys():
+ return render_template('release_search.html', query=ReleaseQuery(), found=None)
- issnl = request.args.get('container_issnl')
- if issnl and query:
- query += ' container_issnl:"{}"'.format(issnl)
-
- container_id = request.args.get('container_id')
- if container_id and query:
- query += ' container_id:"{}"'.format(container_id)
-
- offset = request.args.get('offset', '0')
- offset = max(0, int(offset)) if offset.isnumeric() else 0
-
- if 'q' in request.args.keys():
- # always do files for HTML
- found = do_release_search(query, fulltext_only=fulltext_only, offset=offset)
- return render_template('release_search.html', found=found, query=query, fulltext_only=fulltext_only)
- else:
- return render_template('release_search.html', query=query, fulltext_only=fulltext_only)
+ query = ReleaseQuery.from_args(request.args)
+ found = do_release_search(query)
+ return render_template('release_search.html', query=query, found=found)
@app.route('/container/search', methods=['GET', 'POST'])
def container_search():
- query = request.args.get('q')
- if not query:
- query = '*'
- offset = request.args.get('offset', '0')
- offset = max(0, int(offset)) if offset.isnumeric() else 0
+ if 'q' not in request.args.keys():
+ return render_template('container_search.html', query=GenericQuery(), found=None)
- if 'q' in request.args.keys():
- # always do files for HTML
- found = do_container_search(query, offset=offset)
- return render_template('container_search.html', found=found, query=query)
- else:
- return render_template('container_search.html', query=query)
+ query = GenericQuery.from_args(request.args)
+ found = do_container_search(query)
+ return render_template('container_search.html', query=query, found=found)
def get_changelog_stats():
stats = {}
diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py
index 4a87c735..55caa9c5 100644
--- a/python/fatcat_web/search.py
+++ b/python/fatcat_web/search.py
@@ -2,118 +2,257 @@
"""
Helpers for doing elasticsearch queries (used in the web interface; not part of
the formal API)
-
-TODO: ELASTICSEARCH_*_INDEX should probably be factored out and just hard-coded
"""
+import sys
import datetime
-import requests
-from flask import abort, flash
+from dataclasses import dataclass
+from typing import List, Optional, Any
+
+import elasticsearch
+from elasticsearch_dsl import Search, Q
+import elasticsearch_dsl.response
+
from fatcat_web import app
-def do_search(index, request, limit=30, offset=0, deep_page_limit=2000):
+@dataclass
+class ReleaseQuery:
+ q: Optional[str] = None
+ limit: Optional[int] = None
+ offset: Optional[int] = None
+ fulltext_only: bool = False
+ container_id: Optional[str] = None
+
+ @classmethod
+ def from_args(cls, args) -> 'ReleaseQuery':
+
+ query_str = args.get('q') or '*'
+
+ container_id = args.get('container_id')
+ # TODO: as filter, not in query string
+ if container_id:
+ query_str += ' container_id:"{}"'.format(container_id)
+
+ # TODO: where are container_issnl queries actually used?
+ issnl = args.get('container_issnl')
+ if issnl and query_str:
+ query_str += ' container_issnl:"{}"'.format(issnl)
+
+ offset = args.get('offset', '0')
+ offset = max(0, int(offset)) if offset.isnumeric() else 0
+
+ return ReleaseQuery(
+ q=query_str,
+ offset=offset,
+ fulltext_only=bool(args.get('fulltext_only')),
+ container_id=container_id,
+ )
+
+@dataclass
+class GenericQuery:
+ q: Optional[str] = None
+ limit: Optional[int] = None
+ offset: Optional[int] = None
+
+ @classmethod
+ def from_args(cls, args) -> 'GenericQuery':
+ query_str = args.get('q')
+ if not query_str:
+ query_str = '*'
+ offset = args.get('offset', '0')
+ offset = max(0, int(offset)) if offset.isnumeric() else 0
+
+ return GenericQuery(
+ q=query_str,
+ offset=offset,
+ )
+
+@dataclass
+class SearchHits:
+ count_returned: int
+ count_found: int
+ offset: int
+ limit: int
+ deep_page_limit: int
+ query_time_ms: int
+ results: List[Any]
+
+
+def results_to_dict(response: elasticsearch_dsl.response.Response) -> List[dict]:
+ """
+ Takes a response returns all the hits as JSON objects.
- # Sanity checks
- if limit > 100:
- limit = 100
- if offset < 0:
- offset = 0
- if offset > deep_page_limit:
- # Avoid deep paging problem.
- offset = deep_page_limit
+ Also handles surrogate strings that elasticsearch returns sometimes,
+ probably due to mangled data processing in some pipeline. "Crimes against
+ Unicode"; production workaround
+ """
+
+ results = []
+ for h in response:
+ r = h._d_
+ # print(h.meta._d_)
+ results.append(r)
- request["size"] = int(limit)
- request["from"] = int(offset)
- # print(request)
- resp = requests.get("%s/%s/_search" %
- (app.config['ELASTICSEARCH_BACKEND'], index),
- json=request)
-
- if resp.status_code == 400:
- print("elasticsearch 400: " + str(resp.content))
- flash("Search query failed to parse; you might need to use quotes.<p><code>{}</code>".format(resp.content))
- abort(resp.status_code)
- elif resp.status_code != 200:
- print("elasticsearch non-200 status code: " + str(resp.status_code))
- print(resp.content)
- abort(resp.status_code)
-
- content = resp.json()
- results = [h['_source'] for h in content['hits']['hits']]
for h in results:
- # Handle surrogate strings that elasticsearch returns sometimes,
- # probably due to mangled data processing in some pipeline.
- # "Crimes against Unicode"; production workaround
for key in h:
if type(h[key]) is str:
- h[key] = h[key].encode('utf8', 'ignore').decode('utf8')
+ h[key] = h[key].encode("utf8", "ignore").decode("utf8")
+ return results
- return {"count_returned": len(results),
- "count_found": content['hits']['total'],
- "results": results,
- "offset": offset,
- "deep_page_limit": deep_page_limit}
+def wrap_es_execution(search: Search) -> Any:
+ """
+ Executes a Search object, and converts various ES error types into
+ something we can pretty print to the user.
+ """
+ try:
+ resp = search.execute()
+ except elasticsearch.exceptions.RequestError as e:
+ # this is a "user" error
+ print("elasticsearch 400: " + str(e.info), file=sys.stderr)
+ if e.info.get("error", {}).get("root_cause", {}):
+ raise ValueError(str(e.info["error"]["root_cause"][0].get("reason")))
+ else:
+ raise ValueError(str(e.info))
+ except elasticsearch.exceptions.TransportError as e:
+ # all other errors
+ print("elasticsearch non-200 status code: {}".format(e.info), file=sys.stderr)
+ raise IOError(str(e.info))
+ return resp
+def do_container_search(
+ query: GenericQuery, deep_page_limit: int = 2000
+) -> SearchHits:
-def do_release_search(q, limit=30, fulltext_only=True, offset=0):
+ search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_CONTAINER_INDEX'])
- #print("Search hit: " + q)
- if limit > 100:
- # Sanity check
- limit = 100
+ search = search.query(
+ "query_string",
+ query=query.q,
+ default_operator="AND",
+ analyze_wildcard=True,
+ allow_leading_wildcard=False,
+ lenient=True,
+ fields=["biblio"],
+ )
- # Convert raw DOIs to DOI queries
- if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1:
- q = 'doi:"{}"'.format(q)
+ # Sanity checks
+ limit = min((int(query.limit or 25), 100))
+ offset = max((int(query.offset or 0), 0))
+ if offset > deep_page_limit:
+ # Avoid deep paging problem.
+ offset = deep_page_limit
- if fulltext_only:
- q += " in_web:true"
+ search = search[offset : (offset + limit)]
+
+ resp = wrap_es_execution(search)
+ results = results_to_dict(resp)
+
+ return SearchHits(
+ count_returned=len(results),
+ count_found=int(resp.hits.total),
+ offset=offset,
+ limit=limit,
+ deep_page_limit=deep_page_limit,
+ query_time_ms=int(resp.took),
+ results=results,
+ )
+
+def do_release_search(
+ query: ReleaseQuery, deep_page_limit: int = 2000
+) -> SearchHits:
+
+ search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+
+ # availability filters
+ if query.fulltext_only:
+ search = search.filter("term", in_ia=True)
+
+ # Below, we combine several queries to improve scoring.
+
+ # this query use the fancy built-in query string parser
+ basic_biblio = Q(
+ "query_string",
+ query=query.q,
+ default_operator="AND",
+ analyze_wildcard=True,
+ allow_leading_wildcard=False,
+ lenient=True,
+ fields=[
+ "title^2",
+ "biblio",
+ ],
+ )
+ has_fulltext = Q("term", in_ia=True)
+ poor_metadata = Q(
+ "bool",
+ should=[
+ # if these fields aren't set, metadata is poor. The more that do
+ # not exist, the stronger the signal.
+ Q("bool", must_not=Q("exists", field="title")),
+ Q("bool", must_not=Q("exists", field="release_year")),
+ Q("bool", must_not=Q("exists", field="release_type")),
+ Q("bool", must_not=Q("exists", field="release_stage")),
+ ],
+ )
- search_request = {
- "query": {
- "query_string": {
- "query": q,
- "default_operator": "AND",
- "analyze_wildcard": True,
- "lenient": True,
- "fields": ["biblio"],
- },
- },
- }
+ search = search.query(
+ "boosting",
+ positive=Q("bool", must=basic_biblio, should=[has_fulltext],),
+ negative=poor_metadata,
+ negative_boost=0.5,
+ )
+
+ # Sanity checks
+ limit = min((int(query.limit or 25), 100))
+ offset = max((int(query.offset or 0), 0))
+ if offset > deep_page_limit:
+ # Avoid deep paging problem.
+ offset = deep_page_limit
- resp = do_search(app.config['ELASTICSEARCH_RELEASE_INDEX'], search_request, offset=offset)
- for h in resp['results']:
+ search = search[offset : (offset + limit)]
+
+ resp = wrap_es_execution(search)
+ results = results_to_dict(resp)
+
+ for h in results:
# Ensure 'contrib_names' is a list, not a single string
if type(h['contrib_names']) is not list:
h['contrib_names'] = [h['contrib_names'], ]
h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']]
- resp["query"] = { "q": q }
- resp["limit"] = limit
- return resp
+ return SearchHits(
+ count_returned=len(results),
+ count_found=int(resp.hits.total),
+ offset=offset,
+ limit=limit,
+ deep_page_limit=deep_page_limit,
+ query_time_ms=int(resp.took),
+ results=results,
+ )
-def do_container_search(q, limit=30, offset=0):
+def get_elastic_container_random_releases(ident, limit=5):
+ """
+ Returns a list of releases from the container.
+ """
- # Convert raw ISSN-L to ISSN-L query
- if len(q.split()) == 1 and len(q) == 9 and q[0:4].isdigit() and q[4] == '-':
- q = 'issnl:"{}"'.format(q)
+ assert limit > 0 and limit <= 100
- search_request = {
- "query": {
- "query_string": {
- "query": q,
- "default_operator": "AND",
- "analyze_wildcard": True,
- "lenient": True,
- "fields": ["biblio"],
- },
- },
- }
+ search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ search = search.query(
+ 'bool',
+ must=[
+ Q('term', container_id=ident),
+ Q('range', release_year={ "lte": datetime.datetime.today().year }),
+ ]
+ )
+ search = search.sort('-in_web', '-release_date')
+ search = search[:int(limit)]
- resp = do_search(app.config['ELASTICSEARCH_CONTAINER_INDEX'], search_request, limit=limit, offset=offset)
- resp["query"] = { "q": q }
- resp["limit"] = limit
- return resp
+ search = search.params(request_cache=True)
+ resp = wrap_es_execution(search)
+ results = results_to_dict(resp)
+
+ return results
def get_elastic_entity_stats():
"""
@@ -127,85 +266,73 @@ def get_elastic_entity_stats():
stats = {}
- # 2. releases
- # - total count
- # - total citation records
- # - total (paper, chapter, proceeding)
- # - " with fulltext on web
- # - " open access
- # - " not in KBART, in IA
- #
- # Can do the above with two queries:
- # - all releases, aggregate count and sum(ref_count)
- # - in-scope works, aggregate count by (fulltext, OA, kbart/ia)
-
- # 2a. release totals
- query = {
- "size": 0,
- "aggs": {
- "release_ref_count": { "sum": { "field": "ref_count" } }
- }
- }
- resp = requests.get(
- "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']),
- json=query,
- params=dict(request_cache="true"))
- # TODO: abort()
- resp.raise_for_status()
- resp = resp.json()
+ # release totals
+ search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ search.aggs.bucket(
+ 'release_ref_count',
+ 'sum',
+ field='ref_count',
+ )
+ search = search[:0] # pylint: disable=unsubscriptable-object
+
+ search = search.params(request_cache=True)
+ resp = wrap_es_execution(search)
+
stats['release'] = {
- "total": resp['hits']['total'],
- "refs_total": int(resp['aggregations']['release_ref_count']['value']),
+ "total": int(resp.hits.total),
+ "refs_total": int(resp.aggregations.release_ref_count.value),
}
- # 2b. paper counts
- query = {
- "size": 0,
- "query": {
- "terms": { "release_type": [
- # "chapter", "thesis",
- "article-journal", "paper-conference",
- ] } },
- "aggs": { "paper_like": { "filters": { "filters": {
- "in_web": { "term": { "in_web": "true" } },
- "is_oa": { "term": { "is_oa": "true" } },
- "in_kbart": { "term": { "in_kbart": "true" } },
- "in_web_not_kbart": { "bool": { "filter": [
- { "term": { "in_web": "true" } },
- { "term": { "in_kbart": "false" } }
- ]}}
- }}}}
- }
- resp = requests.get(
- "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']),
- json=query,
- params=dict(request_cache="true"))
- # TODO: abort()
- resp.raise_for_status()
- resp = resp.json()
- buckets = resp['aggregations']['paper_like']['buckets']
+ # paper counts
+ search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ search = search.query(
+ 'terms',
+ release_type=[
+ "article-journal",
+ "paper-conference",
+ # "chapter",
+ # "thesis",
+ ],
+ )
+ search.aggs.bucket(
+ 'paper_like',
+ 'filters',
+ filters={
+ "in_web": { "term": { "in_web": "true" } },
+ "is_oa": { "term": { "is_oa": "true" } },
+ "in_kbart": { "term": { "in_kbart": "true" } },
+ "in_web_not_kbart": { "bool": { "filter": [
+ { "term": { "in_web": "true" } },
+ { "term": { "in_kbart": "false" } },
+ ]}},
+ }
+ )
+ search = search[:0]
+
+ search = search.params(request_cache=True)
+ resp = wrap_es_execution(search)
+ buckets = resp.aggregations.paper_like.buckets
stats['papers'] = {
- 'total': resp['hits']['total'],
- 'in_web': buckets['in_web']['doc_count'],
- 'is_oa': buckets['is_oa']['doc_count'],
- 'in_kbart': buckets['in_kbart']['doc_count'],
- 'in_web_not_kbart': buckets['in_web_not_kbart']['doc_count'],
+ 'total': resp.hits.total,
+ 'in_web': buckets.in_web.doc_count,
+ 'is_oa': buckets.is_oa.doc_count,
+ 'in_kbart': buckets.in_kbart.doc_count,
+ 'in_web_not_kbart': buckets.in_web_not_kbart.doc_count,
}
- # 3. containers
- # => total count
- query = {
- "size": 0,
- }
- resp = requests.get(
- "{}/fatcat_container/_search".format(app.config['ELASTICSEARCH_BACKEND']),
- json=query,
- params=dict(request_cache="true"))
- # TODO: abort()
- resp.raise_for_status()
- resp = resp.json()
+ # container counts
+ search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_CONTAINER_INDEX'])
+ search.aggs.bucket(
+ 'release_ref_count',
+ 'sum',
+ field='ref_count',
+ )
+ search = search[:0] # pylint: disable=unsubscriptable-object
+
+ search = search.params(request_cache=True)
+ resp = wrap_es_execution(search)
stats['container'] = {
- "total": resp['hits']['total'],
+ "total": resp.hits.total,
}
return stats
@@ -221,30 +348,36 @@ def get_elastic_container_stats(ident, issnl=None):
preserved
"""
- query = {
- "size": 0,
- "query": {
- "term": { "container_id": ident }
+ search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ search = search.query(
+ 'term',
+ container_id=ident,
+ )
+ search.aggs.bucket(
+ 'container_stats',
+ 'filters',
+ filters={
+ "in_web": {
+ "term": { "in_web": True },
+ },
+ "in_kbart": {
+ "term": { "in_kbart": True },
+ },
+ "is_preserved": {
+ "term": { "is_preserved": True },
+ },
},
- "aggs": { "container_stats": { "filters": { "filters": {
- "in_web": { "term": { "in_web": "true" } },
- "in_kbart": { "term": { "in_kbart": "true" } },
- "is_preserved": { "term": { "is_preserved": "true" } },
- }}}}
- }
- resp = requests.get(
- "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']),
- json=query,
- params=dict(request_cache="true"))
- # TODO: abort()
- #print(resp.json())
- resp.raise_for_status()
- resp = resp.json()
- buckets = resp['aggregations']['container_stats']['buckets']
+ )
+ search = search[:0]
+
+ search = search.params(request_cache=True)
+ resp = wrap_es_execution(search)
+
+ buckets = resp.aggregations.container_stats.buckets
stats = {
'ident': ident,
'issnl': issnl,
- 'total': resp['hits']['total'],
+ 'total': resp.hits.total,
'in_web': buckets['in_web']['doc_count'],
'in_kbart': buckets['in_kbart']['doc_count'],
'is_preserved': buckets['is_preserved']['doc_count'],
@@ -252,48 +385,6 @@ def get_elastic_container_stats(ident, issnl=None):
return stats
-def get_elastic_container_random_releases(ident, limit=5):
- """
- Returns a list of releases from the container.
- """
-
- assert limit > 0 and limit <= 100
-
- query = {
- "size": int(limit),
- "sort": [
- { "in_web": {"order": "desc"} },
- { "release_date": {"order": "desc"} },
- ],
- "query": {
- "bool": {
- "must": [
- { "term": { "container_id": ident } },
- { "range": { "release_year": { "lte": datetime.datetime.today().year } } },
- ],
- },
- },
- }
- resp = requests.get(
- "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']),
- json=query,
- params=dict(request_cache="true"))
- # TODO: abort()
- #print(resp.json())
- resp.raise_for_status()
- resp = resp.json()
- #print(resp)
- hits = [h['_source'] for h in resp['hits']['hits']]
- for h in hits:
- # Handle surrogate strings that elasticsearch returns sometimes,
- # probably due to mangled data processing in some pipeline.
- # "Crimes against Unicode"; production workaround
- for key in h:
- if type(h[key]) is str:
- h[key] = h[key].encode('utf8', 'ignore').decode('utf8')
-
- return hits
-
def get_elastic_container_histogram(ident):
"""
Fetches a stacked histogram
@@ -304,58 +395,46 @@ def get_elastic_container_histogram(ident):
(year, in_ia, count)
"""
- query = {
- "aggs": {
- "year_in_ia": {
- "composite": {
- "size": 1000,
- "sources": [
- {"year": {
- "histogram": {
- "field": "release_year",
- "interval": 1,
- }}},
- {"in_ia": {
- "terms": {
- "field": "in_ia",
- }}},
- ],
+ search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ search = search.query(
+ 'bool',
+ must=[
+ Q("range", release_year={
+ "gte": datetime.datetime.today().year - 499,
+ "lte": datetime.datetime.today().year,
+ }),
+ ],
+ filter=[
+ Q("bool", minimum_should_match=1, should=[
+ Q("match", container_id=ident),
+ ]),
+ ],
+ )
+ search.aggs.bucket(
+ 'year_in_ia',
+ 'composite',
+ size=1000,
+ sources=[
+ {"year": {
+ "histogram": {
+ "field": "release_year",
+ "interval": 1,
},
- },
- },
- "size": 0,
- "query": {
- "bool": {
- "must": [{
- "range": {
- "release_year": {
- "gte": datetime.datetime.today().year - 499,
- "lte": datetime.datetime.today().year,
- }
- }
- }],
- "filter": [{
- "bool": {
- "should": [{
- "match": {
- "container_id": ident
- }
- }],
- "minimum_should_match": 1,
- },
- }],
- }
- }
- }
- resp = requests.get(
- "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']),
- json=query,
- params=dict(request_cache="true"))
- resp.raise_for_status()
- # TODO: abort()
- resp = resp.json()
- #print(resp)
+ }},
+ {"in_ia": {
+ "terms": {
+ "field": "in_ia",
+ },
+ }},
+ ],
+ )
+ search = search[:0]
+
+ search = search.params(request_cache='true')
+ resp = wrap_es_execution(search)
+
+ buckets = resp.aggregations.year_in_ia.buckets
vals = [(h['key']['year'], h['key']['in_ia'], h['doc_count'])
- for h in resp['aggregations']['year_in_ia']['buckets']]
+ for h in buckets]
vals = sorted(vals)
return vals
diff --git a/python/fatcat_web/templates/container_search.html b/python/fatcat_web/templates/container_search.html
index 1a804595..2566f542 100644
--- a/python/fatcat_web/templates/container_search.html
+++ b/python/fatcat_web/templates/container_search.html
@@ -2,8 +2,8 @@
{% extends "base.html" %}
{% block title %}
-{% if query %}
- Search: {{ query }}
+{% if query.q %}
+ Search: {{ query.q }}
{% else %}
Release Search
{% endif %}
@@ -18,9 +18,9 @@
<form class="" role="search" action="/container/search" method="get">
<div class="ui form">
<div class="ui action input huge fluid">
- <input type="text" placeholder="Query..." name="q" value="{% if query %}{{ query }}{% endif %}" aria-label="search container metadata"> <button class="ui button">Search</button>
+ <input type="text" placeholder="Query..." name="q" value="{% if query.q %}{{ query.q }}{% endif %}" aria-label="search container metadata"> <button class="ui button">Search</button>
</div>
- <br>Can also lookup by <b><a href="/container/lookup">identifier</a></b> or search <b><a href="/release/search?q={{ query or "" }}">releases</a></b>.
+ <br>Can also lookup by <b><a href="/container/lookup">identifier</a></b> or search <b><a href="/release/search?q={{ query.q or "" }}">releases</a></b>.
</div>
</form>
</div>
@@ -32,7 +32,7 @@
{% if found %}
{% if found.results %}
- {{ entity_macros.top_results(found) }}
+ {{ entity_macros.top_results(query, found) }}
{% for entity in found.results %}
<div>
@@ -55,13 +55,13 @@
{% if found.results|length > 8 %}
<div class="ui divider"></div>
<div style="text-align: center">
- {{ entity_macros.bottom_results(found, endpoint='container_search') }}
+ {{ entity_macros.bottom_results(query, found, endpoint='container_search') }}
</div>
{% endif %}
{% else %}
- Raw query was: <i>{{ found.query.q }}</i>
+ Raw query was: <i>{{ query.q }}</i>
<div class="ui centered stackable grid" style="padding-top: 15%;">
<div class="row">
@@ -72,7 +72,7 @@
<h2>No results found!</h2>
<p>You could try elsewhere:</p>
<ul>
- <li>Search <a href="https://scholar.google.com/scholar?q={{ found.query.q | urlencode }}">Google Scholar</a></li>
+ <li>Search <a href="https://scholar.google.com/scholar?q={{ query.q | urlencode }}">Google Scholar</a></li>
</ul>
</div>
</div>
diff --git a/python/fatcat_web/templates/entity_macros.html b/python/fatcat_web/templates/entity_macros.html
index c22eb106..0e7f135a 100644
--- a/python/fatcat_web/templates/entity_macros.html
+++ b/python/fatcat_web/templates/entity_macros.html
@@ -262,7 +262,7 @@ yellow
{% endif %}
{%- endmacro %}
-{% macro top_results(found) -%}
+{% macro top_results(query, found) -%}
<i>Showing
{% if found.offset == 0 %}
@@ -278,13 +278,13 @@ yellow
{%- endmacro %}
-{% macro bottom_results(found, endpoint='release_search') -%}
+{% macro bottom_results(query, found, endpoint='release_search') -%}
{% if found.offset > 0 %}
{% if found.offset - found.limit < 0 %}
- <a href="{{ url_for(endpoint, q=found.query.q, offset=0) }}">&#xab; Previous</a>
+ <a href="{{ url_for(endpoint, q=query.q, offset=0) }}">&#xab; Previous</a>
{% else %}
- <a href="{{ url_for(endpoint, q=found.query.q, offset=found.offset - found.limit) }}">&#xab; Previous</a>
+ <a href="{{ url_for(endpoint, q=query.q, offset=found.offset - found.limit) }}">&#xab; Previous</a>
{% endif %}
{% else %}
<span style="color:gray">&#xab; Previous</span>
@@ -294,7 +294,7 @@ yellow
found.count_returned }} out of {{ found.count_found }} results</i>&nbsp;&nbsp;
{% if found.offset + found.limit < found.count_found and found.offset + found.limit < found.deep_page_limit %}
- <a href="{{ url_for(endpoint, q=found.query.q, offset=found.offset + found.limit) }}">Next &#xbb;</a>
+ <a href="{{ url_for(endpoint, q=query.q, offset=found.offset + found.limit) }}">Next &#xbb;</a>
{% else %}
<span style="color:gray">Next &#xbb;</span>
{% endif %}
diff --git a/python/fatcat_web/templates/release_search.html b/python/fatcat_web/templates/release_search.html
index a600f1b2..58aa35d6 100644
--- a/python/fatcat_web/templates/release_search.html
+++ b/python/fatcat_web/templates/release_search.html
@@ -2,8 +2,8 @@
{% extends "base.html" %}
{% block title %}
-{% if query %}
- Search: {{ query }}
+{% if query.q %}
+ Search: {{ query.q }}
{% else %}
Release Search
{% endif %}
@@ -18,14 +18,14 @@
<form class="" role="search" action="/release/search" method="get">
<div class="ui form">
<div class="ui action input huge fluid">
- <input type="text" placeholder="Query..." name="q" value="{% if query %}{{ query }}{% endif %}" aria-label="search release metadata">
+ <input type="text" placeholder="Query..." name="q" value="{% if query.q %}{{ query.q }}{% endif %}" aria-label="search release metadata">
<button class="ui primary button">Search</button>
</div>
<div class="ui checkbox" style="float: right; margin: 1em;">
<input type="checkbox" name="fulltext_only" id="fulltext_only" value="true" {% if fulltext_only %}checked{% endif %}>
<label for="fulltext_only">Fulltext Available Only</label>
</div>
- <br>Can also lookup by <b><a href="/release/lookup">identifier</a></b> or search for <b><a href="/container/search?q={{ query or "" }}">containers</a></b> (eg, journals).
+ <br>Can also lookup by <b><a href="/release/lookup">identifier</a></b> or search for <b><a href="/container/search?q={{ query.q or "" }}">containers</a></b> (eg, journals).
</div>
</form>
</div>
@@ -37,7 +37,7 @@
{% if found %}
{% if found.results %}
- {{ entity_macros.top_results(found) }}
+ {{ entity_macros.top_results(query, found) }}
{% for paper in found.results %}
{{ entity_macros.release_search_result_row(paper) }}
@@ -46,13 +46,13 @@
{% if found.results|length > 8 %}
<div class="ui divider"></div>
<div style="text-align: center">
- {{ entity_macros.bottom_results(found, endpoint='release_search') }}
+ {{ entity_macros.bottom_results(query, found, endpoint='release_search') }}
</div>
{% endif %}
{% else %}
- Raw query was: <i>{{ found.query.q }}</i>
+ Raw query was: <i>{{ query.q }}</i>
<div class="ui centered stackable grid" style="padding-top: 15%;">
<div class="row">
@@ -63,9 +63,9 @@
<h2>No results found!</h2>
<p>You could try elsewhere:</p>
<ul>
- <li>Search <a href="https://dissem.in/search?q={{ found.query.q | urlencode }}">dissem.in</a></li>
- <li>Search <a href="https://www.base-search.net/Search/Results?lookfor={{ found.query.q | urlencode }}">BASE</a></li>
- <li>Search <a href="https://scholar.google.com/scholar?q={{ found.query.q | urlencode }}">Google Scholar</a></li>
+ <li>Search <a href="https://dissem.in/search?q={{ query.q | urlencode }}">dissem.in</a></li>
+ <li>Search <a href="https://www.base-search.net/Search/Results?lookfor={{ query.q | urlencode }}">BASE</a></li>
+ <li>Search <a href="https://scholar.google.com/scholar?q={{ query.q | urlencode }}">Google Scholar</a></li>
</ul>
</div>
</div>
diff --git a/python/tests/fixtures.py b/python/tests/fixtures.py
index 44c7be63..3263f243 100644
--- a/python/tests/fixtures.py
+++ b/python/tests/fixtures.py
@@ -1,9 +1,10 @@
import pytest
from dotenv import load_dotenv
+import elasticsearch
+
import fatcat_web
import fatcat_openapi_client
-
from fatcat_openapi_client import *
from fatcat_tools import authenticated_api
@@ -13,6 +14,7 @@ def full_app():
fatcat_web.app.testing = True
fatcat_web.app.debug = False
fatcat_web.app.config['WTF_CSRF_ENABLED'] = False
+ fatcat_web.app.es_client = elasticsearch.Elasticsearch("mockbackend")
return fatcat_web.app
@pytest.fixture
diff --git a/python/tests/web_search.py b/python/tests/web_search.py
index 7647bcf5..55e90d56 100644
--- a/python/tests/web_search.py
+++ b/python/tests/web_search.py
@@ -1,31 +1,36 @@
import json
-import responses
+import pytest
+from fatcat_web.search import get_elastic_container_random_releases, get_elastic_container_histogram
+from fatcat_openapi_client.rest import ApiException
from fixtures import *
-@responses.activate
-def test_release_search(app):
+
+def test_release_search(app, mocker):
with open('tests/files/elastic_release_search.json') as f:
elastic_resp=json.loads(f.read())
- responses.add(responses.GET, 'http://localhost:9200/fatcat_release/_search',
- json=elastic_resp, status=200)
+ es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
+ es_raw.side_effect = [
+ (200, {}, json.dumps(elastic_resp)),
+ ]
rv = app.get('/release/search?q=blood')
assert rv.status_code == 200
assert b"Showing" in rv.data
assert b"Quantum Studies of Acetylene Adsorption on Ice Surface" in rv.data
-@responses.activate
-def test_container_search(app):
+def test_container_search(app, mocker):
with open('tests/files/elastic_container_search.json') as f:
elastic_resp=json.loads(f.read())
- responses.add(responses.GET, 'http://localhost:9200/fatcat_container/_search',
- json=elastic_resp, status=200)
+ es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
+ es_raw.side_effect = [
+ (200, {}, json.dumps(elastic_resp)),
+ ]
rv = app.get('/container/search?q=blood')
assert rv.status_code == 200
@@ -33,6 +38,20 @@ def test_container_search(app):
assert b"European Instructional Course Lectures" in rv.data
assert b"British Editorial Society of Bone and Joint Surger" in rv.data
+def test_random_releases(app, mocker):
+
+ with open('tests/files/elastic_release_search.json') as f:
+ elastic_resp=json.loads(f.read())
+
+ es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
+ es_raw.side_effect = [
+ (200, {}, json.dumps(elastic_resp)),
+ ]
+
+ resp = get_elastic_container_random_releases("123")
+ assert len(resp) >= 1
+
+
elastic_resp1 = {
'timed_out': False,
'aggregations': {
@@ -60,39 +79,34 @@ elastic_resp3 = {
'took': 0
}
-@responses.activate
-def test_stats(app):
-
- responses.add(responses.GET,
- 'http://localhost:9200/fatcat_release/_search?request_cache=true',
- json=elastic_resp1.copy(), status=200)
- responses.add(responses.GET,
- 'http://localhost:9200/fatcat_release/_search?request_cache=true',
- json=elastic_resp2.copy(), status=200)
- responses.add(responses.GET,
- 'http://localhost:9200/fatcat_container/_search?request_cache=true',
- json=elastic_resp3.copy(), status=200)
+def test_stats(app, mocker):
+
+ es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
+ es_raw.side_effect = [
+ (200, {}, json.dumps(elastic_resp1)),
+ (200, {}, json.dumps(elastic_resp2)),
+ (200, {}, json.dumps(elastic_resp3)),
+ ]
+
rv = app.get('/stats')
assert rv.status_code == 200
- # TODO: robe these responses better
-
-@responses.activate
-def test_stats_json(app):
-
- responses.add(responses.GET,
- 'http://localhost:9200/fatcat_release/_search?request_cache=true',
- json=elastic_resp1.copy(), status=200)
- responses.add(responses.GET,
- 'http://localhost:9200/fatcat_release/_search?request_cache=true',
- json=elastic_resp2.copy(), status=200)
- responses.add(responses.GET,
- 'http://localhost:9200/fatcat_container/_search?request_cache=true',
- json=elastic_resp3.copy(), status=200)
+ assert b"80,578,584" in rv.data
+
+def test_stats_json(app, mocker):
+
+ es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
+ es_raw.side_effect = [
+ (200, {}, json.dumps(elastic_resp1)),
+ (200, {}, json.dumps(elastic_resp2)),
+ (200, {}, json.dumps(elastic_resp3)),
+ ]
+
rv = app.get('/stats.json')
assert rv.status_code == 200
+ assert rv.json['papers']['in_kbart'] == 51594200
+ assert rv.json['release']['refs_total'] == 8031459
-@responses.activate
-def test_container_stats(app):
+def test_container_stats(app, mocker):
elastic_resp = {
'timed_out': False,
@@ -106,14 +120,71 @@ def test_container_stats(app):
'took': 50
}
- responses.add(responses.GET,
- 'http://localhost:9200/fatcat_release/_search?request_cache=true',
- json=elastic_resp, status=200)
+ es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
+ es_raw.side_effect = [
+ (200, {}, json.dumps(elastic_resp)),
+ (200, {}, json.dumps(elastic_resp)),
+ ]
rv = app.get('/container/issnl/1234-5678/stats.json')
+ #print(rv.json)
+ assert rv.status_code == 200
+
+ rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/stats.json')
+ assert rv.status_code == 200
+
+def test_container_coverage(app, mocker):
+
+ elastic_resp1 = {
+ 'timed_out': False,
+ 'aggregations': {
+ 'container_stats': {'buckets': {
+ 'is_preserved': {'doc_count': 461939},
+ 'in_kbart': {'doc_count': 461939},
+ 'in_web': {'doc_count': 2797}}}},
+ 'hits': {'total': 461939, 'hits': [], 'max_score': 0.0},
+ '_shards': {'successful': 5, 'total': 5, 'skipped': 0, 'failed': 0},
+ 'took': 50
+ }
+
+ elastic_resp2 = {
+ 'took': 294,
+ 'timed_out': False,
+ '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
+ 'hits': {'total': 4327, 'max_score': 0.0, 'hits': []},
+ 'aggregations': {'year_in_ia': {
+ 'after_key': {'year': 2020.0, 'in_ia': True},
+ 'buckets': [
+ {'key': {'year': 2004.0, 'in_ia': False}, 'doc_count': 4},
+ {'key': {'year': 2004.0, 'in_ia': True}, 'doc_count': 68},
+ {'key': {'year': 2005.0, 'in_ia': False}, 'doc_count': 26},
+ {'key': {'year': 2005.0, 'in_ia': True}, 'doc_count': 428},
+ {'key': {'year': 2006.0, 'in_ia': False}, 'doc_count': 14},
+ {'key': {'year': 2006.0, 'in_ia': True}, 'doc_count': 487},
+ {'key': {'year': 2007.0, 'in_ia': False}, 'doc_count': 13},
+ {'key': {'year': 2007.0, 'in_ia': True}, 'doc_count': 345},
+ ],
+ }},
+ }
+
+ es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
+ es_raw.side_effect = [
+ (200, {}, json.dumps(elastic_resp1)),
+ ]
+
+ rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/coverage')
assert rv.status_code == 200
- # TODO: probe this response better
-# TODO: container stats
-# TODO: container ISSN-L query
-# TODO: release DOI query
-# TODO: release fulltext (filter) query
+ es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
+ es_raw.side_effect = [
+ (200, {}, json.dumps(elastic_resp2)),
+ ]
+
+ rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/ia_coverage_years.json')
+ assert rv.status_code == 200
+
+ es_raw.side_effect = [
+ (200, {}, json.dumps(elastic_resp2)),
+ ]
+
+ rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/ia_coverage_years.svg')
+ assert rv.status_code == 200