summaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-07-02 19:14:13 -0700
committerBryan Newbold <bnewbold@robocracy.org>2020-07-24 10:07:48 -0700
commit94dc508bc54399027c3e2cff0f21e41250c81d89 (patch)
tree20dc5eda3185c1425c657112d919f46c985c3816 /python
parenta1f14f5b5ce087cb4681d46817da2be0777e4220 (diff)
downloadfatcat-94dc508bc54399027c3e2cff0f21e41250c81d89.tar.gz
fatcat-94dc508bc54399027c3e2cff0f21e41250c81d89.zip
finish backend refactoring of search code
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_web/search.py241
-rw-r--r--python/tests/web_search.py79
2 files changed, 185 insertions, 135 deletions
diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py
index 5baa8497..ca270110 100644
--- a/python/fatcat_web/search.py
+++ b/python/fatcat_web/search.py
@@ -9,7 +9,6 @@ import datetime
from dataclasses import dataclass
from typing import List, Optional, Any
-import requests
from flask import abort, flash
import elasticsearch
from elasticsearch_dsl import Search, Q
@@ -218,7 +217,6 @@ def do_release_search(
for h in results:
# Ensure 'contrib_names' is a list, not a single string
- print(h, file=sys.stderr)
if type(h['contrib_names']) is not list:
h['contrib_names'] = [h['contrib_names'], ]
h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']]
@@ -233,6 +231,30 @@ def do_release_search(
results=results,
)
+def get_elastic_container_random_releases(ident, limit=5):
+ """
+ Returns a list of releases from the container.
+ """
+
+ assert limit > 0 and limit <= 100
+
+ search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ search = search.query(
+ 'bool',
+ must=[
+ Q('term', container_id=ident),
+ Q('range', release_year={ "lte": datetime.datetime.today().year }),
+ ]
+ )
+ search = search.sort('-in_web', '-release_date')
+ search = search.params(request_cache=True)
+ search = search[:int(limit)]
+
+ resp = wrap_es_execution(search)
+ results = results_to_dict(resp)
+
+ return results
+
def get_elastic_entity_stats():
"""
TODO: files, filesets, webcaptures (no schema yet)
@@ -246,10 +268,8 @@ def get_elastic_entity_stats():
stats = {}
# release totals
- search = Search(
- using=app.es_client,
- index=app.config['ELASTICSEARCH_RELEASE_INDEX']) \
- .extra(request_cache=True)
+ search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ search = search.params(request_cache=True)
search.aggs.bucket(
'release_ref_count',
'sum',
@@ -257,27 +277,25 @@ def get_elastic_entity_stats():
)
search = search[:0] # pylint: disable=unsubscriptable-object
- # NOTE: not catching exceptions
- resp = search.execute()
+ resp = wrap_es_execution(search)
+
stats['release'] = {
"total": int(resp.hits.total),
"refs_total": int(resp.aggregations.release_ref_count.value),
}
# paper counts
- search = Search(
- using=app.es_client,
- index=app.config['ELASTICSEARCH_RELEASE_INDEX']) \
- .query(
- 'terms',
- release_type=[
- "article-journal",
- "paper-conference",
- # "chapter",
- # "thesis",
- ],
- ) \
- .extra(request_cache=True)
+ search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ search = search.query(
+ 'terms',
+ release_type=[
+ "article-journal",
+ "paper-conference",
+ # "chapter",
+ # "thesis",
+ ],
+ )
+ search = search.params(request_cache=True)
search.aggs.bucket(
'paper_like',
'filters',
@@ -293,8 +311,7 @@ def get_elastic_entity_stats():
)
search = search[:0]
- # NOTE: not catching exceptions
- resp = search.execute()
+ resp = wrap_es_execution(search)
buckets = resp.aggregations.paper_like.buckets
stats['papers'] = {
'total': resp.hits.total,
@@ -305,10 +322,8 @@ def get_elastic_entity_stats():
}
# container counts
- search = Search(
- using=app.es_client,
- index=app.config['ELASTICSEARCH_CONTAINER_INDEX']) \
- .extra(request_cache=True)
+ search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_CONTAINER_INDEX'])
+ search = search.params(request_cache=True)
search.aggs.bucket(
'release_ref_count',
'sum',
@@ -316,8 +331,7 @@ def get_elastic_entity_stats():
)
search = search[:0] # pylint: disable=unsubscriptable-object
- # NOTE: not catching exceptions
- resp = search.execute()
+ resp = wrap_es_execution(search)
stats['container'] = {
"total": resp.hits.total,
}
@@ -335,30 +349,36 @@ def get_elastic_container_stats(ident, issnl=None):
preserved
"""
- query = {
- "size": 0,
- "query": {
- "term": { "container_id": ident }
+ search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ search = search.params(request_cache=True)
+ search = search.query(
+ 'term',
+ container_id=ident,
+ )
+ search.aggs.bucket(
+ 'container_stats',
+ 'filters',
+ filters={
+ "in_web": {
+ "term": { "in_web": True },
+ },
+ "in_kbart": {
+ "term": { "in_kbart": True },
+ },
+ "is_preserved": {
+ "term": { "is_preserved": True },
+ },
},
- "aggs": { "container_stats": { "filters": { "filters": {
- "in_web": { "term": { "in_web": "true" } },
- "in_kbart": { "term": { "in_kbart": "true" } },
- "is_preserved": { "term": { "is_preserved": "true" } },
- }}}}
- }
- resp = requests.get(
- "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']),
- json=query,
- params=dict(request_cache="true"))
- # TODO: abort()
- #print(resp.json())
- resp.raise_for_status()
- resp = resp.json()
- buckets = resp['aggregations']['container_stats']['buckets']
+ )
+ search = search[:0]
+
+ resp = wrap_es_execution(search)
+
+ buckets = resp.aggregations.container_stats.buckets
stats = {
'ident': ident,
'issnl': issnl,
- 'total': resp['hits']['total'],
+ 'total': resp.hits.total,
'in_web': buckets['in_web']['doc_count'],
'in_kbart': buckets['in_kbart']['doc_count'],
'is_preserved': buckets['is_preserved']['doc_count'],
@@ -366,39 +386,6 @@ def get_elastic_container_stats(ident, issnl=None):
return stats
-def get_elastic_container_random_releases(ident, limit=5):
- """
- Returns a list of releases from the container.
- """
-
- assert limit > 0 and limit <= 100
-
- search = Search(using=app.es_client, index=app.conf.ELASTICSEARCH_RELEASE_INDEX) \
- .query('bool',
- must=[
- Q('term', container_id=ident),
- Q('range', release_year={ "lte": datetime.datetime.today().year }),
- ]
- ) \
- .sort('-in_web', '-release_date') \
- .extra(request_cache=True)
-
- search = search[:int(limit)]
-
- resp = search.execute()
-
- hits = [dict(h.source) for h in resp]
-
- for h in hits:
- # Handle surrogate strings that elasticsearch returns sometimes,
- # probably due to mangled data processing in some pipeline.
- # "Crimes against Unicode"; production workaround
- for key in h:
- if type(h[key]) is str:
- h[key] = h[key].encode('utf8', 'ignore').decode('utf8')
-
- return hits
-
def get_elastic_container_histogram(ident):
"""
Fetches a stacked histogram
@@ -409,58 +396,46 @@ def get_elastic_container_histogram(ident):
(year, in_ia, count)
"""
- query = {
- "aggs": {
- "year_in_ia": {
- "composite": {
- "size": 1000,
- "sources": [
- {"year": {
- "histogram": {
- "field": "release_year",
- "interval": 1,
- }}},
- {"in_ia": {
- "terms": {
- "field": "in_ia",
- }}},
- ],
+ search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
+ search = search.params(request_cache='true')
+ search = search.query(
+ 'bool',
+ must=[
+ Q("range", release_year={
+ "gte": datetime.datetime.today().year - 499,
+ "lte": datetime.datetime.today().year,
+ }),
+ ],
+ filter=[
+ Q("bool", minimum_should_match=1, should=[
+ Q("match", container_id=ident),
+ ]),
+ ],
+ )
+ search.aggs.bucket(
+ 'year_in_ia',
+ 'composite',
+ size=1000,
+ sources=[
+ {"year": {
+ "histogram": {
+ "field": "release_year",
+ "interval": 1,
},
- },
- },
- "size": 0,
- "query": {
- "bool": {
- "must": [{
- "range": {
- "release_year": {
- "gte": datetime.datetime.today().year - 499,
- "lte": datetime.datetime.today().year,
- }
- }
- }],
- "filter": [{
- "bool": {
- "should": [{
- "match": {
- "container_id": ident
- }
- }],
- "minimum_should_match": 1,
- },
- }],
- }
- }
- }
- resp = requests.get(
- "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']),
- json=query,
- params=dict(request_cache="true"))
- resp.raise_for_status()
- # TODO: abort()
- resp = resp.json()
- #print(resp)
+ }},
+ {"in_ia": {
+ "terms": {
+ "field": "in_ia",
+ },
+ }},
+ ],
+ )
+ search = search[:0]
+
+ resp = wrap_es_execution(search)
+
+ buckets = resp.aggregations.year_in_ia.buckets
vals = [(h['key']['year'], h['key']['in_ia'], h['doc_count'])
- for h in resp['aggregations']['year_in_ia']['buckets']]
+ for h in buckets]
vals = sorted(vals)
return vals
diff --git a/python/tests/web_search.py b/python/tests/web_search.py
index 07985e04..460f5ee2 100644
--- a/python/tests/web_search.py
+++ b/python/tests/web_search.py
@@ -2,6 +2,7 @@
import json
import pytest
+from fatcat_web.search import get_elastic_container_random_releases, get_elastic_container_histogram
from fatcat_openapi_client.rest import ApiException
from fixtures import *
@@ -38,6 +39,20 @@ def test_container_search(app, mocker):
assert b"European Instructional Course Lectures" in rv.data
assert b"British Editorial Society of Bone and Joint Surger" in rv.data
+def test_random_releases(app, mocker):
+
+ with open('tests/files/elastic_release_search.json') as f:
+ elastic_resp=json.loads(f.read())
+
+ es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
+ es_raw.side_effect = [
+ (200, {}, json.dumps(elastic_resp)),
+ ]
+
+ resp = get_elastic_container_random_releases("123")
+ assert len(resp) >= 1
+
+
elastic_resp1 = {
'timed_out': False,
'aggregations': {
@@ -92,7 +107,6 @@ def test_stats_json(app, mocker):
assert rv.json['papers']['in_kbart'] == 51594200
assert rv.json['release']['refs_total'] == 8031459
-@pytest.mark.skip
def test_container_stats(app, mocker):
elastic_resp = {
@@ -110,7 +124,68 @@ def test_container_stats(app, mocker):
es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
es_raw.side_effect = [
(200, {}, json.dumps(elastic_resp)),
+ (200, {}, json.dumps(elastic_resp)),
]
rv = app.get('/container/issnl/1234-5678/stats.json')
#print(rv.json)
- assert rv.status_code == 201
+ assert rv.status_code == 200
+
+ rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/stats.json')
+ assert rv.status_code == 200
+
+def test_container_coverage(app, mocker):
+
+ elastic_resp1 = {
+ 'timed_out': False,
+ 'aggregations': {
+ 'container_stats': {'buckets': {
+ 'is_preserved': {'doc_count': 461939},
+ 'in_kbart': {'doc_count': 461939},
+ 'in_web': {'doc_count': 2797}}}},
+ 'hits': {'total': 461939, 'hits': [], 'max_score': 0.0},
+ '_shards': {'successful': 5, 'total': 5, 'skipped': 0, 'failed': 0},
+ 'took': 50
+ }
+
+ elastic_resp2 = {
+ 'took': 294,
+ 'timed_out': False,
+ '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
+ 'hits': {'total': 4327, 'max_score': 0.0, 'hits': []},
+ 'aggregations': {'year_in_ia': {
+ 'after_key': {'year': 2020.0, 'in_ia': True},
+ 'buckets': [
+ {'key': {'year': 2004.0, 'in_ia': False}, 'doc_count': 4},
+ {'key': {'year': 2004.0, 'in_ia': True}, 'doc_count': 68},
+ {'key': {'year': 2005.0, 'in_ia': False}, 'doc_count': 26},
+ {'key': {'year': 2005.0, 'in_ia': True}, 'doc_count': 428},
+ {'key': {'year': 2006.0, 'in_ia': False}, 'doc_count': 14},
+ {'key': {'year': 2006.0, 'in_ia': True}, 'doc_count': 487},
+ {'key': {'year': 2007.0, 'in_ia': False}, 'doc_count': 13},
+ {'key': {'year': 2007.0, 'in_ia': True}, 'doc_count': 345},
+ ],
+ }},
+ }
+
+ es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
+ es_raw.side_effect = [
+ (200, {}, json.dumps(elastic_resp1)),
+ ]
+
+ rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/coverage')
+ assert rv.status_code == 200
+
+ es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
+ es_raw.side_effect = [
+ (200, {}, json.dumps(elastic_resp2)),
+ ]
+
+ rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/ia_coverage_years.json')
+ assert rv.status_code == 200
+
+ es_raw.side_effect = [
+ (200, {}, json.dumps(elastic_resp2)),
+ ]
+
+ rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/ia_coverage_years.svg')
+ assert rv.status_code == 200