From b9ba24553e2e1de3c3ac0faeba59231ec512fa67 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 1 Jul 2020 20:27:45 -0700 Subject: refactor release and container search Based on fatcat-scholar refactoring. This doesn't include refactoring of stats, aggregates, or histograms yet, just the direct queries. Don't have any test coverage yet; intend to try elasticmock or figuring out how to ingest mock JSON results directly. --- python/tests/web_search.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'python/tests/web_search.py') diff --git a/python/tests/web_search.py b/python/tests/web_search.py index 7647bcf5..b55b0fcf 100644 --- a/python/tests/web_search.py +++ b/python/tests/web_search.py @@ -4,6 +4,7 @@ import responses from fixtures import * +@pytest.mark.skip @responses.activate def test_release_search(app): @@ -18,6 +19,7 @@ def test_release_search(app): assert b"Showing" in rv.data assert b"Quantum Studies of Acetylene Adsorption on Ice Surface" in rv.data +@pytest.mark.skip @responses.activate def test_container_search(app): @@ -112,8 +114,3 @@ def test_container_stats(app): rv = app.get('/container/issnl/1234-5678/stats.json') assert rv.status_code == 200 # TODO: probe this response better - -# TODO: container stats -# TODO: container ISSN-L query -# TODO: release DOI query -# TODO: release fulltext (filter) query -- cgit v1.2.3 From a1f14f5b5ce087cb4681d46817da2be0777e4220 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 2 Jul 2020 12:12:11 -0700 Subject: update web_search tests to mock ES client Instead of using 'responses' mock of 'requests' library. Tried using 'elasticmock' helper but it didn't work. --- python/tests/fixtures.py | 4 ++- python/tests/web_search.py | 88 +++++++++++++++++++++++----------------------- 2 files changed, 47 insertions(+), 45 deletions(-) (limited to 'python/tests/web_search.py') diff --git a/python/tests/fixtures.py b/python/tests/fixtures.py index 44c7be63..3263f243 100644 --- a/python/tests/fixtures.py +++ b/python/tests/fixtures.py @@ -1,9 +1,10 @@ import pytest from dotenv import load_dotenv +import elasticsearch + import fatcat_web import fatcat_openapi_client - from fatcat_openapi_client import * from fatcat_tools import authenticated_api @@ -13,6 +14,7 @@ def full_app(): fatcat_web.app.testing = True fatcat_web.app.debug = False fatcat_web.app.config['WTF_CSRF_ENABLED'] = False + fatcat_web.app.es_client = elasticsearch.Elasticsearch("mockbackend") return fatcat_web.app @pytest.fixture diff --git a/python/tests/web_search.py b/python/tests/web_search.py index b55b0fcf..07985e04 100644 --- a/python/tests/web_search.py +++ b/python/tests/web_search.py @@ -1,33 +1,36 @@ import json -import responses +import pytest +from fatcat_openapi_client.rest import ApiException from fixtures import * -@pytest.mark.skip -@responses.activate -def test_release_search(app): + +def test_release_search(app, mocker): with open('tests/files/elastic_release_search.json') as f: elastic_resp=json.loads(f.read()) - responses.add(responses.GET, 'http://localhost:9200/fatcat_release/_search', - json=elastic_resp, status=200) + es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp)), + ] rv = app.get('/release/search?q=blood') assert rv.status_code == 200 assert b"Showing" in rv.data assert b"Quantum Studies of Acetylene Adsorption on Ice Surface" in rv.data -@pytest.mark.skip -@responses.activate -def test_container_search(app): +def test_container_search(app, mocker): with open('tests/files/elastic_container_search.json') as f: elastic_resp=json.loads(f.read()) - responses.add(responses.GET, 'http://localhost:9200/fatcat_container/_search', - json=elastic_resp, status=200) + + es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp)), + ] rv = app.get('/container/search?q=blood') assert rv.status_code == 200 @@ -62,39 +65,35 @@ elastic_resp3 = { 'took': 0 } -@responses.activate -def test_stats(app): - - responses.add(responses.GET, - 'http://localhost:9200/fatcat_release/_search?request_cache=true', - json=elastic_resp1.copy(), status=200) - responses.add(responses.GET, - 'http://localhost:9200/fatcat_release/_search?request_cache=true', - json=elastic_resp2.copy(), status=200) - responses.add(responses.GET, - 'http://localhost:9200/fatcat_container/_search?request_cache=true', - json=elastic_resp3.copy(), status=200) +def test_stats(app, mocker): + + es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp1)), + (200, {}, json.dumps(elastic_resp2)), + (200, {}, json.dumps(elastic_resp3)), + ] + rv = app.get('/stats') assert rv.status_code == 200 - # TODO: robe these responses better - -@responses.activate -def test_stats_json(app): - - responses.add(responses.GET, - 'http://localhost:9200/fatcat_release/_search?request_cache=true', - json=elastic_resp1.copy(), status=200) - responses.add(responses.GET, - 'http://localhost:9200/fatcat_release/_search?request_cache=true', - json=elastic_resp2.copy(), status=200) - responses.add(responses.GET, - 'http://localhost:9200/fatcat_container/_search?request_cache=true', - json=elastic_resp3.copy(), status=200) + assert b"80,578,584" in rv.data + +def test_stats_json(app, mocker): + + es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp1)), + (200, {}, json.dumps(elastic_resp2)), + (200, {}, json.dumps(elastic_resp3)), + ] + rv = app.get('/stats.json') assert rv.status_code == 200 + assert rv.json['papers']['in_kbart'] == 51594200 + assert rv.json['release']['refs_total'] == 8031459 -@responses.activate -def test_container_stats(app): +@pytest.mark.skip +def test_container_stats(app, mocker): elastic_resp = { 'timed_out': False, @@ -108,9 +107,10 @@ def test_container_stats(app): 'took': 50 } - responses.add(responses.GET, - 'http://localhost:9200/fatcat_release/_search?request_cache=true', - json=elastic_resp, status=200) + es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp)), + ] rv = app.get('/container/issnl/1234-5678/stats.json') - assert rv.status_code == 200 - # TODO: probe this response better + #print(rv.json) + assert rv.status_code == 201 -- cgit v1.2.3 From 94dc508bc54399027c3e2cff0f21e41250c81d89 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 2 Jul 2020 19:14:13 -0700 Subject: finish backend refactoring of search code --- python/fatcat_web/search.py | 241 ++++++++++++++++++++------------------------ python/tests/web_search.py | 79 ++++++++++++++- 2 files changed, 185 insertions(+), 135 deletions(-) (limited to 'python/tests/web_search.py') diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index 5baa8497..ca270110 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -9,7 +9,6 @@ import datetime from dataclasses import dataclass from typing import List, Optional, Any -import requests from flask import abort, flash import elasticsearch from elasticsearch_dsl import Search, Q @@ -218,7 +217,6 @@ def do_release_search( for h in results: # Ensure 'contrib_names' is a list, not a single string - print(h, file=sys.stderr) if type(h['contrib_names']) is not list: h['contrib_names'] = [h['contrib_names'], ] h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']] @@ -233,6 +231,30 @@ def do_release_search( results=results, ) +def get_elastic_container_random_releases(ident, limit=5): + """ + Returns a list of releases from the container. + """ + + assert limit > 0 and limit <= 100 + + search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + search = search.query( + 'bool', + must=[ + Q('term', container_id=ident), + Q('range', release_year={ "lte": datetime.datetime.today().year }), + ] + ) + search = search.sort('-in_web', '-release_date') + search = search.params(request_cache=True) + search = search[:int(limit)] + + resp = wrap_es_execution(search) + results = results_to_dict(resp) + + return results + def get_elastic_entity_stats(): """ TODO: files, filesets, webcaptures (no schema yet) @@ -246,10 +268,8 @@ def get_elastic_entity_stats(): stats = {} # release totals - search = Search( - using=app.es_client, - index=app.config['ELASTICSEARCH_RELEASE_INDEX']) \ - .extra(request_cache=True) + search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + search = search.params(request_cache=True) search.aggs.bucket( 'release_ref_count', 'sum', @@ -257,27 +277,25 @@ def get_elastic_entity_stats(): ) search = search[:0] # pylint: disable=unsubscriptable-object - # NOTE: not catching exceptions - resp = search.execute() + resp = wrap_es_execution(search) + stats['release'] = { "total": int(resp.hits.total), "refs_total": int(resp.aggregations.release_ref_count.value), } # paper counts - search = Search( - using=app.es_client, - index=app.config['ELASTICSEARCH_RELEASE_INDEX']) \ - .query( - 'terms', - release_type=[ - "article-journal", - "paper-conference", - # "chapter", - # "thesis", - ], - ) \ - .extra(request_cache=True) + search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + search = search.query( + 'terms', + release_type=[ + "article-journal", + "paper-conference", + # "chapter", + # "thesis", + ], + ) + search = search.params(request_cache=True) search.aggs.bucket( 'paper_like', 'filters', @@ -293,8 +311,7 @@ def get_elastic_entity_stats(): ) search = search[:0] - # NOTE: not catching exceptions - resp = search.execute() + resp = wrap_es_execution(search) buckets = resp.aggregations.paper_like.buckets stats['papers'] = { 'total': resp.hits.total, @@ -305,10 +322,8 @@ def get_elastic_entity_stats(): } # container counts - search = Search( - using=app.es_client, - index=app.config['ELASTICSEARCH_CONTAINER_INDEX']) \ - .extra(request_cache=True) + search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_CONTAINER_INDEX']) + search = search.params(request_cache=True) search.aggs.bucket( 'release_ref_count', 'sum', @@ -316,8 +331,7 @@ def get_elastic_entity_stats(): ) search = search[:0] # pylint: disable=unsubscriptable-object - # NOTE: not catching exceptions - resp = search.execute() + resp = wrap_es_execution(search) stats['container'] = { "total": resp.hits.total, } @@ -335,30 +349,36 @@ def get_elastic_container_stats(ident, issnl=None): preserved """ - query = { - "size": 0, - "query": { - "term": { "container_id": ident } + search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + search = search.params(request_cache=True) + search = search.query( + 'term', + container_id=ident, + ) + search.aggs.bucket( + 'container_stats', + 'filters', + filters={ + "in_web": { + "term": { "in_web": True }, + }, + "in_kbart": { + "term": { "in_kbart": True }, + }, + "is_preserved": { + "term": { "is_preserved": True }, + }, }, - "aggs": { "container_stats": { "filters": { "filters": { - "in_web": { "term": { "in_web": "true" } }, - "in_kbart": { "term": { "in_kbart": "true" } }, - "is_preserved": { "term": { "is_preserved": "true" } }, - }}}} - } - resp = requests.get( - "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), - json=query, - params=dict(request_cache="true")) - # TODO: abort() - #print(resp.json()) - resp.raise_for_status() - resp = resp.json() - buckets = resp['aggregations']['container_stats']['buckets'] + ) + search = search[:0] + + resp = wrap_es_execution(search) + + buckets = resp.aggregations.container_stats.buckets stats = { 'ident': ident, 'issnl': issnl, - 'total': resp['hits']['total'], + 'total': resp.hits.total, 'in_web': buckets['in_web']['doc_count'], 'in_kbart': buckets['in_kbart']['doc_count'], 'is_preserved': buckets['is_preserved']['doc_count'], @@ -366,39 +386,6 @@ def get_elastic_container_stats(ident, issnl=None): return stats -def get_elastic_container_random_releases(ident, limit=5): - """ - Returns a list of releases from the container. - """ - - assert limit > 0 and limit <= 100 - - search = Search(using=app.es_client, index=app.conf.ELASTICSEARCH_RELEASE_INDEX) \ - .query('bool', - must=[ - Q('term', container_id=ident), - Q('range', release_year={ "lte": datetime.datetime.today().year }), - ] - ) \ - .sort('-in_web', '-release_date') \ - .extra(request_cache=True) - - search = search[:int(limit)] - - resp = search.execute() - - hits = [dict(h.source) for h in resp] - - for h in hits: - # Handle surrogate strings that elasticsearch returns sometimes, - # probably due to mangled data processing in some pipeline. - # "Crimes against Unicode"; production workaround - for key in h: - if type(h[key]) is str: - h[key] = h[key].encode('utf8', 'ignore').decode('utf8') - - return hits - def get_elastic_container_histogram(ident): """ Fetches a stacked histogram @@ -409,58 +396,46 @@ def get_elastic_container_histogram(ident): (year, in_ia, count) """ - query = { - "aggs": { - "year_in_ia": { - "composite": { - "size": 1000, - "sources": [ - {"year": { - "histogram": { - "field": "release_year", - "interval": 1, - }}}, - {"in_ia": { - "terms": { - "field": "in_ia", - }}}, - ], + search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + search = search.params(request_cache='true') + search = search.query( + 'bool', + must=[ + Q("range", release_year={ + "gte": datetime.datetime.today().year - 499, + "lte": datetime.datetime.today().year, + }), + ], + filter=[ + Q("bool", minimum_should_match=1, should=[ + Q("match", container_id=ident), + ]), + ], + ) + search.aggs.bucket( + 'year_in_ia', + 'composite', + size=1000, + sources=[ + {"year": { + "histogram": { + "field": "release_year", + "interval": 1, }, - }, - }, - "size": 0, - "query": { - "bool": { - "must": [{ - "range": { - "release_year": { - "gte": datetime.datetime.today().year - 499, - "lte": datetime.datetime.today().year, - } - } - }], - "filter": [{ - "bool": { - "should": [{ - "match": { - "container_id": ident - } - }], - "minimum_should_match": 1, - }, - }], - } - } - } - resp = requests.get( - "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), - json=query, - params=dict(request_cache="true")) - resp.raise_for_status() - # TODO: abort() - resp = resp.json() - #print(resp) + }}, + {"in_ia": { + "terms": { + "field": "in_ia", + }, + }}, + ], + ) + search = search[:0] + + resp = wrap_es_execution(search) + + buckets = resp.aggregations.year_in_ia.buckets vals = [(h['key']['year'], h['key']['in_ia'], h['doc_count']) - for h in resp['aggregations']['year_in_ia']['buckets']] + for h in buckets] vals = sorted(vals) return vals diff --git a/python/tests/web_search.py b/python/tests/web_search.py index 07985e04..460f5ee2 100644 --- a/python/tests/web_search.py +++ b/python/tests/web_search.py @@ -2,6 +2,7 @@ import json import pytest +from fatcat_web.search import get_elastic_container_random_releases, get_elastic_container_histogram from fatcat_openapi_client.rest import ApiException from fixtures import * @@ -38,6 +39,20 @@ def test_container_search(app, mocker): assert b"European Instructional Course Lectures" in rv.data assert b"British Editorial Society of Bone and Joint Surger" in rv.data +def test_random_releases(app, mocker): + + with open('tests/files/elastic_release_search.json') as f: + elastic_resp=json.loads(f.read()) + + es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp)), + ] + + resp = get_elastic_container_random_releases("123") + assert len(resp) >= 1 + + elastic_resp1 = { 'timed_out': False, 'aggregations': { @@ -92,7 +107,6 @@ def test_stats_json(app, mocker): assert rv.json['papers']['in_kbart'] == 51594200 assert rv.json['release']['refs_total'] == 8031459 -@pytest.mark.skip def test_container_stats(app, mocker): elastic_resp = { @@ -110,7 +124,68 @@ def test_container_stats(app, mocker): es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') es_raw.side_effect = [ (200, {}, json.dumps(elastic_resp)), + (200, {}, json.dumps(elastic_resp)), ] rv = app.get('/container/issnl/1234-5678/stats.json') #print(rv.json) - assert rv.status_code == 201 + assert rv.status_code == 200 + + rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/stats.json') + assert rv.status_code == 200 + +def test_container_coverage(app, mocker): + + elastic_resp1 = { + 'timed_out': False, + 'aggregations': { + 'container_stats': {'buckets': { + 'is_preserved': {'doc_count': 461939}, + 'in_kbart': {'doc_count': 461939}, + 'in_web': {'doc_count': 2797}}}}, + 'hits': {'total': 461939, 'hits': [], 'max_score': 0.0}, + '_shards': {'successful': 5, 'total': 5, 'skipped': 0, 'failed': 0}, + 'took': 50 + } + + elastic_resp2 = { + 'took': 294, + 'timed_out': False, + '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0}, + 'hits': {'total': 4327, 'max_score': 0.0, 'hits': []}, + 'aggregations': {'year_in_ia': { + 'after_key': {'year': 2020.0, 'in_ia': True}, + 'buckets': [ + {'key': {'year': 2004.0, 'in_ia': False}, 'doc_count': 4}, + {'key': {'year': 2004.0, 'in_ia': True}, 'doc_count': 68}, + {'key': {'year': 2005.0, 'in_ia': False}, 'doc_count': 26}, + {'key': {'year': 2005.0, 'in_ia': True}, 'doc_count': 428}, + {'key': {'year': 2006.0, 'in_ia': False}, 'doc_count': 14}, + {'key': {'year': 2006.0, 'in_ia': True}, 'doc_count': 487}, + {'key': {'year': 2007.0, 'in_ia': False}, 'doc_count': 13}, + {'key': {'year': 2007.0, 'in_ia': True}, 'doc_count': 345}, + ], + }}, + } + + es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp1)), + ] + + rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/coverage') + assert rv.status_code == 200 + + es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp2)), + ] + + rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/ia_coverage_years.json') + assert rv.status_code == 200 + + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp2)), + ] + + rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/ia_coverage_years.svg') + assert rv.status_code == 200 -- cgit v1.2.3 From d798ee172294de09ab1621530df4e3498a17640e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 23 Jul 2020 15:02:27 -0700 Subject: small lint fixes --- python/fatcat_web/search.py | 3 +-- python/tests/web_search.py | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'python/tests/web_search.py') diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index ca270110..1165a004 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -9,7 +9,6 @@ import datetime from dataclasses import dataclass from typing import List, Optional, Any -from flask import abort, flash import elasticsearch from elasticsearch_dsl import Search, Q import elasticsearch_dsl.response @@ -43,7 +42,7 @@ class ReleaseQuery: offset = max(0, int(offset)) if offset.isnumeric() else 0 return ReleaseQuery( - q=query_str, + q=query_str, offset=offset, fulltext_only=bool(args.get('fulltext_only')), container_id=container_id, diff --git a/python/tests/web_search.py b/python/tests/web_search.py index 460f5ee2..55e90d56 100644 --- a/python/tests/web_search.py +++ b/python/tests/web_search.py @@ -27,7 +27,6 @@ def test_container_search(app, mocker): with open('tests/files/elastic_container_search.json') as f: elastic_resp=json.loads(f.read()) - es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') es_raw.side_effect = [ (200, {}, json.dumps(elastic_resp)), -- cgit v1.2.3