diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2020-07-02 19:14:13 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-07-24 10:07:48 -0700 | 
| commit | 94dc508bc54399027c3e2cff0f21e41250c81d89 (patch) | |
| tree | 20dc5eda3185c1425c657112d919f46c985c3816 /python | |
| parent | a1f14f5b5ce087cb4681d46817da2be0777e4220 (diff) | |
| download | fatcat-94dc508bc54399027c3e2cff0f21e41250c81d89.tar.gz fatcat-94dc508bc54399027c3e2cff0f21e41250c81d89.zip | |
finish backend refactoring of search code
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat_web/search.py | 241 | ||||
| -rw-r--r-- | python/tests/web_search.py | 79 | 
2 files changed, 185 insertions, 135 deletions
| diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index 5baa8497..ca270110 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -9,7 +9,6 @@ import datetime  from dataclasses import dataclass  from typing import List, Optional, Any -import requests  from flask import abort, flash  import elasticsearch  from elasticsearch_dsl import Search, Q @@ -218,7 +217,6 @@ def do_release_search(      for h in results:          # Ensure 'contrib_names' is a list, not a single string -        print(h, file=sys.stderr)          if type(h['contrib_names']) is not list:              h['contrib_names'] = [h['contrib_names'], ]          h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']] @@ -233,6 +231,30 @@ def do_release_search(          results=results,      ) +def get_elastic_container_random_releases(ident, limit=5): +    """ +    Returns a list of releases from the container. +    """ + +    assert limit > 0 and limit <= 100 + +    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) +    search = search.query( +        'bool', +        must=[ +            Q('term', container_id=ident), +            Q('range', release_year={ "lte": datetime.datetime.today().year }), +        ] +    ) +    search = search.sort('-in_web', '-release_date') +    search = search.params(request_cache=True) +    search = search[:int(limit)] + +    resp = wrap_es_execution(search) +    results = results_to_dict(resp) + +    return results +  def get_elastic_entity_stats():      """      TODO: files, filesets, webcaptures (no schema yet) @@ -246,10 +268,8 @@ def get_elastic_entity_stats():      stats = {}      # release totals -    search = Search( -        using=app.es_client, -        index=app.config['ELASTICSEARCH_RELEASE_INDEX']) \ -        .extra(request_cache=True) +    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) +    search = search.params(request_cache=True)      search.aggs.bucket(          'release_ref_count',          'sum', @@ -257,27 +277,25 @@ def get_elastic_entity_stats():      )      search = search[:0]  # pylint: disable=unsubscriptable-object -    # NOTE: not catching exceptions -    resp = search.execute() +    resp = wrap_es_execution(search) +      stats['release'] = {          "total": int(resp.hits.total),          "refs_total": int(resp.aggregations.release_ref_count.value),      }      # paper counts -    search = Search( -        using=app.es_client, -        index=app.config['ELASTICSEARCH_RELEASE_INDEX']) \ -        .query( -            'terms', -            release_type=[ -                "article-journal", -                "paper-conference", -                # "chapter", -                # "thesis", -            ], -        ) \ -        .extra(request_cache=True) +    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) +    search = search.query( +        'terms', +        release_type=[ +            "article-journal", +            "paper-conference", +            # "chapter", +            # "thesis", +        ], +    ) +    search = search.params(request_cache=True)      search.aggs.bucket(          'paper_like',          'filters', @@ -293,8 +311,7 @@ def get_elastic_entity_stats():      )      search = search[:0] -    # NOTE: not catching exceptions -    resp = search.execute() +    resp = wrap_es_execution(search)      buckets = resp.aggregations.paper_like.buckets      stats['papers'] = {          'total': resp.hits.total, @@ -305,10 +322,8 @@ def get_elastic_entity_stats():      }      # container counts -    search = Search( -        using=app.es_client, -        index=app.config['ELASTICSEARCH_CONTAINER_INDEX']) \ -        .extra(request_cache=True) +    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_CONTAINER_INDEX']) +    search = search.params(request_cache=True)      search.aggs.bucket(          'release_ref_count',          'sum', @@ -316,8 +331,7 @@ def get_elastic_entity_stats():      )      search = search[:0]  # pylint: disable=unsubscriptable-object -    # NOTE: not catching exceptions -    resp = search.execute() +    resp = wrap_es_execution(search)      stats['container'] = {          "total": resp.hits.total,      } @@ -335,30 +349,36 @@ def get_elastic_container_stats(ident, issnl=None):          preserved      """ -    query = { -        "size": 0, -        "query": { -            "term": { "container_id": ident } +    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) +    search = search.params(request_cache=True) +    search = search.query( +        'term', +        container_id=ident, +    ) +    search.aggs.bucket( +        'container_stats', +        'filters', +        filters={ +            "in_web": { +                "term": { "in_web": True }, +            }, +            "in_kbart": { +                "term": { "in_kbart": True }, +            }, +            "is_preserved": { +                "term": { "is_preserved": True }, +            },          }, -        "aggs": { "container_stats": { "filters": { "filters": { -                "in_web": { "term": { "in_web": "true" } }, -                "in_kbart": { "term": { "in_kbart": "true" } }, -                "is_preserved": { "term": { "is_preserved": "true" } }, -        }}}} -    } -    resp = requests.get( -        "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), -        json=query, -        params=dict(request_cache="true")) -    # TODO: abort() -    #print(resp.json()) -    resp.raise_for_status() -    resp = resp.json() -    buckets = resp['aggregations']['container_stats']['buckets'] +    ) +    search = search[:0] + +    resp = wrap_es_execution(search) + +    buckets = resp.aggregations.container_stats.buckets      stats = {          'ident': ident,          'issnl': issnl, -        'total': resp['hits']['total'], +        'total': resp.hits.total,          'in_web': buckets['in_web']['doc_count'],          'in_kbart': buckets['in_kbart']['doc_count'],          'is_preserved': buckets['is_preserved']['doc_count'], @@ -366,39 +386,6 @@ def get_elastic_container_stats(ident, issnl=None):      return stats -def get_elastic_container_random_releases(ident, limit=5): -    """ -    Returns a list of releases from the container. -    """ - -    assert limit > 0 and limit <= 100 - -    search = Search(using=app.es_client, index=app.conf.ELASTICSEARCH_RELEASE_INDEX) \ -        .query('bool', -            must=[ -                Q('term', container_id=ident), -                Q('range', release_year={ "lte": datetime.datetime.today().year }), -            ] -        ) \ -        .sort('-in_web', '-release_date') \ -        .extra(request_cache=True) - -    search = search[:int(limit)] - -    resp = search.execute() - -    hits = [dict(h.source) for h in resp] - -    for h in hits: -        # Handle surrogate strings that elasticsearch returns sometimes, -        # probably due to mangled data processing in some pipeline. -        # "Crimes against Unicode"; production workaround -        for key in h: -            if type(h[key]) is str: -                h[key] = h[key].encode('utf8', 'ignore').decode('utf8') - -    return hits -  def get_elastic_container_histogram(ident):      """      Fetches a stacked histogram @@ -409,58 +396,46 @@ def get_elastic_container_histogram(ident):          (year, in_ia, count)      """ -    query = { -        "aggs": { -            "year_in_ia": { -                "composite": { -                    "size": 1000, -                    "sources": [ -                        {"year": { -                            "histogram": { -                                "field": "release_year", -                                "interval": 1, -                        }}}, -                        {"in_ia": { -                            "terms": { -                                "field": "in_ia", -                        }}}, -                    ], +    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) +    search = search.params(request_cache='true') +    search = search.query( +        'bool', +        must=[ +            Q("range", release_year={ +                "gte": datetime.datetime.today().year - 499, +                "lte": datetime.datetime.today().year, +            }), +        ], +        filter=[ +            Q("bool", minimum_should_match=1, should=[ +                Q("match", container_id=ident), +            ]), +        ], +    ) +    search.aggs.bucket( +        'year_in_ia', +        'composite', +        size=1000, +        sources=[ +            {"year": { +                "histogram": { +                    "field": "release_year", +                    "interval": 1,                  }, -            }, -        }, -        "size": 0, -        "query": { -            "bool": { -                "must": [{ -                    "range": { -                        "release_year": { -                            "gte": datetime.datetime.today().year - 499, -                            "lte": datetime.datetime.today().year, -                        } -                    } -                }], -                "filter": [{ -                    "bool": { -                        "should": [{ -                            "match": { -                                "container_id": ident -                            } -                        }], -                        "minimum_should_match": 1, -                    }, -                }], -            } -        } -    } -    resp = requests.get( -        "{}/fatcat_release/_search".format(app.config['ELASTICSEARCH_BACKEND']), -        json=query, -        params=dict(request_cache="true")) -    resp.raise_for_status() -    # TODO: abort() -    resp = resp.json() -    #print(resp) +            }}, +            {"in_ia": { +                "terms": { +                    "field": "in_ia", +                }, +            }}, +        ], +    ) +    search = search[:0] + +    resp = wrap_es_execution(search) + +    buckets = resp.aggregations.year_in_ia.buckets      vals = [(h['key']['year'], h['key']['in_ia'], h['doc_count']) -            for h in resp['aggregations']['year_in_ia']['buckets']] +            for h in buckets]      vals = sorted(vals)      return vals diff --git a/python/tests/web_search.py b/python/tests/web_search.py index 07985e04..460f5ee2 100644 --- a/python/tests/web_search.py +++ b/python/tests/web_search.py @@ -2,6 +2,7 @@  import json  import pytest +from fatcat_web.search import get_elastic_container_random_releases, get_elastic_container_histogram  from fatcat_openapi_client.rest import ApiException  from fixtures import * @@ -38,6 +39,20 @@ def test_container_search(app, mocker):      assert b"European Instructional Course Lectures" in rv.data      assert b"British Editorial Society of Bone and Joint Surger" in rv.data +def test_random_releases(app, mocker): + +    with open('tests/files/elastic_release_search.json') as f: +        elastic_resp=json.loads(f.read()) + +    es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') +    es_raw.side_effect = [ +        (200, {}, json.dumps(elastic_resp)), +    ] + +    resp = get_elastic_container_random_releases("123") +    assert len(resp) >= 1 + +  elastic_resp1 = {      'timed_out': False,      'aggregations': { @@ -92,7 +107,6 @@ def test_stats_json(app, mocker):      assert rv.json['papers']['in_kbart'] == 51594200      assert rv.json['release']['refs_total'] == 8031459 -@pytest.mark.skip  def test_container_stats(app, mocker):      elastic_resp = { @@ -110,7 +124,68 @@ def test_container_stats(app, mocker):      es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')      es_raw.side_effect = [          (200, {}, json.dumps(elastic_resp)), +        (200, {}, json.dumps(elastic_resp)),      ]      rv = app.get('/container/issnl/1234-5678/stats.json')      #print(rv.json) -    assert rv.status_code == 201 +    assert rv.status_code == 200 + +    rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/stats.json') +    assert rv.status_code == 200 + +def test_container_coverage(app, mocker): + +    elastic_resp1 = { +        'timed_out': False, +        'aggregations': { +            'container_stats': {'buckets': { +              'is_preserved': {'doc_count': 461939}, +              'in_kbart': {'doc_count': 461939}, +              'in_web': {'doc_count': 2797}}}}, +        'hits': {'total': 461939, 'hits': [], 'max_score': 0.0}, +        '_shards': {'successful': 5, 'total': 5, 'skipped': 0, 'failed': 0}, +        'took': 50 +    } + +    elastic_resp2 = { +        'took': 294, +        'timed_out': False, +        '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0}, +        'hits': {'total': 4327, 'max_score': 0.0, 'hits': []}, +        'aggregations': {'year_in_ia': { +            'after_key': {'year': 2020.0, 'in_ia': True}, +            'buckets': [ +                {'key': {'year': 2004.0, 'in_ia': False}, 'doc_count': 4}, +                {'key': {'year': 2004.0, 'in_ia': True}, 'doc_count': 68}, +                {'key': {'year': 2005.0, 'in_ia': False}, 'doc_count': 26}, +                {'key': {'year': 2005.0, 'in_ia': True}, 'doc_count': 428}, +                {'key': {'year': 2006.0, 'in_ia': False}, 'doc_count': 14}, +                {'key': {'year': 2006.0, 'in_ia': True}, 'doc_count': 487}, +                {'key': {'year': 2007.0, 'in_ia': False}, 'doc_count': 13}, +                {'key': {'year': 2007.0, 'in_ia': True}, 'doc_count': 345}, +            ], +        }}, +    } + +    es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') +    es_raw.side_effect = [ +        (200, {}, json.dumps(elastic_resp1)), +    ] + +    rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/coverage') +    assert rv.status_code == 200 + +    es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') +    es_raw.side_effect = [ +        (200, {}, json.dumps(elastic_resp2)), +    ] + +    rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/ia_coverage_years.json') +    assert rv.status_code == 200 + +    es_raw.side_effect = [ +        (200, {}, json.dumps(elastic_resp2)), +    ] + +    rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/ia_coverage_years.svg') +    assert rv.status_code == 200 | 
