diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-02-20 19:58:29 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-02-20 19:58:29 -0800 | 
| commit | 46e67e77cf7709dae7375a22a6fed1ac730355bf (patch) | |
| tree | ea8ae53d57d3dc2e9cb2c6c0efa5359ef9f8d773 /python | |
| parent | 7c04b83a6612b7a0c87afe4a1ed4bbb65568fea1 (diff) | |
| download | fatcat-46e67e77cf7709dae7375a22a6fed1ac730355bf.tar.gz fatcat-46e67e77cf7709dae7375a22a6fed1ac730355bf.zip | |
refactor entity searches
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat_web/routes.py | 16 | ||||
| -rw-r--r-- | python/fatcat_web/search.py | 116 | ||||
| -rw-r--r-- | python/fatcat_web/templates/container_search.html | 4 | ||||
| -rw-r--r-- | python/fatcat_web/templates/release_search.html | 4 | 
4 files changed, 62 insertions, 78 deletions
| diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index 7f10ee2b..115c1981 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -353,18 +353,12 @@ def changelog_entry_view(index):  @app.route('/release/search', methods=['GET', 'POST'])  def release_search(): -    limit = 20      query = request.args.get('q')      fulltext_only = bool(request.args.get('fulltext_only')) -    # Convert raw DOIs to DOI queries -    if query is not None: -        if len(query.split()) == 1 and query.startswith("10.") and query.count("/") >= 1: -            query = 'doi:"{}"'.format(query) -      if 'q' in request.args.keys():          # always do files for HTML -        found = do_release_search(query, limit=limit, fulltext_only=fulltext_only) +        found = do_release_search(query, fulltext_only=fulltext_only)          return render_template('release_search.html', found=found, query=query, fulltext_only=fulltext_only)      else:          return render_template('release_search.html', query=query, fulltext_only=fulltext_only) @@ -372,17 +366,11 @@ def release_search():  @app.route('/container/search', methods=['GET', 'POST'])  def container_search(): -    limit = 20      query = request.args.get('q') -    # Convert raw ISSN-L to ISSN-L query -    if query is not None: -        if len(query.split()) == 1 and len(query) == 9 and isdigit(query[0:4]) and query[4] == '-': -            query = 'issnl:"{}"'.format(query) -      if 'q' in request.args.keys():          # always do files for HTML -        found = do_container_search(query, limit=limit) +        found = do_container_search(query)          return render_template('container_search.html', found=found, query=query)      else:          return render_template('container_search.html', query=query) diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index a301fcb5..d18416d6 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -10,33 +10,18 @@ the formal API)  TODO: ELASTICSEARCH_*_INDEX should probably be factored out and just hard-coded  """ -def do_release_search(q, limit=50, fulltext_only=True): -    #print("Search hit: " + q) +def do_search(index, request, limit=30): +      if limit > 100:          # Sanity check          limit = 100 -    if fulltext_only: -        q += " in_web:true" - -    search_request = { -        "query": { -            "query_string": { -            "query": q, -            "default_operator": "AND", -            "analyze_wildcard": True, -            "lenient": True, -            "fields": ["title^5", "contrib_names^2", "container_title"] -            }, -        }, -        "size": int(limit), -    } - -    #print(search_request) +    request["size"] = int(limit) +    #print(request)      resp = requests.get("%s/%s/_search" % -            (app.config['ELASTICSEARCH_BACKEND'], app.config['ELASTICSEARCH_RELEASE_INDEX']), -        json=search_request) +            (app.config['ELASTICSEARCH_BACKEND'], index), +        json=request)      if resp.status_code == 400:          print("elasticsearch 400: " + str(resp.content)) @@ -48,73 +33,76 @@ def do_release_search(q, limit=50, fulltext_only=True):          abort(resp.status_code)      content = resp.json() -    #print(content)      results = [h['_source'] for h in content['hits']['hits']]      for h in results: -        # Ensure 'contrib_names' is a list, not a single string -        if type(h['contrib_names']) is not list: -            h['contrib_names'] = [h['contrib_names'], ]          # Handle surrogate strings that elasticsearch returns sometimes,          # probably due to mangled data processing in some pipeline.          # "Crimes against Unicode"; production workaround          for key in h:              if type(h[key]) is str:                  h[key] = h[key].encode('utf8', 'ignore').decode('utf8') -        h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']] -    found = content['hits']['total'] -    return {"query": { "q": q }, -            "count_returned": len(results), -            "count_found": found, +    return {"count_returned": len(results), +            "count_found": content['hits']['total'],              "results": results } -def do_container_search(q, limit=50): + +def do_release_search(q, limit=30, fulltext_only=True):      #print("Search hit: " + q)      if limit > 100:          # Sanity check          limit = 100 +    # Convert raw DOIs to DOI queries +    if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1: +        q = 'doi:"{}"'.format(q) + + +    if fulltext_only: +        q += " in_web:true" +      search_request = {          "query": {              "query_string": { -            "query": q, -            "default_operator": "AND", -            "analyze_wildcard": True, -            "lenient": True, -            "fields": ["name^5", "publisher"] +                "query": q, +                "default_operator": "AND", +                "analyze_wildcard": True, +                "lenient": True, +                "fields": ["title^5", "contrib_names^2", "container_title"],              },          }, -        "size": int(limit),      } -    #print(search_request) -    resp = requests.get("%s/%s/_search" % -            (app.config['ELASTICSEARCH_BACKEND'], app.config['ELASTICSEARCH_CONTAINER_INDEX']), -        json=search_request) +    resp = do_search(app.config['ELASTICSEARCH_RELEASE_INDEX'], search_request) +    for h in resp['results']: +        # Ensure 'contrib_names' is a list, not a single string +        if type(h['contrib_names']) is not list: +            h['contrib_names'] = [h['contrib_names'], ] +        h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']] +    resp["query"] = { "q": q } +    return resp -    if resp.status_code == 400: -        print("elasticsearch 400: " + str(resp.content)) -        flash("Search query failed to parse; you might need to use quotes.<p><code>{}</code>".format(resp.content)) -        abort(resp.status_code) -    elif resp.status_code != 200: -        print("elasticsearch non-200 status code: " + str(resp.status_code)) -        print(resp.content) -        abort(resp.status_code) -    content = resp.json() -    #print(content) -    results = [h['_source'] for h in content['hits']['hits']] -    for h in results: -        # Handle surrogate strings that elasticsearch returns sometimes, -        # probably due to mangled data processing in some pipeline. -        # "Crimes against Unicode"; production workaround -        for key in h: -            if type(h[key]) is str: -                h[key] = h[key].encode('utf8', 'ignore').decode('utf8') +def do_container_search(q, limit=30): + +    # Convert raw ISSN-L to ISSN-L query +    if len(q.split()) == 1 and len(q) == 9 and isdigit(q[0:4]) and q[4] == '-': +        q = 'issnl:"{}"'.format(q) + +    search_request = { +        "query": { +            "query_string": { +                "query": q, +                "default_operator": "AND", +                "analyze_wildcard": True, +                "lenient": True, +                "fields": ["name^5", "publisher"], +            }, +        }, +    } + +    resp = do_search(app.config['ELASTICSEARCH_CONTAINER_INDEX'], search_request, limit=limit) +    resp["query"] = { "q": q } +    return resp -    found = content['hits']['total'] -    return {"query": { "q": q }, -            "count_returned": len(results), -            "count_found": found, -            "results": results } diff --git a/python/fatcat_web/templates/container_search.html b/python/fatcat_web/templates/container_search.html index 9fc7aa98..925fbc64 100644 --- a/python/fatcat_web/templates/container_search.html +++ b/python/fatcat_web/templates/container_search.html @@ -33,6 +33,10 @@    {% endif %}  </div>  {% endfor %} +{% if found.results|length > 8 %} +  <br> +  <i>Showing top {{ found.count_returned }} out of {{ found.count_found }} results for: <code>{{ found.query.q }}</code></i> +{% endif %}  {% else %}  <div class="featurette-inner text-center" style="padding-top: 15%;">    <h3>No results found!</h3> diff --git a/python/fatcat_web/templates/release_search.html b/python/fatcat_web/templates/release_search.html index 961bf54f..a7dc3c80 100644 --- a/python/fatcat_web/templates/release_search.html +++ b/python/fatcat_web/templates/release_search.html @@ -46,6 +46,10 @@    {% endif %}  </div>  {% endfor %} +{% if found.results|length > 8 %} +  <br> +  <i>Showing top {{ found.count_returned }} out of {{ found.count_found }} results for: <code>{{ found.query.q }}</code></i> +{% endif %}  {% else %}  <div class="featurette-inner text-center" style="padding-top: 15%;">    <h3>No results found!</h3> | 
