refactor entity searches

author: Bryan Newbold <bnewbold@robocracy.org> 2019-02-20 19:58:29 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2019-02-20 19:58:29 -0800
commit: 46e67e77cf7709dae7375a22a6fed1ac730355bf (patch)
tree: ea8ae53d57d3dc2e9cb2c6c0efa5359ef9f8d773 /python
parent: 7c04b83a6612b7a0c87afe4a1ed4bbb65568fea1 (diff)
download: fatcat-46e67e77cf7709dae7375a22a6fed1ac730355bf.tar.gz
fatcat-46e67e77cf7709dae7375a22a6fed1ac730355bf.zip
4 files changed, 62 insertions, 78 deletions
diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py
index 7f10ee2b..115c1981 100644
--- a/python/fatcat_web/routes.py
+++ b/python/fatcat_web/routes.py
@@ -353,18 +353,12 @@ def changelog_entry_view(index):
 @app.route('/release/search', methods=['GET', 'POST'])
 def release_search():
 
-    limit = 20
     query = request.args.get('q')
     fulltext_only = bool(request.args.get('fulltext_only'))
 
-    # Convert raw DOIs to DOI queries
-    if query is not None:
-        if len(query.split()) == 1 and query.startswith("10.") and query.count("/") >= 1:
-            query = 'doi:"{}"'.format(query)
-
     if 'q' in request.args.keys():
         # always do files for HTML
-        found = do_release_search(query, limit=limit, fulltext_only=fulltext_only)
+        found = do_release_search(query, fulltext_only=fulltext_only)
         return render_template('release_search.html', found=found, query=query, fulltext_only=fulltext_only)
     else:
         return render_template('release_search.html', query=query, fulltext_only=fulltext_only)
@@ -372,17 +366,11 @@ def release_search():
 @app.route('/container/search', methods=['GET', 'POST'])
 def container_search():
 
-    limit = 20
     query = request.args.get('q')
 
-    # Convert raw ISSN-L to ISSN-L query
-    if query is not None:
-        if len(query.split()) == 1 and len(query) == 9 and isdigit(query[0:4]) and query[4] == '-':
-            query = 'issnl:"{}"'.format(query)
-
     if 'q' in request.args.keys():
         # always do files for HTML
-        found = do_container_search(query, limit=limit)
+        found = do_container_search(query)
         return render_template('container_search.html', found=found, query=query)
     else:
         return render_template('container_search.html', query=query)
diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py
index a301fcb5..d18416d6 100644
--- a/python/fatcat_web/search.py
+++ b/python/fatcat_web/search.py
@@ -10,33 +10,18 @@ the formal API)
 TODO: ELASTICSEARCH_*_INDEX should probably be factored out and just hard-coded
 """
 
-def do_release_search(q, limit=50, fulltext_only=True):
 
-    #print("Search hit: " + q)
+def do_search(index, request, limit=30):
+
     if limit > 100:
         # Sanity check
         limit = 100
 
-    if fulltext_only:
-        q += " in_web:true"
-
-    search_request = {
-        "query": {
-            "query_string": {
-            "query": q,
-            "default_operator": "AND",
-            "analyze_wildcard": True,
-            "lenient": True,
-            "fields": ["title^5", "contrib_names^2", "container_title"]
-            },
-        },
-        "size": int(limit),
-    }
-
-    #print(search_request)
+    request["size"] = int(limit)
+    #print(request)
     resp = requests.get("%s/%s/_search" %
-            (app.config['ELASTICSEARCH_BACKEND'], app.config['ELASTICSEARCH_RELEASE_INDEX']),
-        json=search_request)
+            (app.config['ELASTICSEARCH_BACKEND'], index),
+        json=request)
 
     if resp.status_code == 400:
         print("elasticsearch 400: " + str(resp.content))
@@ -48,73 +33,76 @@ def do_release_search(q, limit=50, fulltext_only=True):
         abort(resp.status_code)
 
     content = resp.json()
-    #print(content)
     results = [h['_source'] for h in content['hits']['hits']]
     for h in results:
-        # Ensure 'contrib_names' is a list, not a single string
-        if type(h['contrib_names']) is not list:
-            h['contrib_names'] = [h['contrib_names'], ]
         # Handle surrogate strings that elasticsearch returns sometimes,
         # probably due to mangled data processing in some pipeline.
         # "Crimes against Unicode"; production workaround
         for key in h:
             if type(h[key]) is str:
                 h[key] = h[key].encode('utf8', 'ignore').decode('utf8')
-        h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']]
 
-    found = content['hits']['total']
-    return {"query": { "q": q },
-            "count_returned": len(results),
-            "count_found": found,
+    return {"count_returned": len(results),
+            "count_found": content['hits']['total'],
             "results": results }
 
-def do_container_search(q, limit=50):
+
+def do_release_search(q, limit=30, fulltext_only=True):
 
     #print("Search hit: " + q)
     if limit > 100:
         # Sanity check
         limit = 100
 
+    # Convert raw DOIs to DOI queries
+    if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1:
+        q = 'doi:"{}"'.format(q)
+
+
+    if fulltext_only:
+        q += " in_web:true"
+
     search_request = {
         "query": {
             "query_string": {
-            "query": q,
-            "default_operator": "AND",
-            "analyze_wildcard": True,
-            "lenient": True,
-            "fields": ["name^5", "publisher"]
+                "query": q,
+                "default_operator": "AND",
+                "analyze_wildcard": True,
+                "lenient": True,
+                "fields": ["title^5", "contrib_names^2", "container_title"],
             },
         },
-        "size": int(limit),
     }
 
-    #print(search_request)
-    resp = requests.get("%s/%s/_search" %
-            (app.config['ELASTICSEARCH_BACKEND'], app.config['ELASTICSEARCH_CONTAINER_INDEX']),
-        json=search_request)
+    resp = do_search(app.config['ELASTICSEARCH_RELEASE_INDEX'], search_request)
+    for h in resp['results']:
+        # Ensure 'contrib_names' is a list, not a single string
+        if type(h['contrib_names']) is not list:
+            h['contrib_names'] = [h['contrib_names'], ]
+        h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']]
+    resp["query"] = { "q": q }
+    return resp
 
-    if resp.status_code == 400:
-        print("elasticsearch 400: " + str(resp.content))
-        flash("Search query failed to parse; you might need to use quotes.<p><code>{}</code>".format(resp.content))
-        abort(resp.status_code)
-    elif resp.status_code != 200:
-        print("elasticsearch non-200 status code: " + str(resp.status_code))
-        print(resp.content)
-        abort(resp.status_code)
 
-    content = resp.json()
-    #print(content)
-    results = [h['_source'] for h in content['hits']['hits']]
-    for h in results:
-        # Handle surrogate strings that elasticsearch returns sometimes,
-        # probably due to mangled data processing in some pipeline.
-        # "Crimes against Unicode"; production workaround
-        for key in h:
-            if type(h[key]) is str:
-                h[key] = h[key].encode('utf8', 'ignore').decode('utf8')
+def do_container_search(q, limit=30):
+
+    # Convert raw ISSN-L to ISSN-L query
+    if len(q.split()) == 1 and len(q) == 9 and isdigit(q[0:4]) and q[4] == '-':
+        q = 'issnl:"{}"'.format(q)
+
+    search_request = {
+        "query": {
+            "query_string": {
+                "query": q,
+                "default_operator": "AND",
+                "analyze_wildcard": True,
+                "lenient": True,
+                "fields": ["name^5", "publisher"],
+            },
+        },
+    }
+
+    resp = do_search(app.config['ELASTICSEARCH_CONTAINER_INDEX'], search_request, limit=limit)
+    resp["query"] = { "q": q }
+    return resp
 
-    found = content['hits']['total']
-    return {"query": { "q": q },
-            "count_returned": len(results),
-            "count_found": found,
-            "results": results }
diff --git a/python/fatcat_web/templates/container_search.html b/python/fatcat_web/templates/container_search.html
index 9fc7aa98..925fbc64 100644
--- a/python/fatcat_web/templates/container_search.html
+++ b/python/fatcat_web/templates/container_search.html
@@ -33,6 +33,10 @@
   {% endif %}
 </div>
 {% endfor %}
+{% if found.results|length > 8 %}
+  <br>
+  <i>Showing top {{ found.count_returned }} out of {{ found.count_found }} results for: <code>{{ found.query.q }}</code></i>
+{% endif %}
 {% else %}
 <div class="featurette-inner text-center" style="padding-top: 15%;">
   <h3>No results found!</h3>
diff --git a/python/fatcat_web/templates/release_search.html b/python/fatcat_web/templates/release_search.html
index 961bf54f..a7dc3c80 100644
--- a/python/fatcat_web/templates/release_search.html
+++ b/python/fatcat_web/templates/release_search.html
@@ -46,6 +46,10 @@
   {% endif %}
 </div>
 {% endfor %}
+{% if found.results|length > 8 %}
+  <br>
+  <i>Showing top {{ found.count_returned }} out of {{ found.count_found }} results for: <code>{{ found.query.q }}</code></i>
+{% endif %}
 {% else %}
 <div class="featurette-inner text-center" style="padding-top: 15%;">
   <h3>No results found!</h3>
author	Bryan Newbold <bnewbold@robocracy.org>	2019-02-20 19:58:29 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2019-02-20 19:58:29 -0800
commit	46e67e77cf7709dae7375a22a6fed1ac730355bf (patch)
tree	ea8ae53d57d3dc2e9cb2c6c0efa5359ef9f8d773 /python
parent	7c04b83a6612b7a0c87afe4a1ed4bbb65568fea1 (diff)
download	fatcat-46e67e77cf7709dae7375a22a6fed1ac730355bf.tar.gz fatcat-46e67e77cf7709dae7375a22a6fed1ac730355bf.zip