aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-02-20 19:58:29 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-02-20 19:58:29 -0800
commit46e67e77cf7709dae7375a22a6fed1ac730355bf (patch)
treeea8ae53d57d3dc2e9cb2c6c0efa5359ef9f8d773 /python
parent7c04b83a6612b7a0c87afe4a1ed4bbb65568fea1 (diff)
downloadfatcat-46e67e77cf7709dae7375a22a6fed1ac730355bf.tar.gz
fatcat-46e67e77cf7709dae7375a22a6fed1ac730355bf.zip
refactor entity searches
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_web/routes.py16
-rw-r--r--python/fatcat_web/search.py116
-rw-r--r--python/fatcat_web/templates/container_search.html4
-rw-r--r--python/fatcat_web/templates/release_search.html4
4 files changed, 62 insertions, 78 deletions
diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py
index 7f10ee2b..115c1981 100644
--- a/python/fatcat_web/routes.py
+++ b/python/fatcat_web/routes.py
@@ -353,18 +353,12 @@ def changelog_entry_view(index):
@app.route('/release/search', methods=['GET', 'POST'])
def release_search():
- limit = 20
query = request.args.get('q')
fulltext_only = bool(request.args.get('fulltext_only'))
- # Convert raw DOIs to DOI queries
- if query is not None:
- if len(query.split()) == 1 and query.startswith("10.") and query.count("/") >= 1:
- query = 'doi:"{}"'.format(query)
-
if 'q' in request.args.keys():
# always do files for HTML
- found = do_release_search(query, limit=limit, fulltext_only=fulltext_only)
+ found = do_release_search(query, fulltext_only=fulltext_only)
return render_template('release_search.html', found=found, query=query, fulltext_only=fulltext_only)
else:
return render_template('release_search.html', query=query, fulltext_only=fulltext_only)
@@ -372,17 +366,11 @@ def release_search():
@app.route('/container/search', methods=['GET', 'POST'])
def container_search():
- limit = 20
query = request.args.get('q')
- # Convert raw ISSN-L to ISSN-L query
- if query is not None:
- if len(query.split()) == 1 and len(query) == 9 and isdigit(query[0:4]) and query[4] == '-':
- query = 'issnl:"{}"'.format(query)
-
if 'q' in request.args.keys():
# always do files for HTML
- found = do_container_search(query, limit=limit)
+ found = do_container_search(query)
return render_template('container_search.html', found=found, query=query)
else:
return render_template('container_search.html', query=query)
diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py
index a301fcb5..d18416d6 100644
--- a/python/fatcat_web/search.py
+++ b/python/fatcat_web/search.py
@@ -10,33 +10,18 @@ the formal API)
TODO: ELASTICSEARCH_*_INDEX should probably be factored out and just hard-coded
"""
-def do_release_search(q, limit=50, fulltext_only=True):
- #print("Search hit: " + q)
+def do_search(index, request, limit=30):
+
if limit > 100:
# Sanity check
limit = 100
- if fulltext_only:
- q += " in_web:true"
-
- search_request = {
- "query": {
- "query_string": {
- "query": q,
- "default_operator": "AND",
- "analyze_wildcard": True,
- "lenient": True,
- "fields": ["title^5", "contrib_names^2", "container_title"]
- },
- },
- "size": int(limit),
- }
-
- #print(search_request)
+ request["size"] = int(limit)
+ #print(request)
resp = requests.get("%s/%s/_search" %
- (app.config['ELASTICSEARCH_BACKEND'], app.config['ELASTICSEARCH_RELEASE_INDEX']),
- json=search_request)
+ (app.config['ELASTICSEARCH_BACKEND'], index),
+ json=request)
if resp.status_code == 400:
print("elasticsearch 400: " + str(resp.content))
@@ -48,73 +33,76 @@ def do_release_search(q, limit=50, fulltext_only=True):
abort(resp.status_code)
content = resp.json()
- #print(content)
results = [h['_source'] for h in content['hits']['hits']]
for h in results:
- # Ensure 'contrib_names' is a list, not a single string
- if type(h['contrib_names']) is not list:
- h['contrib_names'] = [h['contrib_names'], ]
# Handle surrogate strings that elasticsearch returns sometimes,
# probably due to mangled data processing in some pipeline.
# "Crimes against Unicode"; production workaround
for key in h:
if type(h[key]) is str:
h[key] = h[key].encode('utf8', 'ignore').decode('utf8')
- h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']]
- found = content['hits']['total']
- return {"query": { "q": q },
- "count_returned": len(results),
- "count_found": found,
+ return {"count_returned": len(results),
+ "count_found": content['hits']['total'],
"results": results }
-def do_container_search(q, limit=50):
+
+def do_release_search(q, limit=30, fulltext_only=True):
#print("Search hit: " + q)
if limit > 100:
# Sanity check
limit = 100
+ # Convert raw DOIs to DOI queries
+ if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1:
+ q = 'doi:"{}"'.format(q)
+
+
+ if fulltext_only:
+ q += " in_web:true"
+
search_request = {
"query": {
"query_string": {
- "query": q,
- "default_operator": "AND",
- "analyze_wildcard": True,
- "lenient": True,
- "fields": ["name^5", "publisher"]
+ "query": q,
+ "default_operator": "AND",
+ "analyze_wildcard": True,
+ "lenient": True,
+ "fields": ["title^5", "contrib_names^2", "container_title"],
},
},
- "size": int(limit),
}
- #print(search_request)
- resp = requests.get("%s/%s/_search" %
- (app.config['ELASTICSEARCH_BACKEND'], app.config['ELASTICSEARCH_CONTAINER_INDEX']),
- json=search_request)
+ resp = do_search(app.config['ELASTICSEARCH_RELEASE_INDEX'], search_request)
+ for h in resp['results']:
+ # Ensure 'contrib_names' is a list, not a single string
+ if type(h['contrib_names']) is not list:
+ h['contrib_names'] = [h['contrib_names'], ]
+ h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']]
+ resp["query"] = { "q": q }
+ return resp
- if resp.status_code == 400:
- print("elasticsearch 400: " + str(resp.content))
- flash("Search query failed to parse; you might need to use quotes.<p><code>{}</code>".format(resp.content))
- abort(resp.status_code)
- elif resp.status_code != 200:
- print("elasticsearch non-200 status code: " + str(resp.status_code))
- print(resp.content)
- abort(resp.status_code)
- content = resp.json()
- #print(content)
- results = [h['_source'] for h in content['hits']['hits']]
- for h in results:
- # Handle surrogate strings that elasticsearch returns sometimes,
- # probably due to mangled data processing in some pipeline.
- # "Crimes against Unicode"; production workaround
- for key in h:
- if type(h[key]) is str:
- h[key] = h[key].encode('utf8', 'ignore').decode('utf8')
+def do_container_search(q, limit=30):
+
+ # Convert raw ISSN-L to ISSN-L query
+ if len(q.split()) == 1 and len(q) == 9 and isdigit(q[0:4]) and q[4] == '-':
+ q = 'issnl:"{}"'.format(q)
+
+ search_request = {
+ "query": {
+ "query_string": {
+ "query": q,
+ "default_operator": "AND",
+ "analyze_wildcard": True,
+ "lenient": True,
+ "fields": ["name^5", "publisher"],
+ },
+ },
+ }
+
+ resp = do_search(app.config['ELASTICSEARCH_CONTAINER_INDEX'], search_request, limit=limit)
+ resp["query"] = { "q": q }
+ return resp
- found = content['hits']['total']
- return {"query": { "q": q },
- "count_returned": len(results),
- "count_found": found,
- "results": results }
diff --git a/python/fatcat_web/templates/container_search.html b/python/fatcat_web/templates/container_search.html
index 9fc7aa98..925fbc64 100644
--- a/python/fatcat_web/templates/container_search.html
+++ b/python/fatcat_web/templates/container_search.html
@@ -33,6 +33,10 @@
{% endif %}
</div>
{% endfor %}
+{% if found.results|length > 8 %}
+ <br>
+ <i>Showing top {{ found.count_returned }} out of {{ found.count_found }} results for: <code>{{ found.query.q }}</code></i>
+{% endif %}
{% else %}
<div class="featurette-inner text-center" style="padding-top: 15%;">
<h3>No results found!</h3>
diff --git a/python/fatcat_web/templates/release_search.html b/python/fatcat_web/templates/release_search.html
index 961bf54f..a7dc3c80 100644
--- a/python/fatcat_web/templates/release_search.html
+++ b/python/fatcat_web/templates/release_search.html
@@ -46,6 +46,10 @@
{% endif %}
</div>
{% endfor %}
+{% if found.results|length > 8 %}
+ <br>
+ <i>Showing top {{ found.count_returned }} out of {{ found.count_found }} results for: <code>{{ found.query.q }}</code></i>
+{% endif %}
{% else %}
<div class="featurette-inner text-center" style="padding-top: 15%;">
<h3>No results found!</h3>