1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
|
import requests
from flask import abort
from fatcat import app
def do_search(q, limit=50, fulltext_only=True):
#print("Search hit: " + q)
if limit > 100:
# Sanity check
limit = 100
if fulltext_only:
q += " file_in_ia:true"
search_request = {
"query": {
"query_string": {
"query": q,
"analyzer": "textIcuSearch",
"default_operator": "AND",
"analyze_wildcard": True,
"lenient": True,
"fields": ["title^5", "contrib_names^2", "container_title"]
},
},
"size": int(limit),
}
#print(search_request)
resp = requests.get("%s/%s/_search" %
(app.config['ELASTIC_BACKEND'], app.config['ELASTIC_INDEX']),
json=search_request)
if resp.status_code != 200:
print("elasticsearch non-200 status code: " + str(resp.status_code))
print(resp.content)
abort(resp.status_code)
content = resp.json()
#print(content)
results = [h['_source'] for h in content['hits']['hits']]
for h in results:
# Ensure 'contrib_names' is a list, not a single string
if type(h['contrib_names']) is not list:
h['contrib_names'] = [h['contrib_names'], ]
# Handle surrogate strings that elasticsearch returns sometimes,
# probably due to mangled data processing in some pipeline.
# "Crimes against Unicode"; production workaround
for key in h:
if type(h[key]) is str:
h[key] = h[key].encode('utf8', 'ignore').decode('utf8')
h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']]
found = content['hits']['total']
return {"query": { "q": q },
"count_returned": len(results),
"count_found": found,
"results": results }
|