1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
|
"""
Helpers to make elasticsearch queries.
"""
import json
import datetime
import elasticsearch
from elasticsearch_dsl import Search, Q
from dynaconf import settings
def generic_search_execute(search, limit=25, offset=0, deep_page_limit=2000):
# Sanity checks
if limit > 100:
limit = 100
if offset < 0:
offset = 0
if offset > deep_page_limit:
# Avoid deep paging problem.
offset = deep_page_limit
search = search[int(offset):int(offset)+int(limit)]
try:
resp = search.execute()
except elasticsearch.exceptions.RequestError as e:
# this is a "user" error
print("elasticsearch 400: " + str(e.info))
#flash("Search query failed to parse; you might need to use quotes.<p><code>{}: {}</code>".format(e.error, e.info['error']['root_cause'][0]['reason']))
# XXX: abort(e.status_code)
raise Exception()
except elasticsearch.exceptions.TransportError as e:
# all other errors
print("elasticsearch non-200 status code: {}".format(e.info))
# XXX: abort(e.status_code)
raise Exception()
# convert from objects to python dicts
results = []
for h in resp:
r = h._d_
#print(json.dumps(h.meta._d_, indent=2))
r['_highlights'] = []
if 'highlight' in dir(h.meta):
highlights = h.meta.highlight._d_
for k in highlights:
r['_highlights'] += highlights[k]
results.append(r)
for h in results:
# Handle surrogate strings that elasticsearch returns sometimes,
# probably due to mangled data processing in some pipeline.
# "Crimes against Unicode"; production workaround
for key in h:
if type(h[key]) is str:
h[key] = h[key].encode('utf8', 'ignore').decode('utf8')
return {
"count_returned": len(results),
"count_found": int(resp.hits.total),
"results": results,
"offset": offset,
"limit": limit,
"deep_page_limit": deep_page_limit,
"query_time_ms": int(resp.took),
}
def do_fulltext_search(q, limit=25, offset=0, filter_time=None, filter_type=None):
# Convert raw DOIs to DOI queries
if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1:
q = 'doi:"{}"'.format(q)
es_client = elasticsearch.Elasticsearch(settings.ELASTICSEARCH_BACKEND)
search = Search(using=es_client, index=settings.ELASTICSEARCH_FULLTEXT_INDEX)
# type filters
if filter_type == "papers":
search = search.filter("terms", release_type=[ "article-journal", "paper-conference", "chapter", ])
elif filter_type == "reports":
search = search.filter("terms", release_type=[ "report", "standard", ])
elif filter_type == "datasets":
search = search.filter("terms", release_type=[ "dataset", "software", ])
elif filter_type == "everything" or filter_type == None:
pass
else:
# XXX: abort(400)
raise Exception()
# time filters
if filter_time == "past_week":
week_ago_date = str(datetime.date.today() - datetime.timedelta(days=7))
search = search.filter("range", release_date=dict(gte=week_ago_date))
elif filter_time == "this_year":
search = search.filter("term", release_year=datetime.date.today().year)
elif filter_time == "since_2000":
search = search.filter("range", release_year=dict(gte=2000))
elif filter_time == "before_1925":
search = search.filter("range", release_year=dict(lte=1924))
elif filter_time == "all_time" or filter_time == None:
pass
else:
# XXX: abort(400)
raise Exception()
search = search.query(
'query_string',
query=q,
default_operator="AND",
analyze_wildcard=True,
lenient=True,
fields=[
"everything",
"abstract",
"fulltext.body",
"fulltext.annex",
],
)
search = search.highlight(
"abstract",
"fulltext.body",
"fulltext.annex",
number_of_fragments=3,
fragment_size=150,
)
resp = generic_search_execute(search, offset=offset)
for h in resp['results']:
# Ensure 'contrib_names' is a list, not a single string
if type(h['contrib_names']) is not list:
h['contrib_names'] = [h['contrib_names'], ]
h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']]
resp["query"] = { "q": q }
return resp
|