1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
|
"""
Helpers to make elasticsearch queries.
TODO: switch to using elasticsearch-dsl library instead of requests+json.
already have a WIP branch for this in fatcat repo.
"""
import json
import datetime
from flask import abort
from fatcat_covid19.webface import app
import elasticsearch
from elasticsearch_dsl import Search, Q
def generic_search_execute(search, limit=25, offset=0, deep_page_limit=2000):
# Sanity checks
if limit > 100:
limit = 100
if offset < 0:
offset = 0
if offset > deep_page_limit:
# Avoid deep paging problem.
offset = deep_page_limit
search = search[int(offset):int(offset)+int(limit)]
try:
resp = search.execute()
except elasticsearch.exceptions.RequestError as e:
# this is a "user" error
print("elasticsearch 400: " + str(e.info))
#flash("Search query failed to parse; you might need to use quotes.<p><code>{}: {}</code>".format(e.error, e.info['error']['root_cause'][0]['reason']))
abort(e.status_code)
except elasticsearch.exceptions.TransportError as e:
# all other errors
print("elasticsearch non-200 status code: {}".format(e.info))
flash("Elasticsearch error: {}".format(e.error))
abort(e.status_code)
# convert from objects to python dicts
results = []
for h in resp:
r = h._d_
#print(json.dumps(h.meta._d_, indent=2))
r['_highlights'] = []
if 'highlight' in dir(h.meta):
highlights = h.meta.highlight._d_
for k in highlights:
r['_highlights'] += highlights[k]
results.append(r)
for h in results:
# Handle surrogate strings that elasticsearch returns sometimes,
# probably due to mangled data processing in some pipeline.
# "Crimes against Unicode"; production workaround
for key in h:
if type(h[key]) is str:
h[key] = h[key].encode('utf8', 'ignore').decode('utf8')
return {
"count_returned": len(results),
"count_found": int(resp.hits.total),
"results": results,
"offset": offset,
"limit": limit,
"deep_page_limit": deep_page_limit,
"query_time_ms": int(resp.took),
}
def do_fulltext_search(q, limit=25, offset=0, filter_time=None, filter_type=None):
# Convert raw DOIs to DOI queries
if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1:
q = 'doi:"{}"'.format(q)
search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_FULLTEXT_INDEX'])
# type filters
if filter_type == "papers":
search = search.filter("terms", release_type=[ "article-journal", "paper-conference", "chapter", ])
elif filter_type == "reports":
search = search.filter("terms", release_type=[ "report", "standard", ])
elif filter_type == "datasets":
search = search.filter("terms", release_type=[ "dataset", "software", ])
elif filter_type == "everything" or filter_type == None:
pass
else:
abort(400)
# time filters
if filter_time == "past_week":
week_ago_date = str(datetime.date.today() - datetime.timedelta(days=7))
search = search.filter("range", release_date=dict(gte=week_ago_date))
elif filter_time == "this_year":
search = search.filter("term", release_year=datetime.date.today().year)
elif filter_time == "since_2000":
search = search.filter("range", release_year=dict(gte=2000))
elif filter_time == "before_1925":
search = search.filter("range", release_year=dict(lte=1924))
elif filter_time == "all_time" or filter_time == None:
pass
else:
abort(400)
search = search.query(
'query_string',
query=q,
default_operator="AND",
analyze_wildcard=True,
lenient=True,
fields=[
"everything",
"abstract",
"fulltext.body",
"fulltext.annex",
],
)
search = search.highlight(
"abstract",
"fulltext.body",
"fulltext.annex",
number_of_fragments=3,
fragment_size=150,
)
resp = generic_search_execute(search, offset=offset)
for h in resp['results']:
# Ensure 'contrib_names' is a list, not a single string
if type(h['contrib_names']) is not list:
h['contrib_names'] = [h['contrib_names'], ]
h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']]
resp["query"] = { "q": q }
return resp
|