1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
|
"""
This file contains helpers for pre-parsing and transforming search query
strings. See the "basic query parsing" proposal doc for original motivation and
design details.
"""
import re
import shlex
def _clean_token(raw: str) -> str:
raw = raw.strip()
if not raw:
return f'"{raw}"'
if len(raw.split()) > 1:
# has whitespace, will get quoted
return raw
if '"' in raw:
# is quoted already
return raw
if "/" in raw or raw.endswith(":") or raw.endswith("!") or raw.endswith("?"):
return f'"{raw}"'
if raw.startswith("[") and raw.endswith("]"):
return f'"{raw}"'
if raw.startswith("{") and raw.endswith("}"):
return f'"{raw}"'
return raw
def pre_parse_query(raw: str) -> str:
r"""
This method does some pre-parsing of raw query strings to prepare them for
passing on as a elasticsearch query string query (which is really just the
lucene query language).
Per Elasticsearch docs, the reserved characters are:
+ - = && || > < ! ( ) { } [ ] ^ " ~ * ? : \ /
For exaple, it tries to handle trailing semi-colons (could be interpreted
as a field filter) and slashes in words.
"""
# if there is a fuzzy match, skip parse attempt
# TODO: can we configure shlex to handle this?
if '"~' in raw:
return raw
lex = shlex.shlex(raw, posix=False)
lex.commenters = ""
lex.whitespace_split = True
tokens = list(map(_clean_token, list(lex)))
# print(list(tokens))
return " ".join(tokens)
def test_pre_parse_query() -> None:
assert pre_parse_query("blah blah blah") == "blah blah blah"
assert pre_parse_query("is_oa:") == '"is_oa:"'
assert pre_parse_query("is_oa: ") == '"is_oa:"'
assert pre_parse_query("is_oa:1") == "is_oa:1"
assert pre_parse_query("is_oa:*") == "is_oa:*"
assert pre_parse_query("<xml>") == "<xml>"
assert pre_parse_query(r"""some $\LaTeX$""") == r"some $\LaTeX$"
assert pre_parse_query("N/A") == '"N/A"'
assert pre_parse_query("a/B thing") == '"a/B" thing'
assert pre_parse_query('"a/B thing"') == '"a/B thing"'
assert pre_parse_query("Krämer") == "Krämer"
assert (
pre_parse_query('"10.1093/qjmed/os-14.56.398"')
== '"10.1093/qjmed/os-14.56.398"'
)
assert (
pre_parse_query("this (is my) paper: here are the results")
== 'this (is my) "paper:" here are the results'
)
assert (
pre_parse_query('"hello world" computing type:book')
== '"hello world" computing type:book'
)
assert (
pre_parse_query('"hello world" computing type:"chapter thing"')
== '"hello world" computing type:"chapter thing"'
)
assert pre_parse_query('"foo bar"~4') == '"foo bar"~4'
assert (
pre_parse_query("(title:foo OR title:bar)^1.5 (body:foo OR body:bar)")
== "(title:foo OR title:bar)^1.5 (body:foo OR body:bar)"
)
assert (
pre_parse_query('(title:"foo bar" AND body:"quick fox") OR title:fox')
== '(title:"foo bar" AND body:"quick fox") OR title:fox'
)
assert (
pre_parse_query("status:[400 TO 499] AND (extension:php OR extension:html)")
== "status:[400 TO 499] AND (extension:php OR extension:html)"
)
assert pre_parse_query("[embargoed]") == '"[embargoed]"'
assert (
pre_parse_query("something 10.1002/eco.2061") == 'something "10.1002/eco.2061"'
)
assert pre_parse_query("different wet/dry ratios") == 'different "wet/dry" ratios'
assert pre_parse_query("kimchy!") == '"kimchy!"'
assert pre_parse_query("kimchy?") == '"kimchy?"'
assert pre_parse_query("Saul B/ Cohen") == 'Saul "B/" Cohen'
assert pre_parse_query("Nobel / Nino") == 'Nobel "/" Nino'
def sniff_citation_query(raw: str) -> bool:
"""
This function tries to categorize raw citation strings.
It doesn't handle lookups detection (yet? refactor?)
"""
# if short, not citation
if len(raw) < 12 or len(raw.split()) < 6:
return False
# if single quoted string, not a citation
if raw.count('"') == 2 and raw.startswith('"') and raw.endswith('"'):
return False
# if there is a filter query, boost, or fuzzy match, not a citation
if re.search(r'([a-zA-Z]:[^\s])|(["\\)][\^~]\d)', raw):
return False
# numbers, years, page numbers, capitalization, quoted strings all increase
# confidence that this is a citation, not just a title
char_types = dict()
for c in raw:
if c.isdigit():
char_types["digit"] = True
elif c >= "A" and c <= "Z":
char_types["capitalized"] = True
elif c == '"' or c == "'":
char_types["quote"] = True
elif c == ".":
char_types["period"] = True
elif c == ",":
char_types["comma"] = True
elif c == ";":
char_types["semicolon"] = True
elif c == "(" or c == ")":
char_types["parens"] = True
if len(char_types) >= 4:
return True
return False
def test_sniff_citation_query() -> None:
assert sniff_citation_query("short") is False
assert (
sniff_citation_query("(title:foo OR title:bar)^1.5 (body:foo OR body:bar)")
is False
)
assert sniff_citation_query("DR. SCHAUDINN'S WORK ON BLOOD PARASITES") is False
assert sniff_citation_query('"DR. SCHAUDINN\'S WORK ON BLOOD PARASITES"') is False
assert (
sniff_citation_query(
'"DR. SCHAUDINN\'S WORK ON BLOOD PARASITES." BMJ (Clinical Research Edition) (1905): 442-444'
)
is True
)
assert (
sniff_citation_query(
'Peskin, Charles S. "Numerical analysis of blood flow in the heart." Journal of computational physics 25.3 (1977): 220-252.'
)
is True
)
assert (
sniff_citation_query(
"Peskin, C.S., 1977. Numerical analysis of blood flow in the heart. Journal of computational physics, 25(3), pp.220-252."
)
is True
)
assert (
sniff_citation_query(
'Page, Don N. "Information in black hole radiation." Physical review letters 71.23 (1993): 3743.'
)
is True
)
assert (
sniff_citation_query(
"Hawking SW. Black hole explosions?. Nature. 1974 Mar;248(5443):30-1."
)
is True
)
|