summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/query_parse.py
blob: 8c49925ebfa5a390b3eeafe78f58463285bcdf5c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
"""
This file contains helpers for pre-parsing and transforming search query
strings. See the "basic query parsing" proposal doc for original motivation and
design details.
"""

import re
import shlex


def _clean_token(raw: str) -> str:
    raw = raw.strip()
    if not raw:
        return f'"{raw}"'
    if len(raw.split()) > 1:
        # has whitespace, will get quoted
        return raw
    if '"' in raw:
        # is quoted already
        return raw
    if "/" in raw or raw.endswith(":") or raw.endswith("!") or raw.endswith("?"):
        return f'"{raw}"'
    if raw.startswith("[") and raw.endswith("]"):
        return f'"{raw}"'
    if raw.startswith("{") and raw.endswith("}"):
        return f'"{raw}"'
    return raw


def pre_parse_query(raw: str) -> str:
    r"""
    This method does some pre-parsing of raw query strings to prepare them for
    passing on as a elasticsearch query string query (which is really just the
    lucene query language).

    Per Elasticsearch docs, the reserved characters are:

        + - = && || > < ! ( ) { } [ ] ^ " ~ * ? : \ /

    For exaple, it tries to handle trailing semi-colons (could be interpreted
    as a field filter) and slashes in words.
    """
    # if there is a fuzzy match, skip parse attempt
    # TODO: can we configure shlex to handle this?
    if '"~' in raw:
        return raw
    lex = shlex.shlex(raw, posix=False)
    lex.commenters = ""
    lex.whitespace_split = True
    tokens = list(map(_clean_token, list(lex)))
    # print(list(tokens))
    return " ".join(tokens)


def test_pre_parse_query() -> None:
    assert pre_parse_query("blah blah blah") == "blah blah blah"
    assert pre_parse_query("is_oa:") == '"is_oa:"'
    assert pre_parse_query("is_oa: ") == '"is_oa:"'
    assert pre_parse_query("is_oa:1") == "is_oa:1"
    assert pre_parse_query("is_oa:*") == "is_oa:*"
    assert pre_parse_query("<xml>") == "<xml>"
    assert pre_parse_query(r"""some $\LaTeX$""") == r"some $\LaTeX$"
    assert pre_parse_query("N/A") == '"N/A"'
    assert pre_parse_query("a/B thing") == '"a/B" thing'
    assert pre_parse_query('"a/B thing"') == '"a/B thing"'
    assert pre_parse_query("Krämer") == "Krämer"
    assert (
        pre_parse_query('"10.1093/qjmed/os-14.56.398"')
        == '"10.1093/qjmed/os-14.56.398"'
    )
    assert (
        pre_parse_query("this (is my) paper: here are the results")
        == 'this (is my) "paper:" here are the results'
    )
    assert (
        pre_parse_query('"hello world" computing type:book')
        == '"hello world" computing type:book'
    )
    assert (
        pre_parse_query('"hello world" computing type:"chapter thing"')
        == '"hello world" computing type:"chapter thing"'
    )
    assert pre_parse_query('"foo bar"~4') == '"foo bar"~4'
    assert (
        pre_parse_query("(title:foo OR title:bar)^1.5 (body:foo OR body:bar)")
        == "(title:foo OR title:bar)^1.5 (body:foo OR body:bar)"
    )
    assert (
        pre_parse_query('(title:"foo bar" AND body:"quick fox") OR title:fox')
        == '(title:"foo bar" AND body:"quick fox") OR title:fox'
    )
    assert (
        pre_parse_query("status:[400 TO 499] AND (extension:php OR extension:html)")
        == "status:[400 TO 499] AND (extension:php OR extension:html)"
    )
    assert pre_parse_query("[embargoed]") == '"[embargoed]"'
    assert (
        pre_parse_query("something 10.1002/eco.2061") == 'something "10.1002/eco.2061"'
    )
    assert pre_parse_query("different wet/dry ratios") == 'different "wet/dry" ratios'
    assert pre_parse_query("kimchy!") == '"kimchy!"'
    assert pre_parse_query("kimchy?") == '"kimchy?"'
    assert pre_parse_query("Saul B/ Cohen") == 'Saul "B/" Cohen'
    assert pre_parse_query("Nobel / Nino") == 'Nobel "/" Nino'


def sniff_citation_query(raw: str) -> bool:
    """
    This function tries to categorize raw citation strings.

    It doesn't handle lookups detection (yet? refactor?)
    """
    # if short, not citation
    if len(raw) < 12 or len(raw.split()) < 6:
        return False

    # if single quoted string, not a citation
    if raw.count('"') == 2 and raw.startswith('"') and raw.endswith('"'):
        return False

    # if there is a filter query, boost, or fuzzy match, not a citation
    if re.search(r'([a-zA-Z]:[^\s])|(["\\)][\^~]\d)', raw):
        return False

    # numbers, years, page numbers, capitalization, quoted strings all increase
    # confidence that this is a citation, not just a title
    char_types = dict()
    for c in raw:
        if c.isdigit():
            char_types["digit"] = True
        elif c >= "A" and c <= "Z":
            char_types["capitalized"] = True
        elif c == '"' or c == "'":
            char_types["quote"] = True
        elif c == ".":
            char_types["period"] = True
        elif c == ",":
            char_types["comma"] = True
        elif c == ";":
            char_types["semicolon"] = True
        elif c == "(" or c == ")":
            char_types["parens"] = True

        if len(char_types) >= 4:
            return True

    return False


def test_sniff_citation_query() -> None:
    assert sniff_citation_query("short") is False
    assert (
        sniff_citation_query("(title:foo OR title:bar)^1.5 (body:foo OR body:bar)")
        is False
    )
    assert sniff_citation_query("DR. SCHAUDINN'S WORK ON BLOOD PARASITES") is False
    assert sniff_citation_query('"DR. SCHAUDINN\'S WORK ON BLOOD PARASITES"') is False
    assert (
        sniff_citation_query(
            '"DR. SCHAUDINN\'S WORK ON BLOOD PARASITES." BMJ (Clinical Research Edition) (1905): 442-444'
        )
        is True
    )
    assert (
        sniff_citation_query(
            'Peskin, Charles S. "Numerical analysis of blood flow in the heart." Journal of computational physics 25.3 (1977): 220-252.'
        )
        is True
    )
    assert (
        sniff_citation_query(
            "Peskin, C.S., 1977. Numerical analysis of blood flow in the heart. Journal of computational physics, 25(3), pp.220-252."
        )
        is True
    )
    assert (
        sniff_citation_query(
            'Page, Don N. "Information in black hole radiation." Physical review letters 71.23 (1993): 3743.'
        )
        is True
    )
    assert (
        sniff_citation_query(
            "Hawking SW. Black hole explosions?. Nature. 1974 Mar;248(5443):30-1."
        )
        is True
    )