aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/query_parse.py
blob: 489a4f47e175ad5af551ae9a72ecd17d489ef9c9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71

"""
This file contains helpers for pre-parsing and transforming search query
strings. See the "basic query parsing" proposal doc for original motivation and
design details.
"""

import shlex


def _clean_token(raw: str) -> str:
    raw = raw.strip()
    if not raw:
        return '"{}"'.format(raw)
    if len(raw.split()) > 1:
        # has whitespace, will get quoted
        return raw
    if "/" in raw or raw.endswith(":"):
        return '"{}"'.format(raw)
    return raw


def pre_parse_query(raw: str) -> str:
    """
    This method does some pre-parsing of raw query strings to prepare them for
    passing on as a elasticsearch query string query (which is really just the
    lucene query language).

    For exaple, it tries to handle trailing semi-colons (could be interpreted
    as a field filter) and slashes in words.
    """
    # if there is a fuzzy match, skip parse attempt
    # TODO: can we configure shlex to handle this?
    if '"~' in raw:
        return raw
    lex = shlex.shlex(raw, posix=False)
    lex.commenters = ''
    lex.whitespace_split = True
    tokens = list(map(_clean_token, list(lex)))
    print(list(tokens))
    return " ".join(tokens)


def test_pre_parse_query() -> None:
    assert pre_parse_query("blah blah blah") == "blah blah blah"
    assert pre_parse_query("is_oa:") == '"is_oa:"'
    assert pre_parse_query("is_oa: ") == '"is_oa:"'
    assert pre_parse_query("is_oa:1") == "is_oa:1"
    assert pre_parse_query("is_oa:*") == "is_oa:*"
    assert pre_parse_query("<xml>") == "<xml>"
    assert pre_parse_query(r"""some $\LaTeX$""") == r"some $\LaTeX$"
    assert pre_parse_query("N/A") == '"N/A"'
    assert pre_parse_query("a/B thing") == '"a/B" thing'
    assert pre_parse_query('"a/B thing"') == '"a/B thing"'
    assert pre_parse_query("Krämer") == "Krämer"
    assert (
        pre_parse_query("this (is my) paper: here are the results")
        == 'this (is my) "paper:" here are the results'
    )
    assert (
        pre_parse_query('"hello world" computing type:book')
        == '"hello world" computing type:book'
    )
    assert (
        pre_parse_query('"hello world" computing type:"chapter thing"')
        == '"hello world" computing type:"chapter thing"'
    )
    assert pre_parse_query('"foo bar"~4') == '"foo bar"~4'
    assert pre_parse_query('(title:foo OR title:bar)^1.5 (body:foo OR body:bar)') == '(title:foo OR title:bar)^1.5 (body:foo OR body:bar)'
    assert pre_parse_query('(title:"foo bar" AND body:"quick fox") OR title:fox') == '(title:"foo bar" AND body:"quick fox") OR title:fox'
    assert pre_parse_query('status:[400 TO 499] AND (extension:php OR extension:html)') == 'status:[400 TO 499] AND (extension:php OR extension:html)'