aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-01-18 19:52:55 -0800
committerBryan Newbold <bnewbold@archive.org>2021-01-19 19:49:04 -0800
commit433a7b30131cf88f474723edebea8f383fd0f4f0 (patch)
tree686d27b80a2b62e70b32d71169cce737eb6bb5b4
parent0adb490ae2ba8f961bac559a981f89d6d264af60 (diff)
downloadfatcat-scholar-433a7b30131cf88f474723edebea8f383fd0f4f0.tar.gz
fatcat-scholar-433a7b30131cf88f474723edebea8f383fd0f4f0.zip
initial crude query parsing routines
-rw-r--r--fatcat_scholar/query_parse.py71
1 files changed, 71 insertions, 0 deletions
diff --git a/fatcat_scholar/query_parse.py b/fatcat_scholar/query_parse.py
new file mode 100644
index 0000000..489a4f4
--- /dev/null
+++ b/fatcat_scholar/query_parse.py
@@ -0,0 +1,71 @@
+
+"""
+This file contains helpers for pre-parsing and transforming search query
+strings. See the "basic query parsing" proposal doc for original motivation and
+design details.
+"""
+
+import shlex
+
+
+def _clean_token(raw: str) -> str:
+ raw = raw.strip()
+ if not raw:
+ return '"{}"'.format(raw)
+ if len(raw.split()) > 1:
+ # has whitespace, will get quoted
+ return raw
+ if "/" in raw or raw.endswith(":"):
+ return '"{}"'.format(raw)
+ return raw
+
+
+def pre_parse_query(raw: str) -> str:
+ """
+ This method does some pre-parsing of raw query strings to prepare them for
+ passing on as a elasticsearch query string query (which is really just the
+ lucene query language).
+
+ For exaple, it tries to handle trailing semi-colons (could be interpreted
+ as a field filter) and slashes in words.
+ """
+ # if there is a fuzzy match, skip parse attempt
+ # TODO: can we configure shlex to handle this?
+ if '"~' in raw:
+ return raw
+ lex = shlex.shlex(raw, posix=False)
+ lex.commenters = ''
+ lex.whitespace_split = True
+ tokens = list(map(_clean_token, list(lex)))
+ print(list(tokens))
+ return " ".join(tokens)
+
+
+def test_pre_parse_query() -> None:
+ assert pre_parse_query("blah blah blah") == "blah blah blah"
+ assert pre_parse_query("is_oa:") == '"is_oa:"'
+ assert pre_parse_query("is_oa: ") == '"is_oa:"'
+ assert pre_parse_query("is_oa:1") == "is_oa:1"
+ assert pre_parse_query("is_oa:*") == "is_oa:*"
+ assert pre_parse_query("<xml>") == "<xml>"
+ assert pre_parse_query(r"""some $\LaTeX$""") == r"some $\LaTeX$"
+ assert pre_parse_query("N/A") == '"N/A"'
+ assert pre_parse_query("a/B thing") == '"a/B" thing'
+ assert pre_parse_query('"a/B thing"') == '"a/B thing"'
+ assert pre_parse_query("Krämer") == "Krämer"
+ assert (
+ pre_parse_query("this (is my) paper: here are the results")
+ == 'this (is my) "paper:" here are the results'
+ )
+ assert (
+ pre_parse_query('"hello world" computing type:book')
+ == '"hello world" computing type:book'
+ )
+ assert (
+ pre_parse_query('"hello world" computing type:"chapter thing"')
+ == '"hello world" computing type:"chapter thing"'
+ )
+ assert pre_parse_query('"foo bar"~4') == '"foo bar"~4'
+ assert pre_parse_query('(title:foo OR title:bar)^1.5 (body:foo OR body:bar)') == '(title:foo OR title:bar)^1.5 (body:foo OR body:bar)'
+ assert pre_parse_query('(title:"foo bar" AND body:"quick fox") OR title:fox') == '(title:"foo bar" AND body:"quick fox") OR title:fox'
+ assert pre_parse_query('status:[400 TO 499] AND (extension:php OR extension:html)') == 'status:[400 TO 499] AND (extension:php OR extension:html)'