aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-01-18 23:49:14 -0800
committerBryan Newbold <bnewbold@archive.org>2021-01-19 19:49:04 -0800
commitb938dc6ba8c354d483d96988dc057565db91ca38 (patch)
treef6fbcc921f8a195d7981236e5b7446edefd21a2c
parent4249da27c244406291133453bf209057e29aacef (diff)
downloadfatcat-scholar-b938dc6ba8c354d483d96988dc057565db91ca38.tar.gz
fatcat-scholar-b938dc6ba8c354d483d96988dc057565db91ca38.zip
query parsing: sniff for citations; more corner cases
-rw-r--r--fatcat_scholar/query_parse.py113
1 files changed, 106 insertions, 7 deletions
diff --git a/fatcat_scholar/query_parse.py b/fatcat_scholar/query_parse.py
index 489a4f4..fd42c6e 100644
--- a/fatcat_scholar/query_parse.py
+++ b/fatcat_scholar/query_parse.py
@@ -1,10 +1,10 @@
-
"""
This file contains helpers for pre-parsing and transforming search query
strings. See the "basic query parsing" proposal doc for original motivation and
design details.
"""
+import re
import shlex
@@ -15,17 +15,25 @@ def _clean_token(raw: str) -> str:
if len(raw.split()) > 1:
# has whitespace, will get quoted
return raw
- if "/" in raw or raw.endswith(":"):
+ if "/" in raw or raw.endswith(":") or raw.endswith("!") or raw.endswith("?"):
+ return '"{}"'.format(raw)
+ if raw.startswith("[") and raw.endswith("]"):
+ return '"{}"'.format(raw)
+ if raw.startswith("{") and raw.endswith("}"):
return '"{}"'.format(raw)
return raw
def pre_parse_query(raw: str) -> str:
- """
+ r"""
This method does some pre-parsing of raw query strings to prepare them for
passing on as a elasticsearch query string query (which is really just the
lucene query language).
+ Per Elasticsearch docs, the reserved characters are:
+
+ + - = && || > < ! ( ) { } [ ] ^ " ~ * ? : \ /
+
For exaple, it tries to handle trailing semi-colons (could be interpreted
as a field filter) and slashes in words.
"""
@@ -34,7 +42,7 @@ def pre_parse_query(raw: str) -> str:
if '"~' in raw:
return raw
lex = shlex.shlex(raw, posix=False)
- lex.commenters = ''
+ lex.commenters = ""
lex.whitespace_split = True
tokens = list(map(_clean_token, list(lex)))
print(list(tokens))
@@ -66,6 +74,97 @@ def test_pre_parse_query() -> None:
== '"hello world" computing type:"chapter thing"'
)
assert pre_parse_query('"foo bar"~4') == '"foo bar"~4'
- assert pre_parse_query('(title:foo OR title:bar)^1.5 (body:foo OR body:bar)') == '(title:foo OR title:bar)^1.5 (body:foo OR body:bar)'
- assert pre_parse_query('(title:"foo bar" AND body:"quick fox") OR title:fox') == '(title:"foo bar" AND body:"quick fox") OR title:fox'
- assert pre_parse_query('status:[400 TO 499] AND (extension:php OR extension:html)') == 'status:[400 TO 499] AND (extension:php OR extension:html)'
+ assert (
+ pre_parse_query("(title:foo OR title:bar)^1.5 (body:foo OR body:bar)")
+ == "(title:foo OR title:bar)^1.5 (body:foo OR body:bar)"
+ )
+ assert (
+ pre_parse_query('(title:"foo bar" AND body:"quick fox") OR title:fox')
+ == '(title:"foo bar" AND body:"quick fox") OR title:fox'
+ )
+ assert (
+ pre_parse_query("status:[400 TO 499] AND (extension:php OR extension:html)")
+ == "status:[400 TO 499] AND (extension:php OR extension:html)"
+ )
+ assert pre_parse_query("[embargoed]") == '"[embargoed]"'
+ assert (
+ pre_parse_query("something 10.1002/eco.2061") == 'something "10.1002/eco.2061"'
+ )
+ assert pre_parse_query("different wet/dry ratios") == 'different "wet/dry" ratios'
+ assert pre_parse_query("kimchy!") == '"kimchy!"'
+ assert pre_parse_query("kimchy?") == '"kimchy?"'
+ assert pre_parse_query("Saul B/ Cohen") == 'Saul "B/" Cohen'
+ assert pre_parse_query("Nobel / Nino") == 'Nobel "/" Nino'
+
+
+def sniff_citation_query(raw: str) -> bool:
+ """
+ This function tries to categorize raw citation strings.
+
+ It doesn't handle lookups detection (yet? refactor?)
+ """
+ # if short, not citation
+ if len(raw) < 12 or len(raw.split()) < 6:
+ return False
+
+ # if there is a filter query, boost, or fuzzy match, not a citation
+ if re.search(r'([a-zA-Z]:[^\s])|(["\\)][\^~]\d)', raw):
+ return False
+
+ # numbers, years, page numbers, capitalization, quoted strings all increase
+ # confidence that this is a citation, not just a title
+ char_types = dict()
+ for c in raw:
+ if c.isdigit():
+ char_types["digit"] = True
+ elif c >= "A" and c <= "Z":
+ char_types["capitalized"] = True
+ elif c == '"' or c == "'":
+ char_types["quote"] = True
+ elif c == ".":
+ char_types["period"] = True
+ elif c == ",":
+ char_types["comma"] = True
+
+ if len(char_types) > 2:
+ return True
+
+ return False
+
+
+def test_sniff_citation_query() -> None:
+ assert sniff_citation_query("short") is False
+ assert (
+ sniff_citation_query("(title:foo OR title:bar)^1.5 (body:foo OR body:bar)")
+ is False
+ )
+ assert (
+ sniff_citation_query(
+ '"DR. SCHAUDINN\'S WORK ON BLOOD PARASITES." BMJ (Clinical Research Edition) (1905): 442-444'
+ )
+ is True
+ )
+ assert (
+ sniff_citation_query(
+ 'Peskin, Charles S. "Numerical analysis of blood flow in the heart." Journal of computational physics 25.3 (1977): 220-252.'
+ )
+ is True
+ )
+ assert (
+ sniff_citation_query(
+ "Peskin, C.S., 1977. Numerical analysis of blood flow in the heart. Journal of computational physics, 25(3), pp.220-252."
+ )
+ is True
+ )
+ assert (
+ sniff_citation_query(
+ 'Page, Don N. "Information in black hole radiation." Physical review letters 71.23 (1993): 3743.'
+ )
+ is True
+ )
+ assert (
+ sniff_citation_query(
+ "Hawking SW. Black hole explosions?. Nature. 1974 Mar;248(5443):30-1."
+ )
+ is True
+ )