query parsing: sniff for citations; more corner cases

author: Bryan Newbold <bnewbold@archive.org> 2021-01-18 23:49:14 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2021-01-19 19:49:04 -0800
commit: b938dc6ba8c354d483d96988dc057565db91ca38 (patch)
tree: f6fbcc921f8a195d7981236e5b7446edefd21a2c
parent: 4249da27c244406291133453bf209057e29aacef (diff)
download: fatcat-scholar-b938dc6ba8c354d483d96988dc057565db91ca38.tar.gz
fatcat-scholar-b938dc6ba8c354d483d96988dc057565db91ca38.zip
1 files changed, 106 insertions, 7 deletions
diff --git a/fatcat_scholar/query_parse.py b/fatcat_scholar/query_parse.py
index 489a4f4..fd42c6e 100644
--- a/fatcat_scholar/query_parse.py
+++ b/fatcat_scholar/query_parse.py
@@ -1,10 +1,10 @@
-
 """
 This file contains helpers for pre-parsing and transforming search query
 strings. See the "basic query parsing" proposal doc for original motivation and
 design details.
 """
 
+import re
 import shlex
 
 
@@ -15,17 +15,25 @@ def _clean_token(raw: str) -> str:
     if len(raw.split()) > 1:
         # has whitespace, will get quoted
         return raw
-    if "/" in raw or raw.endswith(":"):
+    if "/" in raw or raw.endswith(":") or raw.endswith("!") or raw.endswith("?"):
+        return '"{}"'.format(raw)
+    if raw.startswith("[") and raw.endswith("]"):
+        return '"{}"'.format(raw)
+    if raw.startswith("{") and raw.endswith("}"):
         return '"{}"'.format(raw)
     return raw
 
 
 def pre_parse_query(raw: str) -> str:
-    """
+    r"""
     This method does some pre-parsing of raw query strings to prepare them for
     passing on as a elasticsearch query string query (which is really just the
     lucene query language).
 
+    Per Elasticsearch docs, the reserved characters are:
+
+        + - = && || > < ! ( ) { } [ ] ^ " ~ * ? : \ /
+
     For exaple, it tries to handle trailing semi-colons (could be interpreted
     as a field filter) and slashes in words.
     """
@@ -34,7 +42,7 @@ def pre_parse_query(raw: str) -> str:
     if '"~' in raw:
         return raw
     lex = shlex.shlex(raw, posix=False)
-    lex.commenters = ''
+    lex.commenters = ""
     lex.whitespace_split = True
     tokens = list(map(_clean_token, list(lex)))
     print(list(tokens))
@@ -66,6 +74,97 @@ def test_pre_parse_query() -> None:
         == '"hello world" computing type:"chapter thing"'
     )
     assert pre_parse_query('"foo bar"~4') == '"foo bar"~4'
-    assert pre_parse_query('(title:foo OR title:bar)^1.5 (body:foo OR body:bar)') == '(title:foo OR title:bar)^1.5 (body:foo OR body:bar)'
-    assert pre_parse_query('(title:"foo bar" AND body:"quick fox") OR title:fox') == '(title:"foo bar" AND body:"quick fox") OR title:fox'
-    assert pre_parse_query('status:[400 TO 499] AND (extension:php OR extension:html)') == 'status:[400 TO 499] AND (extension:php OR extension:html)'
+    assert (
+        pre_parse_query("(title:foo OR title:bar)^1.5 (body:foo OR body:bar)")
+        == "(title:foo OR title:bar)^1.5 (body:foo OR body:bar)"
+    )
+    assert (
+        pre_parse_query('(title:"foo bar" AND body:"quick fox") OR title:fox')
+        == '(title:"foo bar" AND body:"quick fox") OR title:fox'
+    )
+    assert (
+        pre_parse_query("status:[400 TO 499] AND (extension:php OR extension:html)")
+        == "status:[400 TO 499] AND (extension:php OR extension:html)"
+    )
+    assert pre_parse_query("[embargoed]") == '"[embargoed]"'
+    assert (
+        pre_parse_query("something 10.1002/eco.2061") == 'something "10.1002/eco.2061"'
+    )
+    assert pre_parse_query("different wet/dry ratios") == 'different "wet/dry" ratios'
+    assert pre_parse_query("kimchy!") == '"kimchy!"'
+    assert pre_parse_query("kimchy?") == '"kimchy?"'
+    assert pre_parse_query("Saul B/ Cohen") == 'Saul "B/" Cohen'
+    assert pre_parse_query("Nobel / Nino") == 'Nobel "/" Nino'
+
+
+def sniff_citation_query(raw: str) -> bool:
+    """
+    This function tries to categorize raw citation strings.
+
+    It doesn't handle lookups detection (yet? refactor?)
+    """
+    # if short, not citation
+    if len(raw) < 12 or len(raw.split()) < 6:
+        return False
+
+    # if there is a filter query, boost, or fuzzy match, not a citation
+    if re.search(r'([a-zA-Z]:[^\s])|(["\\)][\^~]\d)', raw):
+        return False
+
+    # numbers, years, page numbers, capitalization, quoted strings all increase
+    # confidence that this is a citation, not just a title
+    char_types = dict()
+    for c in raw:
+        if c.isdigit():
+            char_types["digit"] = True
+        elif c >= "A" and c <= "Z":
+            char_types["capitalized"] = True
+        elif c == '"' or c == "'":
+            char_types["quote"] = True
+        elif c == ".":
+            char_types["period"] = True
+        elif c == ",":
+            char_types["comma"] = True
+
+        if len(char_types) > 2:
+            return True
+
+    return False
+
+
+def test_sniff_citation_query() -> None:
+    assert sniff_citation_query("short") is False
+    assert (
+        sniff_citation_query("(title:foo OR title:bar)^1.5 (body:foo OR body:bar)")
+        is False
+    )
+    assert (
+        sniff_citation_query(
+            '"DR. SCHAUDINN\'S WORK ON BLOOD PARASITES." BMJ (Clinical Research Edition) (1905): 442-444'
+        )
+        is True
+    )
+    assert (
+        sniff_citation_query(
+            'Peskin, Charles S. "Numerical analysis of blood flow in the heart." Journal of computational physics 25.3 (1977): 220-252.'
+        )
+        is True
+    )
+    assert (
+        sniff_citation_query(
+            "Peskin, C.S., 1977. Numerical analysis of blood flow in the heart. Journal of computational physics, 25(3), pp.220-252."
+        )
+        is True
+    )
+    assert (
+        sniff_citation_query(
+            'Page, Don N. "Information in black hole radiation." Physical review letters 71.23 (1993): 3743.'
+        )
+        is True
+    )
+    assert (
+        sniff_citation_query(
+            "Hawking SW. Black hole explosions?. Nature. 1974 Mar;248(5443):30-1."
+        )
+        is True
+    )
author	Bryan Newbold <bnewbold@archive.org>	2021-01-18 23:49:14 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2021-01-19 19:49:04 -0800
commit	b938dc6ba8c354d483d96988dc057565db91ca38 (patch)
tree	f6fbcc921f8a195d7981236e5b7446edefd21a2c
parent	4249da27c244406291133453bf209057e29aacef (diff)
download	fatcat-scholar-b938dc6ba8c354d483d96988dc057565db91ca38.tar.gz fatcat-scholar-b938dc6ba8c354d483d96988dc057565db91ca38.zip