diff options
| -rw-r--r-- | fatcat_scholar/query_parse.py | 113 | 
1 files changed, 106 insertions, 7 deletions
diff --git a/fatcat_scholar/query_parse.py b/fatcat_scholar/query_parse.py index 489a4f4..fd42c6e 100644 --- a/fatcat_scholar/query_parse.py +++ b/fatcat_scholar/query_parse.py @@ -1,10 +1,10 @@ -  """  This file contains helpers for pre-parsing and transforming search query  strings. See the "basic query parsing" proposal doc for original motivation and  design details.  """ +import re  import shlex @@ -15,17 +15,25 @@ def _clean_token(raw: str) -> str:      if len(raw.split()) > 1:          # has whitespace, will get quoted          return raw -    if "/" in raw or raw.endswith(":"): +    if "/" in raw or raw.endswith(":") or raw.endswith("!") or raw.endswith("?"): +        return '"{}"'.format(raw) +    if raw.startswith("[") and raw.endswith("]"): +        return '"{}"'.format(raw) +    if raw.startswith("{") and raw.endswith("}"):          return '"{}"'.format(raw)      return raw  def pre_parse_query(raw: str) -> str: -    """ +    r"""      This method does some pre-parsing of raw query strings to prepare them for      passing on as a elasticsearch query string query (which is really just the      lucene query language). +    Per Elasticsearch docs, the reserved characters are: + +        + - = && || > < ! ( ) { } [ ] ^ " ~ * ? : \ / +      For exaple, it tries to handle trailing semi-colons (could be interpreted      as a field filter) and slashes in words.      """ @@ -34,7 +42,7 @@ def pre_parse_query(raw: str) -> str:      if '"~' in raw:          return raw      lex = shlex.shlex(raw, posix=False) -    lex.commenters = '' +    lex.commenters = ""      lex.whitespace_split = True      tokens = list(map(_clean_token, list(lex)))      print(list(tokens)) @@ -66,6 +74,97 @@ def test_pre_parse_query() -> None:          == '"hello world" computing type:"chapter thing"'      )      assert pre_parse_query('"foo bar"~4') == '"foo bar"~4' -    assert pre_parse_query('(title:foo OR title:bar)^1.5 (body:foo OR body:bar)') == '(title:foo OR title:bar)^1.5 (body:foo OR body:bar)' -    assert pre_parse_query('(title:"foo bar" AND body:"quick fox") OR title:fox') == '(title:"foo bar" AND body:"quick fox") OR title:fox' -    assert pre_parse_query('status:[400 TO 499] AND (extension:php OR extension:html)') == 'status:[400 TO 499] AND (extension:php OR extension:html)' +    assert ( +        pre_parse_query("(title:foo OR title:bar)^1.5 (body:foo OR body:bar)") +        == "(title:foo OR title:bar)^1.5 (body:foo OR body:bar)" +    ) +    assert ( +        pre_parse_query('(title:"foo bar" AND body:"quick fox") OR title:fox') +        == '(title:"foo bar" AND body:"quick fox") OR title:fox' +    ) +    assert ( +        pre_parse_query("status:[400 TO 499] AND (extension:php OR extension:html)") +        == "status:[400 TO 499] AND (extension:php OR extension:html)" +    ) +    assert pre_parse_query("[embargoed]") == '"[embargoed]"' +    assert ( +        pre_parse_query("something 10.1002/eco.2061") == 'something "10.1002/eco.2061"' +    ) +    assert pre_parse_query("different wet/dry ratios") == 'different "wet/dry" ratios' +    assert pre_parse_query("kimchy!") == '"kimchy!"' +    assert pre_parse_query("kimchy?") == '"kimchy?"' +    assert pre_parse_query("Saul B/ Cohen") == 'Saul "B/" Cohen' +    assert pre_parse_query("Nobel / Nino") == 'Nobel "/" Nino' + + +def sniff_citation_query(raw: str) -> bool: +    """ +    This function tries to categorize raw citation strings. + +    It doesn't handle lookups detection (yet? refactor?) +    """ +    # if short, not citation +    if len(raw) < 12 or len(raw.split()) < 6: +        return False + +    # if there is a filter query, boost, or fuzzy match, not a citation +    if re.search(r'([a-zA-Z]:[^\s])|(["\\)][\^~]\d)', raw): +        return False + +    # numbers, years, page numbers, capitalization, quoted strings all increase +    # confidence that this is a citation, not just a title +    char_types = dict() +    for c in raw: +        if c.isdigit(): +            char_types["digit"] = True +        elif c >= "A" and c <= "Z": +            char_types["capitalized"] = True +        elif c == '"' or c == "'": +            char_types["quote"] = True +        elif c == ".": +            char_types["period"] = True +        elif c == ",": +            char_types["comma"] = True + +        if len(char_types) > 2: +            return True + +    return False + + +def test_sniff_citation_query() -> None: +    assert sniff_citation_query("short") is False +    assert ( +        sniff_citation_query("(title:foo OR title:bar)^1.5 (body:foo OR body:bar)") +        is False +    ) +    assert ( +        sniff_citation_query( +            '"DR. SCHAUDINN\'S WORK ON BLOOD PARASITES." BMJ (Clinical Research Edition) (1905): 442-444' +        ) +        is True +    ) +    assert ( +        sniff_citation_query( +            'Peskin, Charles S. "Numerical analysis of blood flow in the heart." Journal of computational physics 25.3 (1977): 220-252.' +        ) +        is True +    ) +    assert ( +        sniff_citation_query( +            "Peskin, C.S., 1977. Numerical analysis of blood flow in the heart. Journal of computational physics, 25(3), pp.220-252." +        ) +        is True +    ) +    assert ( +        sniff_citation_query( +            'Page, Don N. "Information in black hole radiation." Physical review letters 71.23 (1993): 3743.' +        ) +        is True +    ) +    assert ( +        sniff_citation_query( +            "Hawking SW. Black hole explosions?. Nature. 1974 Mar;248(5443):30-1." +        ) +        is True +    )  | 
