diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-01-18 23:49:14 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-01-19 19:49:04 -0800 |
commit | b938dc6ba8c354d483d96988dc057565db91ca38 (patch) | |
tree | f6fbcc921f8a195d7981236e5b7446edefd21a2c | |
parent | 4249da27c244406291133453bf209057e29aacef (diff) | |
download | fatcat-scholar-b938dc6ba8c354d483d96988dc057565db91ca38.tar.gz fatcat-scholar-b938dc6ba8c354d483d96988dc057565db91ca38.zip |
query parsing: sniff for citations; more corner cases
-rw-r--r-- | fatcat_scholar/query_parse.py | 113 |
1 files changed, 106 insertions, 7 deletions
diff --git a/fatcat_scholar/query_parse.py b/fatcat_scholar/query_parse.py index 489a4f4..fd42c6e 100644 --- a/fatcat_scholar/query_parse.py +++ b/fatcat_scholar/query_parse.py @@ -1,10 +1,10 @@ - """ This file contains helpers for pre-parsing and transforming search query strings. See the "basic query parsing" proposal doc for original motivation and design details. """ +import re import shlex @@ -15,17 +15,25 @@ def _clean_token(raw: str) -> str: if len(raw.split()) > 1: # has whitespace, will get quoted return raw - if "/" in raw or raw.endswith(":"): + if "/" in raw or raw.endswith(":") or raw.endswith("!") or raw.endswith("?"): + return '"{}"'.format(raw) + if raw.startswith("[") and raw.endswith("]"): + return '"{}"'.format(raw) + if raw.startswith("{") and raw.endswith("}"): return '"{}"'.format(raw) return raw def pre_parse_query(raw: str) -> str: - """ + r""" This method does some pre-parsing of raw query strings to prepare them for passing on as a elasticsearch query string query (which is really just the lucene query language). + Per Elasticsearch docs, the reserved characters are: + + + - = && || > < ! ( ) { } [ ] ^ " ~ * ? : \ / + For exaple, it tries to handle trailing semi-colons (could be interpreted as a field filter) and slashes in words. """ @@ -34,7 +42,7 @@ def pre_parse_query(raw: str) -> str: if '"~' in raw: return raw lex = shlex.shlex(raw, posix=False) - lex.commenters = '' + lex.commenters = "" lex.whitespace_split = True tokens = list(map(_clean_token, list(lex))) print(list(tokens)) @@ -66,6 +74,97 @@ def test_pre_parse_query() -> None: == '"hello world" computing type:"chapter thing"' ) assert pre_parse_query('"foo bar"~4') == '"foo bar"~4' - assert pre_parse_query('(title:foo OR title:bar)^1.5 (body:foo OR body:bar)') == '(title:foo OR title:bar)^1.5 (body:foo OR body:bar)' - assert pre_parse_query('(title:"foo bar" AND body:"quick fox") OR title:fox') == '(title:"foo bar" AND body:"quick fox") OR title:fox' - assert pre_parse_query('status:[400 TO 499] AND (extension:php OR extension:html)') == 'status:[400 TO 499] AND (extension:php OR extension:html)' + assert ( + pre_parse_query("(title:foo OR title:bar)^1.5 (body:foo OR body:bar)") + == "(title:foo OR title:bar)^1.5 (body:foo OR body:bar)" + ) + assert ( + pre_parse_query('(title:"foo bar" AND body:"quick fox") OR title:fox') + == '(title:"foo bar" AND body:"quick fox") OR title:fox' + ) + assert ( + pre_parse_query("status:[400 TO 499] AND (extension:php OR extension:html)") + == "status:[400 TO 499] AND (extension:php OR extension:html)" + ) + assert pre_parse_query("[embargoed]") == '"[embargoed]"' + assert ( + pre_parse_query("something 10.1002/eco.2061") == 'something "10.1002/eco.2061"' + ) + assert pre_parse_query("different wet/dry ratios") == 'different "wet/dry" ratios' + assert pre_parse_query("kimchy!") == '"kimchy!"' + assert pre_parse_query("kimchy?") == '"kimchy?"' + assert pre_parse_query("Saul B/ Cohen") == 'Saul "B/" Cohen' + assert pre_parse_query("Nobel / Nino") == 'Nobel "/" Nino' + + +def sniff_citation_query(raw: str) -> bool: + """ + This function tries to categorize raw citation strings. + + It doesn't handle lookups detection (yet? refactor?) + """ + # if short, not citation + if len(raw) < 12 or len(raw.split()) < 6: + return False + + # if there is a filter query, boost, or fuzzy match, not a citation + if re.search(r'([a-zA-Z]:[^\s])|(["\\)][\^~]\d)', raw): + return False + + # numbers, years, page numbers, capitalization, quoted strings all increase + # confidence that this is a citation, not just a title + char_types = dict() + for c in raw: + if c.isdigit(): + char_types["digit"] = True + elif c >= "A" and c <= "Z": + char_types["capitalized"] = True + elif c == '"' or c == "'": + char_types["quote"] = True + elif c == ".": + char_types["period"] = True + elif c == ",": + char_types["comma"] = True + + if len(char_types) > 2: + return True + + return False + + +def test_sniff_citation_query() -> None: + assert sniff_citation_query("short") is False + assert ( + sniff_citation_query("(title:foo OR title:bar)^1.5 (body:foo OR body:bar)") + is False + ) + assert ( + sniff_citation_query( + '"DR. SCHAUDINN\'S WORK ON BLOOD PARASITES." BMJ (Clinical Research Edition) (1905): 442-444' + ) + is True + ) + assert ( + sniff_citation_query( + 'Peskin, Charles S. "Numerical analysis of blood flow in the heart." Journal of computational physics 25.3 (1977): 220-252.' + ) + is True + ) + assert ( + sniff_citation_query( + "Peskin, C.S., 1977. Numerical analysis of blood flow in the heart. Journal of computational physics, 25(3), pp.220-252." + ) + is True + ) + assert ( + sniff_citation_query( + 'Page, Don N. "Information in black hole radiation." Physical review letters 71.23 (1993): 3743.' + ) + is True + ) + assert ( + sniff_citation_query( + "Hawking SW. Black hole explosions?. Nature. 1974 Mar;248(5443):30-1." + ) + is True + ) |