diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-01-19 01:06:37 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-01-19 19:49:04 -0800 |
commit | aff6473144542e24c0aaa66c514c210eb83bf8a8 (patch) | |
tree | 0131cdb13a2bd0e5c9edc8f7359233195a8b0432 /fatcat_scholar/query_parse.py | |
parent | 89e9149b27263d6128be449c7b12b025cc031292 (diff) | |
download | fatcat-scholar-aff6473144542e24c0aaa66c514c210eb83bf8a8.tar.gz fatcat-scholar-aff6473144542e24c0aaa66c514c210eb83bf8a8.zip |
parse: tweak citation sniff routine
Diffstat (limited to 'fatcat_scholar/query_parse.py')
-rw-r--r-- | fatcat_scholar/query_parse.py | 14 |
1 files changed, 11 insertions, 3 deletions
diff --git a/fatcat_scholar/query_parse.py b/fatcat_scholar/query_parse.py index 0aae066..0f5086b 100644 --- a/fatcat_scholar/query_parse.py +++ b/fatcat_scholar/query_parse.py @@ -114,6 +114,10 @@ def sniff_citation_query(raw: str) -> bool: if len(raw) < 12 or len(raw.split()) < 6: return False + # if single quoted string, not a citation + if raw.count('"') == 2 and raw.startswith('"') and raw.endswith('"'): + return False + # if there is a filter query, boost, or fuzzy match, not a citation if re.search(r'([a-zA-Z]:[^\s])|(["\\)][\^~]\d)', raw): return False @@ -132,8 +136,12 @@ def sniff_citation_query(raw: str) -> bool: char_types["period"] = True elif c == ",": char_types["comma"] = True + elif c == ";": + char_types["semicolon"] = True + elif c == "(" or c == ")": + char_types["parens"] = True - if len(char_types) > 2: + if len(char_types) >= 4: return True return False @@ -145,8 +153,8 @@ def test_sniff_citation_query() -> None: sniff_citation_query("(title:foo OR title:bar)^1.5 (body:foo OR body:bar)") is False ) - assert sniff_citation_query("DR. SCHAUDINN'S WORK ON BLOOD PARASITES") is True - assert sniff_citation_query('"DR. SCHAUDINN\'S WORK ON BLOOD PARASITES"') is True + assert sniff_citation_query("DR. SCHAUDINN'S WORK ON BLOOD PARASITES") is False + assert sniff_citation_query('"DR. SCHAUDINN\'S WORK ON BLOOD PARASITES"') is False assert ( sniff_citation_query( '"DR. SCHAUDINN\'S WORK ON BLOOD PARASITES." BMJ (Clinical Research Edition) (1905): 442-444' |