diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2021-01-19 01:06:37 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2021-01-19 19:49:04 -0800 | 
| commit | aff6473144542e24c0aaa66c514c210eb83bf8a8 (patch) | |
| tree | 0131cdb13a2bd0e5c9edc8f7359233195a8b0432 | |
| parent | 89e9149b27263d6128be449c7b12b025cc031292 (diff) | |
| download | fatcat-scholar-aff6473144542e24c0aaa66c514c210eb83bf8a8.tar.gz fatcat-scholar-aff6473144542e24c0aaa66c514c210eb83bf8a8.zip | |
parse: tweak citation sniff routine
| -rw-r--r-- | fatcat_scholar/query_parse.py | 14 | 
1 files changed, 11 insertions, 3 deletions
| diff --git a/fatcat_scholar/query_parse.py b/fatcat_scholar/query_parse.py index 0aae066..0f5086b 100644 --- a/fatcat_scholar/query_parse.py +++ b/fatcat_scholar/query_parse.py @@ -114,6 +114,10 @@ def sniff_citation_query(raw: str) -> bool:      if len(raw) < 12 or len(raw.split()) < 6:          return False +    # if single quoted string, not a citation +    if raw.count('"') == 2 and raw.startswith('"') and raw.endswith('"'): +        return False +      # if there is a filter query, boost, or fuzzy match, not a citation      if re.search(r'([a-zA-Z]:[^\s])|(["\\)][\^~]\d)', raw):          return False @@ -132,8 +136,12 @@ def sniff_citation_query(raw: str) -> bool:              char_types["period"] = True          elif c == ",":              char_types["comma"] = True +        elif c == ";": +            char_types["semicolon"] = True +        elif c == "(" or c == ")": +            char_types["parens"] = True -        if len(char_types) > 2: +        if len(char_types) >= 4:              return True      return False @@ -145,8 +153,8 @@ def test_sniff_citation_query() -> None:          sniff_citation_query("(title:foo OR title:bar)^1.5 (body:foo OR body:bar)")          is False      ) -    assert sniff_citation_query("DR. SCHAUDINN'S WORK ON BLOOD PARASITES") is True -    assert sniff_citation_query('"DR. SCHAUDINN\'S WORK ON BLOOD PARASITES"') is True +    assert sniff_citation_query("DR. SCHAUDINN'S WORK ON BLOOD PARASITES") is False +    assert sniff_citation_query('"DR. SCHAUDINN\'S WORK ON BLOOD PARASITES"') is False      assert (          sniff_citation_query(              '"DR. SCHAUDINN\'S WORK ON BLOOD PARASITES." BMJ (Clinical Research Edition) (1905): 442-444' | 
