diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-09-17 23:14:22 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-09-17 23:17:06 -0700 |
commit | 360726ab527ca736f3ff8359e8c6101926017e3e (patch) | |
tree | 6526d561a0c5fdad90ed311b69b809b0cc2fc57f | |
parent | a7c76d1a835ab525be2f59dbd0d7ee487c0bd33c (diff) | |
download | fatcat-scholar-360726ab527ca736f3ff8359e8c6101926017e3e.tar.gz fatcat-scholar-360726ab527ca736f3ff8359e8c6101926017e3e.zip |
search: handle direct DOI and PMCID queries
If query is a single token which looks like a valid PMCID or DOI, with
no surrounding quotes, then expand scope and filter to that single
external identifier.
-rw-r--r-- | fatcat_scholar/identifiers.py | 90 | ||||
-rw-r--r-- | fatcat_scholar/search.py | 25 |
2 files changed, 106 insertions, 9 deletions
diff --git a/fatcat_scholar/identifiers.py b/fatcat_scholar/identifiers.py new file mode 100644 index 0000000..2ea09c7 --- /dev/null +++ b/fatcat_scholar/identifiers.py @@ -0,0 +1,90 @@ +import re +from typing import Optional + +DOI_REGEX = re.compile(r"^10.\d{3,6}/\S+$") + + +def clean_doi(raw: Optional[str]) -> Optional[str]: + """ + Removes any: + - padding whitespace + - 'doi:' prefix + - URL prefix + + Does not try to un-URL-encode + + Returns None if not a valid DOI + """ + if not raw: + return None + raw = raw.strip() + if "\u2013" in raw: + # Do not attempt to normalize "en dash" and since FC does not allow + # unicode in DOI, treat this as invalid. + return None + if len(raw.split()) != 1: + return None + if raw.startswith("doi:"): + raw = raw[4:] + if raw.startswith("http://"): + raw = raw[7:] + if raw.startswith("https://"): + raw = raw[8:] + if raw.startswith("doi.org/"): + raw = raw[8:] + if raw.startswith("dx.doi.org/"): + raw = raw[11:] + if raw[7:9] == "//": + raw = raw[:8] + raw[9:] + + # fatcatd uses same REGEX, but Rust regex rejects these characters, while + # python doesn't. DOIs are syntaxtually valid, but very likely to be typos; + # for now filter them out. + for c in ("¬",): + if c in raw: + return None + + if not raw.startswith("10."): + return None + if not DOI_REGEX.fullmatch(raw): + return None + # will likely want to expand DOI_REGEX to exclude non-ASCII characters, but + # for now block specific characters so we can get PubMed importer running + # again. + if "ä" in raw: + return None + return raw + + +def test_clean_doi() -> None: + assert clean_doi("10.1234/asdf ") == "10.1234/asdf" + assert clean_doi("10.1037//0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50" + assert clean_doi("10.1037/0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50" + assert clean_doi("10.23750/abm.v88i2 -s.6506") == None + assert clean_doi("10.17167/mksz.2017.2.129–155") == None + assert clean_doi("http://doi.org/10.1234/asdf ") == "10.1234/asdf" + assert clean_doi("https://dx.doi.org/10.1234/asdf ") == "10.1234/asdf" + assert clean_doi("doi:10.1234/asdf ") == "10.1234/asdf" + assert clean_doi("doi:10.1234/ asdf ") == None + assert clean_doi("10.4149/gpb¬_2017042") == None # "logical negation" character + assert ( + clean_doi("10.6002/ect.2020.häyry") == None + ) # this example via pubmed (pmid:32519616) + + +def clean_pmcid(raw: Optional[str]) -> Optional[str]: + if not raw: + return None + raw = raw.strip() + if len(raw.split()) != 1: + return None + if raw.startswith("PMC") and raw[3:] and raw[3:].isdigit(): + return raw + return None + + +def test_clean_pmcid() -> None: + assert clean_pmcid("10.1234/asdf ") == None + assert clean_pmcid("") == None + assert clean_pmcid(None) == None + assert clean_pmcid("PMC123") == "PMC123" diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index 0b6798d..d29a720 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -16,6 +16,8 @@ from pydantic import BaseModel # pytype: enable=import-error +from fatcat_scholar.identifiers import * + # i18n note: the use of gettext below doesn't actually do the translation here, # it just ensures that the strings are caught by babel for translation later @@ -97,15 +99,20 @@ def do_fulltext_search( search = Search(using=es_client, index=settings.ELASTICSEARCH_FULLTEXT_INDEX) - # Convert raw DOIs to DOI queries - if ( - query.q - and len(query.q.split()) == 1 - and query.q.startswith("10.") - and query.q.count("/") >= 1 - ): - search = search.filter("terms", doi=query.q) - query.q = "*" + # Try handling raw identifier queries + if query.q and len(query.q.strip().split()) == 1 and not '"' in query.q: + doi = clean_doi(query.q) + if doi: + query.q = f'doi:"{doi}"' + query.filter_type = "everything" + query.filter_availability = "everything" + query.filter_time = "all_time" + pmcid = clean_pmcid(query.q) + if pmcid: + query.q = f'pmcid:"{pmcid}"' + query.filter_type = "everything" + query.filter_availability = "everything" + query.filter_time = "all_time" # type filters if query.filter_type == "papers" or query.filter_type is None: |