summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-09-17 23:14:22 -0700
committerBryan Newbold <bnewbold@archive.org>2020-09-17 23:17:06 -0700
commit360726ab527ca736f3ff8359e8c6101926017e3e (patch)
tree6526d561a0c5fdad90ed311b69b809b0cc2fc57f /fatcat_scholar
parenta7c76d1a835ab525be2f59dbd0d7ee487c0bd33c (diff)
downloadfatcat-scholar-360726ab527ca736f3ff8359e8c6101926017e3e.tar.gz
fatcat-scholar-360726ab527ca736f3ff8359e8c6101926017e3e.zip
search: handle direct DOI and PMCID queries
If query is a single token which looks like a valid PMCID or DOI, with no surrounding quotes, then expand scope and filter to that single external identifier.
Diffstat (limited to 'fatcat_scholar')
-rw-r--r--fatcat_scholar/identifiers.py90
-rw-r--r--fatcat_scholar/search.py25
2 files changed, 106 insertions, 9 deletions
diff --git a/fatcat_scholar/identifiers.py b/fatcat_scholar/identifiers.py
new file mode 100644
index 0000000..2ea09c7
--- /dev/null
+++ b/fatcat_scholar/identifiers.py
@@ -0,0 +1,90 @@
+import re
+from typing import Optional
+
+DOI_REGEX = re.compile(r"^10.\d{3,6}/\S+$")
+
+
+def clean_doi(raw: Optional[str]) -> Optional[str]:
+ """
+ Removes any:
+ - padding whitespace
+ - 'doi:' prefix
+ - URL prefix
+
+ Does not try to un-URL-encode
+
+ Returns None if not a valid DOI
+ """
+ if not raw:
+ return None
+ raw = raw.strip()
+ if "\u2013" in raw:
+ # Do not attempt to normalize "en dash" and since FC does not allow
+ # unicode in DOI, treat this as invalid.
+ return None
+ if len(raw.split()) != 1:
+ return None
+ if raw.startswith("doi:"):
+ raw = raw[4:]
+ if raw.startswith("http://"):
+ raw = raw[7:]
+ if raw.startswith("https://"):
+ raw = raw[8:]
+ if raw.startswith("doi.org/"):
+ raw = raw[8:]
+ if raw.startswith("dx.doi.org/"):
+ raw = raw[11:]
+ if raw[7:9] == "//":
+ raw = raw[:8] + raw[9:]
+
+ # fatcatd uses same REGEX, but Rust regex rejects these characters, while
+ # python doesn't. DOIs are syntaxtually valid, but very likely to be typos;
+ # for now filter them out.
+ for c in ("¬",):
+ if c in raw:
+ return None
+
+ if not raw.startswith("10."):
+ return None
+ if not DOI_REGEX.fullmatch(raw):
+ return None
+ # will likely want to expand DOI_REGEX to exclude non-ASCII characters, but
+ # for now block specific characters so we can get PubMed importer running
+ # again.
+ if "ä" in raw:
+ return None
+ return raw
+
+
+def test_clean_doi() -> None:
+ assert clean_doi("10.1234/asdf ") == "10.1234/asdf"
+ assert clean_doi("10.1037//0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50"
+ assert clean_doi("10.1037/0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50"
+ assert clean_doi("10.23750/abm.v88i2 -s.6506") == None
+ assert clean_doi("10.17167/mksz.2017.2.129–155") == None
+ assert clean_doi("http://doi.org/10.1234/asdf ") == "10.1234/asdf"
+ assert clean_doi("https://dx.doi.org/10.1234/asdf ") == "10.1234/asdf"
+ assert clean_doi("doi:10.1234/asdf ") == "10.1234/asdf"
+ assert clean_doi("doi:10.1234/ asdf ") == None
+ assert clean_doi("10.4149/gpb¬_2017042") == None # "logical negation" character
+ assert (
+ clean_doi("10.6002/ect.2020.häyry") == None
+ ) # this example via pubmed (pmid:32519616)
+
+
+def clean_pmcid(raw: Optional[str]) -> Optional[str]:
+ if not raw:
+ return None
+ raw = raw.strip()
+ if len(raw.split()) != 1:
+ return None
+ if raw.startswith("PMC") and raw[3:] and raw[3:].isdigit():
+ return raw
+ return None
+
+
+def test_clean_pmcid() -> None:
+ assert clean_pmcid("10.1234/asdf ") == None
+ assert clean_pmcid("") == None
+ assert clean_pmcid(None) == None
+ assert clean_pmcid("PMC123") == "PMC123"
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index 0b6798d..d29a720 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -16,6 +16,8 @@ from pydantic import BaseModel
# pytype: enable=import-error
+from fatcat_scholar.identifiers import *
+
# i18n note: the use of gettext below doesn't actually do the translation here,
# it just ensures that the strings are caught by babel for translation later
@@ -97,15 +99,20 @@ def do_fulltext_search(
search = Search(using=es_client, index=settings.ELASTICSEARCH_FULLTEXT_INDEX)
- # Convert raw DOIs to DOI queries
- if (
- query.q
- and len(query.q.split()) == 1
- and query.q.startswith("10.")
- and query.q.count("/") >= 1
- ):
- search = search.filter("terms", doi=query.q)
- query.q = "*"
+ # Try handling raw identifier queries
+ if query.q and len(query.q.strip().split()) == 1 and not '"' in query.q:
+ doi = clean_doi(query.q)
+ if doi:
+ query.q = f'doi:"{doi}"'
+ query.filter_type = "everything"
+ query.filter_availability = "everything"
+ query.filter_time = "all_time"
+ pmcid = clean_pmcid(query.q)
+ if pmcid:
+ query.q = f'pmcid:"{pmcid}"'
+ query.filter_type = "everything"
+ query.filter_availability = "everything"
+ query.filter_time = "all_time"
# type filters
if query.filter_type == "papers" or query.filter_type is None: