search: handle direct DOI and PMCID queries

If query is a single token which looks like a valid PMCID or DOI, with no surrounding quotes, then expand scope and filter to that single external identifier.
author: Bryan Newbold <bnewbold@archive.org> 2020-09-17 23:14:22 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-09-17 23:17:06 -0700
commit: 360726ab527ca736f3ff8359e8c6101926017e3e (patch)
tree: 6526d561a0c5fdad90ed311b69b809b0cc2fc57f /fatcat_scholar
parent: a7c76d1a835ab525be2f59dbd0d7ee487c0bd33c (diff)
download: fatcat-scholar-360726ab527ca736f3ff8359e8c6101926017e3e.tar.gz
fatcat-scholar-360726ab527ca736f3ff8359e8c6101926017e3e.zip
2 files changed, 106 insertions, 9 deletions
diff --git a/fatcat_scholar/identifiers.py b/fatcat_scholar/identifiers.py
new file mode 100644
index 0000000..2ea09c7
--- /dev/null
+++ b/fatcat_scholar/identifiers.py
@@ -0,0 +1,90 @@
+import re
+from typing import Optional
+
+DOI_REGEX = re.compile(r"^10.\d{3,6}/\S+$")
+
+
+def clean_doi(raw: Optional[str]) -> Optional[str]:
+    """
+    Removes any:
+    - padding whitespace
+    - 'doi:' prefix
+    - URL prefix
+
+    Does not try to un-URL-encode
+
+    Returns None if not a valid DOI
+    """
+    if not raw:
+        return None
+    raw = raw.strip()
+    if "\u2013" in raw:
+        # Do not attempt to normalize "en dash" and since FC does not allow
+        # unicode in DOI, treat this as invalid.
+        return None
+    if len(raw.split()) != 1:
+        return None
+    if raw.startswith("doi:"):
+        raw = raw[4:]
+    if raw.startswith("http://"):
+        raw = raw[7:]
+    if raw.startswith("https://"):
+        raw = raw[8:]
+    if raw.startswith("doi.org/"):
+        raw = raw[8:]
+    if raw.startswith("dx.doi.org/"):
+        raw = raw[11:]
+    if raw[7:9] == "//":
+        raw = raw[:8] + raw[9:]
+
+    # fatcatd uses same REGEX, but Rust regex rejects these characters, while
+    # python doesn't. DOIs are syntaxtually valid, but very likely to be typos;
+    # for now filter them out.
+    for c in ("¬",):
+        if c in raw:
+            return None
+
+    if not raw.startswith("10."):
+        return None
+    if not DOI_REGEX.fullmatch(raw):
+        return None
+    # will likely want to expand DOI_REGEX to exclude non-ASCII characters, but
+    # for now block specific characters so we can get PubMed importer running
+    # again.
+    if "ä" in raw:
+        return None
+    return raw
+
+
+def test_clean_doi() -> None:
+    assert clean_doi("10.1234/asdf ") == "10.1234/asdf"
+    assert clean_doi("10.1037//0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50"
+    assert clean_doi("10.1037/0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50"
+    assert clean_doi("10.23750/abm.v88i2 -s.6506") == None
+    assert clean_doi("10.17167/mksz.2017.2.129–155") == None
+    assert clean_doi("http://doi.org/10.1234/asdf ") == "10.1234/asdf"
+    assert clean_doi("https://dx.doi.org/10.1234/asdf ") == "10.1234/asdf"
+    assert clean_doi("doi:10.1234/asdf ") == "10.1234/asdf"
+    assert clean_doi("doi:10.1234/ asdf ") == None
+    assert clean_doi("10.4149/gpb¬_2017042") == None  # "logical negation" character
+    assert (
+        clean_doi("10.6002/ect.2020.häyry") == None
+    )  # this example via pubmed (pmid:32519616)
+
+
+def clean_pmcid(raw: Optional[str]) -> Optional[str]:
+    if not raw:
+        return None
+    raw = raw.strip()
+    if len(raw.split()) != 1:
+        return None
+    if raw.startswith("PMC") and raw[3:] and raw[3:].isdigit():
+        return raw
+    return None
+
+
+def test_clean_pmcid() -> None:
+    assert clean_pmcid("10.1234/asdf ") == None
+    assert clean_pmcid("") == None
+    assert clean_pmcid(None) == None
+    assert clean_pmcid("PMC123") == "PMC123"
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index 0b6798d..d29a720 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -16,6 +16,8 @@ from pydantic import BaseModel
 
 # pytype: enable=import-error
 
+from fatcat_scholar.identifiers import *
+
 # i18n note: the use of gettext below doesn't actually do the translation here,
 # it just ensures that the strings are caught by babel for translation later
 
@@ -97,15 +99,20 @@ def do_fulltext_search(
 
     search = Search(using=es_client, index=settings.ELASTICSEARCH_FULLTEXT_INDEX)
 
-    # Convert raw DOIs to DOI queries
-    if (
-        query.q
-        and len(query.q.split()) == 1
-        and query.q.startswith("10.")
-        and query.q.count("/") >= 1
-    ):
-        search = search.filter("terms", doi=query.q)
-        query.q = "*"
+    # Try handling raw identifier queries
+    if query.q and len(query.q.strip().split()) == 1 and not '"' in query.q:
+        doi = clean_doi(query.q)
+        if doi:
+            query.q = f'doi:"{doi}"'
+            query.filter_type = "everything"
+            query.filter_availability = "everything"
+            query.filter_time = "all_time"
+        pmcid = clean_pmcid(query.q)
+        if pmcid:
+            query.q = f'pmcid:"{pmcid}"'
+            query.filter_type = "everything"
+            query.filter_availability = "everything"
+            query.filter_time = "all_time"
 
     # type filters
     if query.filter_type == "papers" or query.filter_type is None:
author	Bryan Newbold <bnewbold@archive.org>	2020-09-17 23:14:22 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-09-17 23:17:06 -0700
commit	360726ab527ca736f3ff8359e8c6101926017e3e (patch)
tree	6526d561a0c5fdad90ed311b69b809b0cc2fc57f /fatcat_scholar
parent	a7c76d1a835ab525be2f59dbd0d7ee487c0bd33c (diff)
download	fatcat-scholar-360726ab527ca736f3ff8359e8c6101926017e3e.tar.gz fatcat-scholar-360726ab527ca736f3ff8359e8c6101926017e3e.zip