summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-01-19 00:55:44 -0800
committerBryan Newbold <bnewbold@archive.org>2021-01-19 19:49:04 -0800
commit89e9149b27263d6128be449c7b12b025cc031292 (patch)
treee3233ac5fc462881353be62c45217e9a4d974d4f
parent59cc64a24e4331899e6b952cc7a8dedc1ec13547 (diff)
downloadfatcat-scholar-89e9149b27263d6128be449c7b12b025cc031292.tar.gz
fatcat-scholar-89e9149b27263d6128be449c7b12b025cc031292.zip
add citation query feature (disabled by default)
This is operationally complex (queries hit 3x backend services!), so not enabled by default. Will need more testing; possibly circuit-breaking. Though haproxy should provide some of that automatically at this point.
-rw-r--r--fatcat_scholar/search.py83
-rw-r--r--fatcat_scholar/web.py6
-rw-r--r--settings.toml3
3 files changed, 75 insertions, 17 deletions
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index 9bc5699..cbfc465 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -2,6 +2,7 @@
Helpers to make elasticsearch queries.
"""
+import copy
import logging
import datetime
from gettext import gettext
@@ -10,6 +11,7 @@ from typing import List, Optional, Any
import elasticsearch
from elasticsearch_dsl import Search, Q
from elasticsearch_dsl.response import Response
+import fatcat_openapi_client
# pytype: disable=import-error
from pydantic import BaseModel
@@ -19,6 +21,8 @@ from pydantic import BaseModel
from fatcat_scholar.config import settings
from fatcat_scholar.identifiers import *
from fatcat_scholar.schema import ScholarDoc
+from fatcat_scholar.query_parse import sniff_citation_query, pre_parse_query
+from fatcat_scholar.query_citation import try_fuzzy_match
# i18n note: the use of gettext below doesn't actually do the translation here,
# it just ensures that the strings are caught by babel for translation later
@@ -81,6 +85,7 @@ class FulltextQuery(BaseModel):
class FulltextHits(BaseModel):
+ query_type: str
count_returned: int
count_found: int
offset: int
@@ -206,26 +211,75 @@ def apply_filters(search: Search, query: FulltextQuery) -> Search:
return search
-def do_fulltext_search(
- query: FulltextQuery, deep_page_limit: int = 2000
-) -> FulltextHits:
+def process_query(query: FulltextQuery) -> FulltextHits:
- search = Search(using=es_client, index=settings.ELASTICSEARCH_FULLTEXT_INDEX)
+ if not query.q:
+ return do_fulltext_search(query)
- # Try handling raw identifier queries
- if query.q and len(query.q.strip().split()) == 1 and not '"' in query.q:
+ # try handling raw identifier queries
+ if len(query.q.strip().split()) == 1 and not '"' in query.q:
doi = clean_doi(query.q)
if doi:
- query.q = f'doi:"{doi}"'
- query.filter_type = "everything"
- query.filter_availability = "everything"
- query.filter_time = "all_time"
+ return do_lookup_query(f'doi:"{doi}"')
pmcid = clean_pmcid(query.q)
if pmcid:
- query.q = f'pmcid:"{pmcid}"'
- query.filter_type = "everything"
- query.filter_availability = "everything"
- query.filter_time = "all_time"
+ return do_lookup_query(f'pmcid:"{pmcid}"')
+
+ # if this is a citation string, do a fuzzy lookup
+ if settings.ENABLE_CITATION_QUERY and sniff_citation_query(query.q):
+ api_conf = fatcat_openapi_client.Configuration()
+ api_conf.host = settings.FATCAT_API_HOST
+ api_client = fatcat_openapi_client.DefaultApi(
+ fatcat_openapi_client.ApiClient(api_conf)
+ )
+ fatcat_es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki")
+ key: Optional[str] = None
+ try:
+ key = try_fuzzy_match(
+ query.q,
+ grobid_host=settings.GROBID_HOST,
+ es_client=fatcat_es_client,
+ fatcat_api_client=api_client,
+ )
+ except elasticsearch.exceptions.RequestError as e:
+ logging.warn(f"citation fuzzy failure: {e}")
+ pass
+ except Exception as e:
+ # TODO: sentry log?
+ logging.warn(f"citation fuzzy failure: {e}")
+ raise e
+ if key:
+ result = do_lookup_query(f"key:{key}")
+ if result:
+ result.query_type = "citation"
+ return result
+
+ # fall through to regular query, with pre-parsing
+ query = copy.copy(query)
+ if query.q:
+ query.q = pre_parse_query(query.q)
+
+ return do_fulltext_search(query)
+
+
+def do_lookup_query(lookup: str) -> FulltextHits:
+ logging.info(f"lookup query: {lookup}")
+ query = FulltextQuery(
+ q=lookup,
+ filter_type="everything",
+ filter_availability="everything",
+ filter_time="all_time",
+ )
+ result = do_fulltext_search(query)
+ result.query_type = "lookup"
+ return result
+
+
+def do_fulltext_search(
+ query: FulltextQuery, deep_page_limit: int = 2000
+) -> FulltextHits:
+
+ search = Search(using=es_client, index=settings.ELASTICSEARCH_FULLTEXT_INDEX)
if query.collapse_key:
search = search.filter("term", collapse_key=query.collapse_key)
@@ -349,6 +403,7 @@ def do_fulltext_search(
count_found = count_returned
return FulltextHits(
+ query_type="fulltext",
count_returned=count_returned,
count_found=count_found,
offset=offset,
diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py
index a54b4e4..133ddce 100644
--- a/fatcat_scholar/web.py
+++ b/fatcat_scholar/web.py
@@ -19,7 +19,7 @@ from starlette.exceptions import HTTPException as StarletteHTTPException
from fatcat_scholar.config import settings, GIT_REVISION
from fatcat_scholar.hacks import Jinja2Templates, parse_accept_lang
-from fatcat_scholar.search import do_fulltext_search, FulltextQuery, FulltextHits
+from fatcat_scholar.search import process_query, FulltextQuery, FulltextHits
from fatcat_scholar.schema import ScholarDoc
@@ -97,7 +97,7 @@ class HitsModel(BaseModel):
async def search(query: FulltextQuery = Depends(FulltextQuery)) -> FulltextHits:
hits: Optional[FulltextHits] = None
try:
- hits = do_fulltext_search(query)
+ hits = process_query(query)
except ValueError as e:
sentry_sdk.set_level("warning")
sentry_sdk.capture_exception(e)
@@ -213,7 +213,7 @@ async def web_search(
status_code: int = 200
if query.q is not None:
try:
- hits = do_fulltext_search(query)
+ hits = process_query(query)
except ValueError as e:
sentry_sdk.set_level("warning")
sentry_sdk.capture_exception(e)
diff --git a/settings.toml b/settings.toml
index 34f29ce..6cd8823 100644
--- a/settings.toml
+++ b/settings.toml
@@ -10,6 +10,7 @@ ELASTICSEARCH_FULLTEXT_INDEX = "scholar_fulltext"
THUMBNAIL_URL_PREFIX = "https://blobs.fatcat.wiki/thumbnail/pdf/"
SANDCRAWLER_DB_API = "http://aitio.us.archive.org:3030"
SANDCRAWLER_S3_API = "wbgrp-svc169.us.archive.org:8333"
+GROBID_HOST = "https://grobid.qa.fatcat.wiki"
KAFKA_BROKERS = []
FATCAT_API_HOST = "https://api.fatcat.wiki/v0"
INDEX_WORKER_BATCH_SIZE = 50
@@ -19,6 +20,7 @@ GOATCOUNTER_ENDPOINT = "https://goatcounter.scholar.fatcat.wiki/count"
GOATCOUNTER_SCRIPT_URL = "https://goatcounter.scholar.fatcat.wiki/count.js"
ONION_DOMAIN = "scholar-qa.archivev3qli37bju4rlh27glh24lljyezwxf4pokmrdbpefjlcrp5id.onion"
ENABLE_PROMETHEUS = false
+ENABLE_CITATION_QUERY = false
[test]
SCHOLAR_ENV = "test"
@@ -28,6 +30,7 @@ ELASTICSEARCH_FRONTEND = "http://disabled-during-tests-bogus.xyz:3333"
ELASTICSEARCH_BACKEND = "http://disabled-during-tests-bogus.xyz:3333"
SANDCRAWLER_DB_API = "http://disabled-during-tests-bogus.xyz:3333"
SANDCRAWLER_S3_API = "disabled-during-tests-bogus.xyz:8333"
+GROBID_HOST = "http://disabled-during-tests-bogus.xyz:3333"
SCHOLAR_ISSUEDB_PATH = ":memory:"
[development]