From aef7b788326313a44e47549af98fc93690b34661 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 13 May 2020 16:02:41 -0700 Subject: skeleton of basic search, using covid19 index --- fatcat_scholar/search.py | 139 ++++++++++++++++++ fatcat_scholar/static/ia-favicon.ico | Bin 0 -> 4286 bytes fatcat_scholar/static/ia-logo.svg | 13 ++ fatcat_scholar/static/ia-wordmark.svg | 1 + fatcat_scholar/templates/base.html | 212 ++++++++++++++++++++++++++++ fatcat_scholar/templates/home.html | 104 ++++++++++++-- fatcat_scholar/templates/search.html | 74 +++++++++- fatcat_scholar/templates/search_macros.html | 203 ++++++++++++++++++++++++++ fatcat_scholar/web.py | 23 ++- 9 files changed, 751 insertions(+), 18 deletions(-) create mode 100644 fatcat_scholar/search.py create mode 100644 fatcat_scholar/static/ia-favicon.ico create mode 100644 fatcat_scholar/static/ia-logo.svg create mode 100644 fatcat_scholar/static/ia-wordmark.svg create mode 100644 fatcat_scholar/templates/base.html create mode 100644 fatcat_scholar/templates/search_macros.html (limited to 'fatcat_scholar') diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py new file mode 100644 index 0000000..2373245 --- /dev/null +++ b/fatcat_scholar/search.py @@ -0,0 +1,139 @@ + +""" +Helpers to make elasticsearch queries. +""" + +import json +import datetime + +import elasticsearch +from elasticsearch_dsl import Search, Q +from dynaconf import settings + + +def generic_search_execute(search, limit=25, offset=0, deep_page_limit=2000): + + # Sanity checks + if limit > 100: + limit = 100 + if offset < 0: + offset = 0 + if offset > deep_page_limit: + # Avoid deep paging problem. + offset = deep_page_limit + + search = search[int(offset):int(offset)+int(limit)] + + try: + resp = search.execute() + except elasticsearch.exceptions.RequestError as e: + # this is a "user" error + print("elasticsearch 400: " + str(e.info)) + #flash("Search query failed to parse; you might need to use quotes.

{}: {}".format(e.error, e.info['error']['root_cause'][0]['reason'])) + # XXX: abort(e.status_code) + raise Exception() + except elasticsearch.exceptions.TransportError as e: + # all other errors + print("elasticsearch non-200 status code: {}".format(e.info)) + # XXX: abort(e.status_code) + raise Exception() + + # convert from objects to python dicts + results = [] + for h in resp: + r = h._d_ + #print(json.dumps(h.meta._d_, indent=2)) + r['_highlights'] = [] + if 'highlight' in dir(h.meta): + highlights = h.meta.highlight._d_ + for k in highlights: + r['_highlights'] += highlights[k] + results.append(r) + + for h in results: + # Handle surrogate strings that elasticsearch returns sometimes, + # probably due to mangled data processing in some pipeline. + # "Crimes against Unicode"; production workaround + for key in h: + if type(h[key]) is str: + h[key] = h[key].encode('utf8', 'ignore').decode('utf8') + + return { + "count_returned": len(results), + "count_found": int(resp.hits.total), + "results": results, + "offset": offset, + "limit": limit, + "deep_page_limit": deep_page_limit, + "query_time_ms": int(resp.took), + } + +def do_fulltext_search(q, limit=25, offset=0, filter_time=None, filter_type=None): + + # Convert raw DOIs to DOI queries + if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1: + q = 'doi:"{}"'.format(q) + + es_client = elasticsearch.Elasticsearch(settings.ELASTICSEARCH_BACKEND) + search = Search(using=es_client, index=settings.ELASTICSEARCH_FULLTEXT_INDEX) + + # type filters + if filter_type == "papers": + search = search.filter("terms", release_type=[ "article-journal", "paper-conference", "chapter", ]) + elif filter_type == "reports": + search = search.filter("terms", release_type=[ "report", "standard", ]) + elif filter_type == "datasets": + search = search.filter("terms", release_type=[ "dataset", "software", ]) + elif filter_type == "everything" or filter_type == None: + pass + else: + # XXX: abort(400) + raise Exception() + + # time filters + if filter_time == "past_week": + week_ago_date = str(datetime.date.today() - datetime.timedelta(days=7)) + search = search.filter("range", release_date=dict(gte=week_ago_date)) + elif filter_time == "this_year": + search = search.filter("term", release_year=datetime.date.today().year) + elif filter_time == "since_2000": + search = search.filter("range", release_year=dict(gte=2000)) + elif filter_time == "before_1925": + search = search.filter("range", release_year=dict(lte=1924)) + elif filter_time == "all_time" or filter_time == None: + pass + else: + # XXX: abort(400) + raise Exception() + + search = search.query( + 'query_string', + query=q, + default_operator="AND", + analyze_wildcard=True, + lenient=True, + fields=[ + "everything", + "abstract", + "fulltext.body", + "fulltext.annex", + ], + ) + search = search.highlight( + "abstract", + "fulltext.body", + "fulltext.annex", + number_of_fragments=3, + fragment_size=150, + ) + + resp = generic_search_execute(search, offset=offset) + + for h in resp['results']: + # Ensure 'contrib_names' is a list, not a single string + if type(h['contrib_names']) is not list: + h['contrib_names'] = [h['contrib_names'], ] + h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']] + + resp["query"] = { "q": q } + return resp diff --git a/fatcat_scholar/static/ia-favicon.ico b/fatcat_scholar/static/ia-favicon.ico new file mode 100644 index 0000000..363e30d Binary files /dev/null and b/fatcat_scholar/static/ia-favicon.ico differ diff --git a/fatcat_scholar/static/ia-logo.svg b/fatcat_scholar/static/ia-logo.svg new file mode 100644 index 0000000..2360a9f --- /dev/null +++ b/fatcat_scholar/static/ia-logo.svg @@ -0,0 +1,13 @@ + diff --git a/fatcat_scholar/static/ia-wordmark.svg b/fatcat_scholar/static/ia-wordmark.svg new file mode 100644 index 0000000..ae09db7 --- /dev/null +++ b/fatcat_scholar/static/ia-wordmark.svg @@ -0,0 +1 @@ + diff --git a/fatcat_scholar/templates/base.html b/fatcat_scholar/templates/base.html new file mode 100644 index 0000000..d47003b --- /dev/null +++ b/fatcat_scholar/templates/base.html @@ -0,0 +1,212 @@ + + + + + + + + + {%- block title -%}scholar.archive.org{%- endblock %} + + + {% block extra_head %}{% endblock %} + + + +

+ +{% block fullmain %} +
+
+ +
+
+
+{% block fullbody %} +
+ {% block body %}Nothing to see here.{% endblock %} +
+{% endblock %} +
+{% endblock %} + +{# + +#} + +
+
+ + + + +{% block postscript %}{% endblock %} + + diff --git a/fatcat_scholar/templates/home.html b/fatcat_scholar/templates/home.html index e6e09b5..bb387f7 100644 --- a/fatcat_scholar/templates/home.html +++ b/fatcat_scholar/templates/home.html @@ -1,12 +1,92 @@ - - - hello - - -

The Start

- -

{% trans %}This is a longer paragraph, all of which should be translated.{% endtrans %} - -

and {{ _("this is a quick") }} thing to translate. - - +{% extends "base.html" %} + +{% block fullmain %} + +

+
+
+
+
+

Search Inside Millions of Research Papers

+
+ +
+
+
+

This fulltext search index includes over 25 million research articles and other documents preserved in the Internet Archive. +

The collection spans from digitized copies of eighteenth century journals though the latest Open Access conference proceedings and pre-prints crawled from the World Wide Web. +

+
+
+
+ +
+
+ +
+ + + +
+ Children are not little adults: blood transfusion in children with burn injury + Tina L. Palmieri +
+ 2017 + Burns & Trauma +
+
+
+ +
+ + + +
+ Epidemic Influenza in and around the City of Calcutta + Koilas Chandra Bose +
+ 1920 + The Indian Medical Gazette +
+
+
+ +
+ + + +
+ Repertoire of Intensive Care Unit Pneumonia Microbiota + Sabri Bousbia, et al +
+ 2012 + PLoS ONE +
+
+
+ +
+ + + +
+ BioTorrents: A File Sharing Service for Scientific Data + Morgan Langille, et al +
+ 2012 + PLoS ONE +
+
+
+
+
+ +{% endblock %} diff --git a/fatcat_scholar/templates/search.html b/fatcat_scholar/templates/search.html index 783cb47..fcb19c3 100644 --- a/fatcat_scholar/templates/search.html +++ b/fatcat_scholar/templates/search.html @@ -1 +1,73 @@ -

Search template will go here

+{% import "search_macros.html" as search_macros %} +{% extends "base.html" %} + +{% block fullbody %} +
+
+ + {% if found %} +
+
+ {{ "{:,}".format(found.count_found) }} +
+
+ Hits +
+
+ {% else %} + Maybe some filters, facets, counts over here? + {% endif %} + +
+
+ Release Date + + +
+ Resource Type + + +
+ Availability + + +
+ Sort Order + +
+ +
+
+ {% if found %} + {% if found.results %} + {% for paper in found.results %} + {{ search_macros.fulltext_search_result_row(paper) }} + {% endfor %} + {% endif %} + {% else %} + Some other message here when there is no search? Like a bunch of examples? + Or does that ever happen... we can just run query on "*". + {% endif %} +
+
+{% endblock %} diff --git a/fatcat_scholar/templates/search_macros.html b/fatcat_scholar/templates/search_macros.html new file mode 100644 index 0000000..b600cb4 --- /dev/null +++ b/fatcat_scholar/templates/search_macros.html @@ -0,0 +1,203 @@ + +{% macro fulltext_search_result_row(paper) -%} +{% set lang_code = "en" %} +
+
+ {# ### TITLE ROW #} +

+ + {# "best URL" calculation #} + {% if paper.pmcid %} + + {% if lang_code != 'en' and lang_code == paper.lang and paper.original_title %} + {# show original title first instead of title if UI is in that language #} + {{ paper.original_title[:512] }} + {% if paper.original_title|length > 512 %}...{% endif %} + {% elif paper.title %} + {{ paper.title[:512] }} + {% if paper.title|length > 512 %}...{% endif %} + {% else %} + [blank] + {% endif %} + + + {# release type suffix #} + {% if paper.release_type in ("article-journal", "paper-conference") %} + {# pass #} + {% elif paper.release_type in ("book", "chapter", "dataset") %} + [{{ _(paper.release_type) }}] + {% elif not paper.release_type %} + [media?] + {% else %} + [{{ _(paper.release_type) }}] + {% endif %} + + {# show inverse of title/original_title above #} + {% if lang_code != 'en' and lang_code == paper.lang and paper.title and paper.title != paper.original_title %} +
+ + {{ paper.title[:512] }} {% if paper.title|length > 512 %}...{% endif %} + + {% elif paper.original_title and paper.title != paper.original_title %} +
+ + {{ paper.original_title[:512] }} {% if paper.original_title|length > 512 %}...{% endif %} + + {% endif %} + +

+ + {# ### AUTHOR ROW #} + {% if paper.contrib_names %} +
+ + {{ ", ".join(paper.contrib_names[:12]) }} + {% if paper.contrib_names|length > 12 %}(+{{ paper.contrib_names|length - 12 }} others){% endif %} + +
+ {% endif %} + + + {# ### JOURNAL ROW #} + {% if paper.release_year %} + {{ paper.release_year }} + {% endif %} + {% if paper.release_year and paper.container_name %} + | + {% endif %} + {% if paper.container_name %} + {% if paper.container_id %} + {{ paper.container_name }} + {% else %} + {{ paper.container_name }} + {% endif %} + {% if paper.container_is_oa %}{% endif %} + {% endif %} + {% if paper.withdrawn_status %} + [{{ paper.withdrawn_status }}] + {% endif %} + {% if paper.release_stage == "accepted" %} + [{{ paper.release_stage }}] + {% elif paper.release_stage and paper.release_stage != "published" %} + [{{ paper.release_stage }}] + {% elif not paper.release_stage %} + [unpublished?] + {% endif %} + + {# ### ABSTRACT / QUERY HIGHLIGHT #} + {% if paper._highlights %} +
+ {% for highlight in paper._highlights %} + {{ highlight|safe }} ... + {% endfor %} +
+ {% elif paper.abstract %} +
+ {% if paper.abstract[0]|length > 500 %} + {{ paper.abstract[0][:500] }}... + {% else %} + {{ paper.abstract[0][:500] }}... + {% endif %} +
+ {% else %} +
+ {% endif %} + + {# ### IDENTIFIERS #} + {% if paper.doi %} + doi:{{ paper.doi }}   + {% endif %} + {% if paper.pmid %} + pmid:{{ paper.pmid }}   + {% endif %} + {% if paper.pmcid %} + pmcid:{{ paper.pmcid }}   + {% endif %} + {% if paper.arxiv_id %} + arXiv:{{ paper.arxiv_id }}   + {% endif %} + {% if paper.fatcat_ident %} + fatcat:{{ paper.fatcat_ident}}   + {% endif %} + + {# ### SOURCE TAGS #} +
+ {% if paper.cord19_uid or 'cord19' in paper.source_tags %} + + CORD-19 + + {% endif %} + {% if 'fatcat' in paper.source_tags %} + + fatcat + + {% endif %} + {% if 'who' in paper.source_tags %} + + WHO + + {% endif %} + {% if 'wanfang' in paper.source_tags %} + + Wanfang + + {% endif %} + {% if 'cnki' in paper.source_tags %} + + CNKI + + {% endif %} + {# olive, brown, grey, pink, red, etc #} +
+ +
+
+ {% if paper.fulltext.thumbnail_url %} + {# #} + +
+ + + + {# +
+
+ PDF +
+
+ #} +
+ + +
+ {% else %} + {# No Fulltext #} + {% endif %} +{# should we include these little links? + + {% if paper.fulltext.pdf_url %} + mirror + {% endif %} + {% if paper.fulltext.grobid_xml_url %} + xml + {% endif %} + +#} +
+
+{% endmacro %} diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py index 8725ce0..e1ef478 100644 --- a/fatcat_scholar/web.py +++ b/fatcat_scholar/web.py @@ -9,13 +9,17 @@ from enum import Enum import babel.support from fastapi import FastAPI, APIRouter, Request, Depends, Header from fastapi.staticfiles import StaticFiles -from fatcat_scholar.hacks import Jinja2Templates from fastapi.responses import HTMLResponse from pydantic import BaseModel +from dynaconf import settings + +from fatcat_scholar.hacks import Jinja2Templates +from fatcat_scholar.search import do_fulltext_search + +print(settings.as_dict()) -I18N_LANG_DEFAULT = "en" I18N_LANG_TRANSLATIONS = ["de", "zh"] -I18N_LANG_OPTIONS = I18N_LANG_TRANSLATIONS + [I18N_LANG_DEFAULT,] +I18N_LANG_OPTIONS = I18N_LANG_TRANSLATIONS + [settings.I18N_LANG_DEFAULT,] class SearchParams(BaseModel): q: str = "" @@ -31,7 +35,7 @@ class LangPrefix: def __init__(self, request: Request): self.prefix : str = "" - self.code : str = I18N_LANG_DEFAULT + self.code : str = settings.I18N_LANG_DEFAULT for lang_option in I18N_LANG_OPTIONS: if request.url.path.startswith(f"/{lang_option}/"): self.prefix = f"/{lang_option}" @@ -101,6 +105,7 @@ def load_i18n_templates(): locale_ngettext(translations), newstyle=True, ) + templates.env.globals['settings'] = settings d[lang_opt] = templates return d @@ -112,11 +117,19 @@ async def web_home(request: Request, lang: LangPrefix = Depends(LangPrefix), con return await home() return i18n_templates[lang.code].TemplateResponse("home.html", {"request": request, "locale": lang.code, "lang_prefix": lang.prefix}) +@web.get("/about", include_in_schema=False) +async def web_about(request: Request, lang: LangPrefix = Depends(LangPrefix)): + return i18n_templates[lang.code].TemplateResponse("about.html", {"request": request, "locale": lang.code, "lang_prefix": lang.prefix}) + @web.get("/search", include_in_schema=False) async def web_search(request: Request, query: SearchParams = Depends(SearchParams), lang: LangPrefix = Depends(LangPrefix), content: ContentNegotiation = Depends(ContentNegotiation)): if content.mimetype == "application/json": return await search(query) - return i18n_templates[lang.code].TemplateResponse("search.html", {"request": request}) + found = None + if query.q: + found = do_fulltext_search(query.q) + return i18n_templates[lang.code].TemplateResponse("search.html", {"request": request, "locale": lang.code, "lang_prefix": lang.prefix, "found": found}) + app = FastAPI( title="Fatcat Scholar", -- cgit v1.2.3