From 986ce7a38029f7fb20a51271f67d943678e17386 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 1 Apr 2020 21:29:56 -0700 Subject: first iteration of web interface Copied and tweaked from fatcat:python/fatcat_web LICENSE file for this repo is a TODO and will need to match that of fatcat. --- fatcat_covid19/babel.cfg | 3 + fatcat_covid19/search.py | 82 ++++++++++++++++++ fatcat_covid19/static/ia_logo.png | Bin 0 -> 8867 bytes fatcat_covid19/static/ia_logo_text.png | Bin 0 -> 7463 bytes fatcat_covid19/static/robots.txt | 1 + fatcat_covid19/templates/400.html | 13 +++ fatcat_covid19/templates/404.html | 9 ++ fatcat_covid19/templates/500.html | 13 +++ fatcat_covid19/templates/about_de.html | 13 +++ fatcat_covid19/templates/about_en.html | 13 +++ fatcat_covid19/templates/base.html | 106 +++++++++++++++++++++++ fatcat_covid19/templates/entity_macros.html | 117 +++++++++++++++++++++++++ fatcat_covid19/templates/fulltext_search.html | 72 ++++++++++++++++ fatcat_covid19/templates/home.html | 94 ++++++++++++++++++++ fatcat_covid19/templates/sources.html | 119 ++++++++++++++++++++++++++ fatcat_covid19/webface.py | 112 ++++++++++++++++++++++++ 16 files changed, 767 insertions(+) create mode 100644 fatcat_covid19/babel.cfg create mode 100644 fatcat_covid19/search.py create mode 100644 fatcat_covid19/static/ia_logo.png create mode 100644 fatcat_covid19/static/ia_logo_text.png create mode 100644 fatcat_covid19/static/robots.txt create mode 100644 fatcat_covid19/templates/400.html create mode 100644 fatcat_covid19/templates/404.html create mode 100644 fatcat_covid19/templates/500.html create mode 100644 fatcat_covid19/templates/about_de.html create mode 100644 fatcat_covid19/templates/about_en.html create mode 100644 fatcat_covid19/templates/base.html create mode 100644 fatcat_covid19/templates/entity_macros.html create mode 100644 fatcat_covid19/templates/fulltext_search.html create mode 100644 fatcat_covid19/templates/home.html create mode 100644 fatcat_covid19/templates/sources.html create mode 100644 fatcat_covid19/webface.py diff --git a/fatcat_covid19/babel.cfg b/fatcat_covid19/babel.cfg new file mode 100644 index 0000000..0a5feb3 --- /dev/null +++ b/fatcat_covid19/babel.cfg @@ -0,0 +1,3 @@ +[python 1="**.py" language=":"][/python] +[jinja2: **/templates/**.htm] +extensions=jinja2.ext.autoescape,jinja2.ext.with_ diff --git a/fatcat_covid19/search.py b/fatcat_covid19/search.py new file mode 100644 index 0000000..e939502 --- /dev/null +++ b/fatcat_covid19/search.py @@ -0,0 +1,82 @@ + +import datetime +import requests +from flask import abort, flash +from fatcat_covid19.webface import app + +def do_search(index, request, limit=30, offset=0, deep_page_limit=2000): + + # Sanity checks + if limit > 100: + limit = 100 + if offset < 0: + offset = 0 + if offset > deep_page_limit: + # Avoid deep paging problem. + offset = deep_page_limit + + request["size"] = int(limit) + request["from"] = int(offset) + # print(request) + resp = requests.get("%s/%s/_search" % + (app.config['ELASTICSEARCH_BACKEND'], index), + json=request) + + if resp.status_code == 400: + print("elasticsearch 400: " + str(resp.content)) + flash("Search query failed to parse; you might need to use quotes.

{}".format(resp.content)) + abort(resp.status_code) + elif resp.status_code != 200: + print("elasticsearch non-200 status code: " + str(resp.status_code)) + print(resp.content) + abort(resp.status_code) + + content = resp.json() + results = [h['_source'] for h in content['hits']['hits']] + for h in results: + # Handle surrogate strings that elasticsearch returns sometimes, + # probably due to mangled data processing in some pipeline. + # "Crimes against Unicode"; production workaround + for key in h: + if type(h[key]) is str: + h[key] = h[key].encode('utf8', 'ignore').decode('utf8') + + return {"count_returned": len(results), + "count_found": content['hits']['total'], + "results": results, + "offset": offset, + "deep_page_limit": deep_page_limit} + +def do_fulltext_search(q, limit=30, offset=0): + + #print("Search hit: " + q) + if limit > 100: + # Sanity check + limit = 100 + + # Convert raw DOIs to DOI queries + if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1: + q = 'doi:"{}"'.format(q) + + + search_request = { + "query": { + "query_string": { + "query": q, + "default_operator": "AND", + "analyze_wildcard": True, + "lenient": True, + "fields": ["everything"], + }, + }, + } + + resp = do_search(app.config['ELASTICSEARCH_FULLTEXT_INDEX'], search_request, offset=offset) + for h in resp['results']: + # Ensure 'contrib_names' is a list, not a single string + if type(h['contrib_names']) is not list: + h['contrib_names'] = [h['contrib_names'], ] + h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']] + resp["query"] = { "q": q } + resp["limit"] = limit + return resp diff --git a/fatcat_covid19/static/ia_logo.png b/fatcat_covid19/static/ia_logo.png new file mode 100644 index 0000000..97cc445 Binary files /dev/null and b/fatcat_covid19/static/ia_logo.png differ diff --git a/fatcat_covid19/static/ia_logo_text.png b/fatcat_covid19/static/ia_logo_text.png new file mode 100644 index 0000000..ddfc773 Binary files /dev/null and b/fatcat_covid19/static/ia_logo_text.png differ diff --git a/fatcat_covid19/static/robots.txt b/fatcat_covid19/static/robots.txt new file mode 100644 index 0000000..a168f11 --- /dev/null +++ b/fatcat_covid19/static/robots.txt @@ -0,0 +1 @@ +# Hello friends! diff --git a/fatcat_covid19/templates/400.html b/fatcat_covid19/templates/400.html new file mode 100644 index 0000000..f2659ca --- /dev/null +++ b/fatcat_covid19/templates/400.html @@ -0,0 +1,13 @@ +{% extends "base.html" %} +{% block body %} + +

+
400
+
Bad Request
+ +

Wasn't able to handle the request, either due to incorrect or unexpected +input. Usually more context should be available; if you hit this page it means +you've discovered a new corner case! +

+ +{% endblock %} diff --git a/fatcat_covid19/templates/404.html b/fatcat_covid19/templates/404.html new file mode 100644 index 0000000..653b8ee --- /dev/null +++ b/fatcat_covid19/templates/404.html @@ -0,0 +1,9 @@ +{% extends "base.html" %} +{% block body %} + +
+
404
+
Not Found
+
+ +{% endblock %} diff --git a/fatcat_covid19/templates/500.html b/fatcat_covid19/templates/500.html new file mode 100644 index 0000000..a99232c --- /dev/null +++ b/fatcat_covid19/templates/500.html @@ -0,0 +1,13 @@ +{% extends "base.html" %} +{% block body %} + +
+
500
+
Internal Error
+ +

Hrm, something unexpected went wrong. You may have found a bug! This request +should be logged and reported automatically; you could re-try or contact us for +more info. +

+ +{% endblock %} diff --git a/fatcat_covid19/templates/about_de.html b/fatcat_covid19/templates/about_de.html new file mode 100644 index 0000000..2dd2b5e --- /dev/null +++ b/fatcat_covid19/templates/about_de.html @@ -0,0 +1,13 @@ +{% extends "base.html" %} + +{% block title %}About{% endblock %} + +{% block body %} + +{# #} + +

+ +TODO + +{% endblock %} diff --git a/fatcat_covid19/templates/about_en.html b/fatcat_covid19/templates/about_en.html new file mode 100644 index 0000000..2dd2b5e --- /dev/null +++ b/fatcat_covid19/templates/about_en.html @@ -0,0 +1,13 @@ +{% extends "base.html" %} + +{% block title %}About{% endblock %} + +{% block body %} + +{# #} + +

+ +TODO + +{% endblock %} diff --git a/fatcat_covid19/templates/base.html b/fatcat_covid19/templates/base.html new file mode 100644 index 0000000..0ca8471 --- /dev/null +++ b/fatcat_covid19/templates/base.html @@ -0,0 +1,106 @@ + + + + + + + + COVID-19 Research Search + + + + {% block extra_head %}{% endblock %} + + + + + +{% block fullmain %} + +
+{% with messages = get_flashed_messages() %} + {% if messages %} +
+ {# Needs more javascript: #} +
Flash Message!
+
    + {% for message in messages %} +
  • {{ message|safe }} + {% endfor %} +
+
+ {% endif %} +{% endwith %} +{% block fullbody %} +
+ {% block body %}Nothing to see here.{% endblock %} +
+{% endblock %} +
+{% endblock %} + + + + + + +{% block postscript %}{% endblock %} + + + diff --git a/fatcat_covid19/templates/entity_macros.html b/fatcat_covid19/templates/entity_macros.html new file mode 100644 index 0000000..9cded8a --- /dev/null +++ b/fatcat_covid19/templates/entity_macros.html @@ -0,0 +1,117 @@ + +{% macro fulltext_search_result_row(paper) -%} +
+

+ + {% if paper.title %} + {{ paper.title[:512] }} + {% if paper.title|length > 512 %}...{% endif %} + {% else %} + [blank] + {% endif %} + +

+ {% if paper.best_pdf_url %} +
+   fulltext +
+ {% endif %} + {# +
{{ ", ".join(paper.contrib_names[:12]) }} + {% if paper.contrib_names|length > 12 %}(+{{ paper.contrib_names|length - 12 }} others){% endif %} +
+ #} + {% if paper.contrib_names %} +
+ + {{ ", ".join(paper.contrib_names[:12]) }} + {% if paper.contrib_names|length > 12 %}(+{{ paper.contrib_names|length - 12 }} others){% endif %} + +
+ {% endif %} + {% if paper.release_year %} + {{ paper.release_year }} + {% endif %} + {% if paper.release_type %} + {% if paper.release_type in ("article-journal", "paper-conference") %} + {{ paper.release_type }} + {% elif paper.release_type in ("book") %} + {{ paper.release_type }} + {% else %} + {{ paper.release_type }} + {% endif %} + {% endif %} + {% if paper.withdrawn_status %} + {{ paper.withdrawn_status }} + {% endif %} + {% if paper.release_stage and paper.release_stage != "published" %} + {{ paper.release_stage }} + {% elif not paper.release_stage %} + unknown + {% endif %} + {% if paper.container_name %} + {% if paper.container_id %} + {{ paper.container_name }} + {% else %} + {{ paper.container_name }} + {% endif %} + {% if paper.container_is_oa %}{% endif %} + {% endif %} + {% if paper.doi or paper.pmid or paper.arxiv_id or paper.jstor_id %} +
+ {% endif %} + {% if paper.doi %} + doi:{{ paper.doi }}   + {% endif %} + {% if paper.pmid %} + pmid:{{ paper.pmid }}   + {% endif %} + {% if paper.arxiv_id %} + arXiv:{{ paper.arxiv_id }}   + {% endif %} + {% if False %} {# XXX: elastic release work grouping searches #} +
+ and 5 other versions of the same work! + {% endif %} +
+{% endmacro %} + + +{% macro top_results(found) -%} + +Showing + {% if found.offset == 0 %} + first + {% else %} + results {{ found.offset }} — + {% endif %} + + {{ found.offset + found.count_returned }} + out of {{ found.count_found }} results + + +{%- endmacro %} + + +{% macro bottom_results(found, endpoint='search.fulltext_search') -%} + +{% if found.offset > 0 %} + {% if found.offset - found.limit < 0 %} + « Previous + {% else %} + « Previous + {% endif %} +{% else %} + « Previous +{% endif %} + +  Showing results {{ found.offset }} — {{ found.offset + +found.count_returned }} out of {{ found.count_found }} results   + +{% if found.offset + found.limit < found.count_found and found.offset + found.limit < found.deep_page_limit %} + Next » + {% else %} + Next » +{% endif %} + +{%- endmacro %} diff --git a/fatcat_covid19/templates/fulltext_search.html b/fatcat_covid19/templates/fulltext_search.html new file mode 100644 index 0000000..dd42f9b --- /dev/null +++ b/fatcat_covid19/templates/fulltext_search.html @@ -0,0 +1,72 @@ +{% import "entity_macros.html" as entity_macros %} +{% extends "base.html" %} + +{% block title %} +{% if query %} + Search: {{ query }} +{% else %} + Fulltext Search +{% endif %} +{% endblock %} + + +{% block fullmain %} + +
+
+

Search all COVID-19 Resources

+
+
+
+ + +
+
+ + +
+
Can also lookup by identifier or search for containers (eg, journals). +
+
+
+
+ +
+
+ +{% if found %} +{% if found.results %} + {{ entity_macros.top_results(found) }} + + {% for paper in found.results %} + {{ entity_macros.fulltext_search_result_row(paper) }} +{% endfor %} +{% if found.results|length > 8 %} +
+
+ {{ entity_macros.bottom_results(found)}} +
+{% endif %} +{% else %} + +Raw query was: {{ found.query.q }} + +
+
+
+ confused paper man +
+
+

No results found!

+

You could try elsewhere:

+ +
+{% endif %} +{% endif %} + +
+{% endblock %} diff --git a/fatcat_covid19/templates/home.html b/fatcat_covid19/templates/home.html new file mode 100644 index 0000000..dbfb833 --- /dev/null +++ b/fatcat_covid19/templates/home.html @@ -0,0 +1,94 @@ +{% extends "base.html" %} + +{# no special title for now #} +{# {% block title %}Perpetual Access to the Scholarly Record{% endblock %} #} + +{% block extra_head %} + +{% endblock %} + +{% block fullmain %} + +
+
+

+ {{ _("Search tens of thousands of COVID-19 research papers and documents") }} +

+
+
+ +
+
+ + +
+
+
+
+
+ +
+
+
+
+
+ {{ _("Project Status") }}: {{ _("Prototype") }} +   + {{ _("These resources are not qualified medical advice!") }} +
+
+
+
+
+ +{# +
+
+
+
+
+ + confused paper man +
+
+

Fatcat is a versioned, user-editable catalog of research + publications including journal articles, conference proceedings, and + datasets +

Features include archival file-level metadata (verified digests and + long-term copies), an + open, documented API, + and work/release indexing (eg, distinguishing between and linking + pre-prints, manuscripts, and version-of-record). +  Read more... +

+
+
+
+
+#} + +
+
+
+
+
+

This service is hosted at The Internet Archive, a US + non-profit dedicated to providing Universal Access to All Knowledge. + {# + Donations welcome! +

Development funding comes from + The Andrew Mellon Foundation + to improve preservation and access to "long-tail" open access works on + the public web which might otherwise be lost. + #} +

+
+ IA logo +
+
+
+
+
+ +{% endblock %} diff --git a/fatcat_covid19/templates/sources.html b/fatcat_covid19/templates/sources.html new file mode 100644 index 0000000..17b0818 --- /dev/null +++ b/fatcat_covid19/templates/sources.html @@ -0,0 +1,119 @@ +{% extends "base.html" %} + +{% block title %}About{% endblock %} + +{% block body %} + +{# #} + +

+ +

Fatcat is versioned, publicly-editable catalog of research publications: +journal articles, conference proceedings, pre-prints, blog posts, and so forth. +The goal is to improve the state of preservation and access to these works by +providing a manifest of full-text content versions and locations. + +

This service does not directly contain full-text content itself, but +provides basic access for human and machine readers through links to copies in +web archives, repositories, and the public web. + +

Significantly more context and background information can be found in The Guide. + +

Feedback and queries can be directed to +webservices@archive.org. + +

Goals and Features

+ +

A few things set Fatcat apart from similar indexing and discovery services: + +

    +
  • inclusion of archival, file-level metadata (hashes) in addition + to URLs, which allows automated verification ("do I have the right copy"), + reveals content-drift over time, and enables efficient distribution of + content through the ecosystem +
  • native support for "post-PDF" digital media, including archival web + captures and datasets, as well as content stored on the distributed web +
  • data model that captures the work/edition distinction, + grouping pre-print, post-review, published, re-published, and updated + versions of a work together +
  • public editing interface, allowing metadata corrections and improvements + from individuals and bots in addition to automated imports from authoritative + sources +
  • focus on providing a stable API and corpus (making integration with + diverse user-facing applications simple), while enabling full replication and + mirroring of the corpus to reduce the risks of centralized control +
+ +

This service aspires to be a piece of sustainable, long-term, non-profit, +free-software, collaborative, open digital infrastructure. It is primarily +designed to support the archival and dissemination roles of +scholarly communication. It may also support the registration role +(establishing precedence and authorship), but explicitly does not aid with +certification of content, and is not intended to be used for +evaluation of individuals, institutions, or venues. This service is +"universal", not currated, and happily includes retracted and "predatory" +content). + +

Sources of Metadata

+ +The source of all bibliographic information is recorded in edit history +metadata, which allows the provenance of all records to be reconstructed. A few +major sources are worth highlighting here: + +
    +
  • Release metadata from Crossref, via their public + REST API +
  • Release metadata and linked full-text content from NIH Pubmed and arXiv.org +
  • Release metadata and linked public domain full-text content the JSTOR Early Journal Content collection +
  • Creator names and de-duplication from ORCID, via their annual public data releases +
  • Journal title metadata from DOAJ, ISSN ROAD, and SHERPA/RoMEO +
  • Full-text URL lists from CORE, + Unpaywall, + Semantic Scholar, + CiteseerX, + and Microsoft Academic Graph. +
  • The Guide lists more major sources +
+ +Many thanks for the hard work of all these projects, institutions, and +individuals! + + +

Support and Acknowledgments

+ +

Fatcat is a project of the Internet Archive, +a US-based non-profit digital library, well known for its +Wayback Machine web archive and +Open Library book digitization and +lending service. All Fatcat databases and services run on Internet Archive +servers in California, and a copy of most full-text content is stored in the +Archive's collections and/or web archives. + +

Development of Fatcat and related web harvesting, indexing, and preservation +efforts at the Archive have been partially funded (for the 2018-2019 period) by +a generous grant from the Mellon Foundation +("Long-tail Open Access Journal Preservation"). +Fatcat supports this work by both tracking which open access works in known +archives and providing minimum-viable indexing and access mechanisms for +long-tail works which otherwise would lack them. + +

The service would not technically be possible without hundreds of Free +Software components and the efforts of their individual and organizational +maintainers, more than can be listed here (please see the source code for full +lists). A few major components include the PostgreSQL database, Elasticsearch +search engine, Flask python web framework, Rust programming language, Diesel +database library, Swagger/OpenAPI code generators, Kafka distributed log, +Ansible configuration management tool, and Ubuntu GNU/Linux operating system +distribution. + +

The front-page photo of a large feline with a cup of coffee is by +Quinn Kampschroer, +under a CC-0 license. The name "Fatcat" can be interpreted as short for "large +catalog", as the service aspires to be a complete catalog of the digital +scholarly record. + +

A list of technical contributors, including volunteers, is maintained in the +source code repository (CONTRIBUTORS.md). Thanks everybody! + +{% endblock %} diff --git a/fatcat_covid19/webface.py b/fatcat_covid19/webface.py new file mode 100644 index 0000000..5476884 --- /dev/null +++ b/fatcat_covid19/webface.py @@ -0,0 +1,112 @@ + +""" +This is the single-file Flask web application +""" + +import os +import subprocess + +from flask import Flask, Blueprint, g, app, render_template, request +from flask_babel import Babel, gettext +from flask.logging import create_logger + +import sentry_sdk +from sentry_sdk.integrations.flask import FlaskIntegration + + +class BaseConfig(object): + + SUPPORTED_LANGUAGES = {'en': 'English', 'de': 'Deutsch'} + BABEL_DEFAULT_LOCALE = 'en' + BABEL_DEFAULT_TIMEZONE = 'UTC' + GIT_REVISION = subprocess.check_output(["git", "describe", "--always"]).strip().decode('utf-8') + + ELASTICSEARCH_BACKEND = os.environ.get("ELASTICSEARCH_BACKEND", default="https://search.fatcat.wiki") + ELASTICSEARCH_FULLTEXT_INDEX = os.environ.get("ELASTICSEARCH_FULLTEXT_INDEX", default="covid19_fatcat_fulltext") + + FATCAT_DOMAIN = "covid19.fatcat.wiki" + + SENTRY_CONFIG = { + 'enable-threads': True, # for uWSGI + 'release': GIT_REVISION, + 'tags': { + 'service': 'covid19.fatcat.wiki', + }, + } + +sentry_sdk.init( + # set SDN via environment variable SENTRY_DSN + integrations=[FlaskIntegration()] +) + +app = Flask(__name__, static_url_path='/static') +app.config.from_object(BaseConfig()) +app.log = create_logger(app) +babel = Babel(app) + +from fatcat_covid19.search import * + +bp = Blueprint('search', __name__) + +@bp.url_defaults +def add_language_code(endpoint, values): + if g.lang_code_set: + values.setdefault('lang_code', g.lang_code) + +@bp.url_value_preprocessor +def pull_lang_code(endpoint, values): + g.lang_code_set = 'lang_code' in values + g.lang_code = values.pop('lang_code', app.config['BABEL_DEFAULT_LOCALE']) + if g.lang_code not in app.config['SUPPORTED_LANGUAGES']: + abort(404) + +@bp.route('/', methods=['GET']) +def page_home(): + return render_template('home.html') + +@bp.route('/fulltext/search', methods=['GET', 'POST']) +def fulltext_search(): + + query = request.args.get('q') + + offset = request.args.get('offset', '0') + offset = max(0, int(offset)) if offset.isnumeric() else 0 + + if 'q' in request.args.keys(): + found = do_fulltext_search(query, offset=offset) + return render_template('fulltext_search.html', found=found, query=query) + else: + return render_template('fulltext_search.html', query=query) + +@bp.route('/about', methods=['GET']) +def page_about(): + return render_template('about_{}.html'.format(g.lang_code)) + +@bp.route('/sources', methods=['GET']) +def page_sources(): + return render_template('sources.html') + + +@bp.errorhandler(404) +def page_not_found(e): + return render_template('404.html'), 404 + +@bp.errorhandler(400) +def page_bad_request(e): + return render_template('400.html'), 400 + +@bp.errorhandler(502) +@bp.errorhandler(503) +@bp.errorhandler(504) +@bp.errorhandler(500) +def page_server_error(e): + return render_template('500.html'), 500 + +@app.route('/robots.txt', methods=['GET']) +def robots(): + return send_from_directory(os.path.join(app.root_path, 'static'), + 'robots.txt', + mimetype='text/plain') + +app.register_blueprint(bp, url_prefix='//') +app.register_blueprint(bp, url_prefix='/') -- cgit v1.2.3