diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-04-01 21:29:56 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-01 21:29:59 -0700 |
commit | 986ce7a38029f7fb20a51271f67d943678e17386 (patch) | |
tree | 35d0533b5cba7134dce1f626df5f9caeb741f33d | |
parent | bacbad25b60bf585abb03d6d897061a3d872f2db (diff) | |
download | fatcat-covid19-986ce7a38029f7fb20a51271f67d943678e17386.tar.gz fatcat-covid19-986ce7a38029f7fb20a51271f67d943678e17386.zip |
first iteration of web interface
Copied and tweaked from fatcat:python/fatcat_web
LICENSE file for this repo is a TODO and will need to match that of
fatcat.
-rw-r--r-- | fatcat_covid19/babel.cfg | 3 | ||||
-rw-r--r-- | fatcat_covid19/search.py | 82 | ||||
-rw-r--r-- | fatcat_covid19/static/ia_logo.png | bin | 0 -> 8867 bytes | |||
-rw-r--r-- | fatcat_covid19/static/ia_logo_text.png | bin | 0 -> 7463 bytes | |||
-rw-r--r-- | fatcat_covid19/static/robots.txt | 1 | ||||
-rw-r--r-- | fatcat_covid19/templates/400.html | 13 | ||||
-rw-r--r-- | fatcat_covid19/templates/404.html | 9 | ||||
-rw-r--r-- | fatcat_covid19/templates/500.html | 13 | ||||
-rw-r--r-- | fatcat_covid19/templates/about_de.html | 13 | ||||
-rw-r--r-- | fatcat_covid19/templates/about_en.html | 13 | ||||
-rw-r--r-- | fatcat_covid19/templates/base.html | 106 | ||||
-rw-r--r-- | fatcat_covid19/templates/entity_macros.html | 117 | ||||
-rw-r--r-- | fatcat_covid19/templates/fulltext_search.html | 72 | ||||
-rw-r--r-- | fatcat_covid19/templates/home.html | 94 | ||||
-rw-r--r-- | fatcat_covid19/templates/sources.html | 119 | ||||
-rw-r--r-- | fatcat_covid19/webface.py | 112 |
16 files changed, 767 insertions, 0 deletions
diff --git a/fatcat_covid19/babel.cfg b/fatcat_covid19/babel.cfg new file mode 100644 index 0000000..0a5feb3 --- /dev/null +++ b/fatcat_covid19/babel.cfg @@ -0,0 +1,3 @@ +[python 1="**.py" language=":"][/python] +[jinja2: **/templates/**.htm] +extensions=jinja2.ext.autoescape,jinja2.ext.with_ diff --git a/fatcat_covid19/search.py b/fatcat_covid19/search.py new file mode 100644 index 0000000..e939502 --- /dev/null +++ b/fatcat_covid19/search.py @@ -0,0 +1,82 @@ + +import datetime +import requests +from flask import abort, flash +from fatcat_covid19.webface import app + +def do_search(index, request, limit=30, offset=0, deep_page_limit=2000): + + # Sanity checks + if limit > 100: + limit = 100 + if offset < 0: + offset = 0 + if offset > deep_page_limit: + # Avoid deep paging problem. + offset = deep_page_limit + + request["size"] = int(limit) + request["from"] = int(offset) + # print(request) + resp = requests.get("%s/%s/_search" % + (app.config['ELASTICSEARCH_BACKEND'], index), + json=request) + + if resp.status_code == 400: + print("elasticsearch 400: " + str(resp.content)) + flash("Search query failed to parse; you might need to use quotes.<p><code>{}</code>".format(resp.content)) + abort(resp.status_code) + elif resp.status_code != 200: + print("elasticsearch non-200 status code: " + str(resp.status_code)) + print(resp.content) + abort(resp.status_code) + + content = resp.json() + results = [h['_source'] for h in content['hits']['hits']] + for h in results: + # Handle surrogate strings that elasticsearch returns sometimes, + # probably due to mangled data processing in some pipeline. + # "Crimes against Unicode"; production workaround + for key in h: + if type(h[key]) is str: + h[key] = h[key].encode('utf8', 'ignore').decode('utf8') + + return {"count_returned": len(results), + "count_found": content['hits']['total'], + "results": results, + "offset": offset, + "deep_page_limit": deep_page_limit} + +def do_fulltext_search(q, limit=30, offset=0): + + #print("Search hit: " + q) + if limit > 100: + # Sanity check + limit = 100 + + # Convert raw DOIs to DOI queries + if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1: + q = 'doi:"{}"'.format(q) + + + search_request = { + "query": { + "query_string": { + "query": q, + "default_operator": "AND", + "analyze_wildcard": True, + "lenient": True, + "fields": ["everything"], + }, + }, + } + + resp = do_search(app.config['ELASTICSEARCH_FULLTEXT_INDEX'], search_request, offset=offset) + for h in resp['results']: + # Ensure 'contrib_names' is a list, not a single string + if type(h['contrib_names']) is not list: + h['contrib_names'] = [h['contrib_names'], ] + h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']] + resp["query"] = { "q": q } + resp["limit"] = limit + return resp diff --git a/fatcat_covid19/static/ia_logo.png b/fatcat_covid19/static/ia_logo.png Binary files differnew file mode 100644 index 0000000..97cc445 --- /dev/null +++ b/fatcat_covid19/static/ia_logo.png diff --git a/fatcat_covid19/static/ia_logo_text.png b/fatcat_covid19/static/ia_logo_text.png Binary files differnew file mode 100644 index 0000000..ddfc773 --- /dev/null +++ b/fatcat_covid19/static/ia_logo_text.png diff --git a/fatcat_covid19/static/robots.txt b/fatcat_covid19/static/robots.txt new file mode 100644 index 0000000..a168f11 --- /dev/null +++ b/fatcat_covid19/static/robots.txt @@ -0,0 +1 @@ +# Hello friends! diff --git a/fatcat_covid19/templates/400.html b/fatcat_covid19/templates/400.html new file mode 100644 index 0000000..f2659ca --- /dev/null +++ b/fatcat_covid19/templates/400.html @@ -0,0 +1,13 @@ +{% extends "base.html" %} +{% block body %} + +<center> +<div style="font-size: 8em;">400</div> +<div style="font-size: 3em;">Bad Request</div> + +<p>Wasn't able to handle the request, either due to incorrect or unexpected +input. Usually more context should be available; if you hit this page it means +you've discovered a new corner case! +</center> + +{% endblock %} diff --git a/fatcat_covid19/templates/404.html b/fatcat_covid19/templates/404.html new file mode 100644 index 0000000..653b8ee --- /dev/null +++ b/fatcat_covid19/templates/404.html @@ -0,0 +1,9 @@ +{% extends "base.html" %} +{% block body %} + +<center> +<div style="font-size: 8em;">404</div> +<div style="font-size: 3em;">Not Found</div> +</center> + +{% endblock %} diff --git a/fatcat_covid19/templates/500.html b/fatcat_covid19/templates/500.html new file mode 100644 index 0000000..a99232c --- /dev/null +++ b/fatcat_covid19/templates/500.html @@ -0,0 +1,13 @@ +{% extends "base.html" %} +{% block body %} + +<center> +<div style="font-size: 8em;">500</div> +<div style="font-size: 3em;">Internal Error</div> + +<p>Hrm, something unexpected went wrong. You may have found a bug! This request +should be logged and reported automatically; you could re-try or contact us for +more info. +</center> + +{% endblock %} diff --git a/fatcat_covid19/templates/about_de.html b/fatcat_covid19/templates/about_de.html new file mode 100644 index 0000000..2dd2b5e --- /dev/null +++ b/fatcat_covid19/templates/about_de.html @@ -0,0 +1,13 @@ +{% extends "base.html" %} + +{% block title %}About{% endblock %} + +{% block body %} + +{# <img class="ui fluid bordered image" src="/static/fatcat.jpg" title="CC0 photo of an oversized feline" alt=""> #} + +<h1></h1> + +TODO + +{% endblock %} diff --git a/fatcat_covid19/templates/about_en.html b/fatcat_covid19/templates/about_en.html new file mode 100644 index 0000000..2dd2b5e --- /dev/null +++ b/fatcat_covid19/templates/about_en.html @@ -0,0 +1,13 @@ +{% extends "base.html" %} + +{% block title %}About{% endblock %} + +{% block body %} + +{# <img class="ui fluid bordered image" src="/static/fatcat.jpg" title="CC0 photo of an oversized feline" alt=""> #} + +<h1></h1> + +TODO + +{% endblock %} diff --git a/fatcat_covid19/templates/base.html b/fatcat_covid19/templates/base.html new file mode 100644 index 0000000..0ca8471 --- /dev/null +++ b/fatcat_covid19/templates/base.html @@ -0,0 +1,106 @@ +<!DOCTYPE html> +<html lang="en" style="position: relative; min-height: 100%; height: auto;"> +<head> + <meta charset="utf-8" /> + <meta name="viewport" content="width=device-width, initial-scale=1.0"> + <meta name="referrer" content="origin-when-cross-origin"> + + <title>COVID-19 Research Search</title> + + <link rel="stylesheet" + href="https://cdn.jsdelivr.net/npm/semantic-ui@2.4.1/dist/semantic.min.css" + crossorigin="anonymous"> + <style> + {# bnewbold: fix light grey bars in header #} + .ui.inverted.menu .item:before { background: none; } + + @media only screen and (max-width: 479px) { + .mobile-hide{ display: none !important; } + } + </style> + {% block extra_head %}{% endblock %} +</head> +<body style="margin-bottom: 130px;"> + +<header class="ui fixed inverted menu"> + <div class="ui container"> + <a href="/" class="header item"> + <!-- <img class="logo" src="assets/images/logo.png"> --> + <span style="color: red;">COVID-19</span> + </a> + <a href="https://fatcat.wiki/" class="item mobile-hide">Fatcat</a> + <a href="/about" class="item mobile-hide">About</a> + <div class="right menu"> + <div class="item" style="padding: 0;"> + <form class="" action="/fulltext/search" method="get" role="search" aria-label="Papers"> + <div class="ui transparent inverted icon input"> + <i class="search icon" style="padding-right: 2em;"></i> + <input type="text" placeholder="Search Papers..." name="q" style="border: 1px solid #777 !important; padding: 5px !important; width: 15em;"> + </div> + </form> + </div> + <div class="ui simple dropdown item"> + <!-- language/translate SVG icon --> + <img src="" + alt="select language" style="height: 1.5em; filter: invert(100%);"> + <i class="dropdown icon"></i> + <div class="menu"> + {# TODO #} + <a class="item" href="{{ url_for(request.endpoint, lang_code='en') }}">English</a> + <a class="item" href="{{ url_for(request.endpoint, lang_code='de') }}">Deutsch</a> + </div> + </div> + </div> + </div> +</header> + +{% block fullmain %} +<!-- 4em top margin is "enough" --> +<main class="ui main container" style="margin-top: 6em; margin-bottom: 2em;" {% block main_extra_attr %}{% endblock %}> +{% with messages = get_flashed_messages() %} + {% if messages %} + <div class="ui message"> + {# Needs more javascript: <i class="close icon"></i> #} + <div class="header">Flash Message!</div> + <ul class="list"> + {% for message in messages %} + <li>{{ message|safe }} + {% endfor %} + </ul> + </div> + {% endif %} +{% endwith %} +{% block fullbody %} + <div class="ui container text"> + {% block body %}Nothing to see here.{% endblock %} + </div> +{% endblock %} +</main> +{% endblock %} + + +<footer class="ui inverted vertical footer segment" style="margin-top: 2em; padding-top: 2em; padding-bottom:2em; position: absolute; bottom: 0px; width: 100%;"> + <div class="ui center aligned container"> + <div class="ui horizontal inverted small divided link list"> + <a class="item" href="https://fatcat.wiki/">fatcat</a> + <a class="item" href="/about">About</a> + <a class="item" href="/sources">Sources</a> + <a class="item" href="https://github.com/bnewbold/covid19-fatcat-wiki/">Code</a> + <a class="item" href="https://github.com/bnewbold/covid19-fatcat-wiki/tree/{{ config.GIT_REVISION }}"><code>{{ config.GIT_REVISION }}</code></a> + </div> + </div> +</footer> + +<script + src="https://code.jquery.com/jquery-3.1.1.min.js" + integrity="sha256-hVVnYaiADRTO2PzUGmuLJr8BLUSjGIZsDYGmIJLv2b8=" + crossorigin="anonymous"> +</script> +<script + src="https://cdn.jsdelivr.net/npm/semantic-ui@2.3.2/dist/semantic.min.js" + crossorigin="anonymous"> +</script> +{% block postscript %}{% endblock %} + +</body> +</html> diff --git a/fatcat_covid19/templates/entity_macros.html b/fatcat_covid19/templates/entity_macros.html new file mode 100644 index 0000000..9cded8a --- /dev/null +++ b/fatcat_covid19/templates/entity_macros.html @@ -0,0 +1,117 @@ + +{% macro fulltext_search_result_row(paper) -%} +<div> + <h4 style="margin-top: 1em; margin-bottom: 0px; font-size: 1.1em;"> + <a href="/release/{{ paper.ident }}" style="color: #2224c7;"> + {% if paper.title %} + {{ paper.title[:512] }} + {% if paper.title|length > 512 %}...{% endif %} + {% else %} + [blank] + {% endif %} + </a> + </h4> + {% if paper.best_pdf_url %} + <div style="float: right; padding: 4px;"> + <a href="{{ paper.best_pdf_url }}" class="ui violet tag label"><i class="file icon"></i>fulltext</a> + </div> + {% endif %} + {# + <h5 style="margin-top: 4px; margin-bottom: 4px; font-size: 1em;">{{ ", ".join(paper.contrib_names[:12]) }} + {% if paper.contrib_names|length > 12 %}<i>(+{{ paper.contrib_names|length - 12 }} others)</i>{% endif %} + </h5> + #} + {% if paper.contrib_names %} + <div style="margin-top: 0px; margin-bottom: 0px; font-size: 1em;"> + <b> + {{ ", ".join(paper.contrib_names[:12]) }} + {% if paper.contrib_names|length > 12 %}<i>(+{{ paper.contrib_names|length - 12 }} others)</i>{% endif %} + </b> + </div> + {% endif %} + {% if paper.release_year %} + {{ paper.release_year }} + {% endif %} + {% if paper.release_type %} + {% if paper.release_type in ("article-journal", "paper-conference") %} + <span class="ui black basic label small">{{ paper.release_type }}</span> + {% elif paper.release_type in ("book") %} + <span class="ui brown basic label small">{{ paper.release_type }}</span> + {% else %} + <span class="ui grey basic label small">{{ paper.release_type }}</span> + {% endif %} + {% endif %} + {% if paper.withdrawn_status %} + <span class="ui red label small">{{ paper.withdrawn_status }}</span> + {% endif %} + {% if paper.release_stage and paper.release_stage != "published" %} + <span class="ui pink basic label small">{{ paper.release_stage }}</span> + {% elif not paper.release_stage %} + <span class="ui red basic label small">unknown</span> + {% endif %} + {% if paper.container_name %} + {% if paper.container_id %} + <a href="/container/{{ paper.container_id }}" style="color: black;">{{ paper.container_name }}</a> + {% else %} + {{ paper.container_name }} + {% endif %} + {% if paper.container_is_oa %}<i class="icon unlock orange small"></i>{% endif %} + {% endif %} + {% if paper.doi or paper.pmid or paper.arxiv_id or paper.jstor_id %} + <br> + {% endif %} + {% if paper.doi %} + <a href="https://doi.org/{{paper.doi }}" style="color: green;">doi:{{ paper.doi }}</a> + {% endif %} + {% if paper.pmid %} + <a href="https://www.ncbi.nlm.nih.gov/pubmed/{{paper.pmid }}" style="color: green;">pmid:{{ paper.pmid }}</a> + {% endif %} + {% if paper.arxiv_id %} + <a href="https://arxiv.org/abs/{{paper.arxiv_id }}" style="color: green;">arXiv:{{ paper.arxiv_id }}</a> + {% endif %} + {% if False %} {# XXX: elastic release work grouping searches #} + <br> + <a href="/work/{{ paper.work_id }}"><i class="sitemap icon"></i> and 5 other versions of the same work!</a> + {% endif %} +</div> +{% endmacro %} + + +{% macro top_results(found) -%} + +<i>Showing + {% if found.offset == 0 %} + first + {% else %} + results {{ found.offset }} — + {% endif %} + + {{ found.offset + found.count_returned }} + out of {{ found.count_found }} results +</i> + +{%- endmacro %} + + +{% macro bottom_results(found, endpoint='search.fulltext_search') -%} + +{% if found.offset > 0 %} + {% if found.offset - found.limit < 0 %} + <a href="{{ url_for(endpoint, q=found.query.q, offset=0) }}">« Previous</a> + {% else %} + <a href="{{ url_for(endpoint, q=found.query.q, offset=found.offset - found.limit) }}">« Previous</a> + {% endif %} +{% else %} + <span style="color:gray">« Previous</span> +{% endif %} + + <i>Showing results {{ found.offset }} — {{ found.offset + +found.count_returned }} out of {{ found.count_found }} results</i> + +{% if found.offset + found.limit < found.count_found and found.offset + found.limit < found.deep_page_limit %} + <a href="{{ url_for(endpoint, q=found.query.q, offset=found.offset + found.limit) }}">Next »</a> + {% else %} + <span style="color:gray">Next »</span> +{% endif %} + +{%- endmacro %} diff --git a/fatcat_covid19/templates/fulltext_search.html b/fatcat_covid19/templates/fulltext_search.html new file mode 100644 index 0000000..dd42f9b --- /dev/null +++ b/fatcat_covid19/templates/fulltext_search.html @@ -0,0 +1,72 @@ +{% import "entity_macros.html" as entity_macros %} +{% extends "base.html" %} + +{% block title %} +{% if query %} + Search: {{ query }} +{% else %} + Fulltext Search +{% endif %} +{% endblock %} + + +{% block fullmain %} + +<div class="ui vertical stripe segment" style="background-color: #EEE; padding-top: 4.5em;"> + <div class="ui container text"> + <h1>Search all COVID-19 Resources</h1> + <form class="" role="search" action="/fulltext/search" method="get"> + <div class="ui form"> + <div class="ui action input huge fluid"> + <input type="text" placeholder="Query..." name="q" value="{% if query %}{{ query }}{% endif %}" aria-label="search metadata"> + <button class="ui primary button">Search</button> + </div> + <div class="ui checkbox" style="float: right; margin: 1em;"> + <input type="checkbox" name="fulltext_only" value="true" {% if fulltext_only %}checked{% endif %}> + <label>Fulltext Available Only</label> + </div> + <br>Can also lookup by <b><a href="/release/lookup">identifier</a></b> or search for <b><a href="/container/search?q={{ query or "" }}">containers</a></b> (eg, journals). + </div> + </form> + </div> +</div> + +<div class="ui container text"> +<br> + +{% if found %} +{% if found.results %} + {{ entity_macros.top_results(found) }} + + {% for paper in found.results %} + {{ entity_macros.fulltext_search_result_row(paper) }} +{% endfor %} +{% if found.results|length > 8 %} + <div class="ui divider"></div> + <div style="text-align: center"> + {{ entity_macros.bottom_results(found)}} + </div> +{% endif %} +{% else %} + +Raw query was: <i>{{ found.query.q }}</i> + +<div class="ui centered stackable grid" style="padding-top: 15%;"> + <div class="row"> + <div class="four wide column"> + <img src="/static/paper_man_confused.gif" alt="confused paper man"> + </div> + <div class="six wide column"> + <h2>No results found!</h2> + <p>You could try elsewhere:</p> + <ul> + <li>Search <a href="https://dissem.in/search?q={{ found.query.q | urlencode }}">dissem.in</a></li> + <li>Search <a href="https://www.base-search.net/Search/Results?lookfor={{ found.query.q | urlencode }}">BASE</a></li> + <li>Search <a href="https://scholar.google.com/scholar?q={{ found.query.q | urlencode }}">Google Scholar</a></li> + </ul> +</div> +{% endif %} +{% endif %} + +</div> +{% endblock %} diff --git a/fatcat_covid19/templates/home.html b/fatcat_covid19/templates/home.html new file mode 100644 index 0000000..dbfb833 --- /dev/null +++ b/fatcat_covid19/templates/home.html @@ -0,0 +1,94 @@ +{% extends "base.html" %} + +{# no special title for now #} +{# {% block title %}Perpetual Access to the Scholarly Record{% endblock %} #} + +{% block extra_head %} + <link rel="canonical" href="https://{{ config.FATCAT_DOMAIN }}/"> +{% endblock %} + +{% block fullmain %} + +<div class ="ui vertical inverted masthead center aligned segment" style="padding-top: 12em; padding-bottom: 10em;"> + <div class="ui text container"> + <h1 class="ui header inverted huge centered"> + {{ _("Search tens of thousands of COVID-19 research papers and documents") }} + </h1> + <br> + <form class="" action="{{ url_for("search.fulltext_search") }}" method="get" role="search" aria-label="papers" itemprop="potentialAction" itemscope itemtype="https://schema.org/SearchAction"> + <meta itemprop="target" content="https://{{ config.FATCAT_DOMAIN }}/fulltext/search?q={q}"/> + <div class="ui form"> + <div class="ui action input huge fluid"> + <input type="text" placeholder="{{ _("by title, authors, identifiers...") }}" name="q" aria-label="search metadata" required itemprop="query-input"> + <button class="ui green button">{{ _("Search") }}</button> + </div> + </div> + </form> + </div> +</div> + +<div class="ui vertical stripe segment" style="background-color: #fffaf3; color: #573a08;"> + <div class="ui text container"> + <div class="ui centered grid"> + <div class="row"> + <div class="fourteen wide column" style="font-size: 1.1rem;"> + <b>{{ _("Project Status") }}: {{ _("Prototype") }}</b> + + {{ _("These resources are not qualified medical advice!") }} + </div> + </div> + </div> + </div> +</div> + +{# +<div class="ui vertical stripe segment" style="padding-top: 2em; padding-bottom: 2em;"> + <div class="ui text container" style="max-width: 800px!important;"> + <div class="ui centered grid"> + <div class="row"> + <div class="four wide column"> + <!-- TODO: don't let it scale down --> + <img src="/static/paper_man_confused.gif" width="130" alt="confused paper man"> + </div> + <div class="twelve wide column" style="font-size: 1.2rem;"> + <p><b>Fatcat is a versioned, user-editable catalog of research + publications including journal articles, conference proceedings, and + datasets</b> + <p>Features include archival file-level metadata (verified digests and + long-term copies), an + <b><a href="https://api.{{ config.FATCAT_DOMAIN }}">open, documented API</a></b>, + and work/release indexing (eg, distinguishing between and linking + pre-prints, manuscripts, and version-of-record). + <a href="/about">Read more...</a> + </div> + </div> + </div> + </div> +</div> +#} + +<div class="ui vertical stripe segment" style="padding-top: 2em; padding-bottom: 2em; background-color: #F5F5F5;"> + <div class="ui text container" style="max-width: 800px!important;"> + <div class="ui centered grid"> + <div class="row"> + <div class="twelve wide column" style="font-size: 1.2rem;"> + <p>This service is hosted at <b><a + href="https://archive.org">The Internet Archive</a></b>, a US + non-profit dedicated to providing Universal Access to All Knowledge. + {# + <a href="https://archive.org/donate/">Donations welcome!</a> + <p>Development funding comes from + <b><a href="https://blog.archive.org/2018/03/05/andrew-w-mellon-foundation-awards-grant-to-the-internet-archive-for-long-tail-journal-preservation/">The Andrew Mellon Foundation</a></b> + to improve preservation and access to "long-tail" open access works on + the public web which might otherwise be lost. + #} + </div> + <div class="four wide column"> + <img src="/static/ia_logo_text.png" width="140" alt="IA logo"> + </div> + </div> + </div> + </div> +</div> + +{% endblock %} diff --git a/fatcat_covid19/templates/sources.html b/fatcat_covid19/templates/sources.html new file mode 100644 index 0000000..17b0818 --- /dev/null +++ b/fatcat_covid19/templates/sources.html @@ -0,0 +1,119 @@ +{% extends "base.html" %} + +{% block title %}About{% endblock %} + +{% block body %} + +{# <img class="ui fluid bordered image" src="/static/fatcat.jpg" title="CC0 photo of an oversized feline" alt=""> #} + +<h1></h1> + +<p>Fatcat is versioned, publicly-editable catalog of research publications: +journal articles, conference proceedings, pre-prints, blog posts, and so forth. +The goal is to improve the state of preservation and access to these works by +providing a manifest of full-text content versions and locations. + +<p>This service does not directly contain full-text content itself, but +provides basic access for human and machine readers through links to copies in +web archives, repositories, and the public web. + +<p>Significantly more context and background information can be found in <a +href="https://guide.{{ config.FATCAT_DOMAIN }}/">The Guide</a>. + +<p>Feedback and queries can be directed to +<b><a href="mailto:webservices@archive.org">webservices@archive.org</a></b>. + +<h3>Goals and Features</h3> + +<p>A few things set Fatcat apart from similar indexing and discovery services: + +<ul> + <li>inclusion of archival, <b>file-level metadata (hashes)</b> in addition + to URLs, which allows automated verification ("do I have the right copy"), + reveals content-drift over time, and enables efficient distribution of + content through the ecosystem + <li>native support for "post-PDF" digital media, including <b>archival web + captures and datasets</b>, as well as content stored on the distributed web + <li>data model that captures the <b>work/edition distinction</b>, + grouping pre-print, post-review, published, re-published, and updated + versions of a work together + <li><b>public editing</b> interface, allowing metadata corrections and improvements + from individuals and bots in addition to automated imports from authoritative + sources + <li>focus on providing a stable API and corpus (making integration with + diverse user-facing applications simple), while enabling full replication and + mirroring of the corpus to <b>reduce the risks of centralized control</b> +</ul> + +<p>This service aspires to be a piece of sustainable, long-term, non-profit, +free-software, collaborative, open digital infrastructure. It is primarily +designed to support the <i>archival</i> and <i>dissemination</i> roles of +scholarly communication. It may also support the <i>registration</i> role +(establishing precedence and authorship), but explicitly does not aid with +<i>certification</i> of content, and is not intended to be used for +<i>evaluation</i> of individuals, institutions, or venues. This service is +"universal", not currated, and happily includes retracted and "predatory" +content). + +<h3>Sources of Metadata</h3> + +The source of all bibliographic information is recorded in edit history +metadata, which allows the provenance of all records to be reconstructed. A few +major sources are worth highlighting here: + +<ul> + <li>Release metadata from <b>Crossref</b>, via their public + <a href="https://github.com/CrossRef/rest-api-doc">REST API</a> + <li>Release metadata and linked full-text content from NIH <b>Pubmed</b> and <b><a href="https://arxiv.org">arXiv.org</a></b> + <li>Release metadata and linked public domain full-text content the <b>JSTOR</b> Early Journal Content collection + <li>Creator names and de-duplication from <b>ORCID</b>, via their annual public data releases + <li>Journal title metadata from <b>DOAJ</b>, <b>ISSN ROAD</b>, and <b>SHERPA/RoMEO</b> + <li>Full-text URL lists from <b><a href="https://core.ac.uk">CORE</a></b>, + <b><a href="http://unpaywall.org">Unpaywall</a></b>, + <b><a href="https://www.semanticscholar.org">Semantic Scholar</a></b>, + <b><a href="https://citeseerx.ist.psu.edu">CiteseerX</a></b>, + and <b><a href="https://www.microsoft.com/en-us/research/project/academic">Microsoft Academic Graph</a></b>. + <li><a href="https://guide.{{ config.FATCAT_DOMAIN }}/sources.html">The Guide</a> lists more major sources +</ul> + +Many thanks for the hard work of all these projects, institutions, and +individuals! + + +<h3>Support and Acknowledgments</h3> + +<p>Fatcat is a project of the <b><a href="https://archive.org">Internet Archive</a></b>, +a US-based non-profit digital library, well known for its +<a href="https://web.archive.org">Wayback Machine</a> web archive and +<a href="https://openlibrary.org">Open Library</a> book digitization and +lending service. All Fatcat databases and services run on Internet Archive +servers in California, and a copy of most full-text content is stored in the +Archive's collections and/or web archives. + +<p>Development of Fatcat and related web harvesting, indexing, and preservation +efforts at the Archive have been partially funded (for the 2018-2019 period) by +a generous grant from the <b>Mellon Foundation</b> +(<a href="https://blog.archive.org/2018/03/05/andrew-w-mellon-foundation-awards-grant-to-the-internet-archive-for-long-tail-journal-preservation/">"Long-tail Open Access Journal Preservation"</a>). +Fatcat supports this work by both tracking which open access works in known +archives and providing minimum-viable indexing and access mechanisms for +long-tail works which otherwise would lack them. + +<p>The service would not technically be possible without hundreds of Free +Software components and the efforts of their individual and organizational +maintainers, more than can be listed here (please see the source code for full +lists). A few major components include the PostgreSQL database, Elasticsearch +search engine, Flask python web framework, Rust programming language, Diesel +database library, Swagger/OpenAPI code generators, Kafka distributed log, +Ansible configuration management tool, and Ubuntu GNU/Linux operating system +distribution. + +<p>The front-page photo of a large feline with a cup of coffee is by +<a href="http://www.kampschroer.com/photography.html">Quinn Kampschroer</a>, +under a CC-0 license. The name "Fatcat" can be interpreted as short for "large +catalog", as the service aspires to be a <i>complete</i> catalog of the digital +scholarly record. + +<p>A list of technical contributors, including volunteers, is maintained in the +source code repository (<code>CONTRIBUTORS.md</code>). Thanks everybody! + +{% endblock %} diff --git a/fatcat_covid19/webface.py b/fatcat_covid19/webface.py new file mode 100644 index 0000000..5476884 --- /dev/null +++ b/fatcat_covid19/webface.py @@ -0,0 +1,112 @@ + +""" +This is the single-file Flask web application +""" + +import os +import subprocess + +from flask import Flask, Blueprint, g, app, render_template, request +from flask_babel import Babel, gettext +from flask.logging import create_logger + +import sentry_sdk +from sentry_sdk.integrations.flask import FlaskIntegration + + +class BaseConfig(object): + + SUPPORTED_LANGUAGES = {'en': 'English', 'de': 'Deutsch'} + BABEL_DEFAULT_LOCALE = 'en' + BABEL_DEFAULT_TIMEZONE = 'UTC' + GIT_REVISION = subprocess.check_output(["git", "describe", "--always"]).strip().decode('utf-8') + + ELASTICSEARCH_BACKEND = os.environ.get("ELASTICSEARCH_BACKEND", default="https://search.fatcat.wiki") + ELASTICSEARCH_FULLTEXT_INDEX = os.environ.get("ELASTICSEARCH_FULLTEXT_INDEX", default="covid19_fatcat_fulltext") + + FATCAT_DOMAIN = "covid19.fatcat.wiki" + + SENTRY_CONFIG = { + 'enable-threads': True, # for uWSGI + 'release': GIT_REVISION, + 'tags': { + 'service': 'covid19.fatcat.wiki', + }, + } + +sentry_sdk.init( + # set SDN via environment variable SENTRY_DSN + integrations=[FlaskIntegration()] +) + +app = Flask(__name__, static_url_path='/static') +app.config.from_object(BaseConfig()) +app.log = create_logger(app) +babel = Babel(app) + +from fatcat_covid19.search import * + +bp = Blueprint('search', __name__) + +@bp.url_defaults +def add_language_code(endpoint, values): + if g.lang_code_set: + values.setdefault('lang_code', g.lang_code) + +@bp.url_value_preprocessor +def pull_lang_code(endpoint, values): + g.lang_code_set = 'lang_code' in values + g.lang_code = values.pop('lang_code', app.config['BABEL_DEFAULT_LOCALE']) + if g.lang_code not in app.config['SUPPORTED_LANGUAGES']: + abort(404) + +@bp.route('/', methods=['GET']) +def page_home(): + return render_template('home.html') + +@bp.route('/fulltext/search', methods=['GET', 'POST']) +def fulltext_search(): + + query = request.args.get('q') + + offset = request.args.get('offset', '0') + offset = max(0, int(offset)) if offset.isnumeric() else 0 + + if 'q' in request.args.keys(): + found = do_fulltext_search(query, offset=offset) + return render_template('fulltext_search.html', found=found, query=query) + else: + return render_template('fulltext_search.html', query=query) + +@bp.route('/about', methods=['GET']) +def page_about(): + return render_template('about_{}.html'.format(g.lang_code)) + +@bp.route('/sources', methods=['GET']) +def page_sources(): + return render_template('sources.html') + + +@bp.errorhandler(404) +def page_not_found(e): + return render_template('404.html'), 404 + +@bp.errorhandler(400) +def page_bad_request(e): + return render_template('400.html'), 400 + +@bp.errorhandler(502) +@bp.errorhandler(503) +@bp.errorhandler(504) +@bp.errorhandler(500) +def page_server_error(e): + return render_template('500.html'), 500 + +@app.route('/robots.txt', methods=['GET']) +def robots(): + return send_from_directory(os.path.join(app.root_path, 'static'), + 'robots.txt', + mimetype='text/plain') + +app.register_blueprint(bp, url_prefix='/<string(length=2):lang_code>/') +app.register_blueprint(bp, url_prefix='/') |