diff options
Diffstat (limited to 'python/fatcat_web')
-rw-r--r-- | python/fatcat_web/forms.py | 49 | ||||
-rw-r--r-- | python/fatcat_web/kafka.py | 34 | ||||
-rw-r--r-- | python/fatcat_web/routes.py | 46 | ||||
-rw-r--r-- | python/fatcat_web/templates/release_save.html | 73 | ||||
-rw-r--r-- | python/fatcat_web/templates/release_view.html | 26 | ||||
-rw-r--r-- | python/fatcat_web/web_config.py | 4 |
6 files changed, 230 insertions, 2 deletions
diff --git a/python/fatcat_web/forms.py b/python/fatcat_web/forms.py index 206c5087..5539cc20 100644 --- a/python/fatcat_web/forms.py +++ b/python/fatcat_web/forms.py @@ -363,3 +363,52 @@ class FileEntityForm(EntityEditForm): if self.edit_description.data: fe.edit_extra = dict(description=self.edit_description.data) +INGEST_TYPE_OPTIONS = [ + ('pdf', 'PDF Fulltext'), + ('html', 'HTML Fulltext'), + ('xml', 'XML Fulltext'), +] + +class SavePaperNowForm(FlaskForm): + + base_url = StringField( + "URL", + [validators.DataRequired(), + validators.URL()]) + ingest_type = SelectField( + "Content Type", + [validators.DataRequired()], + choices=INGEST_TYPE_OPTIONS, + default='pdf') + release_stage = SelectField( + "Publication Stage", + [validators.DataRequired()], + choices=release_stage_options, + default='') + + def to_ingest_request(self, release, ingest_request_source='savepapernow'): + base_url = self.base_url.data + ext_ids = release.ext_ids.to_dict() + # by default this dict has a bunch of empty values + ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v]) + ingest_request = { + 'ingest_type': self.ingest_type.data, + 'ingest_request_source': ingest_request_source, + 'base_url': base_url, + 'fatcat': { + 'release_ident': release.ident, + 'work_ident': release.work_id, + }, + 'ext_ids': ext_ids, + } + if self.release_stage.data: + ingest_request['release_stage'] = self.release_stage.data + + if release.ext_ids.doi and base_url == "https://doi.org/{}".format(release.ext_ids.doi): + ingest_request['link_source'] = 'doi' + ingest_request['link_source_id'] = release.ext_ids.doi + elif release.ext_ids.arxiv and base_url == "https://arxiv.org/pdf/{}.pdf".format(release.ext_ids.arxiv): + ingest_request['link_source'] = 'arxiv' + ingest_request['link_source_id'] = release.ext_ids.arxiv + return ingest_request + diff --git a/python/fatcat_web/kafka.py b/python/fatcat_web/kafka.py new file mode 100644 index 00000000..895e719f --- /dev/null +++ b/python/fatcat_web/kafka.py @@ -0,0 +1,34 @@ + +import requests + +from fatcat_web import Config + + +def kafka_pixy_produce(topic, msg, key=None, sync=True, timeout=5): + """ + Simple helper to public a message to the given Kafka topic, via the + configured kafka-pixy HTTP gateway + + topic: string + msg: string + key: optional, bytes + timeout: seconds + """ + + if not Config.KAFKA_PIXY_ENDPOINT: + raise Exception("Kafka produce error: kafka-pixy endpoint not configured") + + params = dict() + if key: + params['key'] = key + if sync: + params['sync'] = True + resp = requests.post( + "{}/topics/{}/messages".format(Config.KAFKA_PIXY_ENDPOINT, topic), + params=params, + data=msg, + headers={"Content-Type": "text/plain"}, + timeout=timeout, + ) + resp.raise_for_status() + print(resp.json()) diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index a41f388d..8583d255 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -1,5 +1,6 @@ import os +import sys import json from flask import Flask, render_template, make_response, send_from_directory, \ request, url_for, abort, g, redirect, jsonify, session, flash, Response @@ -10,12 +11,14 @@ from fatcat_openapi_client import Editgroup, EditgroupAnnotation from fatcat_openapi_client.rest import ApiException from fatcat_tools.transforms import * from fatcat_tools.normal import * -from fatcat_web import app, api, auth_api, priv_api, mwoauth +from fatcat_web import app, api, auth_api, priv_api, mwoauth, Config from fatcat_web.auth import handle_token_login, handle_logout, load_user, handle_ia_xauth, handle_wmoauth from fatcat_web.cors import crossdomain from fatcat_web.search import * from fatcat_web.entity_helpers import * from fatcat_web.graphics import * +from fatcat_web.kafka import * +from fatcat_web.forms import SavePaperNowForm ### Generic Entity Views #################################################### @@ -628,6 +631,47 @@ def reviewable_view(): abort(ae.status) return render_template('editgroup_reviewable.html', entries=entries) +@app.route('/release/<ident>/save', methods=['GET', 'POST']) +def release_save(ident): + + form = SavePaperNowForm() + + # lookup release ident, ensure it exists + try: + release = api.get_release(ident) + except ApiException as ae: + abort(ae.status) + + if not Config.KAFKA_PIXY_ENDPOINT: + return render_template('release_save.html', entity=release, form=form, spn_status='not-configured'), 501 + + if form.is_submitted(): + if form.validate_on_submit(): + # got a valid spn request! try to send to kafka-pixy + msg = form.to_ingest_request(release, ingest_request_source="savepapernow-web") + try: + kafka_pixy_produce( + Config.KAFKA_SAVEPAPERNOW_TOPIC, + json.dumps(msg, sort_keys=True), + ) + except Exception as e: + print(e, file=sys.stderr) + return render_template('release_save.html', entity=release, form=form, spn_status='kafka-error'), 500 + return render_template('release_save.html', entity=release, form=form, spn_status='success'), 200 + elif form.errors: + return render_template('release_save.html', entity=release, form=form), 400 + + # form was not submitted; populate defaults + if release.release_stage: + form.release_stage.data = release.release_stage + if release.ext_ids.doi: + form.base_url.data = "https://doi.org/{}".format(release.ext_ids.doi) + elif release.ext_ids.arxiv: + form.base_url.data = "https://arxiv.org/pdf/{}.pdf".format(release.ext_ids.arxiv) + elif release.ext_ids.pmcid: + form.base_url.data = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(release.ext_ids.pmcid) + return render_template('release_save.html', entity=release, form=form), 200 + ### Search ################################################################## @app.route('/search', methods=['GET', 'POST']) diff --git a/python/fatcat_web/templates/release_save.html b/python/fatcat_web/templates/release_save.html new file mode 100644 index 00000000..29875d3d --- /dev/null +++ b/python/fatcat_web/templates/release_save.html @@ -0,0 +1,73 @@ +{% set release = entity %} +{% set entity_view = "save" %} +{% set entity_type = "release" %} +{% import "entity_macros.html" as entity_macros %} +{% import "edit_macros.html" as edit_macros %} +{% extends "entity_base.html" %} + +{% block entity_main %} + +<div class="ui container text" style="margin-top: 2em;"> +<div class="ui segment" style="padding: 3em;"> +<h1 class="ui header">"Save Paper Now"</h1> + +{% if spn_status == "not-configured" %} + +<div class="ui error message" style="margin: 2em;"> + <div class="header">Error</div> + <p>Save Paper Now feature isn't configured, sorry about that. +</div> + +{% elif spn_status == "kafka-error" %} + +<div class="ui error message" style="margin: 2em;"> + <div class="header">Error</div> + <p>Whoops, something went wrong and we couldn't enqueue your request. This + didn't have anything to do with the URL you supplied; please try again later. +</div> + +{% elif spn_status == "success" %} + +<div class="ui positive message" style="margin: 2em;"> + <div class="header">Success</div> + <p>URL has been submitted to the bot queue for crawling. If fulltext content + is found, it will be imported into the catalog for review. Keep an eye on the + <a href="/reviewable">reviewable editgroups</a> list (can take 5-10 minutes + depending on throughput and batch sizes). +</div> + +{% else %} +<form class="ui form" id="save_release_form" method="POST" action="/release/{{ release.ident }}/save"> + <input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/> + + <br> + <p>Know of a legit fulltext copy of this publication on the public web? + Tell us the URL and we will crawl it and provide free perpetual access. + + {{ edit_macros.form_field_basic(form.base_url) }} + + <p style="margin-top: 2em; margin-bottom: 2em;"><b>Important:</b> check the publication stage of the file you are + submitting. We distinguish between pre-prints, manuscripts, and the + published version of record (if applicable). + + <div class="ui equal width fields"> + {{ edit_macros.form_field_basic(form.release_stage) }} + {{ edit_macros.form_field_basic(form.ingest_type) }} + </div> + + <br> + <input class="ui primary submit button big left floated" type="submit" value="Submit URL" style="margin-right: 1em;"> + <div> + <i>Your request will automatically be enqueued for our bots to crawl and + process. All new files will be reviewed before being included in the + catalog + </i> + </div> + +</form> +{% endif %} + +</div> +</div> + +{% endblock %} diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html index 11b67148..b4e0ba25 100644 --- a/python/fatcat_web/templates/release_view.html +++ b/python/fatcat_web/templates/release_view.html @@ -257,7 +257,31 @@ {% elif entity.state == 'active' and entity.webcaptures != [] and entity.webcaptures[0].archive_urls != [] and entity.webcaptures[0].archive_urls[0].rel == "wayback" %} <a href="{{ entity.webcaptures[0].archive_urls[0].url }}{{ entity.webcaptures[0]._wayback_suffix }}" class="ui top attached fluid huge green button"><i class="file archive outline icon"></i>View Web Archive</a> {% elif entity.state == 'active' %} -<span class="ui top attached fluid huge grey button"><i class="file cross icon"></i>No Full Text Available</span> +<span class="ui top attached fluid huge grey button"><i class="ban icon"></i>No Full Text Available</span> + +<a href="/release/{{ release.ident }}/save" class="ui attached fluid huge blue button"> + <i class="cloud download icon"></i>"Save Paper Now" + <div style="margin-top: 0.8em; font-size: smaller; text-align: left;"> + Know of a fulltext copy of on the public web? Submit a URL and we will archive it + </div> +</a> + +{# alternative SPN +<div class="ui segment attached"> + <center> + <a class="ui blue huge button" href="/release/{{ release.ident }}/save" title="save paper now">Save Paper Now</a> + </center> + <p style="margin-top: 0.5em;">Know of a fulltext copy on the public web? Submit a URL and we'll archive it +</div> +#} + +{# alternative SPN +<div class="ui segment attached yellow inverted accordion"> + <b><a href="/release/{{ release.ident }}/save" title="save paper now" style="color: black;">Save Paper Now</a></b> + <br>know of a fulltext copy on the public web? submit a URL and we'll archive it +</div> +#} + {% endif %} {% if release.release_type or release.release_stage or release.release_year %} diff --git a/python/fatcat_web/web_config.py b/python/fatcat_web/web_config.py index ec37b66d..0cb153d6 100644 --- a/python/fatcat_web/web_config.py +++ b/python/fatcat_web/web_config.py @@ -28,6 +28,10 @@ class Config(object): ELASTICSEARCH_RELEASE_INDEX = os.environ.get("ELASTICSEARCH_RELEASE_INDEX", default="fatcat_release") ELASTICSEARCH_CONTAINER_INDEX = os.environ.get("ELASTICSEARCH_CONTAINER_INDEX", default="fatcat_container") + # for save-paper-now. set to None if not configured, so we don't display forms/links + KAFKA_PIXY_ENDPOINT = os.environ.get("KAFKA_PIXY_ENDPOINT", default=None) or None + KAFKA_SAVEPAPERNOW_TOPIC = os.environ.get("KAFKA_SAVEPAPERNOW_TOPIC", default="sandcrawler-dev.ingest-file-requests") + # for flask things, like session cookies FLASK_SECRET_KEY = os.environ.get("FLASK_SECRET_KEY", default=None) SECRET_KEY = FLASK_SECRET_KEY |