summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-12-12 17:47:12 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-12-12 19:49:45 -0800
commite5d0d98d0377c5833dc4fedb6d8df14f5489edb5 (patch)
tree0e757d593555e2e8d970ac8a893cd24fb340c41e
parent7238a0ac6c977f6e3f72224eb63566577a082185 (diff)
downloadfatcat-e5d0d98d0377c5833dc4fedb6d8df14f5489edb5.tar.gz
fatcat-e5d0d98d0377c5833dc4fedb6d8df14f5489edb5.zip
initial 'Save Paper Now' web form
-rw-r--r--python/example.env4
-rw-r--r--python/fatcat_web/forms.py47
-rw-r--r--python/fatcat_web/kafka.py34
-rw-r--r--python/fatcat_web/routes.py42
-rw-r--r--python/fatcat_web/templates/release_save.html73
-rw-r--r--python/fatcat_web/templates/release_view.html26
-rw-r--r--python/fatcat_web/web_config.py4
7 files changed, 228 insertions, 2 deletions
diff --git a/python/example.env b/python/example.env
index fcf49712..120c9e6e 100644
--- a/python/example.env
+++ b/python/example.env
@@ -6,6 +6,10 @@ FATCAT_API_HOST="http://localhost:9411/v0"
ELASTICSEARCH_BACKEND="http://localhost:9200"
ELASTICSEARCH_RELEASE_INDEX="fatcat_release"
ELASTICSEARCH_CONTAINER_INDEX="fatcat_container"
+# for local dev use:
+#KAFKA_PIXY_ENDPOINT="http://localhost:19092"
+KAFKA_PIXY_ENDPOINT=""
+KAFKA_SAVEPAPERNOW_TOPIC="sandcrawler-dev.ingest-file-requests"
GITLAB_CLIENT_ID=""
GITLAB_CLIENT_SECRET=""
IA_XAUTH_CLIENT_ID=""
diff --git a/python/fatcat_web/forms.py b/python/fatcat_web/forms.py
index 206c5087..bd4e4bbd 100644
--- a/python/fatcat_web/forms.py
+++ b/python/fatcat_web/forms.py
@@ -363,3 +363,50 @@ class FileEntityForm(EntityEditForm):
if self.edit_description.data:
fe.edit_extra = dict(description=self.edit_description.data)
+INGEST_TYPE_OPTIONS = [
+ ('pdf', 'PDF Fulltext'),
+ ('html', 'HTML Fulltext'),
+ ('xml', 'XML Fulltext'),
+]
+
+class SavePaperNowForm(FlaskForm):
+
+ base_url = StringField(
+ "URL",
+ [validators.DataRequired(),
+ validators.URL()])
+ ingest_type = SelectField(
+ "Content Type",
+ [validators.DataRequired()],
+ choices=INGEST_TYPE_OPTIONS,
+ default='pdf')
+ release_stage = SelectField(
+ "Publication Stage",
+ [validators.DataRequired()],
+ choices=release_stage_options,
+ default='')
+
+ def to_ingest_request(self, release, actor='savepapernow-web'):
+ base_url = self.base_url.data
+ ext_ids = release.ext_ids.to_dict()
+ # by default this dict has a bunch of empty values
+ ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v])
+ ingest_request = {
+ 'ingest_type': self.ingest_type.data,
+ 'ingest_request_source': actor, # TODO: deprecate?
+ 'actor': actor,
+ 'base_url': base_url,
+ 'fatcat': {
+ 'release_stage': release.release_stage,
+ 'release_ident': release.ident,
+ 'work_ident': release.work_id,
+ },
+ 'ext_ids': ext_ids,
+ }
+ if self.release_stage.data:
+ ingest_request['release_stage'] = self.release_stage.data
+ if release.ext_ids.doi and base_url == "https://doi.org/{}".format(release.ext_ids.doi):
+ ingest_request['source'] = 'doi'
+ ingest_request['source_id'] = release.ext_ids.doi
+ return ingest_request
+
diff --git a/python/fatcat_web/kafka.py b/python/fatcat_web/kafka.py
new file mode 100644
index 00000000..895e719f
--- /dev/null
+++ b/python/fatcat_web/kafka.py
@@ -0,0 +1,34 @@
+
+import requests
+
+from fatcat_web import Config
+
+
+def kafka_pixy_produce(topic, msg, key=None, sync=True, timeout=5):
+ """
+ Simple helper to public a message to the given Kafka topic, via the
+ configured kafka-pixy HTTP gateway
+
+ topic: string
+ msg: string
+ key: optional, bytes
+ timeout: seconds
+ """
+
+ if not Config.KAFKA_PIXY_ENDPOINT:
+ raise Exception("Kafka produce error: kafka-pixy endpoint not configured")
+
+ params = dict()
+ if key:
+ params['key'] = key
+ if sync:
+ params['sync'] = True
+ resp = requests.post(
+ "{}/topics/{}/messages".format(Config.KAFKA_PIXY_ENDPOINT, topic),
+ params=params,
+ data=msg,
+ headers={"Content-Type": "text/plain"},
+ timeout=timeout,
+ )
+ resp.raise_for_status()
+ print(resp.json())
diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py
index a41f388d..cc0af5cc 100644
--- a/python/fatcat_web/routes.py
+++ b/python/fatcat_web/routes.py
@@ -1,5 +1,6 @@
import os
+import sys
import json
from flask import Flask, render_template, make_response, send_from_directory, \
request, url_for, abort, g, redirect, jsonify, session, flash, Response
@@ -10,12 +11,14 @@ from fatcat_openapi_client import Editgroup, EditgroupAnnotation
from fatcat_openapi_client.rest import ApiException
from fatcat_tools.transforms import *
from fatcat_tools.normal import *
-from fatcat_web import app, api, auth_api, priv_api, mwoauth
+from fatcat_web import app, api, auth_api, priv_api, mwoauth, Config
from fatcat_web.auth import handle_token_login, handle_logout, load_user, handle_ia_xauth, handle_wmoauth
from fatcat_web.cors import crossdomain
from fatcat_web.search import *
from fatcat_web.entity_helpers import *
from fatcat_web.graphics import *
+from fatcat_web.kafka import *
+from fatcat_web.forms import SavePaperNowForm
### Generic Entity Views ####################################################
@@ -628,6 +631,43 @@ def reviewable_view():
abort(ae.status)
return render_template('editgroup_reviewable.html', entries=entries)
+@app.route('/release/<ident>/save', methods=['GET', 'POST'])
+def release_save(ident):
+
+ form = SavePaperNowForm()
+
+ # lookup release ident, ensure it exists
+ try:
+ release = api.get_release(ident)
+ except ApiException as ae:
+ abort(ae.status)
+
+ if not Config.KAFKA_PIXY_ENDPOINT:
+ return render_template('release_save.html', entity=release, form=form, spn_status='not-configured'), 501
+
+ if form.is_submitted():
+ if form.validate_on_submit():
+ # got a valid spn request! try to send to kafka-pixy
+ msg = form.to_ingest_request(release)
+ try:
+ kafka_pixy_produce(
+ Config.KAFKA_SAVEPAPERNOW_TOPIC,
+ json.dumps(msg),
+ )
+ except Exception as e:
+ print(e, file=sys.stderr)
+ return render_template('release_save.html', entity=release, form=form, spn_status='kafka-error'), 500
+ return render_template('release_save.html', entity=release, form=form, spn_status='success'), 200
+ elif form.errors:
+ return render_template('release_save.html', entity=release, form=form), 400
+
+ # form was not submitted; populate defaults
+ if release.release_stage:
+ form.release_stage.data = release.release_stage
+ if release.ext_ids.doi:
+ form.base_url.data = "https://doi.org/{}".format(release.ext_ids.doi)
+ return render_template('release_save.html', entity=release, form=form), 200
+
### Search ##################################################################
@app.route('/search', methods=['GET', 'POST'])
diff --git a/python/fatcat_web/templates/release_save.html b/python/fatcat_web/templates/release_save.html
new file mode 100644
index 00000000..29875d3d
--- /dev/null
+++ b/python/fatcat_web/templates/release_save.html
@@ -0,0 +1,73 @@
+{% set release = entity %}
+{% set entity_view = "save" %}
+{% set entity_type = "release" %}
+{% import "entity_macros.html" as entity_macros %}
+{% import "edit_macros.html" as edit_macros %}
+{% extends "entity_base.html" %}
+
+{% block entity_main %}
+
+<div class="ui container text" style="margin-top: 2em;">
+<div class="ui segment" style="padding: 3em;">
+<h1 class="ui header">"Save Paper Now"</h1>
+
+{% if spn_status == "not-configured" %}
+
+<div class="ui error message" style="margin: 2em;">
+ <div class="header">Error</div>
+ <p>Save Paper Now feature isn't configured, sorry about that.
+</div>
+
+{% elif spn_status == "kafka-error" %}
+
+<div class="ui error message" style="margin: 2em;">
+ <div class="header">Error</div>
+ <p>Whoops, something went wrong and we couldn't enqueue your request. This
+ didn't have anything to do with the URL you supplied; please try again later.
+</div>
+
+{% elif spn_status == "success" %}
+
+<div class="ui positive message" style="margin: 2em;">
+ <div class="header">Success</div>
+ <p>URL has been submitted to the bot queue for crawling. If fulltext content
+ is found, it will be imported into the catalog for review. Keep an eye on the
+ <a href="/reviewable">reviewable editgroups</a> list (can take 5-10 minutes
+ depending on throughput and batch sizes).
+</div>
+
+{% else %}
+<form class="ui form" id="save_release_form" method="POST" action="/release/{{ release.ident }}/save">
+ <input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
+
+ <br>
+ <p>Know of a legit fulltext copy of this publication on the public web?
+ Tell us the URL and we will crawl it and provide free perpetual access.
+
+ {{ edit_macros.form_field_basic(form.base_url) }}
+
+ <p style="margin-top: 2em; margin-bottom: 2em;"><b>Important:</b> check the publication stage of the file you are
+ submitting. We distinguish between pre-prints, manuscripts, and the
+ published version of record (if applicable).
+
+ <div class="ui equal width fields">
+ {{ edit_macros.form_field_basic(form.release_stage) }}
+ {{ edit_macros.form_field_basic(form.ingest_type) }}
+ </div>
+
+ <br>
+ <input class="ui primary submit button big left floated" type="submit" value="Submit URL" style="margin-right: 1em;">
+ <div>
+ <i>Your request will automatically be enqueued for our bots to crawl and
+ process. All new files will be reviewed before being included in the
+ catalog
+ </i>
+ </div>
+
+</form>
+{% endif %}
+
+</div>
+</div>
+
+{% endblock %}
diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html
index 11b67148..b4e0ba25 100644
--- a/python/fatcat_web/templates/release_view.html
+++ b/python/fatcat_web/templates/release_view.html
@@ -257,7 +257,31 @@
{% elif entity.state == 'active' and entity.webcaptures != [] and entity.webcaptures[0].archive_urls != [] and entity.webcaptures[0].archive_urls[0].rel == "wayback" %}
<a href="{{ entity.webcaptures[0].archive_urls[0].url }}{{ entity.webcaptures[0]._wayback_suffix }}" class="ui top attached fluid huge green button"><i class="file archive outline icon"></i>View Web Archive</a>
{% elif entity.state == 'active' %}
-<span class="ui top attached fluid huge grey button"><i class="file cross icon"></i>No Full Text Available</span>
+<span class="ui top attached fluid huge grey button"><i class="ban icon"></i>No Full Text Available</span>
+
+<a href="/release/{{ release.ident }}/save" class="ui attached fluid huge blue button">
+ <i class="cloud download icon"></i>"Save Paper Now"
+ <div style="margin-top: 0.8em; font-size: smaller; text-align: left;">
+ Know of a fulltext copy of on the public web? Submit a URL and we will archive it
+ </div>
+</a>
+
+{# alternative SPN
+<div class="ui segment attached">
+ <center>
+ <a class="ui blue huge button" href="/release/{{ release.ident }}/save" title="save paper now">Save Paper Now</a>
+ </center>
+ <p style="margin-top: 0.5em;">Know of a fulltext copy on the public web? Submit a URL and we'll archive it
+</div>
+#}
+
+{# alternative SPN
+<div class="ui segment attached yellow inverted accordion">
+ <b><a href="/release/{{ release.ident }}/save" title="save paper now" style="color: black;">Save Paper Now</a></b>
+ <br>know of a fulltext copy on the public web? submit a URL and we'll archive it
+</div>
+#}
+
{% endif %}
{% if release.release_type or release.release_stage or release.release_year %}
diff --git a/python/fatcat_web/web_config.py b/python/fatcat_web/web_config.py
index ec37b66d..44175ab4 100644
--- a/python/fatcat_web/web_config.py
+++ b/python/fatcat_web/web_config.py
@@ -28,6 +28,10 @@ class Config(object):
ELASTICSEARCH_RELEASE_INDEX = os.environ.get("ELASTICSEARCH_RELEASE_INDEX", default="fatcat_release")
ELASTICSEARCH_CONTAINER_INDEX = os.environ.get("ELASTICSEARCH_CONTAINER_INDEX", default="fatcat_container")
+ # for save-paper-now. set to None if not configured, so we don't display forms/links
+ KAFKA_PIXY_ENDPOINT = os.environ.get("KAFKA_PIXY_ENDPOINT", default=None) or None
+ KAFKA_SAVEPAPERNOW_TOPIC="sandcrawler-dev.ingest-file-requests"
+
# for flask things, like session cookies
FLASK_SECRET_KEY = os.environ.get("FLASK_SECRET_KEY", default=None)
SECRET_KEY = FLASK_SECRET_KEY