From e5d0d98d0377c5833dc4fedb6d8df14f5489edb5 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 12 Dec 2019 17:47:12 -0800 Subject: initial 'Save Paper Now' web form --- python/example.env | 4 ++ python/fatcat_web/forms.py | 47 +++++++++++++++++ python/fatcat_web/kafka.py | 34 +++++++++++++ python/fatcat_web/routes.py | 42 ++++++++++++++- python/fatcat_web/templates/release_save.html | 73 +++++++++++++++++++++++++++ python/fatcat_web/templates/release_view.html | 26 +++++++++- python/fatcat_web/web_config.py | 4 ++ 7 files changed, 228 insertions(+), 2 deletions(-) create mode 100644 python/fatcat_web/kafka.py create mode 100644 python/fatcat_web/templates/release_save.html (limited to 'python') diff --git a/python/example.env b/python/example.env index fcf49712..120c9e6e 100644 --- a/python/example.env +++ b/python/example.env @@ -6,6 +6,10 @@ FATCAT_API_HOST="http://localhost:9411/v0" ELASTICSEARCH_BACKEND="http://localhost:9200" ELASTICSEARCH_RELEASE_INDEX="fatcat_release" ELASTICSEARCH_CONTAINER_INDEX="fatcat_container" +# for local dev use: +#KAFKA_PIXY_ENDPOINT="http://localhost:19092" +KAFKA_PIXY_ENDPOINT="" +KAFKA_SAVEPAPERNOW_TOPIC="sandcrawler-dev.ingest-file-requests" GITLAB_CLIENT_ID="" GITLAB_CLIENT_SECRET="" IA_XAUTH_CLIENT_ID="" diff --git a/python/fatcat_web/forms.py b/python/fatcat_web/forms.py index 206c5087..bd4e4bbd 100644 --- a/python/fatcat_web/forms.py +++ b/python/fatcat_web/forms.py @@ -363,3 +363,50 @@ class FileEntityForm(EntityEditForm): if self.edit_description.data: fe.edit_extra = dict(description=self.edit_description.data) +INGEST_TYPE_OPTIONS = [ + ('pdf', 'PDF Fulltext'), + ('html', 'HTML Fulltext'), + ('xml', 'XML Fulltext'), +] + +class SavePaperNowForm(FlaskForm): + + base_url = StringField( + "URL", + [validators.DataRequired(), + validators.URL()]) + ingest_type = SelectField( + "Content Type", + [validators.DataRequired()], + choices=INGEST_TYPE_OPTIONS, + default='pdf') + release_stage = SelectField( + "Publication Stage", + [validators.DataRequired()], + choices=release_stage_options, + default='') + + def to_ingest_request(self, release, actor='savepapernow-web'): + base_url = self.base_url.data + ext_ids = release.ext_ids.to_dict() + # by default this dict has a bunch of empty values + ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v]) + ingest_request = { + 'ingest_type': self.ingest_type.data, + 'ingest_request_source': actor, # TODO: deprecate? + 'actor': actor, + 'base_url': base_url, + 'fatcat': { + 'release_stage': release.release_stage, + 'release_ident': release.ident, + 'work_ident': release.work_id, + }, + 'ext_ids': ext_ids, + } + if self.release_stage.data: + ingest_request['release_stage'] = self.release_stage.data + if release.ext_ids.doi and base_url == "https://doi.org/{}".format(release.ext_ids.doi): + ingest_request['source'] = 'doi' + ingest_request['source_id'] = release.ext_ids.doi + return ingest_request + diff --git a/python/fatcat_web/kafka.py b/python/fatcat_web/kafka.py new file mode 100644 index 00000000..895e719f --- /dev/null +++ b/python/fatcat_web/kafka.py @@ -0,0 +1,34 @@ + +import requests + +from fatcat_web import Config + + +def kafka_pixy_produce(topic, msg, key=None, sync=True, timeout=5): + """ + Simple helper to public a message to the given Kafka topic, via the + configured kafka-pixy HTTP gateway + + topic: string + msg: string + key: optional, bytes + timeout: seconds + """ + + if not Config.KAFKA_PIXY_ENDPOINT: + raise Exception("Kafka produce error: kafka-pixy endpoint not configured") + + params = dict() + if key: + params['key'] = key + if sync: + params['sync'] = True + resp = requests.post( + "{}/topics/{}/messages".format(Config.KAFKA_PIXY_ENDPOINT, topic), + params=params, + data=msg, + headers={"Content-Type": "text/plain"}, + timeout=timeout, + ) + resp.raise_for_status() + print(resp.json()) diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index a41f388d..cc0af5cc 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -1,5 +1,6 @@ import os +import sys import json from flask import Flask, render_template, make_response, send_from_directory, \ request, url_for, abort, g, redirect, jsonify, session, flash, Response @@ -10,12 +11,14 @@ from fatcat_openapi_client import Editgroup, EditgroupAnnotation from fatcat_openapi_client.rest import ApiException from fatcat_tools.transforms import * from fatcat_tools.normal import * -from fatcat_web import app, api, auth_api, priv_api, mwoauth +from fatcat_web import app, api, auth_api, priv_api, mwoauth, Config from fatcat_web.auth import handle_token_login, handle_logout, load_user, handle_ia_xauth, handle_wmoauth from fatcat_web.cors import crossdomain from fatcat_web.search import * from fatcat_web.entity_helpers import * from fatcat_web.graphics import * +from fatcat_web.kafka import * +from fatcat_web.forms import SavePaperNowForm ### Generic Entity Views #################################################### @@ -628,6 +631,43 @@ def reviewable_view(): abort(ae.status) return render_template('editgroup_reviewable.html', entries=entries) +@app.route('/release//save', methods=['GET', 'POST']) +def release_save(ident): + + form = SavePaperNowForm() + + # lookup release ident, ensure it exists + try: + release = api.get_release(ident) + except ApiException as ae: + abort(ae.status) + + if not Config.KAFKA_PIXY_ENDPOINT: + return render_template('release_save.html', entity=release, form=form, spn_status='not-configured'), 501 + + if form.is_submitted(): + if form.validate_on_submit(): + # got a valid spn request! try to send to kafka-pixy + msg = form.to_ingest_request(release) + try: + kafka_pixy_produce( + Config.KAFKA_SAVEPAPERNOW_TOPIC, + json.dumps(msg), + ) + except Exception as e: + print(e, file=sys.stderr) + return render_template('release_save.html', entity=release, form=form, spn_status='kafka-error'), 500 + return render_template('release_save.html', entity=release, form=form, spn_status='success'), 200 + elif form.errors: + return render_template('release_save.html', entity=release, form=form), 400 + + # form was not submitted; populate defaults + if release.release_stage: + form.release_stage.data = release.release_stage + if release.ext_ids.doi: + form.base_url.data = "https://doi.org/{}".format(release.ext_ids.doi) + return render_template('release_save.html', entity=release, form=form), 200 + ### Search ################################################################## @app.route('/search', methods=['GET', 'POST']) diff --git a/python/fatcat_web/templates/release_save.html b/python/fatcat_web/templates/release_save.html new file mode 100644 index 00000000..29875d3d --- /dev/null +++ b/python/fatcat_web/templates/release_save.html @@ -0,0 +1,73 @@ +{% set release = entity %} +{% set entity_view = "save" %} +{% set entity_type = "release" %} +{% import "entity_macros.html" as entity_macros %} +{% import "edit_macros.html" as edit_macros %} +{% extends "entity_base.html" %} + +{% block entity_main %} + +
+
+

"Save Paper Now"

+ +{% if spn_status == "not-configured" %} + +
+
Error
+

Save Paper Now feature isn't configured, sorry about that. +

+ +{% elif spn_status == "kafka-error" %} + +
+
Error
+

Whoops, something went wrong and we couldn't enqueue your request. This + didn't have anything to do with the URL you supplied; please try again later. +

+ +{% elif spn_status == "success" %} + +
+
Success
+

URL has been submitted to the bot queue for crawling. If fulltext content + is found, it will be imported into the catalog for review. Keep an eye on the + reviewable editgroups list (can take 5-10 minutes + depending on throughput and batch sizes). +

+ +{% else %} +
+ + +
+

Know of a legit fulltext copy of this publication on the public web? + Tell us the URL and we will crawl it and provide free perpetual access. + + {{ edit_macros.form_field_basic(form.base_url) }} + +

Important: check the publication stage of the file you are + submitting. We distinguish between pre-prints, manuscripts, and the + published version of record (if applicable). + +

+ {{ edit_macros.form_field_basic(form.release_stage) }} + {{ edit_macros.form_field_basic(form.ingest_type) }} +
+ +
+ +
+ Your request will automatically be enqueued for our bots to crawl and + process. All new files will be reviewed before being included in the + catalog + +
+ +
+{% endif %} + +
+
+ +{% endblock %} diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html index 11b67148..b4e0ba25 100644 --- a/python/fatcat_web/templates/release_view.html +++ b/python/fatcat_web/templates/release_view.html @@ -257,7 +257,31 @@ {% elif entity.state == 'active' and entity.webcaptures != [] and entity.webcaptures[0].archive_urls != [] and entity.webcaptures[0].archive_urls[0].rel == "wayback" %} View Web Archive {% elif entity.state == 'active' %} -No Full Text Available +No Full Text Available + + + "Save Paper Now" +
+ Know of a fulltext copy of on the public web? Submit a URL and we will archive it +
+
+ +{# alternative SPN +
+
+ Save Paper Now +
+

Know of a fulltext copy on the public web? Submit a URL and we'll archive it +

+#} + +{# alternative SPN +
+ Save Paper Now +
know of a fulltext copy on the public web? submit a URL and we'll archive it +
+#} + {% endif %} {% if release.release_type or release.release_stage or release.release_year %} diff --git a/python/fatcat_web/web_config.py b/python/fatcat_web/web_config.py index ec37b66d..44175ab4 100644 --- a/python/fatcat_web/web_config.py +++ b/python/fatcat_web/web_config.py @@ -28,6 +28,10 @@ class Config(object): ELASTICSEARCH_RELEASE_INDEX = os.environ.get("ELASTICSEARCH_RELEASE_INDEX", default="fatcat_release") ELASTICSEARCH_CONTAINER_INDEX = os.environ.get("ELASTICSEARCH_CONTAINER_INDEX", default="fatcat_container") + # for save-paper-now. set to None if not configured, so we don't display forms/links + KAFKA_PIXY_ENDPOINT = os.environ.get("KAFKA_PIXY_ENDPOINT", default=None) or None + KAFKA_SAVEPAPERNOW_TOPIC="sandcrawler-dev.ingest-file-requests" + # for flask things, like session cookies FLASK_SECRET_KEY = os.environ.get("FLASK_SECRET_KEY", default=None) SECRET_KEY = FLASK_SECRET_KEY -- cgit v1.2.3