diff options
Diffstat (limited to 'python/fatcat_web')
| -rw-r--r-- | python/fatcat_web/forms.py | 49 | ||||
| -rw-r--r-- | python/fatcat_web/kafka.py | 34 | ||||
| -rw-r--r-- | python/fatcat_web/routes.py | 46 | ||||
| -rw-r--r-- | python/fatcat_web/templates/release_save.html | 73 | ||||
| -rw-r--r-- | python/fatcat_web/templates/release_view.html | 26 | ||||
| -rw-r--r-- | python/fatcat_web/web_config.py | 4 | 
6 files changed, 230 insertions, 2 deletions
| diff --git a/python/fatcat_web/forms.py b/python/fatcat_web/forms.py index 206c5087..5539cc20 100644 --- a/python/fatcat_web/forms.py +++ b/python/fatcat_web/forms.py @@ -363,3 +363,52 @@ class FileEntityForm(EntityEditForm):          if self.edit_description.data:              fe.edit_extra = dict(description=self.edit_description.data) +INGEST_TYPE_OPTIONS = [ +    ('pdf', 'PDF Fulltext'), +    ('html', 'HTML Fulltext'), +    ('xml', 'XML Fulltext'), +] + +class SavePaperNowForm(FlaskForm): + +    base_url = StringField( +        "URL", +        [validators.DataRequired(), +         validators.URL()]) +    ingest_type = SelectField( +        "Content Type", +        [validators.DataRequired()], +        choices=INGEST_TYPE_OPTIONS, +        default='pdf') +    release_stage = SelectField( +        "Publication Stage", +        [validators.DataRequired()], +        choices=release_stage_options, +        default='') + +    def to_ingest_request(self, release, ingest_request_source='savepapernow'): +        base_url = self.base_url.data +        ext_ids = release.ext_ids.to_dict() +        # by default this dict has a bunch of empty values +        ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v]) +        ingest_request = { +            'ingest_type': self.ingest_type.data, +            'ingest_request_source': ingest_request_source, +            'base_url': base_url, +            'fatcat': { +                'release_ident': release.ident, +                'work_ident': release.work_id, +            }, +            'ext_ids': ext_ids, +        } +        if self.release_stage.data: +            ingest_request['release_stage'] = self.release_stage.data + +        if release.ext_ids.doi and base_url == "https://doi.org/{}".format(release.ext_ids.doi): +            ingest_request['link_source'] = 'doi' +            ingest_request['link_source_id'] = release.ext_ids.doi +        elif release.ext_ids.arxiv and base_url == "https://arxiv.org/pdf/{}.pdf".format(release.ext_ids.arxiv): +            ingest_request['link_source'] = 'arxiv' +            ingest_request['link_source_id'] = release.ext_ids.arxiv +        return ingest_request + diff --git a/python/fatcat_web/kafka.py b/python/fatcat_web/kafka.py new file mode 100644 index 00000000..895e719f --- /dev/null +++ b/python/fatcat_web/kafka.py @@ -0,0 +1,34 @@ + +import requests + +from fatcat_web import Config + + +def kafka_pixy_produce(topic, msg, key=None, sync=True, timeout=5): +    """ +    Simple helper to public a message to the given Kafka topic, via the +    configured kafka-pixy HTTP gateway + +    topic: string +    msg: string +    key: optional, bytes +    timeout: seconds +    """ + +    if not Config.KAFKA_PIXY_ENDPOINT: +        raise Exception("Kafka produce error: kafka-pixy endpoint not configured") + +    params = dict() +    if key: +        params['key'] = key +    if sync: +        params['sync'] = True +    resp = requests.post( +        "{}/topics/{}/messages".format(Config.KAFKA_PIXY_ENDPOINT, topic), +        params=params, +        data=msg, +        headers={"Content-Type": "text/plain"}, +        timeout=timeout, +    ) +    resp.raise_for_status() +    print(resp.json()) diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index a41f388d..8583d255 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -1,5 +1,6 @@  import os +import sys  import json  from flask import Flask, render_template, make_response, send_from_directory, \      request, url_for, abort, g, redirect, jsonify, session, flash, Response @@ -10,12 +11,14 @@ from fatcat_openapi_client import Editgroup, EditgroupAnnotation  from fatcat_openapi_client.rest import ApiException  from fatcat_tools.transforms import *  from fatcat_tools.normal import * -from fatcat_web import app, api, auth_api, priv_api, mwoauth +from fatcat_web import app, api, auth_api, priv_api, mwoauth, Config  from fatcat_web.auth import handle_token_login, handle_logout, load_user, handle_ia_xauth, handle_wmoauth  from fatcat_web.cors import crossdomain  from fatcat_web.search import *  from fatcat_web.entity_helpers import *  from fatcat_web.graphics import * +from fatcat_web.kafka import * +from fatcat_web.forms import SavePaperNowForm  ### Generic Entity Views #################################################### @@ -628,6 +631,47 @@ def reviewable_view():          abort(ae.status)      return render_template('editgroup_reviewable.html', entries=entries) +@app.route('/release/<ident>/save', methods=['GET', 'POST']) +def release_save(ident): + +    form = SavePaperNowForm() + +    # lookup release ident, ensure it exists +    try: +        release = api.get_release(ident) +    except ApiException as ae: +        abort(ae.status) + +    if not Config.KAFKA_PIXY_ENDPOINT: +        return render_template('release_save.html', entity=release, form=form, spn_status='not-configured'), 501 + +    if form.is_submitted(): +        if form.validate_on_submit(): +            # got a valid spn request! try to send to kafka-pixy +            msg = form.to_ingest_request(release, ingest_request_source="savepapernow-web") +            try: +                kafka_pixy_produce( +                    Config.KAFKA_SAVEPAPERNOW_TOPIC, +                    json.dumps(msg, sort_keys=True), +                ) +            except Exception as e: +                print(e, file=sys.stderr) +                return render_template('release_save.html', entity=release, form=form, spn_status='kafka-error'), 500 +            return render_template('release_save.html', entity=release, form=form, spn_status='success'), 200 +        elif form.errors: +            return render_template('release_save.html', entity=release, form=form), 400 + +    # form was not submitted; populate defaults +    if release.release_stage: +        form.release_stage.data = release.release_stage +    if release.ext_ids.doi: +        form.base_url.data = "https://doi.org/{}".format(release.ext_ids.doi) +    elif release.ext_ids.arxiv: +        form.base_url.data = "https://arxiv.org/pdf/{}.pdf".format(release.ext_ids.arxiv) +    elif release.ext_ids.pmcid: +        form.base_url.data = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(release.ext_ids.pmcid) +    return render_template('release_save.html', entity=release, form=form), 200 +  ### Search ##################################################################  @app.route('/search', methods=['GET', 'POST']) diff --git a/python/fatcat_web/templates/release_save.html b/python/fatcat_web/templates/release_save.html new file mode 100644 index 00000000..29875d3d --- /dev/null +++ b/python/fatcat_web/templates/release_save.html @@ -0,0 +1,73 @@ +{% set release = entity %} +{% set entity_view = "save" %} +{% set entity_type = "release" %} +{% import "entity_macros.html" as entity_macros %} +{% import "edit_macros.html" as edit_macros %} +{% extends "entity_base.html" %} + +{% block entity_main %} + +<div class="ui container text" style="margin-top: 2em;"> +<div class="ui segment" style="padding: 3em;"> +<h1 class="ui header">"Save Paper Now"</h1> + +{% if spn_status == "not-configured" %} + +<div class="ui error message" style="margin: 2em;"> +  <div class="header">Error</div> +  <p>Save Paper Now feature isn't configured, sorry about that. +</div> + +{% elif spn_status == "kafka-error" %} + +<div class="ui error message" style="margin: 2em;"> +  <div class="header">Error</div> +  <p>Whoops, something went wrong and we couldn't enqueue your request. This +  didn't have anything to do with the URL you supplied; please try again later. +</div> + +{% elif spn_status == "success" %} + +<div class="ui positive message" style="margin: 2em;"> +  <div class="header">Success</div> +  <p>URL has been submitted to the bot queue for crawling. If fulltext content +  is found, it will be imported into the catalog for review. Keep an eye on the +  <a href="/reviewable">reviewable editgroups</a> list (can take 5-10 minutes +  depending on throughput and batch sizes). +</div> + +{% else %} +<form class="ui form" id="save_release_form" method="POST" action="/release/{{ release.ident }}/save"> +  <input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/> + +  <br> +  <p>Know of a legit fulltext copy of this publication on the public web? +  Tell us the URL and we will crawl it and provide free perpetual access. + +  {{ edit_macros.form_field_basic(form.base_url) }} + +  <p style="margin-top: 2em; margin-bottom: 2em;"><b>Important:</b> check the publication stage of the file you are +  submitting. We distinguish between pre-prints, manuscripts, and the +  published version of record (if applicable). + +  <div class="ui equal width fields"> +    {{ edit_macros.form_field_basic(form.release_stage) }} +    {{ edit_macros.form_field_basic(form.ingest_type) }} +  </div> + +  <br> +  <input class="ui primary submit button big left floated" type="submit" value="Submit URL" style="margin-right: 1em;"> +  <div> +    <i>Your request will automatically be enqueued for our bots to crawl and +    process. All new files will be reviewed before being included in the +    catalog +    </i> +  </div> + +</form> +{% endif %} + +</div> +</div> + +{% endblock %} diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html index 11b67148..b4e0ba25 100644 --- a/python/fatcat_web/templates/release_view.html +++ b/python/fatcat_web/templates/release_view.html @@ -257,7 +257,31 @@  {% elif entity.state == 'active' and entity.webcaptures != [] and entity.webcaptures[0].archive_urls != [] and entity.webcaptures[0].archive_urls[0].rel == "wayback" %}  <a href="{{ entity.webcaptures[0].archive_urls[0].url }}{{ entity.webcaptures[0]._wayback_suffix }}" class="ui top attached fluid huge green button"><i class="file archive outline icon"></i>View Web Archive</a>  {% elif entity.state == 'active' %} -<span class="ui top attached fluid huge grey button"><i class="file cross icon"></i>No Full Text Available</span> +<span class="ui top attached fluid huge grey button"><i class="ban icon"></i>No Full Text Available</span> + +<a href="/release/{{ release.ident }}/save" class="ui attached fluid huge blue button"> +  <i class="cloud download icon"></i>"Save Paper Now" +  <div style="margin-top: 0.8em; font-size: smaller; text-align: left;"> +    Know of a fulltext copy of on the public web? Submit a URL and we will archive it +  </div> +</a> + +{# alternative SPN +<div class="ui segment attached"> +  <center> +    <a class="ui blue huge button" href="/release/{{ release.ident }}/save" title="save paper now">Save Paper Now</a> +  </center> +  <p style="margin-top: 0.5em;">Know of a fulltext copy on the public web? Submit a URL and we'll archive it +</div> +#} + +{# alternative SPN +<div class="ui segment attached yellow inverted accordion"> +  <b><a href="/release/{{ release.ident }}/save" title="save paper now" style="color: black;">Save Paper Now</a></b> +  <br>know of a fulltext copy on the public web? submit a URL and we'll archive it +</div> +#} +  {% endif %}  {% if release.release_type or release.release_stage or release.release_year %} diff --git a/python/fatcat_web/web_config.py b/python/fatcat_web/web_config.py index ec37b66d..0cb153d6 100644 --- a/python/fatcat_web/web_config.py +++ b/python/fatcat_web/web_config.py @@ -28,6 +28,10 @@ class Config(object):      ELASTICSEARCH_RELEASE_INDEX = os.environ.get("ELASTICSEARCH_RELEASE_INDEX", default="fatcat_release")      ELASTICSEARCH_CONTAINER_INDEX = os.environ.get("ELASTICSEARCH_CONTAINER_INDEX", default="fatcat_container") +    # for save-paper-now. set to None if not configured, so we don't display forms/links +    KAFKA_PIXY_ENDPOINT = os.environ.get("KAFKA_PIXY_ENDPOINT", default=None) or None +    KAFKA_SAVEPAPERNOW_TOPIC = os.environ.get("KAFKA_SAVEPAPERNOW_TOPIC", default="sandcrawler-dev.ingest-file-requests") +      # for flask things, like session cookies      FLASK_SECRET_KEY = os.environ.get("FLASK_SECRET_KEY", default=None)      SECRET_KEY = FLASK_SECRET_KEY | 
