From 7f85ecea4e5a844ad78d129ed0b32a759ca7c1ad Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 21 Sep 2021 22:26:07 -0700 Subject: add GET w/ query params to reference match endpoint (and JSON version) --- python/fatcat_web/forms.py | 12 ++++++++ python/fatcat_web/ref_routes.py | 37 ++++++++++++++++++++---- python/fatcat_web/templates/reference_match.html | 4 ++- 3 files changed, 47 insertions(+), 6 deletions(-) (limited to 'python') diff --git a/python/fatcat_web/forms.py b/python/fatcat_web/forms.py index a856ef22..2757ebd2 100644 --- a/python/fatcat_web/forms.py +++ b/python/fatcat_web/forms.py @@ -487,6 +487,10 @@ class EntityTomlForm(EntityEditForm): class ReferenceMatchForm(FlaskForm): + class Meta: + # this is an API, so disable CSRF + csrf = False + submit_type = SelectField('submit_type', [validators.DataRequired()], choices=['parse', 'match']) @@ -496,12 +500,20 @@ class ReferenceMatchForm(FlaskForm): title = StringField("Title") journal = StringField("Journal or Conference") first_author = StringField("First Author") + #author_names = StringField("Author Names") #year = IntegerField('Year Released', # [validators.Optional(True), valid_year]) year = StringField("Year Released") + date = StringField("Date Released") volume = StringField("Volume") issue = StringField("Issue") pages = StringField("Pages") + publisher = StringField("Publisher") + doi = StringField("DOI") + pmid = StringField("PubMed Identifier (PMID)") + arxiv_id = StringField("arxiv.org Identifier") + release_type = StringField("Release Type") + release_stage = StringField("Release Stage") @staticmethod def from_grobid_parse(parse_dict, raw_citation): diff --git a/python/fatcat_web/ref_routes.py b/python/fatcat_web/ref_routes.py index d4219012..33d2f725 100644 --- a/python/fatcat_web/ref_routes.py +++ b/python/fatcat_web/ref_routes.py @@ -3,13 +3,16 @@ Flask endpoints for reference (citation) endpoints. Eg, listing references "inbound" and "outbound" from a specific release or work. """ -from flask import render_template, request, Response +import json + +from flask import render_template, request, Response, jsonify from fatcat_openapi_client import * from fuzzycat.grobid_unstructured import grobid_api_process_citation, transform_grobid_ref_xml, grobid_ref_to_release from fuzzycat.simple import close_fuzzy_biblio_matches, close_fuzzy_release_matches from fatcat_tools.references import enrich_inbound_refs, enrich_outbound_refs, get_inbound_refs, get_outbound_refs, RefHits from fatcat_tools.transforms.access import release_access_options +from fatcat_tools.transforms.entities import entity_to_dict from fatcat_web import app, api from fatcat_web.cors import crossdomain from fatcat_web.forms import * @@ -92,16 +95,18 @@ def wikipedia_view_refs_outbound(wiki_lang: str, wiki_article: str): hits = _refs_web("out", wikipedia_article=wikipedia_article) return render_template('wikipedia_view_fuzzy_refs.html', wiki_article=wiki_article, wiki_lang=wiki_lang, wiki_url=wiki_url, direction="out", hits=hits), 200 - @app.route('/reference/match', methods=['GET', 'POST']) def reference_match(): - form = ReferenceMatchForm() grobid_status = None grobid_dict = None - if form.is_submitted(): - if form.validate_on_submit(): + form = ReferenceMatchForm() + if not form.is_submitted() and request.args.get('submit_type'): + form = ReferenceMatchForm(request.args) + + if form.is_submitted() or request.args.get('title'): + if form.validate(): if form.submit_type.data == 'parse': resp_xml = grobid_api_process_citation(form.raw_citation.data) if not resp_xml: @@ -166,3 +171,25 @@ def wikipedia_view_refs_outbound_json(wiki_lang: str, wiki_article: str): wikipedia_article = wiki_lang + ":" + wiki_article hits = _refs_web("out", wikipedia_article=wikipedia_article) return Response(hits.json(exclude_unset=True), mimetype="application/json") + + +@app.route('/reference/match.json', methods=['GET', 'OPTIONS']) +@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) +def reference_match_json(): + form = ReferenceMatchForm(request.args) + if form.validate(): + if form.submit_type.data == 'match': + matches = close_fuzzy_biblio_matches(es_client=app.es_client, biblio=form.data, match_limit=10) or [] + else: + raise NotImplementedError() + for m in matches: + # expand releases more completely + m.release = api.get_release(m.release.ident, expand="container,files,filesets,webcaptures", hide="abstract,refs") + # hack in access options + m.access_options = release_access_options(m.release) + + # and convert to dict (for jsonify) + m.release = entity_to_dict(m.release) + return jsonify(matches), 200 + else: + return Response(json.dumps(dict(errors=form.errors)), mimetype="application/json", status=400) diff --git a/python/fatcat_web/templates/reference_match.html b/python/fatcat_web/templates/reference_match.html index f2335f52..08ab33fc 100644 --- a/python/fatcat_web/templates/reference_match.html +++ b/python/fatcat_web/templates/reference_match.html @@ -83,7 +83,9 @@ {% if match.access_options %} {{ match.access_options[0].access_type.name }} - {% endif %} + {% else %} + no fulltext + {% endif %} {% endfor %} -- cgit v1.2.3 From 6cbfaaa5e58ae4c0b482e3573e7e99300a857af8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 18 Oct 2021 10:42:26 -0700 Subject: access: populate thumbnail_url for PDFs --- python/fatcat_tools/transforms/access.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'python') diff --git a/python/fatcat_tools/transforms/access.py b/python/fatcat_tools/transforms/access.py index 5ed64c7c..39d4c6d3 100644 --- a/python/fatcat_tools/transforms/access.py +++ b/python/fatcat_tools/transforms/access.py @@ -36,10 +36,16 @@ def release_access_options(release: ReleaseEntity) -> List[AccessOption]: """ Extracts access options from a release. - TODO: proper implementation + TODO: proper implementation and filtering, instead of just returning first + option found """ options = [] for f in (release.files or []): + thumbnail_url = None + if f.mimetype == 'application/pdf' and f.sha1 and f.urls: + # NOTE: scholar.archive.org does an actual database check before + # generating these URLs, but we skip that for speed + thumbnail_url = f"https://blobs.fatcat.wiki/thumbnail/pdf/{f.sha1[0:2]}/{f.sha1[2:4]}/{f.sha1}.180px.jpg" for u in (f.urls or []): if '://web.archive.org/' in u.url: return [AccessOption( @@ -47,7 +53,7 @@ def release_access_options(release: ReleaseEntity) -> List[AccessOption]: access_url=u.url, mimetype=f.mimetype, size_bytes=f.size, - thumbnail_url=None + thumbnail_url=thumbnail_url, )] elif '://archive.org/' in u.url: return [AccessOption( @@ -55,6 +61,6 @@ def release_access_options(release: ReleaseEntity) -> List[AccessOption]: access_url=u.url, mimetype=f.mimetype, size_bytes=f.size, - thumbnail_url=None + thumbnail_url=thumbnail_url, )] return options -- cgit v1.2.3 From 3752237a30db843fb84a4197d7047f1c34eb5df2 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 18 Oct 2021 10:42:47 -0700 Subject: match: fix access_options in return --- python/fatcat_web/ref_routes.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'python') diff --git a/python/fatcat_web/ref_routes.py b/python/fatcat_web/ref_routes.py index 33d2f725..2d8ed413 100644 --- a/python/fatcat_web/ref_routes.py +++ b/python/fatcat_web/ref_routes.py @@ -182,14 +182,18 @@ def reference_match_json(): matches = close_fuzzy_biblio_matches(es_client=app.es_client, biblio=form.data, match_limit=10) or [] else: raise NotImplementedError() + resp = [] for m in matches: # expand releases more completely m.release = api.get_release(m.release.ident, expand="container,files,filesets,webcaptures", hide="abstract,refs") # hack in access options m.access_options = release_access_options(m.release) - # and convert to dict (for jsonify) - m.release = entity_to_dict(m.release) - return jsonify(matches), 200 + # and manually convert to dict (for jsonify) + info = m.__dict__ + info['release'] = entity_to_dict(m.release) + info['access_options'] = [o.dict() for o in m.access_options] + resp.append(info) + return jsonify(resp), 200 else: return Response(json.dumps(dict(errors=form.errors)), mimetype="application/json", status=400) -- cgit v1.2.3