diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-04-15 23:31:07 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-07-23 10:55:09 -0700 |
commit | 314aba35d06eb80be0c5ffc068774bb9bca38e76 (patch) | |
tree | 1f780f3e2e1808247cffc167f73eba3272353df7 | |
parent | 15680e0caae7ff6e24ddca8584b0c590d2b30581 (diff) | |
download | fatcat-314aba35d06eb80be0c5ffc068774bb9bca38e76.tar.gz fatcat-314aba35d06eb80be0c5ffc068774bb9bca38e76.zip |
web: initial implementation of fuzzy citation parsing and matching tool
-rw-r--r-- | python/fatcat_web/forms.py | 41 | ||||
-rw-r--r-- | python/fatcat_web/ref_routes.py | 46 | ||||
-rw-r--r-- | python/fatcat_web/templates/reference_match.html | 86 |
3 files changed, 173 insertions, 0 deletions
diff --git a/python/fatcat_web/forms.py b/python/fatcat_web/forms.py index 1c9fb199..19176a59 100644 --- a/python/fatcat_web/forms.py +++ b/python/fatcat_web/forms.py @@ -482,3 +482,44 @@ class EntityTomlForm(EntityEditForm): etf.toml.data = entity_to_toml(entity, pop_fields=pop_fields) return etf + +class ReferenceMatchForm(FlaskForm): + + submit_type = SelectField('submit_type', + [validators.DataRequired()], + choices=['parse', 'match']) + + raw_citation = TextAreaField("Citation String", render_kw={'rows':'3'}) + + title = StringField("Title") + journal = StringField("Journal or Conference") + first_author = StringField("First Author") + #year = IntegerField('Year Released', + # [validators.Optional(True), valid_year]) + year = StringField("Year Released") + volume = StringField("Volume") + issue = StringField("Issue") + pages = StringField("Pages") + + @staticmethod + def from_grobid_parse(parse_dict, raw_citation): + """ + Initializes form from GROBID extraction + """ + rmf = ReferenceMatchForm() + rmf.raw_citation.data = raw_citation + + direct_fields = ['title', 'journal', 'volume', 'issue', 'pages'] + for k in direct_fields: + if parse_dict.get(k): + a = getattr(rmf, k) + a.data = parse_dict[k] + + date = parse_dict.get('date') + if date and len(date) >= 4 and date[0:4].isdigit(): + rmf.year.data = int(date[0:4]) + + if parse_dict.get('authors'): + rmf.first_author.data = parse_dict['authors'][0].get('name') + + return rmf diff --git a/python/fatcat_web/ref_routes.py b/python/fatcat_web/ref_routes.py index a49813c4..dc39299f 100644 --- a/python/fatcat_web/ref_routes.py +++ b/python/fatcat_web/ref_routes.py @@ -8,8 +8,11 @@ from typing import Optional from flask import render_template, abort, redirect, request from fatcat_openapi_client import * from fatcat_openapi_client.rest import ApiException +from fuzzycat.grobid_unstructured import grobid_api_process_citation, transform_grobid_ref_xml, grobid_ref_to_release +from fuzzycat.simple import close_fuzzy_biblio_matches, close_fuzzy_release_matches from fatcat_tools.references import enrich_inbound_refs_fatcat, enrich_outbound_refs_fatcat, get_inbound_refs, get_outbound_refs +from fatcat_tools.transforms.access import release_access_options from fatcat_web import app, api, auth_api from fatcat_web.forms import * from fatcat_web.entity_helpers import * @@ -48,3 +51,46 @@ def release_view_refs_outbound(ident): enriched_refs = enrich_outbound_refs_fatcat(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures") return render_template('release_view_fuzzy_refs.html', direction="outbound", entity=release, hits=hits, enriched_refs=enriched_refs), 200 + +@app.route('/reference/match', methods=['GET', 'POST']) +def reference_match(): + + form = ReferenceMatchForm() + grobid_status = None + grobid_dict = None + + if form.is_submitted(): + if form.validate_on_submit(): + if form.submit_type.data == 'parse': + resp_xml = grobid_api_process_citation(form.raw_citation.data) + if not resp_xml: + grobid_status = "failed" + return render_template('reference_match.html', form=form, grobid_status=grobid_status), 400 + grobid_dict = transform_grobid_ref_xml(resp_xml) + if not grobid_dict: + grobid_status = "empty" + return render_template('reference_match.html', form=form, grobid_status=grobid_status), 200 + #print(grobid_dict) + release_stub = grobid_ref_to_release(grobid_dict) + # remove empty values from GROBID parsed dict + grobid_dict = {k: v for k, v in grobid_dict.items() if v is not None} + form = ReferenceMatchForm.from_grobid_parse(grobid_dict, form.raw_citation.data) + grobid_status = "success" + matches = close_fuzzy_release_matches(es_client=app.es_client, release=release_stub, match_limit=10) or [] + elif form.submit_type.data == 'match': + matches = close_fuzzy_biblio_matches(es_client=app.es_client, biblio=form.data, match_limit=10) or [] + else: + raise NotImplementedError() + + for m in matches: + # expand releases more completely + m.release = api.get_release(m.release.ident, expand="container,files,filesets,webcaptures", hide="abstract,refs") + # hack in access options + m.access_options = release_access_options(m.release) + + return render_template('reference_match.html', form=form, grobid_dict=grobid_dict, grobid_status=grobid_status, matches=matches), 200 + + elif form.errors: + return render_template('reference_match.html', form=form), 400 + + return render_template('reference_match.html', form=form), 200 diff --git a/python/fatcat_web/templates/reference_match.html b/python/fatcat_web/templates/reference_match.html new file mode 100644 index 00000000..042b0607 --- /dev/null +++ b/python/fatcat_web/templates/reference_match.html @@ -0,0 +1,86 @@ +{% extends "base.html" %} +{% import "entity_macros.html" as entity_macros %} +{% import "edit_macros.html" as edit_macros %} + +{% block body %} + +<h1>Reference Fuzzy Match Tool</h1> + +<form class="ui form" id="reference_match" method="POST" action="/reference/match"> + <input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/> + + <h3>Parse Citation</h3> + + <p>Enter a citation string here and we will try to parse it (using GROBID) + into a structured format, then match against the catalog. + + {{ edit_macros.form_field_basic(form.raw_citation) }} + + <button class="ui primary submit button right floated" type="submit" name="submit_type" value="parse"> + Parse + </button> + + <br clear="all"> + {% if grobid_status == "success" and grobid_dict %} + <div class="ui positive message"> + <div class="header">Parsed successfully! See match results below</div> + {{ entity_macros.extra_metadata(grobid_dict) }} + </div> + {% endif %} + + <br> + <hr> + <h3>Fuzzy Match Metadata</h3> + + <p>Enter whatever bibliographic metadata fields you know, and we will try to + match to catalog entries. + + <p><b>NOTE:</b> if you already know a persistent identifier (like a DOI), you + should use the <a href="/release/lookup">lookup tool</a> instead. + + {{ edit_macros.form_field_inline(form.title) }} + {{ edit_macros.form_field_inline(form.first_author) }} + + <br> + <div class="ui equal width fields"> + {{ edit_macros.form_field_basic(form.year) }} + {{ edit_macros.form_field_basic(form.journal) }} + </div> + <div class="ui equal width fields"> + {{ edit_macros.form_field_basic(form.volume) }} + {{ edit_macros.form_field_basic(form.issue) }} + {{ edit_macros.form_field_basic(form.pages) }} + </div> + + <button class="ui primary submit button right floated" type="submit" name="submit_type" value="match"> + Match + </button> + <br clear="all"> + +</form> + +{% if matches is defined %} + <br> + <hr> + <h3>Match Results</h3> + <table class="ui very basic celled table"> + <tbody> + {% for match in matches %} + <tr><td class="collapsing center aligned"> + <br><b>{{ match.status.name }}</b> + <br>{{ match.reason.name }} + <td class=""> + {{ entity_macros.release_summary(match.release) }} + <td class=""> + {% if match.access_options %} + <a href="{{ match.access_options[0].access_url}}" class="ui tiny green active button">{{ match.access_options[0].access_type.name }}</a> + {% endif %} + {% endfor %} + </tbody> + </table> + {% if not matches %} + <p><i>None!</i> + {% endif %} +{% endif %} + +{% endblock %} |