From 314aba35d06eb80be0c5ffc068774bb9bca38e76 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 15 Apr 2021 23:31:07 -0700 Subject: web: initial implementation of fuzzy citation parsing and matching tool --- python/fatcat_web/forms.py | 41 +++++++++++ python/fatcat_web/ref_routes.py | 46 +++++++++++++ python/fatcat_web/templates/reference_match.html | 86 ++++++++++++++++++++++++ 3 files changed, 173 insertions(+) create mode 100644 python/fatcat_web/templates/reference_match.html (limited to 'python/fatcat_web') diff --git a/python/fatcat_web/forms.py b/python/fatcat_web/forms.py index 1c9fb199..19176a59 100644 --- a/python/fatcat_web/forms.py +++ b/python/fatcat_web/forms.py @@ -482,3 +482,44 @@ class EntityTomlForm(EntityEditForm): etf.toml.data = entity_to_toml(entity, pop_fields=pop_fields) return etf + +class ReferenceMatchForm(FlaskForm): + + submit_type = SelectField('submit_type', + [validators.DataRequired()], + choices=['parse', 'match']) + + raw_citation = TextAreaField("Citation String", render_kw={'rows':'3'}) + + title = StringField("Title") + journal = StringField("Journal or Conference") + first_author = StringField("First Author") + #year = IntegerField('Year Released', + # [validators.Optional(True), valid_year]) + year = StringField("Year Released") + volume = StringField("Volume") + issue = StringField("Issue") + pages = StringField("Pages") + + @staticmethod + def from_grobid_parse(parse_dict, raw_citation): + """ + Initializes form from GROBID extraction + """ + rmf = ReferenceMatchForm() + rmf.raw_citation.data = raw_citation + + direct_fields = ['title', 'journal', 'volume', 'issue', 'pages'] + for k in direct_fields: + if parse_dict.get(k): + a = getattr(rmf, k) + a.data = parse_dict[k] + + date = parse_dict.get('date') + if date and len(date) >= 4 and date[0:4].isdigit(): + rmf.year.data = int(date[0:4]) + + if parse_dict.get('authors'): + rmf.first_author.data = parse_dict['authors'][0].get('name') + + return rmf diff --git a/python/fatcat_web/ref_routes.py b/python/fatcat_web/ref_routes.py index a49813c4..dc39299f 100644 --- a/python/fatcat_web/ref_routes.py +++ b/python/fatcat_web/ref_routes.py @@ -8,8 +8,11 @@ from typing import Optional from flask import render_template, abort, redirect, request from fatcat_openapi_client import * from fatcat_openapi_client.rest import ApiException +from fuzzycat.grobid_unstructured import grobid_api_process_citation, transform_grobid_ref_xml, grobid_ref_to_release +from fuzzycat.simple import close_fuzzy_biblio_matches, close_fuzzy_release_matches from fatcat_tools.references import enrich_inbound_refs_fatcat, enrich_outbound_refs_fatcat, get_inbound_refs, get_outbound_refs +from fatcat_tools.transforms.access import release_access_options from fatcat_web import app, api, auth_api from fatcat_web.forms import * from fatcat_web.entity_helpers import * @@ -48,3 +51,46 @@ def release_view_refs_outbound(ident): enriched_refs = enrich_outbound_refs_fatcat(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures") return render_template('release_view_fuzzy_refs.html', direction="outbound", entity=release, hits=hits, enriched_refs=enriched_refs), 200 + +@app.route('/reference/match', methods=['GET', 'POST']) +def reference_match(): + + form = ReferenceMatchForm() + grobid_status = None + grobid_dict = None + + if form.is_submitted(): + if form.validate_on_submit(): + if form.submit_type.data == 'parse': + resp_xml = grobid_api_process_citation(form.raw_citation.data) + if not resp_xml: + grobid_status = "failed" + return render_template('reference_match.html', form=form, grobid_status=grobid_status), 400 + grobid_dict = transform_grobid_ref_xml(resp_xml) + if not grobid_dict: + grobid_status = "empty" + return render_template('reference_match.html', form=form, grobid_status=grobid_status), 200 + #print(grobid_dict) + release_stub = grobid_ref_to_release(grobid_dict) + # remove empty values from GROBID parsed dict + grobid_dict = {k: v for k, v in grobid_dict.items() if v is not None} + form = ReferenceMatchForm.from_grobid_parse(grobid_dict, form.raw_citation.data) + grobid_status = "success" + matches = close_fuzzy_release_matches(es_client=app.es_client, release=release_stub, match_limit=10) or [] + elif form.submit_type.data == 'match': + matches = close_fuzzy_biblio_matches(es_client=app.es_client, biblio=form.data, match_limit=10) or [] + else: + raise NotImplementedError() + + for m in matches: + # expand releases more completely + m.release = api.get_release(m.release.ident, expand="container,files,filesets,webcaptures", hide="abstract,refs") + # hack in access options + m.access_options = release_access_options(m.release) + + return render_template('reference_match.html', form=form, grobid_dict=grobid_dict, grobid_status=grobid_status, matches=matches), 200 + + elif form.errors: + return render_template('reference_match.html', form=form), 400 + + return render_template('reference_match.html', form=form), 200 diff --git a/python/fatcat_web/templates/reference_match.html b/python/fatcat_web/templates/reference_match.html new file mode 100644 index 00000000..042b0607 --- /dev/null +++ b/python/fatcat_web/templates/reference_match.html @@ -0,0 +1,86 @@ +{% extends "base.html" %} +{% import "entity_macros.html" as entity_macros %} +{% import "edit_macros.html" as edit_macros %} + +{% block body %} + +

Reference Fuzzy Match Tool

+ +
+ + +

Parse Citation

+ +

Enter a citation string here and we will try to parse it (using GROBID) + into a structured format, then match against the catalog. + + {{ edit_macros.form_field_basic(form.raw_citation) }} + + + +
+ {% if grobid_status == "success" and grobid_dict %} +

+
Parsed successfully! See match results below
+ {{ entity_macros.extra_metadata(grobid_dict) }} +
+ {% endif %} + +
+
+

Fuzzy Match Metadata

+ +

Enter whatever bibliographic metadata fields you know, and we will try to + match to catalog entries. + +

NOTE: if you already know a persistent identifier (like a DOI), you + should use the lookup tool instead. + + {{ edit_macros.form_field_inline(form.title) }} + {{ edit_macros.form_field_inline(form.first_author) }} + +
+

+ {{ edit_macros.form_field_basic(form.year) }} + {{ edit_macros.form_field_basic(form.journal) }} +
+
+ {{ edit_macros.form_field_basic(form.volume) }} + {{ edit_macros.form_field_basic(form.issue) }} + {{ edit_macros.form_field_basic(form.pages) }} +
+ + +
+ +
+ +{% if matches is defined %} +
+
+

Match Results

+ + + {% for match in matches %} + +
+
{{ match.status.name }} +
{{ match.reason.name }} +
+ {{ entity_macros.release_summary(match.release) }} + + {% if match.access_options %} + {{ match.access_options[0].access_type.name }} + {% endif %} + {% endfor %} +
+ {% if not matches %} +

None! + {% endif %} +{% endif %} + +{% endblock %} -- cgit v1.2.3