summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-04-15 23:31:07 -0700
committerBryan Newbold <bnewbold@robocracy.org>2021-07-23 10:55:09 -0700
commit314aba35d06eb80be0c5ffc068774bb9bca38e76 (patch)
tree1f780f3e2e1808247cffc167f73eba3272353df7
parent15680e0caae7ff6e24ddca8584b0c590d2b30581 (diff)
downloadfatcat-314aba35d06eb80be0c5ffc068774bb9bca38e76.tar.gz
fatcat-314aba35d06eb80be0c5ffc068774bb9bca38e76.zip
web: initial implementation of fuzzy citation parsing and matching tool
-rw-r--r--python/fatcat_web/forms.py41
-rw-r--r--python/fatcat_web/ref_routes.py46
-rw-r--r--python/fatcat_web/templates/reference_match.html86
3 files changed, 173 insertions, 0 deletions
diff --git a/python/fatcat_web/forms.py b/python/fatcat_web/forms.py
index 1c9fb199..19176a59 100644
--- a/python/fatcat_web/forms.py
+++ b/python/fatcat_web/forms.py
@@ -482,3 +482,44 @@ class EntityTomlForm(EntityEditForm):
etf.toml.data = entity_to_toml(entity, pop_fields=pop_fields)
return etf
+
+class ReferenceMatchForm(FlaskForm):
+
+ submit_type = SelectField('submit_type',
+ [validators.DataRequired()],
+ choices=['parse', 'match'])
+
+ raw_citation = TextAreaField("Citation String", render_kw={'rows':'3'})
+
+ title = StringField("Title")
+ journal = StringField("Journal or Conference")
+ first_author = StringField("First Author")
+ #year = IntegerField('Year Released',
+ # [validators.Optional(True), valid_year])
+ year = StringField("Year Released")
+ volume = StringField("Volume")
+ issue = StringField("Issue")
+ pages = StringField("Pages")
+
+ @staticmethod
+ def from_grobid_parse(parse_dict, raw_citation):
+ """
+ Initializes form from GROBID extraction
+ """
+ rmf = ReferenceMatchForm()
+ rmf.raw_citation.data = raw_citation
+
+ direct_fields = ['title', 'journal', 'volume', 'issue', 'pages']
+ for k in direct_fields:
+ if parse_dict.get(k):
+ a = getattr(rmf, k)
+ a.data = parse_dict[k]
+
+ date = parse_dict.get('date')
+ if date and len(date) >= 4 and date[0:4].isdigit():
+ rmf.year.data = int(date[0:4])
+
+ if parse_dict.get('authors'):
+ rmf.first_author.data = parse_dict['authors'][0].get('name')
+
+ return rmf
diff --git a/python/fatcat_web/ref_routes.py b/python/fatcat_web/ref_routes.py
index a49813c4..dc39299f 100644
--- a/python/fatcat_web/ref_routes.py
+++ b/python/fatcat_web/ref_routes.py
@@ -8,8 +8,11 @@ from typing import Optional
from flask import render_template, abort, redirect, request
from fatcat_openapi_client import *
from fatcat_openapi_client.rest import ApiException
+from fuzzycat.grobid_unstructured import grobid_api_process_citation, transform_grobid_ref_xml, grobid_ref_to_release
+from fuzzycat.simple import close_fuzzy_biblio_matches, close_fuzzy_release_matches
from fatcat_tools.references import enrich_inbound_refs_fatcat, enrich_outbound_refs_fatcat, get_inbound_refs, get_outbound_refs
+from fatcat_tools.transforms.access import release_access_options
from fatcat_web import app, api, auth_api
from fatcat_web.forms import *
from fatcat_web.entity_helpers import *
@@ -48,3 +51,46 @@ def release_view_refs_outbound(ident):
enriched_refs = enrich_outbound_refs_fatcat(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures")
return render_template('release_view_fuzzy_refs.html', direction="outbound", entity=release, hits=hits, enriched_refs=enriched_refs), 200
+
+@app.route('/reference/match', methods=['GET', 'POST'])
+def reference_match():
+
+ form = ReferenceMatchForm()
+ grobid_status = None
+ grobid_dict = None
+
+ if form.is_submitted():
+ if form.validate_on_submit():
+ if form.submit_type.data == 'parse':
+ resp_xml = grobid_api_process_citation(form.raw_citation.data)
+ if not resp_xml:
+ grobid_status = "failed"
+ return render_template('reference_match.html', form=form, grobid_status=grobid_status), 400
+ grobid_dict = transform_grobid_ref_xml(resp_xml)
+ if not grobid_dict:
+ grobid_status = "empty"
+ return render_template('reference_match.html', form=form, grobid_status=grobid_status), 200
+ #print(grobid_dict)
+ release_stub = grobid_ref_to_release(grobid_dict)
+ # remove empty values from GROBID parsed dict
+ grobid_dict = {k: v for k, v in grobid_dict.items() if v is not None}
+ form = ReferenceMatchForm.from_grobid_parse(grobid_dict, form.raw_citation.data)
+ grobid_status = "success"
+ matches = close_fuzzy_release_matches(es_client=app.es_client, release=release_stub, match_limit=10) or []
+ elif form.submit_type.data == 'match':
+ matches = close_fuzzy_biblio_matches(es_client=app.es_client, biblio=form.data, match_limit=10) or []
+ else:
+ raise NotImplementedError()
+
+ for m in matches:
+ # expand releases more completely
+ m.release = api.get_release(m.release.ident, expand="container,files,filesets,webcaptures", hide="abstract,refs")
+ # hack in access options
+ m.access_options = release_access_options(m.release)
+
+ return render_template('reference_match.html', form=form, grobid_dict=grobid_dict, grobid_status=grobid_status, matches=matches), 200
+
+ elif form.errors:
+ return render_template('reference_match.html', form=form), 400
+
+ return render_template('reference_match.html', form=form), 200
diff --git a/python/fatcat_web/templates/reference_match.html b/python/fatcat_web/templates/reference_match.html
new file mode 100644
index 00000000..042b0607
--- /dev/null
+++ b/python/fatcat_web/templates/reference_match.html
@@ -0,0 +1,86 @@
+{% extends "base.html" %}
+{% import "entity_macros.html" as entity_macros %}
+{% import "edit_macros.html" as edit_macros %}
+
+{% block body %}
+
+<h1>Reference Fuzzy Match Tool</h1>
+
+<form class="ui form" id="reference_match" method="POST" action="/reference/match">
+ <input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
+
+ <h3>Parse Citation</h3>
+
+ <p>Enter a citation string here and we will try to parse it (using GROBID)
+ into a structured format, then match against the catalog.
+
+ {{ edit_macros.form_field_basic(form.raw_citation) }}
+
+ <button class="ui primary submit button right floated" type="submit" name="submit_type" value="parse">
+ Parse
+ </button>
+
+ <br clear="all">
+ {% if grobid_status == "success" and grobid_dict %}
+ <div class="ui positive message">
+ <div class="header">Parsed successfully! See match results below</div>
+ {{ entity_macros.extra_metadata(grobid_dict) }}
+ </div>
+ {% endif %}
+
+ <br>
+ <hr>
+ <h3>Fuzzy Match Metadata</h3>
+
+ <p>Enter whatever bibliographic metadata fields you know, and we will try to
+ match to catalog entries.
+
+ <p><b>NOTE:</b> if you already know a persistent identifier (like a DOI), you
+ should use the <a href="/release/lookup">lookup tool</a> instead.
+
+ {{ edit_macros.form_field_inline(form.title) }}
+ {{ edit_macros.form_field_inline(form.first_author) }}
+
+ <br>
+ <div class="ui equal width fields">
+ {{ edit_macros.form_field_basic(form.year) }}
+ {{ edit_macros.form_field_basic(form.journal) }}
+ </div>
+ <div class="ui equal width fields">
+ {{ edit_macros.form_field_basic(form.volume) }}
+ {{ edit_macros.form_field_basic(form.issue) }}
+ {{ edit_macros.form_field_basic(form.pages) }}
+ </div>
+
+ <button class="ui primary submit button right floated" type="submit" name="submit_type" value="match">
+ Match
+ </button>
+ <br clear="all">
+
+</form>
+
+{% if matches is defined %}
+ <br>
+ <hr>
+ <h3>Match Results</h3>
+ <table class="ui very basic celled table">
+ <tbody>
+ {% for match in matches %}
+ <tr><td class="collapsing center aligned">
+ <br><b>{{ match.status.name }}</b>
+ <br>{{ match.reason.name }}
+ <td class="">
+ {{ entity_macros.release_summary(match.release) }}
+ <td class="">
+ {% if match.access_options %}
+ <a href="{{ match.access_options[0].access_url}}" class="ui tiny green active button">{{ match.access_options[0].access_type.name }}</a>
+ {% endif %}
+ {% endfor %}
+ </tbody>
+ </table>
+ {% if not matches %}
+ <p><i>None!</i>
+ {% endif %}
+{% endif %}
+
+{% endblock %}