From 567727e8606d2565098ddbcd63a1526aa44ff97f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 14 Apr 2021 15:40:15 -0700 Subject: GROBID API unstructured citation parsing utility code --- tests/test_grobid_unstructured.py | 130 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 tests/test_grobid_unstructured.py (limited to 'tests') diff --git a/tests/test_grobid_unstructured.py b/tests/test_grobid_unstructured.py new file mode 100644 index 0000000..dd69936 --- /dev/null +++ b/tests/test_grobid_unstructured.py @@ -0,0 +1,130 @@ +import pytest + +from fuzzycat.grobid_unstructured import grobid_api_process_citation, grobid_parse_unstructured, grobid_ref_to_release, transform_grobid_ref_xml + + +def test_grobid_ref_to_release(): + + d = { + 'title': + "some title", + 'doi': + '10.1234/5678', + 'journal': + 'some journal', + 'authors': [ + { + 'name': 'ahab sailor', + 'given_name': 'ahab', + 'surname': 'sailor' + }, + { + 'name': 'mary jane', + 'given_name': 'mary', + 'surname': 'jane' + }, + ], + } + r = grobid_ref_to_release(d) + assert r.title == d['title'] + assert r.ext_ids.doi == d['doi'] + assert r.extra['container_name'] == d['journal'] + assert r.contribs[0].surname == d['authors'][0]['surname'] + assert r.contribs[1].raw_name == d['authors'][1]['name'] + + +def test_transform_grobid_ref_xml(): + citation_xml = """ + + + Mesh migration following abdominal hernia repair: a comprehensive review + + + H + B + Cunningham + + + + + J + J + Weis + + + + + L + R + Taveras + + + + + S + Huerta + + + 10.1007/s10029-019-01898-9 + 30701369 + + + Hernia + + 23 + 2 + + + + +""" + + d = transform_grobid_ref_xml(citation_xml) + assert d['title'] == "Mesh migration following abdominal hernia repair: a comprehensive review" + assert d['authors'][2]['given_name'] == "L" + assert d['authors'][2]['surname'] == "Taveras" + assert d['authors'][2]['name'] == "L R Taveras" + assert d['doi'] == "10.1007/s10029-019-01898-9" + assert d['pmid'] == "30701369" + assert d['date'] == "2019-01-30" + assert d['pages'] == "235-243" + assert d['volume'] == "23" + assert d['issue'] == "2" + assert d['journal'] == "Hernia" + + +def test_grobid_parse_unstructured(): + """ + NOTE: this test makes live network requests to GROBID + """ + + r = grobid_parse_unstructured("blah") + assert r is None + + r = grobid_parse_unstructured( + """Cunningham HB, Weis JJ, Taveras LR, Huerta S. Mesh migration following abdominal hernia repair: a comprehensive review. Hernia. 2019 Apr;23(2):235-243. doi: 10.1007/s10029-019-01898-9. Epub 2019 Jan 30. PMID: 30701369.""" + ) + assert r.title == "Mesh migration following abdominal hernia repair: a comprehensive review" + assert r.contribs[0].surname == "Cunningham" + assert r.contribs[1].surname == "Weis" + assert r.contribs[2].surname == "Taveras" + assert r.contribs[3].surname == "Huerta" + assert r.extra['container_name'] == "Hernia" + assert r.release_year == 2019 + assert r.volume == "23" + assert r.issue == "2" + assert r.pages == "235-243" + assert r.ext_ids.doi == "10.1007/s10029-019-01898-9" + assert r.ext_ids.pmid == "30701369" + + +def test_grobid_parse_unstructured_timeout(): + """ + NOTE: this test makes live network requests to GROBID + """ + with pytest.raises(TimeoutError): + grobid_parse_unstructured("blah", timeout=0.000001) -- cgit v1.2.3 From afe27a3480f45bfa4d13fce0ca7624cd37069434 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 14 Apr 2021 16:28:22 -0700 Subject: add 'simple' high-level routines for fuzzy-match-and-verify calls Some of these are a little redundant, in that calling code could trivially re-implement. However, I think these are good starters for stable external API interfaces, leaving us room to iterate and refactor lower-level implementations behind the scenes. --- fuzzycat/simple.py | 274 +++++++++++++++++++++++++++++++++++++++++++++++++++ tests/test_simple.py | 42 ++++++++ 2 files changed, 316 insertions(+) create mode 100644 fuzzycat/simple.py create mode 100644 tests/test_simple.py (limited to 'tests') diff --git a/fuzzycat/simple.py b/fuzzycat/simple.py new file mode 100644 index 0000000..c78ac28 --- /dev/null +++ b/fuzzycat/simple.py @@ -0,0 +1,274 @@ +""" +This file contains simple high-level functions that call in to match, verify, +and unstructured parsing routines. + + close_fuzzy_release_matches(release) -> List[FuzzyReleaseMatchResult] + close_fuzzy_biblio_matches(biblio) -> List[FuzzyReleaseMatchResult] + close_fuzzy_unstructured_matches(unstructured) -> List[FuzzyReleaseMatchResult] + +Each function takes additional arguments: + + es_client + fatcat_api_client + match_limit + +Each also has a "closest" variant, which returns just the single highest-rated +match. +""" + +from dataclasses import dataclass +from typing import Any, List, Optional + +from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds + +from fuzzycat.common import Reason, Status +from fuzzycat.entities import entity_to_dict +from fuzzycat.grobid_unstructured import grobid_parse_unstructured +from fuzzycat.matching import match_release_fuzzy +from fuzzycat.verify import verify + + +@dataclass +class FuzzyReleaseMatchResult: + status: Status + reason: Reason + release: ReleaseEntity + + +# this map used to establish priority order of verified matches +STATUS_SORT = { + Status.TODO: 0, + Status.EXACT: 10, + Status.STRONG: 20, + Status.WEAK: 30, + Status.AMBIGUOUS: 40, + Status.DIFFERENT: 60, +} + + +def close_fuzzy_release_matches(release: ReleaseEntity, + es_client: Any, + fatcat_api_client: Optional[Any] = None, + match_limit: int = 5) -> Optional[FuzzyReleaseMatchResult]: + """ + This high-level helper function runs a fuzzy match (using elasticsearch), + verifies all the results, and returns the "closest" matching results (if + any). + + es_client is required, and used in the matcing process. + + fatcat_api_client is optional and used both for entity-to-dict conversion + efficiency and for fetching current entities from the fatcat API + + match_limit sets the maximum result size from the inital fuzzy match call + + Returns an empty list if there was no match of any kind, or a sorted list + of simple result objects (FuzzyReleaseMatchResult dataclass) with fields: + + status: fuzzycat.common.Status + reason: fuzzycat.common.Reason + release: ReleaseEntity + + Status is one of the fuzzycat.common.Status, with "strongest match" in this + sorted order: + + - EXACT + - STRONG + - WEAK + - AMBIGUOUS + + DIFFERENT and TODO matches are never returned. + + Eg, if there is any EXACT match that is always returned; an AMBIGIOUS + result is only returned if all the candidate matches were ambiguous. + """ + + candidates = match_release_fuzzy(release, size=match_limit, es=es_client) + if not candidates: + return None + + release_dict = entity_to_dict(release, api_client=fatcat_api_client) + + # list of tuple of (Verify, ReleaseEntity) + verified = [( + verify(release_dict, entity_to_dict(c, api_client=fatcat_api_client)), + c, + ) for c in candidates] + + # list of FuzzyReleaseMatchResult, with TODO and DIFFERENT removed + verified = [ + FuzzyReleaseMatchResult(v[0].status, v[0].reason, v[1]) for v in verified + if v[0].status not in [Status.TODO, Status.DIFFERENT] + ] + + return sorted(verified, key=lambda v: STATUS_SORT[v.status]) + + +def closest_fuzzy_release_match(release: ReleaseEntity, + es_client: Any, + fatcat_api_client: Optional[Any] = None, + match_limit: int = 5) -> Optional[FuzzyReleaseMatchResult]: + """ + Single-result variant of close_fuzzy_release_matches() + """ + matches = close_fuzzy_release_matches( + release, + es_client=es_client, + fatcat_api_client=fatcat_api_client, + match_limit=match_limit, + ) + if matches: + return matches[0] + else: + return None + + +def close_fuzzy_unstructured_matches(raw_citation: str, + es_client: Any, + fatcat_api_client: Optional[Any] = None, + match_limit: int = 5) -> List[FuzzyReleaseMatchResult]: + """ + Variation of close_fuzzy_release_matches() which first parses an + unstructured citation string, then finds close matches. + + TODO: pass-through GROBID API configuration? + """ + release = grobid_parse_unstructured(raw_citation) + if not release: + return None + return close_fuzzy_release_matches( + release, + es_client=es_client, + fatcat_api_client=fatcat_api_client, + match_limit=match_limit, + ) + + +def closest_fuzzy_unstructured_match(raw_citation: str, + es_client: Any, + fatcat_api_client: Optional[Any] = None, + match_limit: int = 5) -> List[FuzzyReleaseMatchResult]: + """ + Single-result variant of close_fuzzy_release_matches() + """ + matches = close_fuzzy_unstructured_matches( + raw_citation, + es_client=es_client, + fatcat_api_client=fatcat_api_client, + match_limit=match_limit, + ) + if matches: + return matches[0] + else: + return None + + +def biblio_to_release(biblio: dict) -> ReleaseEntity: + """ + Helper for close_fuzzy_biblio_matches() et al + """ + contribs = [] + if biblio.get('authors'): + for a in biblio['authors']: + contribs.append( + ReleaseContrib( + raw_name=a.get('name'), + given_name=a.get('given_name'), + surname=a.get('surname'), + )) + elif biblio.get('author_names'): + for a in biblio['author_names']: + contribs.append(ReleaseContrib(raw_name=a)) + elif biblio.get('first_author'): + contribs.append(ReleaseContrib(raw_name=biblio['first_author'])) + release = ReleaseEntity( + title=biblio.get("title"), + ext_ids=ReleaseExtIds( + doi=biblio.get("doi"), + pmid=biblio.get("pmid"), + pmcid=biblio.get("pmcid"), + arxiv=biblio.get("arxiv_id"), + ), + volume=biblio.get("volume"), + issue=biblio.get("issue"), + pages=biblio.get("pages") or biblio.get("first_page"), + publisher=biblio.get("publisher"), + release_stage=biblio.get("release_stage"), + release_type=biblio.get("release_type"), + extra=dict(), + ) + if biblio.get('journal'): + release.extra['container_name'] = biblio['journal'] + elif biblio.get('conference'): + release.extra['container_name'] = biblio['conference'] + if biblio.get('year'): + year = biblio['year'] + if isinstance(year, str) and len(year) >= 4 and year[0:4].isdigit(): + release.release_year = int(year[0:4]) + elif isinstance(year, int): + release.release_year = year + elif biblio.get('date'): + date = biblio['date'] + if isinstance(date, str) and len(date) >= 4 and date[0:4].isdigit(): + release.release_year = int(date[0:4]) + return release + + +def close_fuzzy_biblio_matches(biblio: dict, + es_client: Any, + fatcat_api_client: Optional[Any] = None, + match_limit: int = 5) -> List[FuzzyReleaseMatchResult]: + """ + Variation of close_fuzzy_release_matches() which takes bibliographic fields + as arguments. + + Biblio fields which are handled include: + + title + journal + or: conference + authors + name + given_name + surname + or: author_names (List[str]) + or: first_author (str) + year + date + volume + issue + pages + or: first_page + publisher + doi + pmid + arxiv_id + release_type (eg, 'journal-article', 'book', 'dataset') + release_stage + """ + release = biblio_to_release(biblio) + return close_fuzzy_release_matches( + release, + es_client=es_client, + fatcat_api_client=fatcat_api_client, + match_limit=match_limit, + ) + + +def closest_fuzzy_biblio_match(biblio: dict, + es_client: Any, + fatcat_api_client: Optional[Any] = None, + match_limit: int = 5) -> List[FuzzyReleaseMatchResult]: + """ + Single-result variant of close_fuzzy_biblio_matches() + """ + matches = close_fuzzy_biblio_matches( + biblio, + es_client=es_client, + fatcat_api_client=fatcat_api_client, + match_limit=match_limit, + ) + if matches: + return matches[0] + else: + return None diff --git a/tests/test_simple.py b/tests/test_simple.py new file mode 100644 index 0000000..0c5d216 --- /dev/null +++ b/tests/test_simple.py @@ -0,0 +1,42 @@ +""" +These basically all hit external network services. +""" + +import pytest +import elasticsearch + +from fuzzycat.simple import * +from fuzzycat.config import settings + + +@pytest.fixture +def es_client(): + return elasticsearch.Elasticsearch( + [settings.get("FATCAT_SEARCH_URL", "https://search.fatcat.wiki:443")]) + + +def test_close_fuzzy_unstructured_matches(es_client): + + matches = close_fuzzy_unstructured_matches( + """Cunningham HB, Weis JJ, Taveras LR, Huerta S. Mesh migration following abdominal hernia repair: a comprehensive review. Hernia. 2019 Apr;23(2):235-243. doi: 10.1007/s10029-019-01898-9. Epub 2019 Jan 30. PMID: 30701369.""", + es_client=es_client) + + assert matches + assert matches[0].status.name == "EXACT" + assert matches[0].release.ext_ids.doi == "10.1007/s10029-019-01898-9" + + +def test_close_fuzzy_biblio_matches(es_client): + + matches = close_fuzzy_biblio_matches(dict( + title="Mesh migration following abdominal hernia repair: a comprehensive review", + first_author="Cunningham", + year=2019, + journal="Hernia", + ), + es_client=es_client) + + assert matches + # TODO: should be "STRONG" or "WEAK" without all authors? + assert matches[0].status.name in ("STRONG", "WEAK", "AMBIGUOUS") + assert matches[0].release.ext_ids.doi == "10.1007/s10029-019-01898-9" -- cgit v1.2.3