From 45870aa17b5dec6d63df8b7a2eb7839feac9afec Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 21 Oct 2021 14:00:10 -0700 Subject: rip out API interaction code --- grobid_tei_xml/grobid_unstructured.py | 99 +---------------------------------- tests/test_grobid_unstructured.py | 63 +--------------------- 2 files changed, 3 insertions(+), 159 deletions(-) diff --git a/grobid_tei_xml/grobid_unstructured.py b/grobid_tei_xml/grobid_unstructured.py index 5462ae1..83808e0 100644 --- a/grobid_tei_xml/grobid_unstructured.py +++ b/grobid_tei_xml/grobid_unstructured.py @@ -13,46 +13,7 @@ import sys import xml.etree.ElementTree as ET from typing import Optional -import requests -from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds - -from fuzzycat.config import settings -from fuzzycat.grobid2json import biblio_info -from fuzzycat.utils import clean_doi - -GROBID_API_BASE = settings.get("GROBID_API_BASE", "https://grobid.qa.fatcat.wiki") - - -def grobid_api_process_citation(raw_citation: str, - grobid_api_base: str = GROBID_API_BASE, - timeout: float = 20.0) -> Optional[str]: - """ - Process a single citation string using GROBID API, returning a TEI-XML response. - - Raises python TimeoutError if there was a network or request timeout. - - Raises a 'requests' error other unexpected failures (including network - connection failures) - """ - try: - grobid_response = requests.post( - grobid_api_base + "/api/processCitation", - data={ - "citations": raw_citation, - "consolidateCitations": 0, - }, - timeout=timeout, - ) - except requests.Timeout: - raise TimeoutError("GROBID request (HTTP POST) timeout") - - if grobid_response.status_code == 204: - return None - elif grobid_response.status_code != 200: - print(f"GROBID request (HTTP POST) failed: {grobid_response}", file=sys.stderr) - grobid_response.raise_for_status() - - return grobid_response.text or None +from .grobid2json import biblio_info def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]: @@ -60,7 +21,7 @@ def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]: Parses GROBID XML for the case of a single reference/citation string (eg, not a full/propper TEI-XML fulltext document), and returns a dict. """ - # first, remove any xmlns stuff, for consistent parsign + # first, remove any xmlns stuff, for consistent parsing raw_xml = raw_xml.replace('xmlns="http://www.tei-c.org/ns/1.0"', "") tree = ET.parse(io.StringIO(raw_xml)) root = tree.getroot() @@ -69,59 +30,3 @@ def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]: return None return ref - -def grobid_ref_to_release(ref: dict) -> ReleaseEntity: - """ - Takes the dict returned by transform_grobid_ref_xml() and returns a partial - ReleaseEntity object (for use with fuzzycat) - """ - contribs = [] - for author in ref.get("authors") or []: - contribs.append( - ReleaseContrib( - raw_name=author.get("name"), - given_name=author.get("given_name"), - surname=author.get("surname"), - )) - release = ReleaseEntity( - title=ref.get("title"), - contribs=contribs, - volume=ref.get("volume"), - issue=ref.get("issue"), - pages=ref.get("pages"), - ext_ids=ReleaseExtIds( - doi=clean_doi(ref.get("doi")), - pmid=ref.get("pmid"), - pmcid=ref.get("pmcid"), - arxiv=ref.get("arxiv_id"), - ), - ) - if ref.get("journal"): - release.extra = {"container_name": ref.get("journal")} - if ref.get("date"): - if len(ref["date"]) >= 4 and ref["date"][0:4].isdigit(): - release.release_year = int(ref["date"][0:4]) - # TODO: try to parse 'date' into an ISO date format, and assign to release_date? - return release - - -def grobid_parse_unstructured(raw_citation: str, - grobid_api_base: str = GROBID_API_BASE, - timeout: float = 20.0) -> Optional[ReleaseEntity]: - """ - High-level wrapper to parse a raw citation string into a (partial) release - entity. - - Returns None if it fails to parse. - - Raises various exceptions on network or remote errors. - """ - ref_xml = grobid_api_process_citation(raw_citation, - grobid_api_base=grobid_api_base, - timeout=timeout) - if not ref_xml: - return None - biblio_dict = transform_grobid_ref_xml(ref_xml) - if not biblio_dict: - return None - return grobid_ref_to_release(biblio_dict) diff --git a/tests/test_grobid_unstructured.py b/tests/test_grobid_unstructured.py index dd69936..b8d79ca 100644 --- a/tests/test_grobid_unstructured.py +++ b/tests/test_grobid_unstructured.py @@ -1,36 +1,6 @@ import pytest -from fuzzycat.grobid_unstructured import grobid_api_process_citation, grobid_parse_unstructured, grobid_ref_to_release, transform_grobid_ref_xml - - -def test_grobid_ref_to_release(): - - d = { - 'title': - "some title", - 'doi': - '10.1234/5678', - 'journal': - 'some journal', - 'authors': [ - { - 'name': 'ahab sailor', - 'given_name': 'ahab', - 'surname': 'sailor' - }, - { - 'name': 'mary jane', - 'given_name': 'mary', - 'surname': 'jane' - }, - ], - } - r = grobid_ref_to_release(d) - assert r.title == d['title'] - assert r.ext_ids.doi == d['doi'] - assert r.extra['container_name'] == d['journal'] - assert r.contribs[0].surname == d['authors'][0]['surname'] - assert r.contribs[1].raw_name == d['authors'][1]['name'] +from grobid_tei_xml.grobid_unstructured import transform_grobid_ref_xml def test_transform_grobid_ref_xml(): @@ -97,34 +67,3 @@ def test_transform_grobid_ref_xml(): assert d['journal'] == "Hernia" -def test_grobid_parse_unstructured(): - """ - NOTE: this test makes live network requests to GROBID - """ - - r = grobid_parse_unstructured("blah") - assert r is None - - r = grobid_parse_unstructured( - """Cunningham HB, Weis JJ, Taveras LR, Huerta S. Mesh migration following abdominal hernia repair: a comprehensive review. Hernia. 2019 Apr;23(2):235-243. doi: 10.1007/s10029-019-01898-9. Epub 2019 Jan 30. PMID: 30701369.""" - ) - assert r.title == "Mesh migration following abdominal hernia repair: a comprehensive review" - assert r.contribs[0].surname == "Cunningham" - assert r.contribs[1].surname == "Weis" - assert r.contribs[2].surname == "Taveras" - assert r.contribs[3].surname == "Huerta" - assert r.extra['container_name'] == "Hernia" - assert r.release_year == 2019 - assert r.volume == "23" - assert r.issue == "2" - assert r.pages == "235-243" - assert r.ext_ids.doi == "10.1007/s10029-019-01898-9" - assert r.ext_ids.pmid == "30701369" - - -def test_grobid_parse_unstructured_timeout(): - """ - NOTE: this test makes live network requests to GROBID - """ - with pytest.raises(TimeoutError): - grobid_parse_unstructured("blah", timeout=0.000001) -- cgit v1.2.3