aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-21 14:00:10 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-21 14:00:10 -0700
commit45870aa17b5dec6d63df8b7a2eb7839feac9afec (patch)
treeb7e6918da2420cbbe3e7fcbe5d14ec3d2de3cc53
parent2f3fd33c33f12b4426072a0279de85ff797611fe (diff)
downloadgrobid_tei_xml-45870aa17b5dec6d63df8b7a2eb7839feac9afec.tar.gz
grobid_tei_xml-45870aa17b5dec6d63df8b7a2eb7839feac9afec.zip
rip out API interaction code
-rw-r--r--grobid_tei_xml/grobid_unstructured.py99
-rw-r--r--tests/test_grobid_unstructured.py63
2 files changed, 3 insertions, 159 deletions
diff --git a/grobid_tei_xml/grobid_unstructured.py b/grobid_tei_xml/grobid_unstructured.py
index 5462ae1..83808e0 100644
--- a/grobid_tei_xml/grobid_unstructured.py
+++ b/grobid_tei_xml/grobid_unstructured.py
@@ -13,46 +13,7 @@ import sys
import xml.etree.ElementTree as ET
from typing import Optional
-import requests
-from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds
-
-from fuzzycat.config import settings
-from fuzzycat.grobid2json import biblio_info
-from fuzzycat.utils import clean_doi
-
-GROBID_API_BASE = settings.get("GROBID_API_BASE", "https://grobid.qa.fatcat.wiki")
-
-
-def grobid_api_process_citation(raw_citation: str,
- grobid_api_base: str = GROBID_API_BASE,
- timeout: float = 20.0) -> Optional[str]:
- """
- Process a single citation string using GROBID API, returning a TEI-XML response.
-
- Raises python TimeoutError if there was a network or request timeout.
-
- Raises a 'requests' error other unexpected failures (including network
- connection failures)
- """
- try:
- grobid_response = requests.post(
- grobid_api_base + "/api/processCitation",
- data={
- "citations": raw_citation,
- "consolidateCitations": 0,
- },
- timeout=timeout,
- )
- except requests.Timeout:
- raise TimeoutError("GROBID request (HTTP POST) timeout")
-
- if grobid_response.status_code == 204:
- return None
- elif grobid_response.status_code != 200:
- print(f"GROBID request (HTTP POST) failed: {grobid_response}", file=sys.stderr)
- grobid_response.raise_for_status()
-
- return grobid_response.text or None
+from .grobid2json import biblio_info
def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]:
@@ -60,7 +21,7 @@ def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]:
Parses GROBID XML for the case of a single reference/citation string (eg,
not a full/propper TEI-XML fulltext document), and returns a dict.
"""
- # first, remove any xmlns stuff, for consistent parsign
+ # first, remove any xmlns stuff, for consistent parsing
raw_xml = raw_xml.replace('xmlns="http://www.tei-c.org/ns/1.0"', "")
tree = ET.parse(io.StringIO(raw_xml))
root = tree.getroot()
@@ -69,59 +30,3 @@ def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]:
return None
return ref
-
-def grobid_ref_to_release(ref: dict) -> ReleaseEntity:
- """
- Takes the dict returned by transform_grobid_ref_xml() and returns a partial
- ReleaseEntity object (for use with fuzzycat)
- """
- contribs = []
- for author in ref.get("authors") or []:
- contribs.append(
- ReleaseContrib(
- raw_name=author.get("name"),
- given_name=author.get("given_name"),
- surname=author.get("surname"),
- ))
- release = ReleaseEntity(
- title=ref.get("title"),
- contribs=contribs,
- volume=ref.get("volume"),
- issue=ref.get("issue"),
- pages=ref.get("pages"),
- ext_ids=ReleaseExtIds(
- doi=clean_doi(ref.get("doi")),
- pmid=ref.get("pmid"),
- pmcid=ref.get("pmcid"),
- arxiv=ref.get("arxiv_id"),
- ),
- )
- if ref.get("journal"):
- release.extra = {"container_name": ref.get("journal")}
- if ref.get("date"):
- if len(ref["date"]) >= 4 and ref["date"][0:4].isdigit():
- release.release_year = int(ref["date"][0:4])
- # TODO: try to parse 'date' into an ISO date format, and assign to release_date?
- return release
-
-
-def grobid_parse_unstructured(raw_citation: str,
- grobid_api_base: str = GROBID_API_BASE,
- timeout: float = 20.0) -> Optional[ReleaseEntity]:
- """
- High-level wrapper to parse a raw citation string into a (partial) release
- entity.
-
- Returns None if it fails to parse.
-
- Raises various exceptions on network or remote errors.
- """
- ref_xml = grobid_api_process_citation(raw_citation,
- grobid_api_base=grobid_api_base,
- timeout=timeout)
- if not ref_xml:
- return None
- biblio_dict = transform_grobid_ref_xml(ref_xml)
- if not biblio_dict:
- return None
- return grobid_ref_to_release(biblio_dict)
diff --git a/tests/test_grobid_unstructured.py b/tests/test_grobid_unstructured.py
index dd69936..b8d79ca 100644
--- a/tests/test_grobid_unstructured.py
+++ b/tests/test_grobid_unstructured.py
@@ -1,36 +1,6 @@
import pytest
-from fuzzycat.grobid_unstructured import grobid_api_process_citation, grobid_parse_unstructured, grobid_ref_to_release, transform_grobid_ref_xml
-
-
-def test_grobid_ref_to_release():
-
- d = {
- 'title':
- "some title",
- 'doi':
- '10.1234/5678',
- 'journal':
- 'some journal',
- 'authors': [
- {
- 'name': 'ahab sailor',
- 'given_name': 'ahab',
- 'surname': 'sailor'
- },
- {
- 'name': 'mary jane',
- 'given_name': 'mary',
- 'surname': 'jane'
- },
- ],
- }
- r = grobid_ref_to_release(d)
- assert r.title == d['title']
- assert r.ext_ids.doi == d['doi']
- assert r.extra['container_name'] == d['journal']
- assert r.contribs[0].surname == d['authors'][0]['surname']
- assert r.contribs[1].raw_name == d['authors'][1]['name']
+from grobid_tei_xml.grobid_unstructured import transform_grobid_ref_xml
def test_transform_grobid_ref_xml():
@@ -97,34 +67,3 @@ def test_transform_grobid_ref_xml():
assert d['journal'] == "Hernia"
-def test_grobid_parse_unstructured():
- """
- NOTE: this test makes live network requests to GROBID
- """
-
- r = grobid_parse_unstructured("blah")
- assert r is None
-
- r = grobid_parse_unstructured(
- """Cunningham HB, Weis JJ, Taveras LR, Huerta S. Mesh migration following abdominal hernia repair: a comprehensive review. Hernia. 2019 Apr;23(2):235-243. doi: 10.1007/s10029-019-01898-9. Epub 2019 Jan 30. PMID: 30701369."""
- )
- assert r.title == "Mesh migration following abdominal hernia repair: a comprehensive review"
- assert r.contribs[0].surname == "Cunningham"
- assert r.contribs[1].surname == "Weis"
- assert r.contribs[2].surname == "Taveras"
- assert r.contribs[3].surname == "Huerta"
- assert r.extra['container_name'] == "Hernia"
- assert r.release_year == 2019
- assert r.volume == "23"
- assert r.issue == "2"
- assert r.pages == "235-243"
- assert r.ext_ids.doi == "10.1007/s10029-019-01898-9"
- assert r.ext_ids.pmid == "30701369"
-
-
-def test_grobid_parse_unstructured_timeout():
- """
- NOTE: this test makes live network requests to GROBID
- """
- with pytest.raises(TimeoutError):
- grobid_parse_unstructured("blah", timeout=0.000001)