diff options
Diffstat (limited to 'grobid_tei_xml/grobid_unstructured.py')
-rw-r--r-- | grobid_tei_xml/grobid_unstructured.py | 99 |
1 files changed, 2 insertions, 97 deletions
diff --git a/grobid_tei_xml/grobid_unstructured.py b/grobid_tei_xml/grobid_unstructured.py index 5462ae1..83808e0 100644 --- a/grobid_tei_xml/grobid_unstructured.py +++ b/grobid_tei_xml/grobid_unstructured.py @@ -13,46 +13,7 @@ import sys import xml.etree.ElementTree as ET from typing import Optional -import requests -from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds - -from fuzzycat.config import settings -from fuzzycat.grobid2json import biblio_info -from fuzzycat.utils import clean_doi - -GROBID_API_BASE = settings.get("GROBID_API_BASE", "https://grobid.qa.fatcat.wiki") - - -def grobid_api_process_citation(raw_citation: str, - grobid_api_base: str = GROBID_API_BASE, - timeout: float = 20.0) -> Optional[str]: - """ - Process a single citation string using GROBID API, returning a TEI-XML response. - - Raises python TimeoutError if there was a network or request timeout. - - Raises a 'requests' error other unexpected failures (including network - connection failures) - """ - try: - grobid_response = requests.post( - grobid_api_base + "/api/processCitation", - data={ - "citations": raw_citation, - "consolidateCitations": 0, - }, - timeout=timeout, - ) - except requests.Timeout: - raise TimeoutError("GROBID request (HTTP POST) timeout") - - if grobid_response.status_code == 204: - return None - elif grobid_response.status_code != 200: - print(f"GROBID request (HTTP POST) failed: {grobid_response}", file=sys.stderr) - grobid_response.raise_for_status() - - return grobid_response.text or None +from .grobid2json import biblio_info def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]: @@ -60,7 +21,7 @@ def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]: Parses GROBID XML for the case of a single reference/citation string (eg, not a full/propper TEI-XML fulltext document), and returns a dict. """ - # first, remove any xmlns stuff, for consistent parsign + # first, remove any xmlns stuff, for consistent parsing raw_xml = raw_xml.replace('xmlns="http://www.tei-c.org/ns/1.0"', "") tree = ET.parse(io.StringIO(raw_xml)) root = tree.getroot() @@ -69,59 +30,3 @@ def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]: return None return ref - -def grobid_ref_to_release(ref: dict) -> ReleaseEntity: - """ - Takes the dict returned by transform_grobid_ref_xml() and returns a partial - ReleaseEntity object (for use with fuzzycat) - """ - contribs = [] - for author in ref.get("authors") or []: - contribs.append( - ReleaseContrib( - raw_name=author.get("name"), - given_name=author.get("given_name"), - surname=author.get("surname"), - )) - release = ReleaseEntity( - title=ref.get("title"), - contribs=contribs, - volume=ref.get("volume"), - issue=ref.get("issue"), - pages=ref.get("pages"), - ext_ids=ReleaseExtIds( - doi=clean_doi(ref.get("doi")), - pmid=ref.get("pmid"), - pmcid=ref.get("pmcid"), - arxiv=ref.get("arxiv_id"), - ), - ) - if ref.get("journal"): - release.extra = {"container_name": ref.get("journal")} - if ref.get("date"): - if len(ref["date"]) >= 4 and ref["date"][0:4].isdigit(): - release.release_year = int(ref["date"][0:4]) - # TODO: try to parse 'date' into an ISO date format, and assign to release_date? - return release - - -def grobid_parse_unstructured(raw_citation: str, - grobid_api_base: str = GROBID_API_BASE, - timeout: float = 20.0) -> Optional[ReleaseEntity]: - """ - High-level wrapper to parse a raw citation string into a (partial) release - entity. - - Returns None if it fails to parse. - - Raises various exceptions on network or remote errors. - """ - ref_xml = grobid_api_process_citation(raw_citation, - grobid_api_base=grobid_api_base, - timeout=timeout) - if not ref_xml: - return None - biblio_dict = transform_grobid_ref_xml(ref_xml) - if not biblio_dict: - return None - return grobid_ref_to_release(biblio_dict) |