""" Helper functions to parse an unstructured citation string using GROBID, then fuzzy match using the result. - try to parse string with GROBID REST API call - transform the GROBID XML response to a simple dict/struct TODO: more general versions which handle multiple reference strings in a batch? """ import io import sys import xml.etree.ElementTree as ET from typing import Optional import requests from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds from grobid_tei_xml import GrobidBiblio, parse_citation_xml from fuzzycat.config import settings from fuzzycat.utils import clean_doi GROBID_API_BASE = settings.get("GROBID_API_BASE", "https://grobid.qa.fatcat.wiki") def grobid_api_process_citation(raw_citation: str, grobid_api_base: str = GROBID_API_BASE, timeout: float = 20.0) -> Optional[str]: """ Process a single citation string using GROBID API, returning a TEI-XML response. Raises python TimeoutError if there was a network or request timeout. Raises a 'requests' error other unexpected failures (including network connection failures) """ try: grobid_response = requests.post( grobid_api_base + "/api/processCitation", data={ "citations": raw_citation, "consolidateCitations": 0, }, timeout=timeout, ) except requests.Timeout: raise TimeoutError("GROBID request (HTTP POST) timeout") if grobid_response.status_code == 204: return None elif grobid_response.status_code != 200: print(f"GROBID request (HTTP POST) failed: {grobid_response}", file=sys.stderr) grobid_response.raise_for_status() return grobid_response.text or None def grobid_ref_to_release(ref: GrobidBiblio) -> ReleaseEntity: """ Takes GrobidBiblio (parsed from TEI-XML) and returns a partial ReleaseEntity object (for use with fuzzycat) """ contribs = [] for author in ref.authors or []: contribs.append( ReleaseContrib( raw_name=author.full_name, given_name=author.given_name, surname=author.surname, )) release = ReleaseEntity( title=ref.title, contribs=contribs, volume=ref.volume, issue=ref.issue, pages=ref.pages, ext_ids=ReleaseExtIds( doi=clean_doi(ref.doi), pmid=ref.pmid, pmcid=ref.pmcid, arxiv=ref.arxiv_id, ), ) if ref.journal: release.extra = {"container_name": ref.journal} if ref.date: if len(ref.date) >= 4 and ref.date[0:4].isdigit(): release.release_year = int(ref.date[0:4]) # TODO: try to parse 'date' into an ISO date format, and assign to release_date? return release def grobid_parse_unstructured(raw_citation: str, grobid_api_base: str = GROBID_API_BASE, timeout: float = 20.0) -> Optional[ReleaseEntity]: """ High-level wrapper to parse a raw citation string into a (partial) release entity. Returns None if it fails to parse. Raises various exceptions on network or remote errors. """ ref_xml = grobid_api_process_citation(raw_citation, grobid_api_base=grobid_api_base, timeout=timeout) if not ref_xml: return None ref = parse_citation_xml(ref_xml) if not ref: return None return grobid_ref_to_release(ref)