fuzzycat/grobid_unstructured.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126

"""
Helper functions to parse an unstructured citation string using GROBID, then
fuzzy match using the result.

- try to parse string with GROBID REST API call
- transform the GROBID XML response to a simple dict/struct

TODO: more general versions which handle multiple reference strings in a batch?
"""

import io
import sys
import xml.etree.ElementTree as ET
from typing import Optional

import requests
from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds

from fuzzycat.config import settings
from fuzzycat.grobid2json import biblio_info

GROBID_API_BASE = settings.get("GROBID_API_BASE", "https://grobid.qa.fatcat.wiki")


def grobid_api_process_citation(raw_citation: str,
                                grobid_api_base: str = GROBID_API_BASE,
                                timeout: float = 20.0) -> Optional[str]:
    """
    Process a single citation string using GROBID API, returning a TEI-XML response.

    Raises python TimeoutError if there was a network or request timeout.

    Raises a 'requests' error other unexpected failures (including network
    connection failures)
    """
    try:
        grobid_response = requests.post(
            grobid_api_base + "/api/processCitation",
            data={
                "citations": raw_citation,
                "consolidateCitations": 0,
            },
            timeout=timeout,
        )
    except requests.Timeout:
        raise TimeoutError("GROBID request (HTTP POST) timeout")

    if grobid_response.status_code == 204:
        return None
    elif grobid_response.status_code != 200:
        print(f"GROBID request (HTTP POST) failed: {grobid_response}", file=sys.stderr)
    grobid_response.raise_for_status()

    return grobid_response.text or None


def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]:
    """
    Parses GROBID XML for the case of a single reference/citation string (eg,
    not a full/propper TEI-XML fulltext document), and returns a dict.
    """
    # first, remove any xmlns stuff, for consistent parsign
    raw_xml = raw_xml.replace('xmlns="http://www.tei-c.org/ns/1.0"', "")
    tree = ET.parse(io.StringIO(raw_xml))
    root = tree.getroot()
    ref = biblio_info(root, ns="")
    if not any(ref.values()):
        return None
    return ref


def grobid_ref_to_release(ref: dict) -> ReleaseEntity:
    """
    Takes the dict returned by transform_grobid_ref_xml() and returns a partial
    ReleaseEntity object (for use with fuzzycat)
    """
    contribs = []
    for author in ref.get("authors") or []:
        contribs.append(
            ReleaseContrib(
                raw_name=author.get("name"),
                given_name=author.get("given_name"),
                surname=author.get("surname"),
            ))
    release = ReleaseEntity(
        title=ref.get("title"),
        contribs=contribs,
        volume=ref.get("volume"),
        issue=ref.get("issue"),
        pages=ref.get("pages"),
        ext_ids=ReleaseExtIds(
            doi=ref.get("doi"),
            pmid=ref.get("pmid"),
            pmcid=ref.get("pmcid"),
            arxiv=ref.get("arxiv_id"),
        ),
    )
    if ref.get("journal"):
        release.extra = {"container_name": ref.get("journal")}
    if ref.get("date"):
        if len(ref["date"]) >= 4 and ref["date"][0:4].isdigit():
            release.release_year = int(ref["date"][0:4])
        # TODO: try to parse 'date' into an ISO date format, and assign to release_date?
    return release


def grobid_parse_unstructured(raw_citation: str,
                              grobid_api_base: str = GROBID_API_BASE,
                              timeout: float = 20.0) -> Optional[ReleaseEntity]:
    """
    High-level wrapper to parse a raw citation string into a (partial) release
    entity.
    
    Returns None if it fails to parse.

    Raises various exceptions on network or remote errors.
    """
    ref_xml = grobid_api_process_citation(raw_citation,
                                          grobid_api_base=grobid_api_base,
                                          timeout=timeout)
    if not ref_xml:
        return None
    biblio_dict = transform_grobid_ref_xml(ref_xml)
    if not biblio_dict:
        return None
    return grobid_ref_to_release(biblio_dict)