aboutsummaryrefslogtreecommitdiffstats
path: root/grobid_tei_xml/grobid_unstructured.py
blob: eeb3507b74358ab05cbce2c184461aef1309858a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
"""
Helper functions to parse an unstructured citation string using GROBID, then
fuzzy match using the result.

- try to parse string with GROBID REST API call
- transform the GROBID XML response to a simple dict/struct

TODO: more general versions which handle multiple reference strings in a batch?
"""

import io
import sys
import xml.etree.ElementTree as ET
from typing import Optional

from .grobid2json import biblio_info


def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]:
    """
    Parses GROBID XML for the case of a single reference/citation string (eg,
    not a full/propper TEI-XML fulltext document), and returns a dict.
    """
    # first, remove any xmlns stuff, for consistent parsing
    raw_xml = raw_xml.replace('xmlns="http://www.tei-c.org/ns/1.0"', "")
    tree = ET.parse(io.StringIO(raw_xml))
    root = tree.getroot()
    ref = biblio_info(root, ns="")
    if not any(ref.values()):
        return None
    return ref