aboutsummaryrefslogtreecommitdiffstats
path: root/grobid_tei_xml/grobid_unstructured.py
blob: cbf73226b8088e4eae843dcc28a2de09c467823f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
"""
Helper functions to parse an unstructured citation string using GROBID, then
fuzzy match using the result.

- try to parse string with GROBID REST API call
- transform the GROBID XML response to a simple dict/struct

TODO: more general versions which handle multiple reference strings in a batch?
"""

import io
import sys
import xml.etree.ElementTree as ET
from typing import Optional

from .parse import biblio_info