1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
"""
Helper functions to parse an unstructured citation string using GROBID, then
fuzzy match using the result.
- try to parse string with GROBID REST API call
- transform the GROBID XML response to a simple dict/struct
TODO: more general versions which handle multiple reference strings in a batch?
"""
import io
import sys
import xml.etree.ElementTree as ET
from typing import Optional
import requests
from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds
from fuzzycat.config import settings
from fuzzycat.grobid2json import biblio_info
GROBID_API_BASE = settings.get("GROBID_API_BASE", "https://grobid.qa.fatcat.wiki")
def grobid_api_process_citation(raw_citation: str,
grobid_api_base: str = GROBID_API_BASE,
timeout: float = 20.0) -> Optional[str]:
"""
Process a single citation string using GROBID API, returning a TEI-XML response.
Raises python TimeoutError if there was a network or request timeout.
Raises a 'requests' error other unexpected failures (including network
connection failures)
"""
try:
grobid_response = requests.post(
grobid_api_base + "/api/processCitation",
data={
"citations": raw_citation,
"consolidateCitations": 0,
},
timeout=timeout,
)
except requests.Timeout:
raise TimeoutError("GROBID request (HTTP POST) timeout")
if grobid_response.status_code == 204:
return None
elif grobid_response.status_code != 200:
print(f"GROBID request (HTTP POST) failed: {grobid_response}", file=sys.stderr)
grobid_response.raise_for_status()
return grobid_response.text or None
def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]:
"""
Parses GROBID XML for the case of a single reference/citation string (eg,
not a full/propper TEI-XML fulltext document), and returns a dict.
"""
# first, remove any xmlns stuff, for consistent parsign
raw_xml = raw_xml.replace('xmlns="http://www.tei-c.org/ns/1.0"', "")
tree = ET.parse(io.StringIO(raw_xml))
root = tree.getroot()
ref = biblio_info(root, ns="")
if not any(ref.values()):
return None
return ref
def grobid_ref_to_release(ref: dict) -> ReleaseEntity:
"""
Takes the dict returned by transform_grobid_ref_xml() and returns a partial
ReleaseEntity object (for use with fuzzycat)
"""
contribs = []
for author in ref.get("authors") or []:
contribs.append(
ReleaseContrib(
raw_name=author.get("name"),
given_name=author.get("given_name"),
surname=author.get("surname"),
))
release = ReleaseEntity(
title=ref.get("title"),
contribs=contribs,
volume=ref.get("volume"),
issue=ref.get("issue"),
pages=ref.get("pages"),
ext_ids=ReleaseExtIds(
doi=ref.get("doi"),
pmid=ref.get("pmid"),
pmcid=ref.get("pmcid"),
arxiv=ref.get("arxiv_id"),
),
)
if ref.get("journal"):
release.extra = {"container_name": ref.get("journal")}
if ref.get("date"):
if len(ref["date"]) >= 4 and ref["date"][0:4].isdigit():
release.release_year = int(ref["date"][0:4])
# TODO: try to parse 'date' into an ISO date format, and assign to release_date?
return release
def grobid_parse_unstructured(raw_citation: str,
grobid_api_base: str = GROBID_API_BASE,
timeout: float = 20.0) -> Optional[ReleaseEntity]:
"""
High-level wrapper to parse a raw citation string into a (partial) release
entity.
Returns None if it fails to parse.
Raises various exceptions on network or remote errors.
"""
ref_xml = grobid_api_process_citation(raw_citation,
grobid_api_base=grobid_api_base,
timeout=timeout)
if not ref_xml:
return None
biblio_dict = transform_grobid_ref_xml(ref_xml)
if not biblio_dict:
return None
return grobid_ref_to_release(biblio_dict)
|