1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
|
"""
This file contains helpers to fuzzy match a raw citation string:
- try to parse it with GROBID into structured form
- transform the GROBID XML response to a simple dict/struct
- run fuzzycat lookup
Note that this chain hits several external services, and should be wrapped in a
timeout and try/except! In the future, perhaps should be async so it can run in
parallel with "regular" query?
"""
import io
import sys
from typing import Optional, Any, Tuple
import xml.etree.ElementTree as ET
import requests
from fatcat_openapi_client import ReleaseEntity, ReleaseExtIds, ReleaseContrib
from fatcat_scholar.api_entities import entity_to_dict
from fuzzycat.matching import match_release_fuzzy
import fuzzycat.common
import fuzzycat.verify
from fatcat_scholar.grobid2json import biblio_info
def grobid_process_citation(
raw: str, grobid_host: str = "https://grobid.qa.fatcat.wiki", timeout: float = 10.0
) -> Optional[str]:
try:
grobid_response = requests.post(
grobid_host + "/api/processCitation",
data={"citations": raw, "consolidateCitations": 0,},
timeout=timeout,
)
except requests.Timeout:
print("GROBID request (HTTP POST) timeout", file=sys.stderr)
return None
if grobid_response.status_code != 200:
print(f"GROBID request (HTTP POST) failed: {grobid_response}", file=sys.stderr)
return None
return grobid_response.text
def transform_grobid(raw_xml: str) -> Optional[dict]:
# first, remove any xmlns stuff
raw_xml = raw_xml.replace('xmlns="http://www.tei-c.org/ns/1.0"', "")
tree = ET.parse(io.StringIO(raw_xml))
root = tree.getroot()
ref = biblio_info(root, ns="")
if not any(ref.values()):
return None
return ref
def ref_to_release(ref: dict) -> ReleaseEntity:
contribs = []
for author in ref.get("authors") or []:
contribs.append(
ReleaseContrib(
raw_name=author.get("name"),
given_name=author.get("given_name"),
surname=author.get("surname"),
)
)
release = ReleaseEntity(
title=ref.get("title"),
contribs=contribs,
volume=ref.get("volume"),
issue=ref.get("issue"),
pages=ref.get("pages"),
ext_ids=ReleaseExtIds(
doi=ref.get("doi"),
pmid=ref.get("pmid"),
pmcid=ref.get("pmcid"),
arxiv=ref.get("arxiv_id"),
),
)
if ref.get("journal"):
release.extra = {"container_name": ref.get("journal")}
if ref.get("date"):
if len(ref["date"]) == 4 and ref["date"].isdigit():
release.release_year = int(ref["date"])
return release
def fuzzy_match(
release: ReleaseEntity, es_client: Any, api_client: Any, timeout: float = 10.0
) -> Optional[Tuple[str, str, ReleaseEntity]]:
"""
This helper function uses fuzzycat (and elasticsearch) to look for
existing release entities with similar metadata.
Returns None if there was no match of any kind, or a single tuple
(status: str, reason: str, existing: ReleaseEntity) if there was a match.
Status string is one of the fuzzycat.common.Status, with "strongest
match" in this sorted order:
- EXACT
- STRONG
- WEAK
- AMBIGUOUS
Eg, if there is any EXACT match that is always returned; an AMBIGIOUS
result is only returned if all the candidate matches were ambiguous.
TODO: actually do something with timeout
"""
# this map used to establish priority order of verified matches
STATUS_SORT = {
fuzzycat.common.Status.TODO: 0,
fuzzycat.common.Status.EXACT: 10,
fuzzycat.common.Status.STRONG: 20,
fuzzycat.common.Status.WEAK: 30,
fuzzycat.common.Status.AMBIGUOUS: 40,
fuzzycat.common.Status.DIFFERENT: 60,
}
# TODO: the size here is a first guess; what should it really be?
candidates = match_release_fuzzy(release, size=10, es=es_client)
if not candidates:
return None
release_dict = entity_to_dict(release, api_client=api_client.api_client)
verified = [
(
fuzzycat.verify.verify(
release_dict, entity_to_dict(c, api_client=api_client.api_client)
),
c,
)
for c in candidates
]
# chose the "closest" match
closest = sorted(verified, key=lambda v: STATUS_SORT[v[0].status])[0]
if closest[0].status == fuzzycat.common.Status.DIFFERENT:
return None
elif closest[0].status == fuzzycat.common.Status.TODO:
raise NotImplementedError("fuzzycat verify hit a Status.TODO")
else:
return (closest[0].status.name, closest[0].reason.value, closest[1])
def try_fuzzy_match(
citation: str, grobid_host: str, es_client: Any, fatcat_api_client: Any
) -> Optional[str]:
"""
All-in-one helper method
"""
resp = grobid_process_citation(citation, grobid_host=grobid_host, timeout=3.0)
if not resp:
return None
ref = transform_grobid(resp)
if not ref:
return None
release = ref_to_release(ref)
matches = fuzzy_match(
release, es_client=es_client, api_client=fatcat_api_client, timeout=3.0
)
if not matches or matches[0] not in ("EXACT", "STRONG", "WEAK"):
return None
return f"work_{matches[2].work_id}"
if __name__ == "__main__":
"""
Demo showing how to integrate the above functions together.
"""
import os
import elasticsearch
import fatcat_openapi_client
citation = sys.argv[1]
print("Sending to GROBID...")
resp = grobid_process_citation(citation)
print(resp)
if not resp:
sys.exit(0)
ref = transform_grobid(resp)
print(ref)
if not ref:
sys.exit(0)
release = ref_to_release(ref)
print(release)
es_backend = os.environ.get("ELASTICSEARCH_BACKEND", "https://search.fatcat.wiki")
es_client = elasticsearch.Elasticsearch(es_backend)
api_conf = fatcat_openapi_client.Configuration()
api_conf.host = os.environ.get("FATCAT_API_HOST", "https://api.fatcat.wiki/v0")
api_client = fatcat_openapi_client.DefaultApi(
fatcat_openapi_client.ApiClient(api_conf)
)
matches = fuzzy_match(release, es_client=es_client, api_client=api_client)
print(matches)
if not matches or matches[0] not in ("EXACT", "STRONG", "WEAK"):
sys.exit(0)
print(matches[2].work_id)
|