1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
|
import sys
import json
import datetime
import unicodedata
from bs4 import BeautifulSoup
from bs4.element import NavigableString
DATE_FMT = "%Y-%m-%d"
def is_cjk(s):
if not s:
return False
return unicodedata.name(s[0]).startswith("CJK")
class JalcXmlParser():
"""
Converts JALC DOI metadata (in XML/RDF format) to fatcat release entity
NOTE: some JALC DOIs seem to get cross-registered with Crossref
"""
def __init__(self):
pass
def parse_file(self, handle):
# 1. open with beautiful soup
soup = BeautifulSoup(handle, "xml")
# 2. iterate over articles, call parse_article on each
for record in soup.find_all("Description"):
resp = self.parse_record(record)
print(json.dumps(resp))
#sys.exit(-1)
def parse_record(self, record):
"""
In JALC metadata, both English and Japanese records are given for most
fields.
"""
#extra = dict()
#extra_jalc = dict()
titles = record.find_all("title")
title = titles[0].string.strip()
original_title = None
if title.endswith('.'):
title = title[:-1]
if len(titles) > 1:
original_title = titles[1].string.strip()
if original_title.endswith('.'):
original_title = original_title[:-1]
doi = None
if record.doi:
doi = record.doi.string.lower().strip()
assert doi.startswith('10.')
contribs = []
people = record.find_all("Person")
if people and (len(people) % 2 == 0) and is_cjk(people[1].find('name').string):
# both english and japanese names are included
for i in range(int(len(people)/2)):
# both english and japanese names are included for every author
eng = people[i*2]
jpn = people[i*2 + 1]
raw_name = eng.find('name')
orig_name = jpn.find('name')
if not raw_name:
raw_name = orig_name
contrib = dict(
raw_name=raw_name.string,
role='author',
)
if raw_name and orig_name:
contrib['extra'] = dict(original_name=orig_name.string)
contribs.append(contrib)
elif people:
for eng in people:
raw_name = eng.find('name')
contrib = dict(
raw_name=eng.find('name').string,
role='author',
)
contribs.append(contrib)
release_year = None
release_date = None
date = record.date or None
if date:
date = date.string
if len(date) is 10:
release_date = datetime.datetime.strptime(state['completed-date'], DATE_FMT).date()
release_year = release_date.year
release_date = release_date.isoformat()
elif len(date) is 4:
release_year = int(date)
pages = None
if record.startingPage:
pages = record.startingPage.string
if record.endingPage:
pages = "{}-{}".format(pages, record.endingPage.string)
volume = None
if record.volume:
volume = record.volume.string
issue = None
if record.number:
# note: number/issue transform
issue = record.number.string
issn = None
issn_list = record.find_all("issn")
if issn_list:
# if we wanted the other ISSNs, would also need to uniq the list.
# But we only need one to lookup ISSN-L/container
issn = issn_list[0].string
container = dict()
container_extra = dict()
container_name = None
if record.publicationName:
pubs = [p.string.strip() for p in record.find_all("publicationName")]
pubs = [p for p in pubs if p]
assert(pubs)
if len(pubs) > 1 and pubs[0] == pubs[1]:
pubs = [pubs[0]]
elif len(pubs) > 1 and is_cjk(pubs[0]):
# ordering is not reliable
pubs = [pubs[1], pubs[0]]
container_name = pubs[0]
container['name'] = container_name
if len(pubs) > 1:
orig_container_name = pubs[1]
container_extra['original_name'] = pubs[1]
publisher = None
if record.publisher:
pubs = [p.string.strip() for p in record.find_all("publisher")]
pubs = [p for p in pubs if p]
if len(pubs) > 1 and pubs[0] == pubs[1]:
pubs = [pubs[0]]
elif len(pubs) > 1 and is_cjk(pubs[0]):
# ordering is not reliable
pubs = [pubs[1], pubs[0]]
publisher = pubs[0]
container['publisher'] = publisher
if len(pubs) > 1:
container_extra['publisher_alt_name'] = pubs[1]
if container_extra:
container['extra'] = container_extra
if not container:
container = None
# the vast majority of works are in japanese
# TODO: any indication when *not* in japanese?
lang = "ja"
# reasonable default for this collection
release_type = "article-journal"
re = dict(
work_id=None,
title=title,
original_title=original_title,
release_type="article-journal",
release_status='submitted', # XXX: source_type?
release_date=release_date,
release_year=release_year,
#arxiv_id
doi=doi,
#pmid
#pmcid
#isbn13 # never in Article
volume=volume,
issue=issue,
pages=pages,
publisher=publisher,
language=lang,
#license_slug # not in MEDLINE
# content, mimetype, lang
#abstracts=abstracts,
# raw_name, role, raw_affiliation, extra
contribs=contribs,
# name, type, publisher, issnl
# extra: issnp, issne, original_name, languages, country
container=container,
# extra:
# withdrawn_date
# translation_of
# subtitle
# aliases
# container_name
# group-title
# pubmed: retraction refs
#extra=extra,
)
return re
if __name__=='__main__':
parser = JalcXmlParser()
parser.parse_file(open(sys.argv[1]))
|