python/parse_jalc_xml.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209


import sys
import json
import datetime
import unicodedata
from bs4 import BeautifulSoup
from bs4.element import NavigableString


DATE_FMT = "%Y-%m-%d"

def is_cjk(s):
    if not s:
        return False
    return unicodedata.name(s[0]).startswith("CJK")

class JalcXmlParser():
    """
    Converts JALC DOI metadata (in XML/RDF format) to fatcat release entity

    NOTE: some JALC DOIs seem to get cross-registered with Crossref
    """

    def __init__(self):
        pass

    def parse_file(self, handle):

        # 1. open with beautiful soup
        soup = BeautifulSoup(handle, "xml")

        # 2. iterate over articles, call parse_article on each
        for record in soup.find_all("Description"):
            resp = self.parse_record(record)
            print(json.dumps(resp))
            #sys.exit(-1)


    def parse_record(self, record):
        """
        In JALC metadata, both English and Japanese records are given for most
        fields.
        """

        #extra = dict()
        #extra_jalc = dict()

        titles = record.find_all("title")
        title = titles[0].string.strip()
        original_title = None
        if title.endswith('.'):
            title = title[:-1]
        if len(titles) > 1:
            original_title = titles[1].string.strip()
            if original_title.endswith('.'):
                original_title = original_title[:-1]

        doi = None
        if record.doi:
            doi = record.doi.string.lower().strip()
            assert doi.startswith('10.')

        contribs = []
        people = record.find_all("Person")
        if people and (len(people) % 2 == 0) and is_cjk(people[1].find('name').string):
            # both english and japanese names are included
            for i in range(int(len(people)/2)):
                # both english and japanese names are included for every author
                eng = people[i*2]
                jpn = people[i*2 + 1]
                raw_name = eng.find('name')
                orig_name = jpn.find('name')
                if not raw_name:
                    raw_name = orig_name
                contrib = dict(
                    raw_name=raw_name.string,
                    role='author',
                )
                if raw_name and orig_name:
                    contrib['extra'] = dict(original_name=orig_name.string)
                contribs.append(contrib)
        elif people:
            for eng in people:
                raw_name = eng.find('name')
                contrib = dict(
                    raw_name=eng.find('name').string,
                    role='author',
                )
                contribs.append(contrib)

        release_year = None
        release_date = None
        date = record.date or None
        if date:
            date = date.string
            if len(date) is 10:
                release_date = datetime.datetime.strptime(date['completed-date'], DATE_FMT).date()
                release_year = release_date.year
                release_date = release_date.isoformat()
            elif len(date) is 4:
                release_year = int(date)

        pages = None
        if record.startingPage:
            pages = record.startingPage.string
            if record.endingPage:
                pages = "{}-{}".format(pages, record.endingPage.string)
        volume = None
        if record.volume:
            volume = record.volume.string
        issue = None
        if record.number:
            # note: number/issue transform
            issue = record.number.string

        issn = None
        issn_list = record.find_all("issn")
        if issn_list:
            # if we wanted the other ISSNs, would also need to uniq the list.
            # But we only need one to lookup ISSN-L/container
            issn = issn_list[0].string

        container = dict()
        container_extra = dict()
        container_name = None
        if record.publicationName:
            pubs = [p.string.strip() for p in record.find_all("publicationName")]
            pubs = [p for p in pubs if p]
            assert(pubs)
            if len(pubs) > 1 and pubs[0] == pubs[1]:
                pubs = [pubs[0]]
            elif len(pubs) > 1 and is_cjk(pubs[0]):
                # ordering is not reliable
                pubs = [pubs[1], pubs[0]]
            container_name = pubs[0]
            container['name'] = container_name
            if len(pubs) > 1:
                orig_container_name = pubs[1]
                container_extra['original_name'] = pubs[1]
        publisher = None
        if record.publisher:
            pubs = [p.string.strip() for p in record.find_all("publisher")]
            pubs = [p for p in pubs if p]
            if len(pubs) > 1 and pubs[0] == pubs[1]:
                pubs = [pubs[0]]
            elif len(pubs) > 1 and is_cjk(pubs[0]):
                # ordering is not reliable
                pubs = [pubs[1], pubs[0]]
            publisher = pubs[0]
            container['publisher'] = publisher
            if len(pubs) > 1:
                container_extra['publisher_alt_name'] = pubs[1]
        if container_extra:
            container['extra'] = container_extra
        if not container:
            container = None

        # the vast majority of works are in japanese
        # TODO: any indication when *not* in japanese?
        lang = "ja"

        # reasonable default for this collection
        release_type = "article-journal"

        re = dict(
            work_id=None,
            title=title,
            original_title=original_title,
            release_type="article-journal",
            release_status='submitted', # XXX: source_type?
            release_date=release_date,
            release_year=release_year,
            #arxiv_id
            doi=doi,
            #pmid
            #pmcid
            #isbn13     # never in Article
            volume=volume,
            issue=issue,
            pages=pages,
            publisher=publisher,
            language=lang,
            #license_slug   # not in MEDLINE

            # content, mimetype, lang
            #abstracts=abstracts,

            # raw_name, role, raw_affiliation, extra
            contribs=contribs,

            #   name, type, publisher, issnl
            #   extra: issnp, issne, original_name, languages, country
            container=container,

            # extra:
            #   withdrawn_date
            #   translation_of
            #   subtitle
            #   aliases
            #   container_name
            #   group-title
            #   pubmed: retraction refs
            #extra=extra,
        )
        return re

if __name__=='__main__':
    parser = JalcXmlParser()
    parser.parse_file(open(sys.argv[1]))