diff options
-rw-r--r-- | python/parse_jalc_xml.py | 209 | ||||
-rw-r--r-- | python/tests/files/jalc_lod_sample.xml | 176 |
2 files changed, 385 insertions, 0 deletions
diff --git a/python/parse_jalc_xml.py b/python/parse_jalc_xml.py new file mode 100644 index 00000000..7df79421 --- /dev/null +++ b/python/parse_jalc_xml.py @@ -0,0 +1,209 @@ + +import sys +import json +import datetime +import unicodedata +from bs4 import BeautifulSoup +from bs4.element import NavigableString + + +DATE_FMT = "%Y-%m-%d" + +def is_cjk(s): + if not s: + return False + return unicodedata.name(s[0]).startswith("CJK") + +class JalcXmlParser(): + """ + Converts JALC DOI metadata (in XML/RDF format) to fatcat release entity + + NOTE: some JALC DOIs seem to get cross-registered with Crossref + """ + + def __init__(self): + pass + + def parse_file(self, handle): + + # 1. open with beautiful soup + soup = BeautifulSoup(handle, "xml") + + # 2. iterate over articles, call parse_article on each + for record in soup.find_all("Description"): + resp = self.parse_record(record) + print(json.dumps(resp)) + #sys.exit(-1) + + + def parse_record(self, record): + """ + In JALC metadata, both English and Japanese records are given for most + fields. + """ + + #extra = dict() + #extra_jalc = dict() + + titles = record.find_all("title") + title = titles[0].string.strip() + original_title = None + if title.endswith('.'): + title = title[:-1] + if len(titles) > 1: + original_title = titles[1].string.strip() + if original_title.endswith('.'): + original_title = original_title[:-1] + + doi = None + if record.doi: + doi = record.doi.string.lower().strip() + assert doi.startswith('10.') + + contribs = [] + people = record.find_all("Person") + if people and (len(people) % 2 == 0) and is_cjk(people[1].find('name').string): + # both english and japanese names are included + for i in range(int(len(people)/2)): + # both english and japanese names are included for every author + eng = people[i*2] + jpn = people[i*2 + 1] + raw_name = eng.find('name') + orig_name = jpn.find('name') + if not raw_name: + raw_name = orig_name + contrib = dict( + raw_name=raw_name.string, + role='author', + ) + if raw_name and orig_name: + contrib['extra'] = dict(original_name=orig_name.string) + contribs.append(contrib) + elif people: + for eng in people: + raw_name = eng.find('name') + contrib = dict( + raw_name=eng.find('name').string, + role='author', + ) + contribs.append(contrib) + + release_year = None + release_date = None + date = record.date or None + if date: + date = date.string + if len(date) is 10: + release_date = datetime.datetime.strptime(state['completed-date'], DATE_FMT).date() + release_year = release_date.year + release_date = release_date.isoformat() + elif len(date) is 4: + release_year = int(date) + + pages = None + if record.startingPage: + pages = record.startingPage.string + if record.endingPage: + pages = "{}-{}".format(pages, record.endingPage.string) + volume = None + if record.volume: + volume = record.volume.string + issue = None + if record.number: + # note: number/issue transform + issue = record.number.string + + issn = None + issn_list = record.find_all("issn") + if issn_list: + # if we wanted the other ISSNs, would also need to uniq the list. + # But we only need one to lookup ISSN-L/container + issn = issn_list[0].string + + container = dict() + container_extra = dict() + container_name = None + if record.publicationName: + pubs = [p.string.strip() for p in record.find_all("publicationName")] + pubs = [p for p in pubs if p] + assert(pubs) + if len(pubs) > 1 and pubs[0] == pubs[1]: + pubs = [pubs[0]] + elif len(pubs) > 1 and is_cjk(pubs[0]): + # ordering is not reliable + pubs = [pubs[1], pubs[0]] + container_name = pubs[0] + container['name'] = container_name + if len(pubs) > 1: + orig_container_name = pubs[1] + container_extra['original_name'] = pubs[1] + publisher = None + if record.publisher: + pubs = [p.string.strip() for p in record.find_all("publisher")] + pubs = [p for p in pubs if p] + if len(pubs) > 1 and pubs[0] == pubs[1]: + pubs = [pubs[0]] + elif len(pubs) > 1 and is_cjk(pubs[0]): + # ordering is not reliable + pubs = [pubs[1], pubs[0]] + publisher = pubs[0] + container['publisher'] = publisher + if len(pubs) > 1: + container_extra['publisher_alt_name'] = pubs[1] + if container_extra: + container['extra'] = container_extra + if not container: + container = None + + # the vast majority of works are in japanese + # TODO: any indication when *not* in japanese? + lang = "ja" + + # reasonable default for this collection + release_type = "article-journal" + + re = dict( + work_id=None, + title=title, + original_title=original_title, + release_type="article-journal", + release_status='submitted', # XXX: source_type? + release_date=release_date, + release_year=release_year, + #arxiv_id + doi=doi, + #pmid + #pmcid + #isbn13 # never in Article + volume=volume, + issue=issue, + pages=pages, + publisher=publisher, + language=lang, + #license_slug # not in MEDLINE + + # content, mimetype, lang + #abstracts=abstracts, + + # raw_name, role, raw_affiliation, extra + contribs=contribs, + + # name, type, publisher, issnl + # extra: issnp, issne, original_name, languages, country + container=container, + + # extra: + # withdrawn_date + # translation_of + # subtitle + # aliases + # container_name + # group-title + # pubmed: retraction refs + #extra=extra, + ) + return re + +if __name__=='__main__': + parser = JalcXmlParser() + parser.parse_file(open(sys.argv[1])) diff --git a/python/tests/files/jalc_lod_sample.xml b/python/tests/files/jalc_lod_sample.xml new file mode 100644 index 00000000..3a9dd770 --- /dev/null +++ b/python/tests/files/jalc_lod_sample.xml @@ -0,0 +1,176 @@ +<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" +xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/" +xmlns:foaf="http://xmlns.com/foaf/0.1/" +xmlns:prism="http://prismstandard.org/namespaces/basic/2.0/"> +<rdf:Description rdf:about="http://doi.org/10.2497/jjspm.36.898"> +<prism:doi>10.2497/jjspm.36.898</prism:doi> +<dcterms:title>New carbides in the Ni-Ti-Mo-C system.</dcterms:title> +<dcterms:title>Ni-Ti-Mo-C系に出現する新炭化物相について</dcterms:title> +<dcterms:creator> +<foaf:Person> +<foaf:name>Hashimoto Yasuhiko</foaf:name> +<foaf:familyName>Hashimoto</foaf:familyName> +<foaf:givenName>Yasuhiko</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>橋本 雍彦</foaf:name> +<foaf:familyName>橋本</foaf:familyName> +<foaf:givenName>雍彦</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>Koyama Koichiro</foaf:name> +<foaf:familyName>Koyama</foaf:familyName> +<foaf:givenName>Koichiro</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>香山 滉一郎</foaf:name> +<foaf:familyName>香山</foaf:familyName> +<foaf:givenName>滉一郎</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>Suzuki Kenji</foaf:name> +<foaf:familyName>Suzuki</foaf:familyName> +<foaf:givenName>Kenji</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>鈴木 建次</foaf:name> +<foaf:familyName>鈴木</foaf:familyName> +<foaf:givenName>建次</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>Takahashi Teruo</foaf:name> +<foaf:familyName>Takahashi</foaf:familyName> +<foaf:givenName>Teruo</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>高橋 輝男</foaf:name> +<foaf:familyName>高橋</foaf:familyName> +<foaf:givenName>輝男</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dc:creator>Hashimoto Yasuhiko</dc:creator> +<dc:creator>橋本 雍彦</dc:creator> +<dc:creator>Koyama Koichiro</dc:creator> +<dc:creator>香山 滉一郎</dc:creator> +<dc:creator>Suzuki Kenji</dc:creator> +<dc:creator>鈴木 建次</dc:creator> +<dc:creator>Takahashi Teruo</dc:creator> +<dc:creator>高橋 輝男</dc:creator> +<dcterms:publisher>Japan Society of Powder and Powder Metallurgy</dcterms:publisher> +<dcterms:publisher>一般社団法人 粉体粉末冶金協会</dcterms:publisher> +<dcterms:date rdf:resource="http://www.w3.org/2001/XMLSchema#date">1989</dcterms:date> +<prism:volume>36</prism:volume> +<prism:number>8</prism:number> +<prism:startingPage>898</prism:startingPage> +<prism:endingPage>902</prism:endingPage> +<prism:issn>0532-8799</prism:issn> +<prism:issn>0532-8799</prism:issn> +<prism:issn>1880-9014</prism:issn> +<prism:issn>1880-9014</prism:issn> +<dcterms:publicationName> +Journal of the Japan Society of Powder and Powder Metallurgy +</dcterms:publicationName> +<dcterms:publicationName>粉体および粉末冶金</dcterms:publicationName> +</rdf:Description> +<rdf:Description rdf:about="http://doi.org/10.2497/jjspm.36.903"> +<prism:doi>10.2497/jjspm.36.903</prism:doi> +<dcterms:title> +Effects of grain size on cutting performance of Al2O3-TiC ceramics tool. +</dcterms:title> +<dcterms:title>Al2O3-TiCセラミックス工具の切削性能におよぼすセラミックス粒度の影響 +</dcterms:title> +<dcterms:creator> +<foaf:Person> +<foaf:name>Katsumura Yuji</foaf:name> +<foaf:familyName>Katsumura</foaf:familyName> +<foaf:givenName>Yuji</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>勝村 祐次</foaf:name> +<foaf:familyName>勝村</foaf:familyName> +<foaf:givenName>祐次</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>Sobata Kaoru</foaf:name> +<foaf:familyName>Sobata</foaf:familyName> +<foaf:givenName>Kaoru</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>蕎麦田 薫</foaf:name> +<foaf:familyName>蕎麦田</foaf:familyName> +<foaf:givenName>薫</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>Uehara Yoshito</foaf:name> +<foaf:familyName>Uehara</foaf:familyName> +<foaf:givenName>Yoshito</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>上原 好人</foaf:name> +<foaf:familyName>上原</foaf:familyName> +<foaf:givenName>好人</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>Suzuki Hisashi</foaf:name> +<foaf:familyName>Suzuki</foaf:familyName> +<foaf:givenName>Hisashi</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>鈴木 寿</foaf:name> +<foaf:familyName>鈴木</foaf:familyName> +<foaf:givenName>寿</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dc:creator>Katsumura Yuji</dc:creator> +<dc:creator>勝村祐次</dc:creator> +<dc:creator>Sobata Kaoru</dc:creator> +<dc:creator>蕎麦田 薫</dc:creator> +<dc:creator>Uehara Yoshito</dc:creator> +<dc:creator>上原好人</dc:creator> +<dc:creator>Suzuki Hisashi</dc:creator> +<dc:creator>鈴木寿</dc:creator> +<dcterms:publisher>Japan Society of Powder and Powder Metallurgy</dcterms:publisher> +<dcterms:publisher>一般社団法人 粉体粉末冶金協会</dcterms:publisher> +<dcterms:date rdf:resource="http://www.w3.org/2001/XMLSchema#date">1989</dcterms:date> +<prism:volume>36</prism:volume> +<prism:number>8</prism:number> +<prism:startingPage>903</prism:startingPage> +<prism:endingPage>907</prism:endingPage> +<prism:issn>0532-8799</prism:issn> +<prism:issn>0532-8799</prism:issn> +<prism:issn>1880-9014</prism:issn> +<prism:issn>1880-9014</prism:issn> +<dcterms:publicationName> +Journal of the Japan Society of Powder and Powder Metallurgy +</dcterms:publicationName> +<dcterms:publicationName>粉体および粉末冶金</dcterms:publicationName> +</rdf:Description> +</rdf:RDF> |