diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-03-11 16:08:26 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-21 11:41:29 -0700 | 
| commit | ad1876a204cd2379ca4ccfb9f174dbe84f373d9a (patch) | |
| tree | 9751daa8dc1ee71d1f131aefbb0ca7cede113466 | |
| parent | 351393f4a1c6e86e3fd48d158e6a173919a80db1 (diff) | |
| download | fatcat-ad1876a204cd2379ca4ccfb9f174dbe84f373d9a.tar.gz fatcat-ad1876a204cd2379ca4ccfb9f174dbe84f373d9a.zip | |
basic JALC XML DOI metadata parser
| -rw-r--r-- | python/parse_jalc_xml.py | 209 | ||||
| -rw-r--r-- | python/tests/files/jalc_lod_sample.xml | 176 | 
2 files changed, 385 insertions, 0 deletions
| diff --git a/python/parse_jalc_xml.py b/python/parse_jalc_xml.py new file mode 100644 index 00000000..7df79421 --- /dev/null +++ b/python/parse_jalc_xml.py @@ -0,0 +1,209 @@ + +import sys +import json +import datetime +import unicodedata +from bs4 import BeautifulSoup +from bs4.element import NavigableString + + +DATE_FMT = "%Y-%m-%d" + +def is_cjk(s): +    if not s: +        return False +    return unicodedata.name(s[0]).startswith("CJK") + +class JalcXmlParser(): +    """ +    Converts JALC DOI metadata (in XML/RDF format) to fatcat release entity + +    NOTE: some JALC DOIs seem to get cross-registered with Crossref +    """ + +    def __init__(self): +        pass + +    def parse_file(self, handle): + +        # 1. open with beautiful soup +        soup = BeautifulSoup(handle, "xml") + +        # 2. iterate over articles, call parse_article on each +        for record in soup.find_all("Description"): +            resp = self.parse_record(record) +            print(json.dumps(resp)) +            #sys.exit(-1) + + +    def parse_record(self, record): +        """ +        In JALC metadata, both English and Japanese records are given for most +        fields. +        """ + +        #extra = dict() +        #extra_jalc = dict() + +        titles = record.find_all("title") +        title = titles[0].string.strip() +        original_title = None +        if title.endswith('.'): +            title = title[:-1] +        if len(titles) > 1: +            original_title = titles[1].string.strip() +            if original_title.endswith('.'): +                original_title = original_title[:-1] + +        doi = None +        if record.doi: +            doi = record.doi.string.lower().strip() +            assert doi.startswith('10.') + +        contribs = [] +        people = record.find_all("Person") +        if people and (len(people) % 2 == 0) and is_cjk(people[1].find('name').string): +            # both english and japanese names are included +            for i in range(int(len(people)/2)): +                # both english and japanese names are included for every author +                eng = people[i*2] +                jpn = people[i*2 + 1] +                raw_name = eng.find('name') +                orig_name = jpn.find('name') +                if not raw_name: +                    raw_name = orig_name +                contrib = dict( +                    raw_name=raw_name.string, +                    role='author', +                ) +                if raw_name and orig_name: +                    contrib['extra'] = dict(original_name=orig_name.string) +                contribs.append(contrib) +        elif people: +            for eng in people: +                raw_name = eng.find('name') +                contrib = dict( +                    raw_name=eng.find('name').string, +                    role='author', +                ) +                contribs.append(contrib) + +        release_year = None +        release_date = None +        date = record.date or None +        if date: +            date = date.string +            if len(date) is 10: +                release_date = datetime.datetime.strptime(state['completed-date'], DATE_FMT).date() +                release_year = release_date.year +                release_date = release_date.isoformat() +            elif len(date) is 4: +                release_year = int(date) + +        pages = None +        if record.startingPage: +            pages = record.startingPage.string +            if record.endingPage: +                pages = "{}-{}".format(pages, record.endingPage.string) +        volume = None +        if record.volume: +            volume = record.volume.string +        issue = None +        if record.number: +            # note: number/issue transform +            issue = record.number.string + +        issn = None +        issn_list = record.find_all("issn") +        if issn_list: +            # if we wanted the other ISSNs, would also need to uniq the list. +            # But we only need one to lookup ISSN-L/container +            issn = issn_list[0].string + +        container = dict() +        container_extra = dict() +        container_name = None +        if record.publicationName: +            pubs = [p.string.strip() for p in record.find_all("publicationName")] +            pubs = [p for p in pubs if p] +            assert(pubs) +            if len(pubs) > 1 and pubs[0] == pubs[1]: +                pubs = [pubs[0]] +            elif len(pubs) > 1 and is_cjk(pubs[0]): +                # ordering is not reliable +                pubs = [pubs[1], pubs[0]] +            container_name = pubs[0] +            container['name'] = container_name +            if len(pubs) > 1: +                orig_container_name = pubs[1] +                container_extra['original_name'] = pubs[1] +        publisher = None +        if record.publisher: +            pubs = [p.string.strip() for p in record.find_all("publisher")] +            pubs = [p for p in pubs if p] +            if len(pubs) > 1 and pubs[0] == pubs[1]: +                pubs = [pubs[0]] +            elif len(pubs) > 1 and is_cjk(pubs[0]): +                # ordering is not reliable +                pubs = [pubs[1], pubs[0]] +            publisher = pubs[0] +            container['publisher'] = publisher +            if len(pubs) > 1: +                container_extra['publisher_alt_name'] = pubs[1] +        if container_extra: +            container['extra'] = container_extra +        if not container: +            container = None + +        # the vast majority of works are in japanese +        # TODO: any indication when *not* in japanese? +        lang = "ja" + +        # reasonable default for this collection +        release_type = "article-journal" + +        re = dict( +            work_id=None, +            title=title, +            original_title=original_title, +            release_type="article-journal", +            release_status='submitted', # XXX: source_type? +            release_date=release_date, +            release_year=release_year, +            #arxiv_id +            doi=doi, +            #pmid +            #pmcid +            #isbn13     # never in Article +            volume=volume, +            issue=issue, +            pages=pages, +            publisher=publisher, +            language=lang, +            #license_slug   # not in MEDLINE + +            # content, mimetype, lang +            #abstracts=abstracts, + +            # raw_name, role, raw_affiliation, extra +            contribs=contribs, + +            #   name, type, publisher, issnl +            #   extra: issnp, issne, original_name, languages, country +            container=container, + +            # extra: +            #   withdrawn_date +            #   translation_of +            #   subtitle +            #   aliases +            #   container_name +            #   group-title +            #   pubmed: retraction refs +            #extra=extra, +        ) +        return re + +if __name__=='__main__': +    parser = JalcXmlParser() +    parser.parse_file(open(sys.argv[1])) diff --git a/python/tests/files/jalc_lod_sample.xml b/python/tests/files/jalc_lod_sample.xml new file mode 100644 index 00000000..3a9dd770 --- /dev/null +++ b/python/tests/files/jalc_lod_sample.xml @@ -0,0 +1,176 @@ +<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" +xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/" +xmlns:foaf="http://xmlns.com/foaf/0.1/" +xmlns:prism="http://prismstandard.org/namespaces/basic/2.0/"> +<rdf:Description rdf:about="http://doi.org/10.2497/jjspm.36.898"> +<prism:doi>10.2497/jjspm.36.898</prism:doi> +<dcterms:title>New carbides in the Ni-Ti-Mo-C system.</dcterms:title> +<dcterms:title>Ni-Ti-Mo-C系に出現する新炭化物相について</dcterms:title> +<dcterms:creator> +<foaf:Person> +<foaf:name>Hashimoto Yasuhiko</foaf:name> +<foaf:familyName>Hashimoto</foaf:familyName> +<foaf:givenName>Yasuhiko</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>橋本 雍彦</foaf:name> +<foaf:familyName>橋本</foaf:familyName> +<foaf:givenName>雍彦</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>Koyama Koichiro</foaf:name> +<foaf:familyName>Koyama</foaf:familyName> +<foaf:givenName>Koichiro</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>香山 滉一郎</foaf:name> +<foaf:familyName>香山</foaf:familyName> +<foaf:givenName>滉一郎</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>Suzuki Kenji</foaf:name> +<foaf:familyName>Suzuki</foaf:familyName> +<foaf:givenName>Kenji</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>鈴木 建次</foaf:name> +<foaf:familyName>鈴木</foaf:familyName> +<foaf:givenName>建次</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>Takahashi Teruo</foaf:name> +<foaf:familyName>Takahashi</foaf:familyName> +<foaf:givenName>Teruo</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>高橋 輝男</foaf:name> +<foaf:familyName>高橋</foaf:familyName> +<foaf:givenName>輝男</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dc:creator>Hashimoto Yasuhiko</dc:creator> +<dc:creator>橋本 雍彦</dc:creator> +<dc:creator>Koyama Koichiro</dc:creator> +<dc:creator>香山 滉一郎</dc:creator> +<dc:creator>Suzuki Kenji</dc:creator> +<dc:creator>鈴木 建次</dc:creator> +<dc:creator>Takahashi Teruo</dc:creator> +<dc:creator>高橋 輝男</dc:creator> +<dcterms:publisher>Japan Society of Powder and Powder Metallurgy</dcterms:publisher> +<dcterms:publisher>一般社団法人 粉体粉末冶金協会</dcterms:publisher> +<dcterms:date rdf:resource="http://www.w3.org/2001/XMLSchema#date">1989</dcterms:date> +<prism:volume>36</prism:volume> +<prism:number>8</prism:number> +<prism:startingPage>898</prism:startingPage> +<prism:endingPage>902</prism:endingPage> +<prism:issn>0532-8799</prism:issn> +<prism:issn>0532-8799</prism:issn> +<prism:issn>1880-9014</prism:issn> +<prism:issn>1880-9014</prism:issn> +<dcterms:publicationName> +Journal of the Japan Society of Powder and Powder Metallurgy +</dcterms:publicationName> +<dcterms:publicationName>粉体および粉末冶金</dcterms:publicationName> +</rdf:Description> +<rdf:Description rdf:about="http://doi.org/10.2497/jjspm.36.903"> +<prism:doi>10.2497/jjspm.36.903</prism:doi> +<dcterms:title> +Effects of grain size on cutting performance of Al2O3-TiC ceramics tool. +</dcterms:title> +<dcterms:title>Al2O3-TiCセラミックス工具の切削性能におよぼすセラミックス粒度の影響 +</dcterms:title> +<dcterms:creator> +<foaf:Person> +<foaf:name>Katsumura Yuji</foaf:name> +<foaf:familyName>Katsumura</foaf:familyName> +<foaf:givenName>Yuji</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>勝村 祐次</foaf:name> +<foaf:familyName>勝村</foaf:familyName> +<foaf:givenName>祐次</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>Sobata Kaoru</foaf:name> +<foaf:familyName>Sobata</foaf:familyName> +<foaf:givenName>Kaoru</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>蕎麦田 薫</foaf:name> +<foaf:familyName>蕎麦田</foaf:familyName> +<foaf:givenName>薫</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>Uehara Yoshito</foaf:name> +<foaf:familyName>Uehara</foaf:familyName> +<foaf:givenName>Yoshito</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>上原 好人</foaf:name> +<foaf:familyName>上原</foaf:familyName> +<foaf:givenName>好人</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>Suzuki Hisashi</foaf:name> +<foaf:familyName>Suzuki</foaf:familyName> +<foaf:givenName>Hisashi</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dcterms:creator> +<foaf:Person> +<foaf:name>鈴木 寿</foaf:name> +<foaf:familyName>鈴木</foaf:familyName> +<foaf:givenName>寿</foaf:givenName> +</foaf:Person> +</dcterms:creator> +<dc:creator>Katsumura Yuji</dc:creator> +<dc:creator>勝村祐次</dc:creator> +<dc:creator>Sobata Kaoru</dc:creator> +<dc:creator>蕎麦田 薫</dc:creator> +<dc:creator>Uehara Yoshito</dc:creator> +<dc:creator>上原好人</dc:creator> +<dc:creator>Suzuki Hisashi</dc:creator> +<dc:creator>鈴木寿</dc:creator> +<dcterms:publisher>Japan Society of Powder and Powder Metallurgy</dcterms:publisher> +<dcterms:publisher>一般社団法人 粉体粉末冶金協会</dcterms:publisher> +<dcterms:date rdf:resource="http://www.w3.org/2001/XMLSchema#date">1989</dcterms:date> +<prism:volume>36</prism:volume> +<prism:number>8</prism:number> +<prism:startingPage>903</prism:startingPage> +<prism:endingPage>907</prism:endingPage> +<prism:issn>0532-8799</prism:issn> +<prism:issn>0532-8799</prism:issn> +<prism:issn>1880-9014</prism:issn> +<prism:issn>1880-9014</prism:issn> +<dcterms:publicationName> +Journal of the Japan Society of Powder and Powder Metallurgy +</dcterms:publicationName> +<dcterms:publicationName>粉体および粉末冶金</dcterms:publicationName> +</rdf:Description> +</rdf:RDF> | 
