import sys import json import datetime import unicodedata from bs4 import BeautifulSoup from bs4.element import NavigableString DATE_FMT = "%Y-%m-%d" def is_cjk(s): if not s: return False return unicodedata.name(s[0]).startswith("CJK") class JalcXmlParser(): """ Converts JALC DOI metadata (in XML/RDF format) to fatcat release entity NOTE: some JALC DOIs seem to get cross-registered with Crossref """ def __init__(self): pass def parse_file(self, handle): # 1. open with beautiful soup soup = BeautifulSoup(handle, "xml") # 2. iterate over articles, call parse_article on each for record in soup.find_all("Description"): resp = self.parse_record(record) print(json.dumps(resp)) #sys.exit(-1) def parse_record(self, record): """ In JALC metadata, both English and Japanese records are given for most fields. """ #extra = dict() #extra_jalc = dict() titles = record.find_all("title") title = titles[0].string.strip() original_title = None if title.endswith('.'): title = title[:-1] if len(titles) > 1: original_title = titles[1].string.strip() if original_title.endswith('.'): original_title = original_title[:-1] doi = None if record.doi: doi = record.doi.string.lower().strip() assert doi.startswith('10.') contribs = [] people = record.find_all("Person") if people and (len(people) % 2 == 0) and is_cjk(people[1].find('name').string): # both english and japanese names are included for i in range(int(len(people)/2)): # both english and japanese names are included for every author eng = people[i*2] jpn = people[i*2 + 1] raw_name = eng.find('name') orig_name = jpn.find('name') if not raw_name: raw_name = orig_name contrib = dict( raw_name=raw_name.string, role='author', ) if raw_name and orig_name: contrib['extra'] = dict(original_name=orig_name.string) contribs.append(contrib) elif people: for eng in people: raw_name = eng.find('name') contrib = dict( raw_name=eng.find('name').string, role='author', ) contribs.append(contrib) release_year = None release_date = None date = record.date or None if date: date = date.string if len(date) is 10: release_date = datetime.datetime.strptime(date['completed-date'], DATE_FMT).date() release_year = release_date.year release_date = release_date.isoformat() elif len(date) is 4: release_year = int(date) pages = None if record.startingPage: pages = record.startingPage.string if record.endingPage: pages = "{}-{}".format(pages, record.endingPage.string) volume = None if record.volume: volume = record.volume.string issue = None if record.number: # note: number/issue transform issue = record.number.string issn = None issn_list = record.find_all("issn") if issn_list: # if we wanted the other ISSNs, would also need to uniq the list. # But we only need one to lookup ISSN-L/container issn = issn_list[0].string container = dict() container_extra = dict() container_name = None if record.publicationName: pubs = [p.string.strip() for p in record.find_all("publicationName")] pubs = [p for p in pubs if p] assert(pubs) if len(pubs) > 1 and pubs[0] == pubs[1]: pubs = [pubs[0]] elif len(pubs) > 1 and is_cjk(pubs[0]): # ordering is not reliable pubs = [pubs[1], pubs[0]] container_name = pubs[0] container['name'] = container_name if len(pubs) > 1: orig_container_name = pubs[1] container_extra['original_name'] = pubs[1] publisher = None if record.publisher: pubs = [p.string.strip() for p in record.find_all("publisher")] pubs = [p for p in pubs if p] if len(pubs) > 1 and pubs[0] == pubs[1]: pubs = [pubs[0]] elif len(pubs) > 1 and is_cjk(pubs[0]): # ordering is not reliable pubs = [pubs[1], pubs[0]] publisher = pubs[0] container['publisher'] = publisher if len(pubs) > 1: container_extra['publisher_alt_name'] = pubs[1] if container_extra: container['extra'] = container_extra if not container: container = None # the vast majority of works are in japanese # TODO: any indication when *not* in japanese? lang = "ja" # reasonable default for this collection release_type = "article-journal" re = dict( work_id=None, title=title, original_title=original_title, release_type="article-journal", release_status='submitted', # XXX: source_type? release_date=release_date, release_year=release_year, #arxiv_id doi=doi, #pmid #pmcid #isbn13 # never in Article volume=volume, issue=issue, pages=pages, publisher=publisher, language=lang, #license_slug # not in MEDLINE # content, mimetype, lang #abstracts=abstracts, # raw_name, role, raw_affiliation, extra contribs=contribs, # name, type, publisher, issnl # extra: issnp, issne, original_name, languages, country container=container, # extra: # withdrawn_date # translation_of # subtitle # aliases # container_name # group-title # pubmed: retraction refs #extra=extra, ) return re if __name__=='__main__': parser = JalcXmlParser() parser.parse_file(open(sys.argv[1]))