diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-15 12:02:55 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-21 11:41:29 -0700 |
commit | 82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a (patch) | |
tree | 930331468462a74873aebb44b88c051e8b096c4f /python/parse_jalc_xml.py | |
parent | 4cff530fa3a49e845a2c21bbc85d74a92a3e2b06 (diff) | |
download | fatcat-82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a.tar.gz fatcat-82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a.zip |
initial flesh out of JALC parser
Diffstat (limited to 'python/parse_jalc_xml.py')
-rw-r--r-- | python/parse_jalc_xml.py | 209 |
1 files changed, 0 insertions, 209 deletions
diff --git a/python/parse_jalc_xml.py b/python/parse_jalc_xml.py deleted file mode 100644 index d7817df9..00000000 --- a/python/parse_jalc_xml.py +++ /dev/null @@ -1,209 +0,0 @@ - -import sys -import json -import datetime -import unicodedata -from bs4 import BeautifulSoup -from bs4.element import NavigableString - - -DATE_FMT = "%Y-%m-%d" - -def is_cjk(s): - if not s: - return False - return unicodedata.name(s[0]).startswith("CJK") - -class JalcXmlParser(): - """ - Converts JALC DOI metadata (in XML/RDF format) to fatcat release entity - - NOTE: some JALC DOIs seem to get cross-registered with Crossref - """ - - def __init__(self): - pass - - def parse_file(self, handle): - - # 1. open with beautiful soup - soup = BeautifulSoup(handle, "xml") - - # 2. iterate over articles, call parse_article on each - for record in soup.find_all("Description"): - resp = self.parse_record(record) - print(json.dumps(resp)) - #sys.exit(-1) - - - def parse_record(self, record): - """ - In JALC metadata, both English and Japanese records are given for most - fields. - """ - - #extra = dict() - #extra_jalc = dict() - - titles = record.find_all("title") - title = titles[0].string.strip() - original_title = None - if title.endswith('.'): - title = title[:-1] - if len(titles) > 1: - original_title = titles[1].string.strip() - if original_title.endswith('.'): - original_title = original_title[:-1] - - doi = None - if record.doi: - doi = record.doi.string.lower().strip() - assert doi.startswith('10.') - - contribs = [] - people = record.find_all("Person") - if people and (len(people) % 2 == 0) and is_cjk(people[1].find('name').string): - # both english and japanese names are included - for i in range(int(len(people)/2)): - # both english and japanese names are included for every author - eng = people[i*2] - jpn = people[i*2 + 1] - raw_name = eng.find('name') - orig_name = jpn.find('name') - if not raw_name: - raw_name = orig_name - contrib = dict( - raw_name=raw_name.string, - role='author', - ) - if raw_name and orig_name: - contrib['extra'] = dict(original_name=orig_name.string) - contribs.append(contrib) - elif people: - for eng in people: - raw_name = eng.find('name') - contrib = dict( - raw_name=eng.find('name').string, - role='author', - ) - contribs.append(contrib) - - release_year = None - release_date = None - date = record.date or None - if date: - date = date.string - if len(date) is 10: - release_date = datetime.datetime.strptime(date['completed-date'], DATE_FMT).date() - release_year = release_date.year - release_date = release_date.isoformat() - elif len(date) is 4: - release_year = int(date) - - pages = None - if record.startingPage: - pages = record.startingPage.string - if record.endingPage: - pages = "{}-{}".format(pages, record.endingPage.string) - volume = None - if record.volume: - volume = record.volume.string - issue = None - if record.number: - # note: number/issue transform - issue = record.number.string - - issn = None - issn_list = record.find_all("issn") - if issn_list: - # if we wanted the other ISSNs, would also need to uniq the list. - # But we only need one to lookup ISSN-L/container - issn = issn_list[0].string - - container = dict() - container_extra = dict() - container_name = None - if record.publicationName: - pubs = [p.string.strip() for p in record.find_all("publicationName")] - pubs = [p for p in pubs if p] - assert(pubs) - if len(pubs) > 1 and pubs[0] == pubs[1]: - pubs = [pubs[0]] - elif len(pubs) > 1 and is_cjk(pubs[0]): - # ordering is not reliable - pubs = [pubs[1], pubs[0]] - container_name = pubs[0] - container['name'] = container_name - if len(pubs) > 1: - orig_container_name = pubs[1] - container_extra['original_name'] = pubs[1] - publisher = None - if record.publisher: - pubs = [p.string.strip() for p in record.find_all("publisher")] - pubs = [p for p in pubs if p] - if len(pubs) > 1 and pubs[0] == pubs[1]: - pubs = [pubs[0]] - elif len(pubs) > 1 and is_cjk(pubs[0]): - # ordering is not reliable - pubs = [pubs[1], pubs[0]] - publisher = pubs[0] - container['publisher'] = publisher - if len(pubs) > 1: - container_extra['publisher_alt_name'] = pubs[1] - if container_extra: - container['extra'] = container_extra - if not container: - container = None - - # the vast majority of works are in japanese - # TODO: any indication when *not* in japanese? - lang = "ja" - - # reasonable default for this collection - release_type = "article-journal" - - re = dict( - work_id=None, - title=title, - original_title=original_title, - release_type="article-journal", - release_status='submitted', # XXX: source_type? - release_date=release_date, - release_year=release_year, - #arxiv_id - doi=doi, - #pmid - #pmcid - #isbn13 # never in Article - volume=volume, - issue=issue, - pages=pages, - publisher=publisher, - language=lang, - #license_slug # not in MEDLINE - - # content, mimetype, lang - #abstracts=abstracts, - - # raw_name, role, raw_affiliation, extra - contribs=contribs, - - # name, type, publisher, issnl - # extra: issnp, issne, original_name, languages, country - container=container, - - # extra: - # withdrawn_date - # translation_of - # subtitle - # aliases - # container_name - # group-title - # pubmed: retraction refs - #extra=extra, - ) - return re - -if __name__=='__main__': - parser = JalcXmlParser() - parser.parse_file(open(sys.argv[1])) |