aboutsummaryrefslogtreecommitdiffstats
path: root/python/parse_jalc_xml.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-03-11 16:08:26 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-21 11:41:29 -0700
commitad1876a204cd2379ca4ccfb9f174dbe84f373d9a (patch)
tree9751daa8dc1ee71d1f131aefbb0ca7cede113466 /python/parse_jalc_xml.py
parent351393f4a1c6e86e3fd48d158e6a173919a80db1 (diff)
downloadfatcat-ad1876a204cd2379ca4ccfb9f174dbe84f373d9a.tar.gz
fatcat-ad1876a204cd2379ca4ccfb9f174dbe84f373d9a.zip
basic JALC XML DOI metadata parser
Diffstat (limited to 'python/parse_jalc_xml.py')
-rw-r--r--python/parse_jalc_xml.py209
1 files changed, 209 insertions, 0 deletions
diff --git a/python/parse_jalc_xml.py b/python/parse_jalc_xml.py
new file mode 100644
index 00000000..7df79421
--- /dev/null
+++ b/python/parse_jalc_xml.py
@@ -0,0 +1,209 @@
+
+import sys
+import json
+import datetime
+import unicodedata
+from bs4 import BeautifulSoup
+from bs4.element import NavigableString
+
+
+DATE_FMT = "%Y-%m-%d"
+
+def is_cjk(s):
+ if not s:
+ return False
+ return unicodedata.name(s[0]).startswith("CJK")
+
+class JalcXmlParser():
+ """
+ Converts JALC DOI metadata (in XML/RDF format) to fatcat release entity
+
+ NOTE: some JALC DOIs seem to get cross-registered with Crossref
+ """
+
+ def __init__(self):
+ pass
+
+ def parse_file(self, handle):
+
+ # 1. open with beautiful soup
+ soup = BeautifulSoup(handle, "xml")
+
+ # 2. iterate over articles, call parse_article on each
+ for record in soup.find_all("Description"):
+ resp = self.parse_record(record)
+ print(json.dumps(resp))
+ #sys.exit(-1)
+
+
+ def parse_record(self, record):
+ """
+ In JALC metadata, both English and Japanese records are given for most
+ fields.
+ """
+
+ #extra = dict()
+ #extra_jalc = dict()
+
+ titles = record.find_all("title")
+ title = titles[0].string.strip()
+ original_title = None
+ if title.endswith('.'):
+ title = title[:-1]
+ if len(titles) > 1:
+ original_title = titles[1].string.strip()
+ if original_title.endswith('.'):
+ original_title = original_title[:-1]
+
+ doi = None
+ if record.doi:
+ doi = record.doi.string.lower().strip()
+ assert doi.startswith('10.')
+
+ contribs = []
+ people = record.find_all("Person")
+ if people and (len(people) % 2 == 0) and is_cjk(people[1].find('name').string):
+ # both english and japanese names are included
+ for i in range(int(len(people)/2)):
+ # both english and japanese names are included for every author
+ eng = people[i*2]
+ jpn = people[i*2 + 1]
+ raw_name = eng.find('name')
+ orig_name = jpn.find('name')
+ if not raw_name:
+ raw_name = orig_name
+ contrib = dict(
+ raw_name=raw_name.string,
+ role='author',
+ )
+ if raw_name and orig_name:
+ contrib['extra'] = dict(original_name=orig_name.string)
+ contribs.append(contrib)
+ elif people:
+ for eng in people:
+ raw_name = eng.find('name')
+ contrib = dict(
+ raw_name=eng.find('name').string,
+ role='author',
+ )
+ contribs.append(contrib)
+
+ release_year = None
+ release_date = None
+ date = record.date or None
+ if date:
+ date = date.string
+ if len(date) is 10:
+ release_date = datetime.datetime.strptime(state['completed-date'], DATE_FMT).date()
+ release_year = release_date.year
+ release_date = release_date.isoformat()
+ elif len(date) is 4:
+ release_year = int(date)
+
+ pages = None
+ if record.startingPage:
+ pages = record.startingPage.string
+ if record.endingPage:
+ pages = "{}-{}".format(pages, record.endingPage.string)
+ volume = None
+ if record.volume:
+ volume = record.volume.string
+ issue = None
+ if record.number:
+ # note: number/issue transform
+ issue = record.number.string
+
+ issn = None
+ issn_list = record.find_all("issn")
+ if issn_list:
+ # if we wanted the other ISSNs, would also need to uniq the list.
+ # But we only need one to lookup ISSN-L/container
+ issn = issn_list[0].string
+
+ container = dict()
+ container_extra = dict()
+ container_name = None
+ if record.publicationName:
+ pubs = [p.string.strip() for p in record.find_all("publicationName")]
+ pubs = [p for p in pubs if p]
+ assert(pubs)
+ if len(pubs) > 1 and pubs[0] == pubs[1]:
+ pubs = [pubs[0]]
+ elif len(pubs) > 1 and is_cjk(pubs[0]):
+ # ordering is not reliable
+ pubs = [pubs[1], pubs[0]]
+ container_name = pubs[0]
+ container['name'] = container_name
+ if len(pubs) > 1:
+ orig_container_name = pubs[1]
+ container_extra['original_name'] = pubs[1]
+ publisher = None
+ if record.publisher:
+ pubs = [p.string.strip() for p in record.find_all("publisher")]
+ pubs = [p for p in pubs if p]
+ if len(pubs) > 1 and pubs[0] == pubs[1]:
+ pubs = [pubs[0]]
+ elif len(pubs) > 1 and is_cjk(pubs[0]):
+ # ordering is not reliable
+ pubs = [pubs[1], pubs[0]]
+ publisher = pubs[0]
+ container['publisher'] = publisher
+ if len(pubs) > 1:
+ container_extra['publisher_alt_name'] = pubs[1]
+ if container_extra:
+ container['extra'] = container_extra
+ if not container:
+ container = None
+
+ # the vast majority of works are in japanese
+ # TODO: any indication when *not* in japanese?
+ lang = "ja"
+
+ # reasonable default for this collection
+ release_type = "article-journal"
+
+ re = dict(
+ work_id=None,
+ title=title,
+ original_title=original_title,
+ release_type="article-journal",
+ release_status='submitted', # XXX: source_type?
+ release_date=release_date,
+ release_year=release_year,
+ #arxiv_id
+ doi=doi,
+ #pmid
+ #pmcid
+ #isbn13 # never in Article
+ volume=volume,
+ issue=issue,
+ pages=pages,
+ publisher=publisher,
+ language=lang,
+ #license_slug # not in MEDLINE
+
+ # content, mimetype, lang
+ #abstracts=abstracts,
+
+ # raw_name, role, raw_affiliation, extra
+ contribs=contribs,
+
+ # name, type, publisher, issnl
+ # extra: issnp, issne, original_name, languages, country
+ container=container,
+
+ # extra:
+ # withdrawn_date
+ # translation_of
+ # subtitle
+ # aliases
+ # container_name
+ # group-title
+ # pubmed: retraction refs
+ #extra=extra,
+ )
+ return re
+
+if __name__=='__main__':
+ parser = JalcXmlParser()
+ parser.parse_file(open(sys.argv[1]))