summaryrefslogtreecommitdiffstats
path: root/python/parse_jalc_xml.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-15 12:02:55 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-21 11:41:29 -0700
commit82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a (patch)
tree930331468462a74873aebb44b88c051e8b096c4f /python/parse_jalc_xml.py
parent4cff530fa3a49e845a2c21bbc85d74a92a3e2b06 (diff)
downloadfatcat-82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a.tar.gz
fatcat-82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a.zip
initial flesh out of JALC parser
Diffstat (limited to 'python/parse_jalc_xml.py')
-rw-r--r--python/parse_jalc_xml.py209
1 files changed, 0 insertions, 209 deletions
diff --git a/python/parse_jalc_xml.py b/python/parse_jalc_xml.py
deleted file mode 100644
index d7817df9..00000000
--- a/python/parse_jalc_xml.py
+++ /dev/null
@@ -1,209 +0,0 @@
-
-import sys
-import json
-import datetime
-import unicodedata
-from bs4 import BeautifulSoup
-from bs4.element import NavigableString
-
-
-DATE_FMT = "%Y-%m-%d"
-
-def is_cjk(s):
- if not s:
- return False
- return unicodedata.name(s[0]).startswith("CJK")
-
-class JalcXmlParser():
- """
- Converts JALC DOI metadata (in XML/RDF format) to fatcat release entity
-
- NOTE: some JALC DOIs seem to get cross-registered with Crossref
- """
-
- def __init__(self):
- pass
-
- def parse_file(self, handle):
-
- # 1. open with beautiful soup
- soup = BeautifulSoup(handle, "xml")
-
- # 2. iterate over articles, call parse_article on each
- for record in soup.find_all("Description"):
- resp = self.parse_record(record)
- print(json.dumps(resp))
- #sys.exit(-1)
-
-
- def parse_record(self, record):
- """
- In JALC metadata, both English and Japanese records are given for most
- fields.
- """
-
- #extra = dict()
- #extra_jalc = dict()
-
- titles = record.find_all("title")
- title = titles[0].string.strip()
- original_title = None
- if title.endswith('.'):
- title = title[:-1]
- if len(titles) > 1:
- original_title = titles[1].string.strip()
- if original_title.endswith('.'):
- original_title = original_title[:-1]
-
- doi = None
- if record.doi:
- doi = record.doi.string.lower().strip()
- assert doi.startswith('10.')
-
- contribs = []
- people = record.find_all("Person")
- if people and (len(people) % 2 == 0) and is_cjk(people[1].find('name').string):
- # both english and japanese names are included
- for i in range(int(len(people)/2)):
- # both english and japanese names are included for every author
- eng = people[i*2]
- jpn = people[i*2 + 1]
- raw_name = eng.find('name')
- orig_name = jpn.find('name')
- if not raw_name:
- raw_name = orig_name
- contrib = dict(
- raw_name=raw_name.string,
- role='author',
- )
- if raw_name and orig_name:
- contrib['extra'] = dict(original_name=orig_name.string)
- contribs.append(contrib)
- elif people:
- for eng in people:
- raw_name = eng.find('name')
- contrib = dict(
- raw_name=eng.find('name').string,
- role='author',
- )
- contribs.append(contrib)
-
- release_year = None
- release_date = None
- date = record.date or None
- if date:
- date = date.string
- if len(date) is 10:
- release_date = datetime.datetime.strptime(date['completed-date'], DATE_FMT).date()
- release_year = release_date.year
- release_date = release_date.isoformat()
- elif len(date) is 4:
- release_year = int(date)
-
- pages = None
- if record.startingPage:
- pages = record.startingPage.string
- if record.endingPage:
- pages = "{}-{}".format(pages, record.endingPage.string)
- volume = None
- if record.volume:
- volume = record.volume.string
- issue = None
- if record.number:
- # note: number/issue transform
- issue = record.number.string
-
- issn = None
- issn_list = record.find_all("issn")
- if issn_list:
- # if we wanted the other ISSNs, would also need to uniq the list.
- # But we only need one to lookup ISSN-L/container
- issn = issn_list[0].string
-
- container = dict()
- container_extra = dict()
- container_name = None
- if record.publicationName:
- pubs = [p.string.strip() for p in record.find_all("publicationName")]
- pubs = [p for p in pubs if p]
- assert(pubs)
- if len(pubs) > 1 and pubs[0] == pubs[1]:
- pubs = [pubs[0]]
- elif len(pubs) > 1 and is_cjk(pubs[0]):
- # ordering is not reliable
- pubs = [pubs[1], pubs[0]]
- container_name = pubs[0]
- container['name'] = container_name
- if len(pubs) > 1:
- orig_container_name = pubs[1]
- container_extra['original_name'] = pubs[1]
- publisher = None
- if record.publisher:
- pubs = [p.string.strip() for p in record.find_all("publisher")]
- pubs = [p for p in pubs if p]
- if len(pubs) > 1 and pubs[0] == pubs[1]:
- pubs = [pubs[0]]
- elif len(pubs) > 1 and is_cjk(pubs[0]):
- # ordering is not reliable
- pubs = [pubs[1], pubs[0]]
- publisher = pubs[0]
- container['publisher'] = publisher
- if len(pubs) > 1:
- container_extra['publisher_alt_name'] = pubs[1]
- if container_extra:
- container['extra'] = container_extra
- if not container:
- container = None
-
- # the vast majority of works are in japanese
- # TODO: any indication when *not* in japanese?
- lang = "ja"
-
- # reasonable default for this collection
- release_type = "article-journal"
-
- re = dict(
- work_id=None,
- title=title,
- original_title=original_title,
- release_type="article-journal",
- release_status='submitted', # XXX: source_type?
- release_date=release_date,
- release_year=release_year,
- #arxiv_id
- doi=doi,
- #pmid
- #pmcid
- #isbn13 # never in Article
- volume=volume,
- issue=issue,
- pages=pages,
- publisher=publisher,
- language=lang,
- #license_slug # not in MEDLINE
-
- # content, mimetype, lang
- #abstracts=abstracts,
-
- # raw_name, role, raw_affiliation, extra
- contribs=contribs,
-
- # name, type, publisher, issnl
- # extra: issnp, issne, original_name, languages, country
- container=container,
-
- # extra:
- # withdrawn_date
- # translation_of
- # subtitle
- # aliases
- # container_name
- # group-title
- # pubmed: retraction refs
- #extra=extra,
- )
- return re
-
-if __name__=='__main__':
- parser = JalcXmlParser()
- parser.parse_file(open(sys.argv[1]))