initial flesh out of JALC parser

author: Bryan Newbold <bnewbold@robocracy.org> 2019-05-15 12:02:55 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2019-05-21 11:41:29 -0700
commit: 82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a (patch)
tree: 930331468462a74873aebb44b88c051e8b096c4f /python/parse_jalc_xml.py
parent: 4cff530fa3a49e845a2c21bbc85d74a92a3e2b06 (diff)
download: fatcat-82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a.tar.gz
fatcat-82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a.zip
1 files changed, 0 insertions, 209 deletions
diff --git a/python/parse_jalc_xml.py b/python/parse_jalc_xml.py
deleted file mode 100644
index d7817df9..00000000
--- a/python/parse_jalc_xml.py
+++ /dev/null
@@ -1,209 +0,0 @@
-
-import sys
-import json
-import datetime
-import unicodedata
-from bs4 import BeautifulSoup
-from bs4.element import NavigableString
-
-
-DATE_FMT = "%Y-%m-%d"
-
-def is_cjk(s):
-    if not s:
-        return False
-    return unicodedata.name(s[0]).startswith("CJK")
-
-class JalcXmlParser():
-    """
-    Converts JALC DOI metadata (in XML/RDF format) to fatcat release entity
-
-    NOTE: some JALC DOIs seem to get cross-registered with Crossref
-    """
-
-    def __init__(self):
-        pass
-
-    def parse_file(self, handle):
-
-        # 1. open with beautiful soup
-        soup = BeautifulSoup(handle, "xml")
-
-        # 2. iterate over articles, call parse_article on each
-        for record in soup.find_all("Description"):
-            resp = self.parse_record(record)
-            print(json.dumps(resp))
-            #sys.exit(-1)
-
-
-    def parse_record(self, record):
-        """
-        In JALC metadata, both English and Japanese records are given for most
-        fields.
-        """
-
-        #extra = dict()
-        #extra_jalc = dict()
-
-        titles = record.find_all("title")
-        title = titles[0].string.strip()
-        original_title = None
-        if title.endswith('.'):
-            title = title[:-1]
-        if len(titles) > 1:
-            original_title = titles[1].string.strip()
-            if original_title.endswith('.'):
-                original_title = original_title[:-1]
-
-        doi = None
-        if record.doi:
-            doi = record.doi.string.lower().strip()
-            assert doi.startswith('10.')
-
-        contribs = []
-        people = record.find_all("Person")
-        if people and (len(people) % 2 == 0) and is_cjk(people[1].find('name').string):
-            # both english and japanese names are included
-            for i in range(int(len(people)/2)):
-                # both english and japanese names are included for every author
-                eng = people[i*2]
-                jpn = people[i*2 + 1]
-                raw_name = eng.find('name')
-                orig_name = jpn.find('name')
-                if not raw_name:
-                    raw_name = orig_name
-                contrib = dict(
-                    raw_name=raw_name.string,
-                    role='author',
-                )
-                if raw_name and orig_name:
-                    contrib['extra'] = dict(original_name=orig_name.string)
-                contribs.append(contrib)
-        elif people:
-            for eng in people:
-                raw_name = eng.find('name')
-                contrib = dict(
-                    raw_name=eng.find('name').string,
-                    role='author',
-                )
-                contribs.append(contrib)
-
-        release_year = None
-        release_date = None
-        date = record.date or None
-        if date:
-            date = date.string
-            if len(date) is 10:
-                release_date = datetime.datetime.strptime(date['completed-date'], DATE_FMT).date()
-                release_year = release_date.year
-                release_date = release_date.isoformat()
-            elif len(date) is 4:
-                release_year = int(date)
-
-        pages = None
-        if record.startingPage:
-            pages = record.startingPage.string
-            if record.endingPage:
-                pages = "{}-{}".format(pages, record.endingPage.string)
-        volume = None
-        if record.volume:
-            volume = record.volume.string
-        issue = None
-        if record.number:
-            # note: number/issue transform
-            issue = record.number.string
-
-        issn = None
-        issn_list = record.find_all("issn")
-        if issn_list:
-            # if we wanted the other ISSNs, would also need to uniq the list.
-            # But we only need one to lookup ISSN-L/container
-            issn = issn_list[0].string
-
-        container = dict()
-        container_extra = dict()
-        container_name = None
-        if record.publicationName:
-            pubs = [p.string.strip() for p in record.find_all("publicationName")]
-            pubs = [p for p in pubs if p]
-            assert(pubs)
-            if len(pubs) > 1 and pubs[0] == pubs[1]:
-                pubs = [pubs[0]]
-            elif len(pubs) > 1 and is_cjk(pubs[0]):
-                # ordering is not reliable
-                pubs = [pubs[1], pubs[0]]
-            container_name = pubs[0]
-            container['name'] = container_name
-            if len(pubs) > 1:
-                orig_container_name = pubs[1]
-                container_extra['original_name'] = pubs[1]
-        publisher = None
-        if record.publisher:
-            pubs = [p.string.strip() for p in record.find_all("publisher")]
-            pubs = [p for p in pubs if p]
-            if len(pubs) > 1 and pubs[0] == pubs[1]:
-                pubs = [pubs[0]]
-            elif len(pubs) > 1 and is_cjk(pubs[0]):
-                # ordering is not reliable
-                pubs = [pubs[1], pubs[0]]
-            publisher = pubs[0]
-            container['publisher'] = publisher
-            if len(pubs) > 1:
-                container_extra['publisher_alt_name'] = pubs[1]
-        if container_extra:
-            container['extra'] = container_extra
-        if not container:
-            container = None
-
-        # the vast majority of works are in japanese
-        # TODO: any indication when *not* in japanese?
-        lang = "ja"
-
-        # reasonable default for this collection
-        release_type = "article-journal"
-
-        re = dict(
-            work_id=None,
-            title=title,
-            original_title=original_title,
-            release_type="article-journal",
-            release_status='submitted', # XXX: source_type?
-            release_date=release_date,
-            release_year=release_year,
-            #arxiv_id
-            doi=doi,
-            #pmid
-            #pmcid
-            #isbn13     # never in Article
-            volume=volume,
-            issue=issue,
-            pages=pages,
-            publisher=publisher,
-            language=lang,
-            #license_slug   # not in MEDLINE
-
-            # content, mimetype, lang
-            #abstracts=abstracts,
-
-            # raw_name, role, raw_affiliation, extra
-            contribs=contribs,
-
-            #   name, type, publisher, issnl
-            #   extra: issnp, issne, original_name, languages, country
-            container=container,
-
-            # extra:
-            #   withdrawn_date
-            #   translation_of
-            #   subtitle
-            #   aliases
-            #   container_name
-            #   group-title
-            #   pubmed: retraction refs
-            #extra=extra,
-        )
-        return re
-
-if __name__=='__main__':
-    parser = JalcXmlParser()
-    parser.parse_file(open(sys.argv[1]))
author	Bryan Newbold <bnewbold@robocracy.org>	2019-05-15 12:02:55 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2019-05-21 11:41:29 -0700
commit	82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a (patch)
tree	930331468462a74873aebb44b88c051e8b096c4f /python/parse_jalc_xml.py
parent	4cff530fa3a49e845a2c21bbc85d74a92a3e2b06 (diff)
download	fatcat-82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a.tar.gz fatcat-82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a.zip