basic JSTOR XML parser

author: Bryan Newbold <bnewbold@robocracy.org> 2019-03-05 18:42:43 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2019-05-21 11:41:29 -0700
commit: ee393b537f3164ad25c9337b658db81192b25629 (patch)
tree: c1962719963942d56b8194694567d25b03dc4f5b /python/parse_jstor_xml.py
parent: a987af927686725f7778475f4c383d59c8c494bf (diff)
download: fatcat-ee393b537f3164ad25c9337b658db81192b25629.tar.gz
fatcat-ee393b537f3164ad25c9337b658db81192b25629.zip
1 files changed, 177 insertions, 0 deletions
diff --git a/python/parse_jstor_xml.py b/python/parse_jstor_xml.py
new file mode 100644
index 00000000..dfb899fd
--- /dev/null
+++ b/python/parse_jstor_xml.py
@@ -0,0 +1,177 @@
+
+import sys
+import json
+import datetime
+from bs4 import BeautifulSoup
+from bs4.element import NavigableString
+
+
+class JstorXmlParser():
+    """
+    Converts JSTOR bulk XML metadata (eg, from their Early Journals Collection)
+    """
+
+    def __init__(self):
+        pass
+
+    def parse_file(self, handle):
+
+        # 1. open with beautiful soup
+        soup = BeautifulSoup(handle, "xml")
+
+        # 2. iterate over articles, call parse_article on each
+        for article in soup.find_all("article"):
+            resp = self.parse_article(article)
+            print(json.dumps(resp))
+            #sys.exit(-1)
+
+    def parse_article(self, article):
+
+        journal_meta = article.front.find("journal-meta")
+        article_meta = article.front.find("article-meta")
+
+        extra = dict()
+        extra_jstor = dict()
+
+        journal_title = journal_meta.find("journal-title").string
+        publisher = journal_meta.find("publisher-name").string
+        issn = journal_meta.find("issn")
+        if issn:
+            issn = issn.string
+            if len(issn) == 8:
+                issn = "{}-{}".format(issn[0:4], issn[4:8])
+            else:
+                assert len(issn) == 9
+        container = dict(
+            name=journal_title,
+            publisher=publisher,
+            issn=issn,   # TODO: ISSN-L lookup...
+        )
+
+        doi = article_meta.find("article-id", attr={"pub-id-type": "doi"})
+        if doi:
+            doi = doi.string.lower().strip()
+
+        title = article_meta.find("article-title")
+        if title:
+            title = title.string.strip()
+            if title.endswith('.'):
+                title = title[:-1]
+
+        contribs = []
+        cgroup = article_meta.find("contrib-group")
+        if cgroup:
+            for c in cgroup.find_all("contrib"):
+                given = c.find("given-names")
+                surname = c.find("surname")
+                if given and surname:
+                    name = "{} {}".format(given.string, surname.string)
+                elif surname:
+                    name = surname.string
+                else:
+                    name = None
+                contribs.append(dict(
+                    role=c['contrib-type'],   # XXX: types? mapping?
+                    raw_name=name,
+                ))
+
+        release_year = None
+        release_date = None
+        pub_date = article_meta.find('pub-date')
+        if pub_date and pub_date.year:
+            release_year = int(pub_date.year.string)
+            if pub_date.month and pub_date.day:
+                release_date = datetime.date(
+                    release_year,
+                    int(pub_date.month.string),
+                    int(pub_date.day.string))
+        
+        volume = None
+        if article_meta.volume:
+            volume = article_meta.volume.string or None
+
+        issue = None
+        if article_meta.issue:
+            issue = article_meta.issue.string or None
+
+        pages = None
+        if article_meta.find("page-range"):
+            pages = article_meta.find("page-range").string
+        elif article_meta.fpage:
+            pages = article_meta.fpage.string
+
+        language = None
+        cm = article_meta.find("custom-meta")
+        if cm.find("meta-name").string == "lang":
+            language = cm.find("meta-value").string
+
+        release_type = "article-journal"
+        if "[Abstract]" in title:
+            release_type = "abstract"
+        elif "[Editorial" in title:
+            release_type = "editorial"
+        elif "[Letter" in title:
+            release_type = "letter"
+        elif "[Poem" in title or "[Photograph" in title:
+            release_type = None
+        else if title.startswith("[") and title.endswith("]"):
+            # strip brackets if that is all that is there (eg, translation or non-english)
+            title = title[1:-1]
+
+        # everything in JSTOR is published
+        release_status = "published"
+
+        if extra_jstor:
+            extra['jstor'] = extra_jstor
+        if not extra:
+            extra = None
+
+        re = dict(
+            issn=issn, # not an entity field
+            #work_id
+            title=title,
+            #original_title
+            release_type=release_type,
+            release_status=release_status,
+            release_date=release_date.isoformat(),
+            release_year=release_year,
+            doi=doi,
+            #pmid
+            #pmcid
+            #isbn13     # TODO: ?
+            volume=volume,
+            issue=issue,
+            pages=pages,
+            publisher=publisher,
+            language=language,
+            #license_slug   # TODO: ?
+
+            # content, mimetype, lang
+            #abstracts=abstracts,
+
+            # raw_name, role, raw_affiliation, extra
+            contribs=contribs,
+
+            # key, year, container_name, title, locator
+            # extra: volume, authors, issue, publisher, identifiers
+            #refs=refs,
+
+            #   name, type, publisher, issnl
+            #   extra: issnp, issne, original_name, languages, country
+            container=container,
+
+            # extra:
+            #   withdrawn_date
+            #   translation_of
+            #   subtitle
+            #   aliases
+            #   container_name
+            #   group-title
+            #   pubmed: retraction refs
+            extra=extra,
+        )
+        return re
+
+if __name__=='__main__':
+    parser = JstorXmlParser()
+    parser.parse_file(open(sys.argv[1]))
author	Bryan Newbold <bnewbold@robocracy.org>	2019-03-05 18:42:43 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2019-05-21 11:41:29 -0700
commit	ee393b537f3164ad25c9337b658db81192b25629 (patch)
tree	c1962719963942d56b8194694567d25b03dc4f5b /python/parse_jstor_xml.py
parent	a987af927686725f7778475f4c383d59c8c494bf (diff)
download	fatcat-ee393b537f3164ad25c9337b658db81192b25629.tar.gz fatcat-ee393b537f3164ad25c9337b658db81192b25629.zip