From 9a944bfb6d994fe2f6865c5b9117920ed99cc5f1 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 5 Mar 2019 18:42:43 -0800
Subject: basic JSTOR XML parser

---
 python/parse_jstor_xml.py                          | 177 +++++++++++++++++++++
 .../tests/files/jstor-article-10.2307_111039.xml   |  58 +++++++
 2 files changed, 235 insertions(+)
 create mode 100644 python/parse_jstor_xml.py
 create mode 100644 python/tests/files/jstor-article-10.2307_111039.xml

diff --git a/python/parse_jstor_xml.py b/python/parse_jstor_xml.py
new file mode 100644
index 00000000..dfb899fd
--- /dev/null
+++ b/python/parse_jstor_xml.py
@@ -0,0 +1,177 @@
+
+import sys
+import json
+import datetime
+from bs4 import BeautifulSoup
+from bs4.element import NavigableString
+
+
+class JstorXmlParser():
+    """
+    Converts JSTOR bulk XML metadata (eg, from their Early Journals Collection)
+    """
+
+    def __init__(self):
+        pass
+
+    def parse_file(self, handle):
+
+        # 1. open with beautiful soup
+        soup = BeautifulSoup(handle, "xml")
+
+        # 2. iterate over articles, call parse_article on each
+        for article in soup.find_all("article"):
+            resp = self.parse_article(article)
+            print(json.dumps(resp))
+            #sys.exit(-1)
+
+    def parse_article(self, article):
+
+        journal_meta = article.front.find("journal-meta")
+        article_meta = article.front.find("article-meta")
+
+        extra = dict()
+        extra_jstor = dict()
+
+        journal_title = journal_meta.find("journal-title").string
+        publisher = journal_meta.find("publisher-name").string
+        issn = journal_meta.find("issn")
+        if issn:
+            issn = issn.string
+            if len(issn) == 8:
+                issn = "{}-{}".format(issn[0:4], issn[4:8])
+            else:
+                assert len(issn) == 9
+        container = dict(
+            name=journal_title,
+            publisher=publisher,
+            issn=issn,   # TODO: ISSN-L lookup...
+        )
+
+        doi = article_meta.find("article-id", attr={"pub-id-type": "doi"})
+        if doi:
+            doi = doi.string.lower().strip()
+
+        title = article_meta.find("article-title")
+        if title:
+            title = title.string.strip()
+            if title.endswith('.'):
+                title = title[:-1]
+
+        contribs = []
+        cgroup = article_meta.find("contrib-group")
+        if cgroup:
+            for c in cgroup.find_all("contrib"):
+                given = c.find("given-names")
+                surname = c.find("surname")
+                if given and surname:
+                    name = "{} {}".format(given.string, surname.string)
+                elif surname:
+                    name = surname.string
+                else:
+                    name = None
+                contribs.append(dict(
+                    role=c['contrib-type'],   # XXX: types? mapping?
+                    raw_name=name,
+                ))
+
+        release_year = None
+        release_date = None
+        pub_date = article_meta.find('pub-date')
+        if pub_date and pub_date.year:
+            release_year = int(pub_date.year.string)
+            if pub_date.month and pub_date.day:
+                release_date = datetime.date(
+                    release_year,
+                    int(pub_date.month.string),
+                    int(pub_date.day.string))
+        
+        volume = None
+        if article_meta.volume:
+            volume = article_meta.volume.string or None
+
+        issue = None
+        if article_meta.issue:
+            issue = article_meta.issue.string or None
+
+        pages = None
+        if article_meta.find("page-range"):
+            pages = article_meta.find("page-range").string
+        elif article_meta.fpage:
+            pages = article_meta.fpage.string
+
+        language = None
+        cm = article_meta.find("custom-meta")
+        if cm.find("meta-name").string == "lang":
+            language = cm.find("meta-value").string
+
+        release_type = "article-journal"
+        if "[Abstract]" in title:
+            release_type = "abstract"
+        elif "[Editorial" in title:
+            release_type = "editorial"
+        elif "[Letter" in title:
+            release_type = "letter"
+        elif "[Poem" in title or "[Photograph" in title:
+            release_type = None
+        else if title.startswith("[") and title.endswith("]"):
+            # strip brackets if that is all that is there (eg, translation or non-english)
+            title = title[1:-1]
+
+        # everything in JSTOR is published
+        release_status = "published"
+
+        if extra_jstor:
+            extra['jstor'] = extra_jstor
+        if not extra:
+            extra = None
+
+        re = dict(
+            issn=issn, # not an entity field
+            #work_id
+            title=title,
+            #original_title
+            release_type=release_type,
+            release_status=release_status,
+            release_date=release_date.isoformat(),
+            release_year=release_year,
+            doi=doi,
+            #pmid
+            #pmcid
+            #isbn13     # TODO: ?
+            volume=volume,
+            issue=issue,
+            pages=pages,
+            publisher=publisher,
+            language=language,
+            #license_slug   # TODO: ?
+
+            # content, mimetype, lang
+            #abstracts=abstracts,
+
+            # raw_name, role, raw_affiliation, extra
+            contribs=contribs,
+
+            # key, year, container_name, title, locator
+            # extra: volume, authors, issue, publisher, identifiers
+            #refs=refs,
+
+            #   name, type, publisher, issnl
+            #   extra: issnp, issne, original_name, languages, country
+            container=container,
+
+            # extra:
+            #   withdrawn_date
+            #   translation_of
+            #   subtitle
+            #   aliases
+            #   container_name
+            #   group-title
+            #   pubmed: retraction refs
+            extra=extra,
+        )
+        return re
+
+if __name__=='__main__':
+    parser = JstorXmlParser()
+    parser.parse_file(open(sys.argv[1]))
diff --git a/python/tests/files/jstor-article-10.2307_111039.xml b/python/tests/files/jstor-article-10.2307_111039.xml
new file mode 100644
index 00000000..1a4c760d
--- /dev/null
+++ b/python/tests/files/jstor-article-10.2307_111039.xml
@@ -0,0 +1,58 @@
+<article xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xmlns:xlink="http://www.w3.org/1999/xlink"
+         xmlns:mml="http://www.w3.org/1998/Math/MathML"
+         dtd-version="1.0"
+         article-type="research-article">
+   <front>
+      <journal-meta>
+         <journal-id journal-id-type="jstor">abstpapecommroya</journal-id>
+         <journal-id journal-id-type="jstor">j100687</journal-id>
+         <journal-title-group>
+            <journal-title>Abstracts of the Papers Communicated to the Royal Society of London</journal-title>
+         </journal-title-group>
+         <publisher>
+            <publisher-name>The Royal Society</publisher-name>
+         </publisher>
+         <issn pub-type="ppub">03650855</issn>
+      </journal-meta>
+      <article-meta>
+         <article-id pub-id-type="jstor">111039</article-id>
+         <title-group>
+            <article-title>On the Universal Law of Attraction, Including that of Gravitation, as a Particular Case of Approximation Deducible from the Principle that Equal and Similar Particles of Matter Move Similarly, Relatively to Each other. [Abstract]</article-title>
+         </title-group>
+         <contrib-group>
+            <contrib contrib-type="author">
+               <string-name>
+                  <given-names>John Kinnersley</given-names>
+                  <surname>Smythies</surname>
+               </string-name>
+            </contrib>
+         </contrib-group>
+         <pub-date>
+            <day>1</day>
+            <month>1</month>
+            <year>1843</year>
+         
+            <day>1</day>
+            <month>1</month>
+            <year>1850</year>
+         </pub-date>
+         <volume>5</volume>
+         <issue/>
+         <issue-id>i207047</issue-id>
+         <fpage>831</fpage>
+         <lpage>832</lpage>
+         <page-range>831-832</page-range>
+         <permissions>
+            <copyright-statement/>
+         </permissions>
+         <self-uri xlink:href="http://www.jstor.org/stable/111039"/>
+         <custom-meta-group>
+            <custom-meta>
+               <meta-name>lang</meta-name>
+               <meta-value>eng</meta-value>
+            </custom-meta>
+         </custom-meta-group>
+      </article-meta>
+   </front>
+</article>
-- 
cgit v1.2.3