diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-03-05 18:42:43 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-21 11:41:29 -0700 | 
| commit | ee393b537f3164ad25c9337b658db81192b25629 (patch) | |
| tree | c1962719963942d56b8194694567d25b03dc4f5b | |
| parent | a987af927686725f7778475f4c383d59c8c494bf (diff) | |
| download | fatcat-ee393b537f3164ad25c9337b658db81192b25629.tar.gz fatcat-ee393b537f3164ad25c9337b658db81192b25629.zip | |
basic JSTOR XML parser
| -rw-r--r-- | python/parse_jstor_xml.py | 177 | ||||
| -rw-r--r-- | python/tests/files/jstor-article-10.2307_111039.xml | 58 | 
2 files changed, 235 insertions, 0 deletions
| diff --git a/python/parse_jstor_xml.py b/python/parse_jstor_xml.py new file mode 100644 index 00000000..dfb899fd --- /dev/null +++ b/python/parse_jstor_xml.py @@ -0,0 +1,177 @@ + +import sys +import json +import datetime +from bs4 import BeautifulSoup +from bs4.element import NavigableString + + +class JstorXmlParser(): +    """ +    Converts JSTOR bulk XML metadata (eg, from their Early Journals Collection) +    """ + +    def __init__(self): +        pass + +    def parse_file(self, handle): + +        # 1. open with beautiful soup +        soup = BeautifulSoup(handle, "xml") + +        # 2. iterate over articles, call parse_article on each +        for article in soup.find_all("article"): +            resp = self.parse_article(article) +            print(json.dumps(resp)) +            #sys.exit(-1) + +    def parse_article(self, article): + +        journal_meta = article.front.find("journal-meta") +        article_meta = article.front.find("article-meta") + +        extra = dict() +        extra_jstor = dict() + +        journal_title = journal_meta.find("journal-title").string +        publisher = journal_meta.find("publisher-name").string +        issn = journal_meta.find("issn") +        if issn: +            issn = issn.string +            if len(issn) == 8: +                issn = "{}-{}".format(issn[0:4], issn[4:8]) +            else: +                assert len(issn) == 9 +        container = dict( +            name=journal_title, +            publisher=publisher, +            issn=issn,   # TODO: ISSN-L lookup... +        ) + +        doi = article_meta.find("article-id", attr={"pub-id-type": "doi"}) +        if doi: +            doi = doi.string.lower().strip() + +        title = article_meta.find("article-title") +        if title: +            title = title.string.strip() +            if title.endswith('.'): +                title = title[:-1] + +        contribs = [] +        cgroup = article_meta.find("contrib-group") +        if cgroup: +            for c in cgroup.find_all("contrib"): +                given = c.find("given-names") +                surname = c.find("surname") +                if given and surname: +                    name = "{} {}".format(given.string, surname.string) +                elif surname: +                    name = surname.string +                else: +                    name = None +                contribs.append(dict( +                    role=c['contrib-type'],   # XXX: types? mapping? +                    raw_name=name, +                )) + +        release_year = None +        release_date = None +        pub_date = article_meta.find('pub-date') +        if pub_date and pub_date.year: +            release_year = int(pub_date.year.string) +            if pub_date.month and pub_date.day: +                release_date = datetime.date( +                    release_year, +                    int(pub_date.month.string), +                    int(pub_date.day.string)) +         +        volume = None +        if article_meta.volume: +            volume = article_meta.volume.string or None + +        issue = None +        if article_meta.issue: +            issue = article_meta.issue.string or None + +        pages = None +        if article_meta.find("page-range"): +            pages = article_meta.find("page-range").string +        elif article_meta.fpage: +            pages = article_meta.fpage.string + +        language = None +        cm = article_meta.find("custom-meta") +        if cm.find("meta-name").string == "lang": +            language = cm.find("meta-value").string + +        release_type = "article-journal" +        if "[Abstract]" in title: +            release_type = "abstract" +        elif "[Editorial" in title: +            release_type = "editorial" +        elif "[Letter" in title: +            release_type = "letter" +        elif "[Poem" in title or "[Photograph" in title: +            release_type = None +        else if title.startswith("[") and title.endswith("]"): +            # strip brackets if that is all that is there (eg, translation or non-english) +            title = title[1:-1] + +        # everything in JSTOR is published +        release_status = "published" + +        if extra_jstor: +            extra['jstor'] = extra_jstor +        if not extra: +            extra = None + +        re = dict( +            issn=issn, # not an entity field +            #work_id +            title=title, +            #original_title +            release_type=release_type, +            release_status=release_status, +            release_date=release_date.isoformat(), +            release_year=release_year, +            doi=doi, +            #pmid +            #pmcid +            #isbn13     # TODO: ? +            volume=volume, +            issue=issue, +            pages=pages, +            publisher=publisher, +            language=language, +            #license_slug   # TODO: ? + +            # content, mimetype, lang +            #abstracts=abstracts, + +            # raw_name, role, raw_affiliation, extra +            contribs=contribs, + +            # key, year, container_name, title, locator +            # extra: volume, authors, issue, publisher, identifiers +            #refs=refs, + +            #   name, type, publisher, issnl +            #   extra: issnp, issne, original_name, languages, country +            container=container, + +            # extra: +            #   withdrawn_date +            #   translation_of +            #   subtitle +            #   aliases +            #   container_name +            #   group-title +            #   pubmed: retraction refs +            extra=extra, +        ) +        return re + +if __name__=='__main__': +    parser = JstorXmlParser() +    parser.parse_file(open(sys.argv[1])) diff --git a/python/tests/files/jstor-article-10.2307_111039.xml b/python/tests/files/jstor-article-10.2307_111039.xml new file mode 100644 index 00000000..1a4c760d --- /dev/null +++ b/python/tests/files/jstor-article-10.2307_111039.xml @@ -0,0 +1,58 @@ +<article xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" +         xmlns:xlink="http://www.w3.org/1999/xlink" +         xmlns:mml="http://www.w3.org/1998/Math/MathML" +         dtd-version="1.0" +         article-type="research-article"> +   <front> +      <journal-meta> +         <journal-id journal-id-type="jstor">abstpapecommroya</journal-id> +         <journal-id journal-id-type="jstor">j100687</journal-id> +         <journal-title-group> +            <journal-title>Abstracts of the Papers Communicated to the Royal Society of London</journal-title> +         </journal-title-group> +         <publisher> +            <publisher-name>The Royal Society</publisher-name> +         </publisher> +         <issn pub-type="ppub">03650855</issn> +      </journal-meta> +      <article-meta> +         <article-id pub-id-type="jstor">111039</article-id> +         <title-group> +            <article-title>On the Universal Law of Attraction, Including that of Gravitation, as a Particular Case of Approximation Deducible from the Principle that Equal and Similar Particles of Matter Move Similarly, Relatively to Each other. [Abstract]</article-title> +         </title-group> +         <contrib-group> +            <contrib contrib-type="author"> +               <string-name> +                  <given-names>John Kinnersley</given-names> +                  <surname>Smythies</surname> +               </string-name> +            </contrib> +         </contrib-group> +         <pub-date> +            <day>1</day> +            <month>1</month> +            <year>1843</year> +          +            <day>1</day> +            <month>1</month> +            <year>1850</year> +         </pub-date> +         <volume>5</volume> +         <issue/> +         <issue-id>i207047</issue-id> +         <fpage>831</fpage> +         <lpage>832</lpage> +         <page-range>831-832</page-range> +         <permissions> +            <copyright-statement/> +         </permissions> +         <self-uri xlink:href="http://www.jstor.org/stable/111039"/> +         <custom-meta-group> +            <custom-meta> +               <meta-name>lang</meta-name> +               <meta-value>eng</meta-value> +            </custom-meta> +         </custom-meta-group> +      </article-meta> +   </front> +</article> | 
