diff options
-rw-r--r-- | python/parse_jstor_xml.py | 177 | ||||
-rw-r--r-- | python/tests/files/jstor-article-10.2307_111039.xml | 58 |
2 files changed, 235 insertions, 0 deletions
diff --git a/python/parse_jstor_xml.py b/python/parse_jstor_xml.py new file mode 100644 index 00000000..dfb899fd --- /dev/null +++ b/python/parse_jstor_xml.py @@ -0,0 +1,177 @@ + +import sys +import json +import datetime +from bs4 import BeautifulSoup +from bs4.element import NavigableString + + +class JstorXmlParser(): + """ + Converts JSTOR bulk XML metadata (eg, from their Early Journals Collection) + """ + + def __init__(self): + pass + + def parse_file(self, handle): + + # 1. open with beautiful soup + soup = BeautifulSoup(handle, "xml") + + # 2. iterate over articles, call parse_article on each + for article in soup.find_all("article"): + resp = self.parse_article(article) + print(json.dumps(resp)) + #sys.exit(-1) + + def parse_article(self, article): + + journal_meta = article.front.find("journal-meta") + article_meta = article.front.find("article-meta") + + extra = dict() + extra_jstor = dict() + + journal_title = journal_meta.find("journal-title").string + publisher = journal_meta.find("publisher-name").string + issn = journal_meta.find("issn") + if issn: + issn = issn.string + if len(issn) == 8: + issn = "{}-{}".format(issn[0:4], issn[4:8]) + else: + assert len(issn) == 9 + container = dict( + name=journal_title, + publisher=publisher, + issn=issn, # TODO: ISSN-L lookup... + ) + + doi = article_meta.find("article-id", attr={"pub-id-type": "doi"}) + if doi: + doi = doi.string.lower().strip() + + title = article_meta.find("article-title") + if title: + title = title.string.strip() + if title.endswith('.'): + title = title[:-1] + + contribs = [] + cgroup = article_meta.find("contrib-group") + if cgroup: + for c in cgroup.find_all("contrib"): + given = c.find("given-names") + surname = c.find("surname") + if given and surname: + name = "{} {}".format(given.string, surname.string) + elif surname: + name = surname.string + else: + name = None + contribs.append(dict( + role=c['contrib-type'], # XXX: types? mapping? + raw_name=name, + )) + + release_year = None + release_date = None + pub_date = article_meta.find('pub-date') + if pub_date and pub_date.year: + release_year = int(pub_date.year.string) + if pub_date.month and pub_date.day: + release_date = datetime.date( + release_year, + int(pub_date.month.string), + int(pub_date.day.string)) + + volume = None + if article_meta.volume: + volume = article_meta.volume.string or None + + issue = None + if article_meta.issue: + issue = article_meta.issue.string or None + + pages = None + if article_meta.find("page-range"): + pages = article_meta.find("page-range").string + elif article_meta.fpage: + pages = article_meta.fpage.string + + language = None + cm = article_meta.find("custom-meta") + if cm.find("meta-name").string == "lang": + language = cm.find("meta-value").string + + release_type = "article-journal" + if "[Abstract]" in title: + release_type = "abstract" + elif "[Editorial" in title: + release_type = "editorial" + elif "[Letter" in title: + release_type = "letter" + elif "[Poem" in title or "[Photograph" in title: + release_type = None + else if title.startswith("[") and title.endswith("]"): + # strip brackets if that is all that is there (eg, translation or non-english) + title = title[1:-1] + + # everything in JSTOR is published + release_status = "published" + + if extra_jstor: + extra['jstor'] = extra_jstor + if not extra: + extra = None + + re = dict( + issn=issn, # not an entity field + #work_id + title=title, + #original_title + release_type=release_type, + release_status=release_status, + release_date=release_date.isoformat(), + release_year=release_year, + doi=doi, + #pmid + #pmcid + #isbn13 # TODO: ? + volume=volume, + issue=issue, + pages=pages, + publisher=publisher, + language=language, + #license_slug # TODO: ? + + # content, mimetype, lang + #abstracts=abstracts, + + # raw_name, role, raw_affiliation, extra + contribs=contribs, + + # key, year, container_name, title, locator + # extra: volume, authors, issue, publisher, identifiers + #refs=refs, + + # name, type, publisher, issnl + # extra: issnp, issne, original_name, languages, country + container=container, + + # extra: + # withdrawn_date + # translation_of + # subtitle + # aliases + # container_name + # group-title + # pubmed: retraction refs + extra=extra, + ) + return re + +if __name__=='__main__': + parser = JstorXmlParser() + parser.parse_file(open(sys.argv[1])) diff --git a/python/tests/files/jstor-article-10.2307_111039.xml b/python/tests/files/jstor-article-10.2307_111039.xml new file mode 100644 index 00000000..1a4c760d --- /dev/null +++ b/python/tests/files/jstor-article-10.2307_111039.xml @@ -0,0 +1,58 @@ +<article xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xmlns:xlink="http://www.w3.org/1999/xlink" + xmlns:mml="http://www.w3.org/1998/Math/MathML" + dtd-version="1.0" + article-type="research-article"> + <front> + <journal-meta> + <journal-id journal-id-type="jstor">abstpapecommroya</journal-id> + <journal-id journal-id-type="jstor">j100687</journal-id> + <journal-title-group> + <journal-title>Abstracts of the Papers Communicated to the Royal Society of London</journal-title> + </journal-title-group> + <publisher> + <publisher-name>The Royal Society</publisher-name> + </publisher> + <issn pub-type="ppub">03650855</issn> + </journal-meta> + <article-meta> + <article-id pub-id-type="jstor">111039</article-id> + <title-group> + <article-title>On the Universal Law of Attraction, Including that of Gravitation, as a Particular Case of Approximation Deducible from the Principle that Equal and Similar Particles of Matter Move Similarly, Relatively to Each other. [Abstract]</article-title> + </title-group> + <contrib-group> + <contrib contrib-type="author"> + <string-name> + <given-names>John Kinnersley</given-names> + <surname>Smythies</surname> + </string-name> + </contrib> + </contrib-group> + <pub-date> + <day>1</day> + <month>1</month> + <year>1843</year> + + <day>1</day> + <month>1</month> + <year>1850</year> + </pub-date> + <volume>5</volume> + <issue/> + <issue-id>i207047</issue-id> + <fpage>831</fpage> + <lpage>832</lpage> + <page-range>831-832</page-range> + <permissions> + <copyright-statement/> + </permissions> + <self-uri xlink:href="http://www.jstor.org/stable/111039"/> + <custom-meta-group> + <custom-meta> + <meta-name>lang</meta-name> + <meta-value>eng</meta-value> + </custom-meta> + </custom-meta-group> + </article-meta> + </front> +</article> |