diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-03-05 17:35:36 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-21 11:41:29 -0700 |
commit | 3ec275c7d78aa261027f35c26366a382c5dd7a6c (patch) | |
tree | 9356ac9a145a4709c0a8339eb751f4c4d7840936 | |
parent | 6fc6232250241b3bfdddb5e62501a26dfaac6827 (diff) | |
download | fatcat-3ec275c7d78aa261027f35c26366a382c5dd7a6c.tar.gz fatcat-3ec275c7d78aa261027f35c26366a382c5dd7a6c.zip |
basic arxivraw XML parser
-rw-r--r-- | python/parse_arxivraw_xml.py | 197 | ||||
-rw-r--r-- | python/tests/files/arxivraw_1810.09584.xml | 31 |
2 files changed, 228 insertions, 0 deletions
diff --git a/python/parse_arxivraw_xml.py b/python/parse_arxivraw_xml.py new file mode 100644 index 00000000..e2fab510 --- /dev/null +++ b/python/parse_arxivraw_xml.py @@ -0,0 +1,197 @@ + +import sys +import json +import datetime +from bs4 import BeautifulSoup +from bs4.element import NavigableString +from pylatexenc.latex2text import LatexNodes2Text + + +latex2text = LatexNodes2Text() + +def parse_arxiv_authors(raw): + if not raw: + return [] + authors = raw.split(', ') + if authors: + last = authors[-1].split(" and ") + if len(last) == 2: + authors[-1] = last[0] + authors.append(last[1]) + authors = [latex2text.latex_to_text(a).strip() for a in authors] + return authors + +def test_parse_arxiv_authors(): + + assert parse_arxiv_authors("Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an") == [ + "Raphael Chetrite", + "Shamik Gupta", + "Izaak Neri", + "Édgar Roldán", + ] + assert parse_arxiv_authors("Izaak Neri and \\'Edgar Rold\\'an") == [ + "Izaak Neri", + "Édgar Roldán", + ] + assert parse_arxiv_authors("Raphael Chetrite Shamik Gupta") == [ + "Raphael Chetrite Shamik Gupta", + ] + +class ArxivRawXmlParser(): + """ + Converts arxiv.org "arXivRaw" OAI-PMH XML records to fatcat release entities + + TODO: this will require a special importer that keeps works together + TODO: arxiv_id lookup in API (rust) with no version specified should select + the "most recent" version; can be a simple sort? + """ + + def __init__(self): + pass + + def parse_file(self, handle): + + # 1. open with beautiful soup + soup = BeautifulSoup(handle, "xml") + + # 2. iterate over articles, call parse_article on each + for article in soup.find_all("record"): + resp = self.parse_record(article) + print(json.dumps(resp)) + #sys.exit(-1) + + + def parse_record(self, record): + + metadata = record.arXivRaw + extra = dict() + extra_arxiv = dict() + + base_id = metadata.id.string + doi = None + if metadata.doi and metadata.doi.string: + doi = metadata.doi.string.lower().strip() + assert doi.startswith('10.') + title = latex2text.latex_to_text(metadata.title.string) + authors = parse_arxiv_authors(metadata.authors.string) + contribs = [dict(raw_name=a, role='author') for a in authors] + + lang = "en" # the vast majority in english + if metadata.comments and metadata.comments.string: + comments = metadata.comments.string.strip() + extra_arxiv['comments'] = comments + if 'in french' in comments.lower(): + lang = 'fr' + elif 'in spanish' in comments.lower(): + lang = 'es' + elif 'in portuguese' in comments.lower(): + lang = 'pt' + elif 'in hindi' in comments.lower(): + lang = 'hi' + elif 'in japanese' in comments.lower(): + lang = 'ja' + elif 'in german' in comments.lower(): + lang = 'de' + elif 'simplified chinese' in comments.lower(): + lang = 'zh' + elif 'in russian' in comments.lower(): + lang = 'ru' + # more languages? + + release_type = "article-journal" + + if metadata.find('journal-ref') and metadata.find('journal-ref').string: + journal_ref = metadata.find('journal-ref').string.strip() + extra_arxiv['journal_ref'] = journal_ref + if "conf." in journal_ref.lower() or "proc." in journal_ref.lower(): + release_type = "conference-paper" + if metadata.find('report-no') and metadata.find('report-no').string: + extra['number'] = metadata.find('report-no').string.strip() + release_type = "report" + if metadata.find('acm-class') and metadata.find('acm-class').string: + extra_arxiv['acm_class'] = metadata.find('acm_class').string.strip() + if metadata.categories and metadata.categories.string: + extra_arxiv['categories'] = metadata.categories.string.split() + license_slug = None + if metadata.license and metadata.license.string: + # XXX: convert URL to slug + license_slug = metadata.license.string.strip() + abstracts = None + if metadata.abstract: + abstracts = [] + abst = metadata.abstract.string.strip() + orig = None + if '-----' in abst: + both = abst.split('-----') + abst = both[0].strip() + orig = both[1].strip() + if '$' in abst or '{' in abstr: + mime = "application/x-latex" + abst_plain = latex2text.latex_to_text(abst) + abstracts.append(dict(content=abst_plain, mime="text/plain", lang="en")) + else: + mime = "text/plain" + abstracts.append(dict(content=abst, mime=mime, lang="en")) + if orig: + abstracts.append(dict(content=orig, mime=mime)) + + if extra_arxiv: + extra['arxiv'] = extra_arxiv + if not extra: + extra = None + + versions = [] + for version in metadata.find_all('version'): + arxiv_id = base_id + version['version'] + release_date = version.date.string.strip() + release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z") + versions.append(dict( + work_id=None, + title=title, + #original_title + release_type="article-journal", + release_status='submitted', # XXX: source_type? + release_date=release_date.isoformat() + "Z", + release_year=release_date.year, + arxiv_id=arxiv_id, + #doi (see below) + #pmid + #pmcid + #isbn13 # never in Article + #volume + #issue + #pages + #publisher + language=lang, + #license_slug # not in MEDLINE + + # content, mimetype, lang + abstracts=abstracts, + + # raw_name, role, raw_affiliation, extra + contribs=contribs, + + # name, type, publisher, issnl + # extra: issnp, issne, original_name, languages, country + #container=container, # very little/none; resolve via DOI? + + # extra: + # withdrawn_date + # translation_of + # subtitle + # aliases + # container_name + # group-title + # pubmed: retraction refs + extra=extra, + )) + + # only apply DOI to most recent version (HACK) + if doi: + versions[-1]['doi'] = doi + versions[-1]['release_status'] = "published" + return base_id, versions + +if __name__=='__main__': + parser = ArxivRawXmlParser() + parser.parse_file(open(sys.argv[1])) diff --git a/python/tests/files/arxivraw_1810.09584.xml b/python/tests/files/arxivraw_1810.09584.xml new file mode 100644 index 00000000..55ce381f --- /dev/null +++ b/python/tests/files/arxivraw_1810.09584.xml @@ -0,0 +1,31 @@ +<?xml version="1.0" encoding="UTF-8"?> +<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"> +<responseDate>2019-03-05T23:10:23Z</responseDate> +<request verb="GetRecord" identifier="oai:arXiv.org:1810.09584" metadataPrefix="arXivRaw">http://export.arxiv.org/oai2</request> +<GetRecord> +<record> +<header> + <identifier>oai:arXiv.org:1810.09584</identifier> + <datestamp>2019-01-15</datestamp> + <setSpec>physics:cond-mat</setSpec> + <setSpec>physics:physics</setSpec> +</header> +<metadata> + <arXivRaw xmlns="http://arxiv.org/OAI/arXivRaw/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://arxiv.org/OAI/arXivRaw/ http://arxiv.org/OAI/arXivRaw.xsd"> + <id>1810.09584</id><submitter>\'Edgar Rold\'an</submitter><version version="v1"><date>Mon, 22 Oct 2018 22:41:50 GMT</date><size>401kb</size><source_type>D</source_type></version><version version="v2"><date>Sun, 13 Jan 2019 11:17:09 GMT</date><size>669kb</size><source_type>D</source_type></version><title>Martingale theory for housekeeping heat</title><authors>Raphael Chetrite, Shamik Gupta, Izaak Neri and \'Edgar Rold\'an</authors><categories>cond-mat.stat-mech physics.bio-ph physics.data-an</categories><comments>7 pages, 2 figures</comments><journal-ref>EPL 124,60006 (2018)</journal-ref><doi>10.1209/0295-5075/124/60006</doi><license>http://arxiv.org/licenses/nonexclusive-distrib/1.0/</license><abstract> The housekeeping heat is the energy exchanged between a system and its +environment in a nonequilibrium process that results from the violation of +detailed balance. We describe fluctuations of the housekeeping heat in +mesoscopic systems using the theory of martingales, a mathematical framework +widely used in probability theory and finance. We show that the exponentiated +housekeeping heat (in units of $k_{\rm B}T$, with $k_{\rm B}$ the Boltzmann +constant and $T$ the temperature) of a Markovian nonequilibrium process under +arbitrary time-dependent driving is a martingale process. From this result, we +derive universal equalities and inequalities for the statistics of +stopping-times and suprema of the housekeeping heat. We test our results with +numerical simulations of a system driven out of equilibrium and described by +Langevin dynamics. +</abstract></arXivRaw> +</metadata> +</record> +</GetRecord> +</OAI-PMH> |