From c8d24dd3c743a2413eb87ec02a5a9e5e67c4f7a1 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 5 Mar 2019 17:35:36 -0800 Subject: basic arxivraw XML parser --- python/parse_arxivraw_xml.py | 197 +++++++++++++++++++++++++++++ python/tests/files/arxivraw_1810.09584.xml | 31 +++++ 2 files changed, 228 insertions(+) create mode 100644 python/parse_arxivraw_xml.py create mode 100644 python/tests/files/arxivraw_1810.09584.xml diff --git a/python/parse_arxivraw_xml.py b/python/parse_arxivraw_xml.py new file mode 100644 index 00000000..e2fab510 --- /dev/null +++ b/python/parse_arxivraw_xml.py @@ -0,0 +1,197 @@ + +import sys +import json +import datetime +from bs4 import BeautifulSoup +from bs4.element import NavigableString +from pylatexenc.latex2text import LatexNodes2Text + + +latex2text = LatexNodes2Text() + +def parse_arxiv_authors(raw): + if not raw: + return [] + authors = raw.split(', ') + if authors: + last = authors[-1].split(" and ") + if len(last) == 2: + authors[-1] = last[0] + authors.append(last[1]) + authors = [latex2text.latex_to_text(a).strip() for a in authors] + return authors + +def test_parse_arxiv_authors(): + + assert parse_arxiv_authors("Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an") == [ + "Raphael Chetrite", + "Shamik Gupta", + "Izaak Neri", + "Édgar Roldán", + ] + assert parse_arxiv_authors("Izaak Neri and \\'Edgar Rold\\'an") == [ + "Izaak Neri", + "Édgar Roldán", + ] + assert parse_arxiv_authors("Raphael Chetrite Shamik Gupta") == [ + "Raphael Chetrite Shamik Gupta", + ] + +class ArxivRawXmlParser(): + """ + Converts arxiv.org "arXivRaw" OAI-PMH XML records to fatcat release entities + + TODO: this will require a special importer that keeps works together + TODO: arxiv_id lookup in API (rust) with no version specified should select + the "most recent" version; can be a simple sort? + """ + + def __init__(self): + pass + + def parse_file(self, handle): + + # 1. open with beautiful soup + soup = BeautifulSoup(handle, "xml") + + # 2. iterate over articles, call parse_article on each + for article in soup.find_all("record"): + resp = self.parse_record(article) + print(json.dumps(resp)) + #sys.exit(-1) + + + def parse_record(self, record): + + metadata = record.arXivRaw + extra = dict() + extra_arxiv = dict() + + base_id = metadata.id.string + doi = None + if metadata.doi and metadata.doi.string: + doi = metadata.doi.string.lower().strip() + assert doi.startswith('10.') + title = latex2text.latex_to_text(metadata.title.string) + authors = parse_arxiv_authors(metadata.authors.string) + contribs = [dict(raw_name=a, role='author') for a in authors] + + lang = "en" # the vast majority in english + if metadata.comments and metadata.comments.string: + comments = metadata.comments.string.strip() + extra_arxiv['comments'] = comments + if 'in french' in comments.lower(): + lang = 'fr' + elif 'in spanish' in comments.lower(): + lang = 'es' + elif 'in portuguese' in comments.lower(): + lang = 'pt' + elif 'in hindi' in comments.lower(): + lang = 'hi' + elif 'in japanese' in comments.lower(): + lang = 'ja' + elif 'in german' in comments.lower(): + lang = 'de' + elif 'simplified chinese' in comments.lower(): + lang = 'zh' + elif 'in russian' in comments.lower(): + lang = 'ru' + # more languages? + + release_type = "article-journal" + + if metadata.find('journal-ref') and metadata.find('journal-ref').string: + journal_ref = metadata.find('journal-ref').string.strip() + extra_arxiv['journal_ref'] = journal_ref + if "conf." in journal_ref.lower() or "proc." in journal_ref.lower(): + release_type = "conference-paper" + if metadata.find('report-no') and metadata.find('report-no').string: + extra['number'] = metadata.find('report-no').string.strip() + release_type = "report" + if metadata.find('acm-class') and metadata.find('acm-class').string: + extra_arxiv['acm_class'] = metadata.find('acm_class').string.strip() + if metadata.categories and metadata.categories.string: + extra_arxiv['categories'] = metadata.categories.string.split() + license_slug = None + if metadata.license and metadata.license.string: + # XXX: convert URL to slug + license_slug = metadata.license.string.strip() + abstracts = None + if metadata.abstract: + abstracts = [] + abst = metadata.abstract.string.strip() + orig = None + if '-----' in abst: + both = abst.split('-----') + abst = both[0].strip() + orig = both[1].strip() + if '$' in abst or '{' in abstr: + mime = "application/x-latex" + abst_plain = latex2text.latex_to_text(abst) + abstracts.append(dict(content=abst_plain, mime="text/plain", lang="en")) + else: + mime = "text/plain" + abstracts.append(dict(content=abst, mime=mime, lang="en")) + if orig: + abstracts.append(dict(content=orig, mime=mime)) + + if extra_arxiv: + extra['arxiv'] = extra_arxiv + if not extra: + extra = None + + versions = [] + for version in metadata.find_all('version'): + arxiv_id = base_id + version['version'] + release_date = version.date.string.strip() + release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z") + versions.append(dict( + work_id=None, + title=title, + #original_title + release_type="article-journal", + release_status='submitted', # XXX: source_type? + release_date=release_date.isoformat() + "Z", + release_year=release_date.year, + arxiv_id=arxiv_id, + #doi (see below) + #pmid + #pmcid + #isbn13 # never in Article + #volume + #issue + #pages + #publisher + language=lang, + #license_slug # not in MEDLINE + + # content, mimetype, lang + abstracts=abstracts, + + # raw_name, role, raw_affiliation, extra + contribs=contribs, + + # name, type, publisher, issnl + # extra: issnp, issne, original_name, languages, country + #container=container, # very little/none; resolve via DOI? + + # extra: + # withdrawn_date + # translation_of + # subtitle + # aliases + # container_name + # group-title + # pubmed: retraction refs + extra=extra, + )) + + # only apply DOI to most recent version (HACK) + if doi: + versions[-1]['doi'] = doi + versions[-1]['release_status'] = "published" + return base_id, versions + +if __name__=='__main__': + parser = ArxivRawXmlParser() + parser.parse_file(open(sys.argv[1])) diff --git a/python/tests/files/arxivraw_1810.09584.xml b/python/tests/files/arxivraw_1810.09584.xml new file mode 100644 index 00000000..55ce381f --- /dev/null +++ b/python/tests/files/arxivraw_1810.09584.xml @@ -0,0 +1,31 @@ + + +2019-03-05T23:10:23Z +http://export.arxiv.org/oai2 + + +
+ oai:arXiv.org:1810.09584 + 2019-01-15 + physics:cond-mat + physics:physics +
+ + + 1810.09584\'Edgar Rold\'anMon, 22 Oct 2018 22:41:50 GMT401kbDSun, 13 Jan 2019 11:17:09 GMT669kbDMartingale theory for housekeeping heatRaphael Chetrite, Shamik Gupta, Izaak Neri and \'Edgar Rold\'ancond-mat.stat-mech physics.bio-ph physics.data-an7 pages, 2 figuresEPL 124,60006 (2018)10.1209/0295-5075/124/60006http://arxiv.org/licenses/nonexclusive-distrib/1.0/ The housekeeping heat is the energy exchanged between a system and its +environment in a nonequilibrium process that results from the violation of +detailed balance. We describe fluctuations of the housekeeping heat in +mesoscopic systems using the theory of martingales, a mathematical framework +widely used in probability theory and finance. We show that the exponentiated +housekeeping heat (in units of $k_{\rm B}T$, with $k_{\rm B}$ the Boltzmann +constant and $T$ the temperature) of a Markovian nonequilibrium process under +arbitrary time-dependent driving is a martingale process. From this result, we +derive universal equalities and inequalities for the statistics of +stopping-times and suprema of the housekeeping heat. We test our results with +numerical simulations of a system driven out of equilibrium and described by +Langevin dynamics. + + +
+
+
-- cgit v1.2.3