2 files changed, 228 insertions, 0 deletions
diff --git a/python/parse_arxivraw_xml.py b/python/parse_arxivraw_xml.py
new file mode 100644
index 00000000..e2fab510
--- /dev/null
+++ b/python/parse_arxivraw_xml.py
@@ -0,0 +1,197 @@
+
+import sys
+import json
+import datetime
+from bs4 import BeautifulSoup
+from bs4.element import NavigableString
+from pylatexenc.latex2text import LatexNodes2Text
+
+
+latex2text = LatexNodes2Text()
+
+def parse_arxiv_authors(raw):
+    if not raw:
+        return []
+    authors = raw.split(', ')
+    if authors:
+        last = authors[-1].split(" and ")
+        if len(last) == 2:
+            authors[-1] = last[0]
+            authors.append(last[1])
+    authors = [latex2text.latex_to_text(a).strip() for a in authors]
+    return authors
+
+def test_parse_arxiv_authors():
+
+    assert parse_arxiv_authors("Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an") == [
+        "Raphael Chetrite",
+        "Shamik Gupta",
+        "Izaak Neri",
+        "Édgar Roldán",
+    ]
+    assert parse_arxiv_authors("Izaak Neri and \\'Edgar Rold\\'an") == [
+        "Izaak Neri",
+        "Édgar Roldán",
+    ]
+    assert parse_arxiv_authors("Raphael Chetrite Shamik Gupta") == [
+        "Raphael Chetrite Shamik Gupta",
+    ]
+
+class ArxivRawXmlParser():
+    """
+    Converts arxiv.org "arXivRaw" OAI-PMH XML records to fatcat release entities
+
+    TODO: this will require a special importer that keeps works together
+    TODO: arxiv_id lookup in API (rust) with no version specified should select
+          the "most recent" version; can be a simple sort?
+    """
+
+    def __init__(self):
+        pass
+
+    def parse_file(self, handle):
+
+        # 1. open with beautiful soup
+        soup = BeautifulSoup(handle, "xml")
+
+        # 2. iterate over articles, call parse_article on each
+        for article in soup.find_all("record"):
+            resp = self.parse_record(article)
+            print(json.dumps(resp))
+            #sys.exit(-1)
+
+
+    def parse_record(self, record):
+
+        metadata = record.arXivRaw
+        extra = dict()
+        extra_arxiv = dict()
+
+        base_id = metadata.id.string
+        doi = None
+        if metadata.doi and metadata.doi.string:
+            doi = metadata.doi.string.lower().strip()
+            assert doi.startswith('10.')
+        title = latex2text.latex_to_text(metadata.title.string)
+        authors = parse_arxiv_authors(metadata.authors.string)
+        contribs = [dict(raw_name=a, role='author') for a in authors]
+
+        lang = "en"     # the vast majority in english
+        if metadata.comments and metadata.comments.string:
+            comments = metadata.comments.string.strip()
+            extra_arxiv['comments'] = comments
+            if 'in french' in comments.lower():
+                lang = 'fr'
+            elif 'in spanish' in comments.lower():
+                lang = 'es'
+            elif 'in portuguese' in comments.lower():
+                lang = 'pt'
+            elif 'in hindi' in comments.lower():
+                lang = 'hi'
+            elif 'in japanese' in comments.lower():
+                lang = 'ja'
+            elif 'in german' in comments.lower():
+                lang = 'de'
+            elif 'simplified chinese' in comments.lower():
+                lang = 'zh'
+            elif 'in russian' in comments.lower():
+                lang = 'ru'
+            # more languages?
+
+        release_type = "article-journal"
+
+        if metadata.find('journal-ref') and metadata.find('journal-ref').string:
+            journal_ref = metadata.find('journal-ref').string.strip()
+            extra_arxiv['journal_ref'] = journal_ref
+            if "conf." in journal_ref.lower() or "proc." in journal_ref.lower():
+                release_type = "conference-paper"
+        if metadata.find('report-no') and metadata.find('report-no').string:
+            extra['number'] = metadata.find('report-no').string.strip()
+            release_type = "report"
+        if metadata.find('acm-class') and metadata.find('acm-class').string:
+            extra_arxiv['acm_class'] = metadata.find('acm_class').string.strip()
+        if metadata.categories and metadata.categories.string:
+            extra_arxiv['categories'] = metadata.categories.string.split()
+        license_slug = None
+        if metadata.license and metadata.license.string:
+            # XXX: convert URL to slug
+            license_slug = metadata.license.string.strip()
+        abstracts = None
+        if metadata.abstract:
+            abstracts = []
+            abst = metadata.abstract.string.strip()
+            orig = None
+            if '-----' in abst:
+                both = abst.split('-----')
+                abst = both[0].strip()
+                orig = both[1].strip()
+            if '$' in abst or '{' in abstr:
+                mime = "application/x-latex"
+                abst_plain = latex2text.latex_to_text(abst)
+                abstracts.append(dict(content=abst_plain, mime="text/plain", lang="en"))
+            else:
+                mime = "text/plain"
+            abstracts.append(dict(content=abst, mime=mime, lang="en"))
+            if orig:
+                abstracts.append(dict(content=orig, mime=mime))
+
+        if extra_arxiv:
+            extra['arxiv'] = extra_arxiv
+        if not extra:
+            extra = None
+
+        versions = []
+        for version in metadata.find_all('version'):
+            arxiv_id = base_id + version['version']
+            release_date = version.date.string.strip()
+            release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z")
+            versions.append(dict(
+                work_id=None,
+                title=title,
+                #original_title
+                release_type="article-journal",
+                release_status='submitted', # XXX: source_type?
+                release_date=release_date.isoformat() + "Z",
+                release_year=release_date.year,
+                arxiv_id=arxiv_id,
+                #doi (see below)
+                #pmid
+                #pmcid
+                #isbn13     # never in Article
+                #volume
+                #issue
+                #pages
+                #publisher
+                language=lang,
+                #license_slug   # not in MEDLINE
+
+                # content, mimetype, lang
+                abstracts=abstracts,
+
+                # raw_name, role, raw_affiliation, extra
+                contribs=contribs,
+
+                #   name, type, publisher, issnl
+                #   extra: issnp, issne, original_name, languages, country
+                #container=container,   # very little/none; resolve via DOI?
+
+                # extra:
+                #   withdrawn_date
+                #   translation_of
+                #   subtitle
+                #   aliases
+                #   container_name
+                #   group-title
+                #   pubmed: retraction refs
+                extra=extra,
+            ))
+
+        # only apply DOI to most recent version (HACK)
+        if doi:
+            versions[-1]['doi'] = doi
+            versions[-1]['release_status'] = "published"
+        return base_id, versions
+
+if __name__=='__main__':
+    parser = ArxivRawXmlParser()
+    parser.parse_file(open(sys.argv[1]))
diff --git a/python/tests/files/arxivraw_1810.09584.xml b/python/tests/files/arxivraw_1810.09584.xml
new file mode 100644
index 00000000..55ce381f
--- /dev/null
+++ b/python/tests/files/arxivraw_1810.09584.xml
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
+<responseDate>2019-03-05T23:10:23Z</responseDate>
+<request verb="GetRecord" identifier="oai:arXiv.org:1810.09584" metadataPrefix="arXivRaw">http://export.arxiv.org/oai2</request>
+<GetRecord>
+<record>
+<header>
+ <identifier>oai:arXiv.org:1810.09584</identifier>
+ <datestamp>2019-01-15</datestamp>
+ <setSpec>physics:cond-mat</setSpec>
+ <setSpec>physics:physics</setSpec>
+</header>
+<metadata>
+ <arXivRaw xmlns="http://arxiv.org/OAI/arXivRaw/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://arxiv.org/OAI/arXivRaw/ http://arxiv.org/OAI/arXivRaw.xsd">
+ <id>1810.09584</id><submitter>\'Edgar Rold\'an</submitter><version version="v1"><date>Mon, 22 Oct 2018 22:41:50 GMT</date><size>401kb</size><source_type>D</source_type></version><version version="v2"><date>Sun, 13 Jan 2019 11:17:09 GMT</date><size>669kb</size><source_type>D</source_type></version><title>Martingale theory for housekeeping heat</title><authors>Raphael Chetrite, Shamik Gupta, Izaak Neri and \'Edgar Rold\'an</authors><categories>cond-mat.stat-mech physics.bio-ph physics.data-an</categories><comments>7 pages, 2 figures</comments><journal-ref>EPL 124,60006 (2018)</journal-ref><doi>10.1209/0295-5075/124/60006</doi><license>http://arxiv.org/licenses/nonexclusive-distrib/1.0/</license><abstract>  The housekeeping heat is the energy exchanged between a system and its
+environment in a nonequilibrium process that results from the violation of
+detailed balance. We describe fluctuations of the housekeeping heat in
+mesoscopic systems using the theory of martingales, a mathematical framework
+widely used in probability theory and finance. We show that the exponentiated
+housekeeping heat (in units of $k_{\rm B}T$, with $k_{\rm B}$ the Boltzmann
+constant and $T$ the temperature) of a Markovian nonequilibrium process under
+arbitrary time-dependent driving is a martingale process. From this result, we
+derive universal equalities and inequalities for the statistics of
+stopping-times and suprema of the housekeeping heat. We test our results with
+numerical simulations of a system driven out of equilibrium and described by
+Langevin dynamics.
+</abstract></arXivRaw>
+</metadata>
+</record>
+</GetRecord>
+</OAI-PMH>