diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-03-05 17:35:36 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-21 11:41:29 -0700 | 
| commit | 3ec275c7d78aa261027f35c26366a382c5dd7a6c (patch) | |
| tree | 9356ac9a145a4709c0a8339eb751f4c4d7840936 | |
| parent | 6fc6232250241b3bfdddb5e62501a26dfaac6827 (diff) | |
| download | fatcat-3ec275c7d78aa261027f35c26366a382c5dd7a6c.tar.gz fatcat-3ec275c7d78aa261027f35c26366a382c5dd7a6c.zip | |
basic arxivraw XML parser
| -rw-r--r-- | python/parse_arxivraw_xml.py | 197 | ||||
| -rw-r--r-- | python/tests/files/arxivraw_1810.09584.xml | 31 | 
2 files changed, 228 insertions, 0 deletions
| diff --git a/python/parse_arxivraw_xml.py b/python/parse_arxivraw_xml.py new file mode 100644 index 00000000..e2fab510 --- /dev/null +++ b/python/parse_arxivraw_xml.py @@ -0,0 +1,197 @@ + +import sys +import json +import datetime +from bs4 import BeautifulSoup +from bs4.element import NavigableString +from pylatexenc.latex2text import LatexNodes2Text + + +latex2text = LatexNodes2Text() + +def parse_arxiv_authors(raw): +    if not raw: +        return [] +    authors = raw.split(', ') +    if authors: +        last = authors[-1].split(" and ") +        if len(last) == 2: +            authors[-1] = last[0] +            authors.append(last[1]) +    authors = [latex2text.latex_to_text(a).strip() for a in authors] +    return authors + +def test_parse_arxiv_authors(): + +    assert parse_arxiv_authors("Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an") == [ +        "Raphael Chetrite", +        "Shamik Gupta", +        "Izaak Neri", +        "Édgar Roldán", +    ] +    assert parse_arxiv_authors("Izaak Neri and \\'Edgar Rold\\'an") == [ +        "Izaak Neri", +        "Édgar Roldán", +    ] +    assert parse_arxiv_authors("Raphael Chetrite Shamik Gupta") == [ +        "Raphael Chetrite Shamik Gupta", +    ] + +class ArxivRawXmlParser(): +    """ +    Converts arxiv.org "arXivRaw" OAI-PMH XML records to fatcat release entities + +    TODO: this will require a special importer that keeps works together +    TODO: arxiv_id lookup in API (rust) with no version specified should select +          the "most recent" version; can be a simple sort? +    """ + +    def __init__(self): +        pass + +    def parse_file(self, handle): + +        # 1. open with beautiful soup +        soup = BeautifulSoup(handle, "xml") + +        # 2. iterate over articles, call parse_article on each +        for article in soup.find_all("record"): +            resp = self.parse_record(article) +            print(json.dumps(resp)) +            #sys.exit(-1) + + +    def parse_record(self, record): + +        metadata = record.arXivRaw +        extra = dict() +        extra_arxiv = dict() + +        base_id = metadata.id.string +        doi = None +        if metadata.doi and metadata.doi.string: +            doi = metadata.doi.string.lower().strip() +            assert doi.startswith('10.') +        title = latex2text.latex_to_text(metadata.title.string) +        authors = parse_arxiv_authors(metadata.authors.string) +        contribs = [dict(raw_name=a, role='author') for a in authors] + +        lang = "en"     # the vast majority in english +        if metadata.comments and metadata.comments.string: +            comments = metadata.comments.string.strip() +            extra_arxiv['comments'] = comments +            if 'in french' in comments.lower(): +                lang = 'fr' +            elif 'in spanish' in comments.lower(): +                lang = 'es' +            elif 'in portuguese' in comments.lower(): +                lang = 'pt' +            elif 'in hindi' in comments.lower(): +                lang = 'hi' +            elif 'in japanese' in comments.lower(): +                lang = 'ja' +            elif 'in german' in comments.lower(): +                lang = 'de' +            elif 'simplified chinese' in comments.lower(): +                lang = 'zh' +            elif 'in russian' in comments.lower(): +                lang = 'ru' +            # more languages? + +        release_type = "article-journal" + +        if metadata.find('journal-ref') and metadata.find('journal-ref').string: +            journal_ref = metadata.find('journal-ref').string.strip() +            extra_arxiv['journal_ref'] = journal_ref +            if "conf." in journal_ref.lower() or "proc." in journal_ref.lower(): +                release_type = "conference-paper" +        if metadata.find('report-no') and metadata.find('report-no').string: +            extra['number'] = metadata.find('report-no').string.strip() +            release_type = "report" +        if metadata.find('acm-class') and metadata.find('acm-class').string: +            extra_arxiv['acm_class'] = metadata.find('acm_class').string.strip() +        if metadata.categories and metadata.categories.string: +            extra_arxiv['categories'] = metadata.categories.string.split() +        license_slug = None +        if metadata.license and metadata.license.string: +            # XXX: convert URL to slug +            license_slug = metadata.license.string.strip() +        abstracts = None +        if metadata.abstract: +            abstracts = [] +            abst = metadata.abstract.string.strip() +            orig = None +            if '-----' in abst: +                both = abst.split('-----') +                abst = both[0].strip() +                orig = both[1].strip() +            if '$' in abst or '{' in abstr: +                mime = "application/x-latex" +                abst_plain = latex2text.latex_to_text(abst) +                abstracts.append(dict(content=abst_plain, mime="text/plain", lang="en")) +            else: +                mime = "text/plain" +            abstracts.append(dict(content=abst, mime=mime, lang="en")) +            if orig: +                abstracts.append(dict(content=orig, mime=mime)) + +        if extra_arxiv: +            extra['arxiv'] = extra_arxiv +        if not extra: +            extra = None + +        versions = [] +        for version in metadata.find_all('version'): +            arxiv_id = base_id + version['version'] +            release_date = version.date.string.strip() +            release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z") +            versions.append(dict( +                work_id=None, +                title=title, +                #original_title +                release_type="article-journal", +                release_status='submitted', # XXX: source_type? +                release_date=release_date.isoformat() + "Z", +                release_year=release_date.year, +                arxiv_id=arxiv_id, +                #doi (see below) +                #pmid +                #pmcid +                #isbn13     # never in Article +                #volume +                #issue +                #pages +                #publisher +                language=lang, +                #license_slug   # not in MEDLINE + +                # content, mimetype, lang +                abstracts=abstracts, + +                # raw_name, role, raw_affiliation, extra +                contribs=contribs, + +                #   name, type, publisher, issnl +                #   extra: issnp, issne, original_name, languages, country +                #container=container,   # very little/none; resolve via DOI? + +                # extra: +                #   withdrawn_date +                #   translation_of +                #   subtitle +                #   aliases +                #   container_name +                #   group-title +                #   pubmed: retraction refs +                extra=extra, +            )) + +        # only apply DOI to most recent version (HACK) +        if doi: +            versions[-1]['doi'] = doi +            versions[-1]['release_status'] = "published" +        return base_id, versions + +if __name__=='__main__': +    parser = ArxivRawXmlParser() +    parser.parse_file(open(sys.argv[1])) diff --git a/python/tests/files/arxivraw_1810.09584.xml b/python/tests/files/arxivraw_1810.09584.xml new file mode 100644 index 00000000..55ce381f --- /dev/null +++ b/python/tests/files/arxivraw_1810.09584.xml @@ -0,0 +1,31 @@ +<?xml version="1.0" encoding="UTF-8"?> +<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"> +<responseDate>2019-03-05T23:10:23Z</responseDate> +<request verb="GetRecord" identifier="oai:arXiv.org:1810.09584" metadataPrefix="arXivRaw">http://export.arxiv.org/oai2</request> +<GetRecord> +<record> +<header> + <identifier>oai:arXiv.org:1810.09584</identifier> + <datestamp>2019-01-15</datestamp> + <setSpec>physics:cond-mat</setSpec> + <setSpec>physics:physics</setSpec> +</header> +<metadata> + <arXivRaw xmlns="http://arxiv.org/OAI/arXivRaw/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://arxiv.org/OAI/arXivRaw/ http://arxiv.org/OAI/arXivRaw.xsd"> + <id>1810.09584</id><submitter>\'Edgar Rold\'an</submitter><version version="v1"><date>Mon, 22 Oct 2018 22:41:50 GMT</date><size>401kb</size><source_type>D</source_type></version><version version="v2"><date>Sun, 13 Jan 2019 11:17:09 GMT</date><size>669kb</size><source_type>D</source_type></version><title>Martingale theory for housekeeping heat</title><authors>Raphael Chetrite, Shamik Gupta, Izaak Neri and \'Edgar Rold\'an</authors><categories>cond-mat.stat-mech physics.bio-ph physics.data-an</categories><comments>7 pages, 2 figures</comments><journal-ref>EPL 124,60006 (2018)</journal-ref><doi>10.1209/0295-5075/124/60006</doi><license>http://arxiv.org/licenses/nonexclusive-distrib/1.0/</license><abstract>  The housekeeping heat is the energy exchanged between a system and its +environment in a nonequilibrium process that results from the violation of +detailed balance. We describe fluctuations of the housekeeping heat in +mesoscopic systems using the theory of martingales, a mathematical framework +widely used in probability theory and finance. We show that the exponentiated +housekeeping heat (in units of $k_{\rm B}T$, with $k_{\rm B}$ the Boltzmann +constant and $T$ the temperature) of a Markovian nonequilibrium process under +arbitrary time-dependent driving is a martingale process. From this result, we +derive universal equalities and inequalities for the statistics of +stopping-times and suprema of the housekeeping heat. We test our results with +numerical simulations of a system driven out of equilibrium and described by +Langevin dynamics. +</abstract></arXivRaw> +</metadata> +</record> +</GetRecord> +</OAI-PMH> | 
