initial arxivraw importer (from parser)

author: Bryan Newbold <bnewbold@robocracy.org> 2019-05-15 17:11:52 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2019-05-21 11:41:29 -0700
commit: 91879651d7aa8a18a5fbd2b57dd60c171d6c8fba (patch)
tree: df8d1fd330e41de9e0c9b0a7dcfa97a7dbe6cf02 /python
parent: 1b592132fe1a127368189e07bdbf9a16a807a284 (diff)
download: fatcat-91879651d7aa8a18a5fbd2b57dd60c171d6c8fba.tar.gz
fatcat-91879651d7aa8a18a5fbd2b57dd60c171d6c8fba.zip
4 files changed, 395 insertions, 198 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 497946ea..8ec219f8 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -16,6 +16,7 @@ from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, Sqlit
 from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
 from .jalc import JalcImporter
 from .jstor import JstorImporter
+from .arxiv import ArxivRawImporter
 from .grobid_metadata import GrobidMetadataImporter
 from .journal_metadata import JournalMetadataImporter
 from .matched import MatchedImporter
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
new file mode 100644
index 00000000..c53e47f1
--- /dev/null
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -0,0 +1,298 @@
+
+import sys
+import json
+import datetime
+from bs4 import BeautifulSoup
+from pylatexenc.latex2text import LatexNodes2Text
+
+import fatcat_client
+from .common import EntityImporter, clean
+from .crossref import lookup_license_slug
+
+
+latex2text = LatexNodes2Text()
+
+def parse_arxiv_authors(raw):
+    if not raw:
+        return []
+    authors = raw.split(', ')
+    if authors:
+        last = authors[-1].split(" and ")
+        if len(last) == 2:
+            authors[-1] = last[0]
+            authors.append(last[1])
+    authors = [latex2text.latex_to_text(a).strip() for a in authors]
+    return authors
+
+def test_parse_arxiv_authors():
+
+    assert parse_arxiv_authors("Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an") == [
+        "Raphael Chetrite",
+        "Shamik Gupta",
+        "Izaak Neri",
+        "Édgar Roldán",
+    ]
+    assert parse_arxiv_authors("Izaak Neri and \\'Edgar Rold\\'an") == [
+        "Izaak Neri",
+        "Édgar Roldán",
+    ]
+    assert parse_arxiv_authors("Raphael Chetrite Shamik Gupta") == [
+        "Raphael Chetrite Shamik Gupta",
+    ]
+
+
+class ArxivRawImporter(EntityImporter):
+    """
+    Converts arxiv.org "arXivRaw" OAI-PMH XML records to fatcat release entities
+
+    TODO: this will require a special importer that keeps works together
+    TODO: arxiv_id lookup in API (rust) with no version specified should select
+          the "most recent" version; can be a simple sort?
+    """
+
+    def __init__(self, api, **kwargs):
+
+        eg_desc = kwargs.get('editgroup_description',
+            "Automated import of arxiv metadata via arXivRaw OAI-PMH feed")
+        eg_extra = kwargs.get('editgroup_extra', dict())
+        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArxivRawImporter')
+        # lower batch size, because multiple versions per entry (guessing 2-3 on average?)
+        batch_size = kwargs.get('edit_batch_size', 50)
+        super().__init__(api,
+            editgroup_description=eg_desc,
+            editgroup_extra=eg_extra,
+            batch_size=batch_size,
+            **kwargs)
+        self._test_override = False
+
+
+    def parse_record(self, record):
+
+        metadata = record.arXivRaw
+        extra = dict()
+        extra_arxiv = dict()
+
+        base_id = metadata.id.string
+        doi = None
+        if metadata.doi and metadata.doi.string:
+            doi = metadata.doi.string.lower().strip()
+            assert doi.startswith('10.')
+        title = latex2text.latex_to_text(metadata.title.string)
+        authors = parse_arxiv_authors(metadata.authors.string)
+        contribs = [fatcat_client.ReleaseContrib(raw_name=a, role='author') for a in authors]
+
+        lang = "en"     # the vast majority in english
+        if metadata.comments and metadata.comments.string:
+            comments = metadata.comments.string.strip()
+            extra_arxiv['comments'] = comments
+            if 'in french' in comments.lower():
+                lang = 'fr'
+            elif 'in spanish' in comments.lower():
+                lang = 'es'
+            elif 'in portuguese' in comments.lower():
+                lang = 'pt'
+            elif 'in hindi' in comments.lower():
+                lang = 'hi'
+            elif 'in japanese' in comments.lower():
+                lang = 'ja'
+            elif 'in german' in comments.lower():
+                lang = 'de'
+            elif 'simplified chinese' in comments.lower():
+                lang = 'zh'
+            elif 'in russian' in comments.lower():
+                lang = 'ru'
+            # more languages?
+
+        release_type = "article-journal"
+
+        if metadata.find('journal-ref') and metadata.find('journal-ref').string:
+            journal_ref = metadata.find('journal-ref').string.strip()
+            extra_arxiv['journal_ref'] = journal_ref
+            if "conf." in journal_ref.lower() or "proc." in journal_ref.lower():
+                release_type = "conference-paper"
+        if metadata.find('report-no') and metadata.find('report-no').string:
+            extra['number'] = metadata.find('report-no').string.strip()
+            release_type = "report"
+        if metadata.find('acm-class') and metadata.find('acm-class').string:
+            extra_arxiv['acm_class'] = metadata.find('acm_class').string.strip()
+        if metadata.categories and metadata.categories.string:
+            extra_arxiv['categories'] = metadata.categories.string.split()
+        license_slug = None
+        if metadata.license and metadata.license.string:
+            license_slug = lookup_license_slug(metadata.license.string)
+        abstracts = None
+        if metadata.abstract:
+            # TODO: test for this multi-abstract code path
+            abstracts = []
+            abst = metadata.abstract.string.strip()
+            orig = None
+            if '-----' in abst:
+                both = abst.split('-----')
+                abst = both[0].strip()
+                orig = both[1].strip()
+            if '$' in abst or '{' in abst:
+                mime = "application/x-latex"
+                abst_plain = latex2text.latex_to_text(abst)
+                abstracts.append(fatcat_client.ReleaseAbstract(content=abst_plain, mimetype="text/plain", lang="en"))
+            else:
+                mime = "text/plain"
+            abstracts.append(fatcat_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en"))
+            if orig:
+                abstracts.append(fatcat_client.ReleaseAbstract(content=orig, mimetype=mime))
+                # indicates that fulltext probably isn't english either
+                if lang == 'en':
+                    lang = None
+
+
+        # extra:
+        #   withdrawn_date
+        #   translation_of
+        #   subtitle
+        #   aliases
+        #   container_name
+        #   group-title
+        #   arxiv: comments, categories, etc
+        extra_arxiv['base_id'] = base_id
+        extra['arxiv'] = extra_arxiv
+
+        versions = []
+        for version in metadata.find_all('version'):
+            arxiv_id = base_id + version['version']
+            release_date = version.date.string.strip()
+            release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z").date()
+            # XXX: source_type?
+            versions.append(fatcat_client.ReleaseEntity(
+                work_id=None,
+                title=title,
+                #original_title
+                version=version['version'],
+                release_type="article-journal",
+                release_stage='submitted',
+                release_date=release_date.isoformat(),
+                release_year=release_date.year,
+                ext_ids=fatcat_client.ReleaseExtIds(
+                    arxiv=arxiv_id,
+                ),
+                language=lang,
+                license_slug=license_slug,
+                abstracts=abstracts,
+                contribs=contribs,
+                extra=extra,
+            ))
+        # TODO: assert that versions are actually in order
+        assert versions
+
+        # only apply DOI to most recent version (HACK)
+        if doi:
+            versions[-1].ext_ids.doi = doi
+            versions[-1].release_stage = "published"
+        return versions
+
+    def try_update(self, versions):
+        """
+        This is pretty complex! There is no batch/bezerk mode for arxiv importer.
+
+        For each version, do a lookup by full arxiv_id, and store work/release
+        id results.
+        
+        If a version has a DOI, also do a doi lookup and store that result. If
+        there is an existing release with both matching, set that as the
+        existing work. If they don't match, use the full arxiv_id match and
+        move on (maybe log or at least count the error?). If it's a
+        one/or/other case, update the existing release (and mark version as
+        existing).
+
+        If there was any existing release, take its work_id.
+
+        Iterate back through versions. If it didn't already exist, insert it
+        with any existing work_id. If there wasn't an existing work_id, lookup
+        the new release (by rev from edit?) and use that for the rest.
+
+        Do not pass any versions on for batch insert.
+        """
+
+        # first do lookups
+        any_work_id = None
+        for v in versions:
+            v._existing_work_id = None
+            v._updated = False
+            existing = None
+            existing_doi = None
+            try:
+                existing = self.api.lookup_release(arxiv=v.ext_ids.arxiv)
+            except fatcat_client.rest.ApiException as err:
+                if err.status != 404:
+                    raise err
+            if v.ext_ids.doi:
+                try:
+                    existing_doi = self.api.lookup_release(arxiv=v.ext_ids.arxiv)
+                except fatcat_client.rest.ApiException as err:
+                    if err.status != 404:
+                        raise err
+            if existing_doi:
+                if existing and existing.ident == existing_doi.ident:
+                    # great, they match and have idents, nothing to do
+                    pass
+                elif existing and existing.ident != existing_doi.ident:
+                    # could be bad, or could be that a new arxiv version was
+                    # created (update?)
+                    # stick with arxiv_id match as existing; don't update anything
+                    pass
+                else:
+                    assert not existing
+                    if not existing_doi.ext_ids.arxiv_id:
+                        # update the existing DOI-based record with our full arxiv_id
+                        existing_doi.ext_ids.arxiv_id = v.ext_ids.arxiv_id
+                        self.api.update_release(self.get_editgroup_id(), existing_doi.ident, existing_doi)
+                        self.counts['update'] += 1
+                        # as a flag to not count below
+                        v._updated = True
+                    existing = existing_doi
+            
+            v._existing_work_id = existing.work_id
+            any_work_id = existing.work_id
+
+        last_edit = None
+        for v in versions:
+            if v._existing_work_id:
+                if not v._updated:
+                    self.counts['exists'] += 1
+                continue
+            if not any_work_id and last_edit:
+                # fetch the last inserted release from this group
+                r = self.api.get_release_rev(last_edit.rev)
+                assert r.work_id
+                any_work_id = r.work_id
+            v.work_id = any_work_id
+            last_edit = self.api.insert_release(self.get_editgroup_id(), v)
+            self.counts['insert'] += 1
+
+        return False
+
+    def insert_batch(self, batch_batch):
+        # there is no batch/bezerk mode for arxiv importer, except for testing
+        if self._test_override:
+            for batch in batch_batch:
+                self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch(
+                    editgroup=fatcat_client.Editgroup(
+                        description=self.editgroup_description,
+                        extra=self.editgroup_extra),
+                    entity_list=batch))
+                self.counts['insert'] += len(batch) - 1
+        else:
+            raise NotImplementedError()
+
+    def parse_file(self, handle):
+
+        # 1. open with beautiful soup
+        soup = BeautifulSoup(handle, "xml")
+
+        # 2. iterate over articles, call parse_article on each
+        for article in soup.find_all("record"):
+            resp = self.parse_record(article)
+            print(json.dumps(resp))
+            #sys.exit(-1)
+
+if __name__=='__main__':
+    parser = ArxivRawImporter()
+    parser.parse_file(open(sys.argv[1]))
diff --git a/python/parse_arxivraw_xml.py b/python/parse_arxivraw_xml.py
deleted file mode 100644
index 9b9f28c9..00000000
--- a/python/parse_arxivraw_xml.py
+++ /dev/null
@@ -1,198 +0,0 @@
-
-import sys
-import json
-import datetime
-from bs4 import BeautifulSoup
-from bs4.element import NavigableString
-from pylatexenc.latex2text import LatexNodes2Text
-
-
-latex2text = LatexNodes2Text()
-
-def parse_arxiv_authors(raw):
-    if not raw:
-        return []
-    authors = raw.split(', ')
-    if authors:
-        last = authors[-1].split(" and ")
-        if len(last) == 2:
-            authors[-1] = last[0]
-            authors.append(last[1])
-    authors = [latex2text.latex_to_text(a).strip() for a in authors]
-    return authors
-
-def test_parse_arxiv_authors():
-
-    assert parse_arxiv_authors("Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an") == [
-        "Raphael Chetrite",
-        "Shamik Gupta",
-        "Izaak Neri",
-        "Édgar Roldán",
-    ]
-    assert parse_arxiv_authors("Izaak Neri and \\'Edgar Rold\\'an") == [
-        "Izaak Neri",
-        "Édgar Roldán",
-    ]
-    assert parse_arxiv_authors("Raphael Chetrite Shamik Gupta") == [
-        "Raphael Chetrite Shamik Gupta",
-    ]
-
-class ArxivRawXmlParser():
-    """
-    Converts arxiv.org "arXivRaw" OAI-PMH XML records to fatcat release entities
-
-    TODO: this will require a special importer that keeps works together
-    TODO: arxiv_id lookup in API (rust) with no version specified should select
-          the "most recent" version; can be a simple sort?
-    """
-
-    def __init__(self):
-        pass
-
-    def parse_file(self, handle):
-
-        # 1. open with beautiful soup
-        soup = BeautifulSoup(handle, "xml")
-
-        # 2. iterate over articles, call parse_article on each
-        for article in soup.find_all("record"):
-            resp = self.parse_record(article)
-            print(json.dumps(resp))
-            #sys.exit(-1)
-
-
-    def parse_record(self, record):
-
-        metadata = record.arXivRaw
-        extra = dict()
-        extra_arxiv = dict()
-
-        base_id = metadata.id.string
-        doi = None
-        if metadata.doi and metadata.doi.string:
-            doi = metadata.doi.string.lower().strip()
-            assert doi.startswith('10.')
-        title = latex2text.latex_to_text(metadata.title.string)
-        authors = parse_arxiv_authors(metadata.authors.string)
-        contribs = [dict(raw_name=a, role='author') for a in authors]
-
-        lang = "en"     # the vast majority in english
-        if metadata.comments and metadata.comments.string:
-            comments = metadata.comments.string.strip()
-            extra_arxiv['comments'] = comments
-            if 'in french' in comments.lower():
-                lang = 'fr'
-            elif 'in spanish' in comments.lower():
-                lang = 'es'
-            elif 'in portuguese' in comments.lower():
-                lang = 'pt'
-            elif 'in hindi' in comments.lower():
-                lang = 'hi'
-            elif 'in japanese' in comments.lower():
-                lang = 'ja'
-            elif 'in german' in comments.lower():
-                lang = 'de'
-            elif 'simplified chinese' in comments.lower():
-                lang = 'zh'
-            elif 'in russian' in comments.lower():
-                lang = 'ru'
-            # more languages?
-
-        release_type = "article-journal"
-
-        if metadata.find('journal-ref') and metadata.find('journal-ref').string:
-            journal_ref = metadata.find('journal-ref').string.strip()
-            extra_arxiv['journal_ref'] = journal_ref
-            if "conf." in journal_ref.lower() or "proc." in journal_ref.lower():
-                release_type = "conference-paper"
-        if metadata.find('report-no') and metadata.find('report-no').string:
-            extra['number'] = metadata.find('report-no').string.strip()
-            release_type = "report"
-        if metadata.find('acm-class') and metadata.find('acm-class').string:
-            extra_arxiv['acm_class'] = metadata.find('acm_class').string.strip()
-        if metadata.categories and metadata.categories.string:
-            extra_arxiv['categories'] = metadata.categories.string.split()
-        license_slug = None
-        if metadata.license and metadata.license.string:
-            # XXX: convert URL to slug
-            license_slug = metadata.license.string.strip()
-        abstracts = None
-        if metadata.abstract:
-            # TODO: test for this multi-abstract code path
-            abstracts = []
-            abst = metadata.abstract.string.strip()
-            orig = None
-            if '-----' in abst:
-                both = abst.split('-----')
-                abst = both[0].strip()
-                orig = both[1].strip()
-            if '$' in abst or '{' in abst:
-                mime = "application/x-latex"
-                abst_plain = latex2text.latex_to_text(abst)
-                abstracts.append(dict(content=abst_plain, mime="text/plain", lang="en"))
-            else:
-                mime = "text/plain"
-            abstracts.append(dict(content=abst, mime=mime, lang="en"))
-            if orig:
-                abstracts.append(dict(content=orig, mime=mime))
-
-        if extra_arxiv:
-            extra['arxiv'] = extra_arxiv
-        if not extra:
-            extra = None
-
-        versions = []
-        for version in metadata.find_all('version'):
-            arxiv_id = base_id + version['version']
-            release_date = version.date.string.strip()
-            release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z").date()
-            versions.append(dict(
-                work_id=None,
-                title=title,
-                #original_title
-                release_type="article-journal",
-                release_status='submitted', # XXX: source_type?
-                release_date=release_date.isoformat(),
-                release_year=release_date.year,
-                arxiv_id=arxiv_id,
-                #doi (see below)
-                #pmid
-                #pmcid
-                #isbn13     # never in Article
-                #volume
-                #issue
-                #pages
-                #publisher
-                language=lang,
-                #license_slug   # not in MEDLINE
-
-                # content, mimetype, lang
-                abstracts=abstracts,
-
-                # raw_name, role, raw_affiliation, extra
-                contribs=contribs,
-
-                #   name, type, publisher, issnl
-                #   extra: issnp, issne, original_name, languages, country
-                #container=container,   # very little/none; resolve via DOI?
-
-                # extra:
-                #   withdrawn_date
-                #   translation_of
-                #   subtitle
-                #   aliases
-                #   container_name
-                #   group-title
-                #   pubmed: retraction refs
-                extra=extra,
-            ))
-
-        # only apply DOI to most recent version (HACK)
-        if doi:
-            versions[-1]['doi'] = doi
-            versions[-1]['release_status'] = "published"
-        return base_id, versions
-
-if __name__=='__main__':
-    parser = ArxivRawXmlParser()
-    parser.parse_file(open(sys.argv[1]))
diff --git a/python/tests/import_arxiv.py b/python/tests/import_arxiv.py
new file mode 100644
index 00000000..726bafc5
--- /dev/null
+++ b/python/tests/import_arxiv.py
@@ -0,0 +1,96 @@
+
+import json, gzip
+import pytest
+from fatcat_tools.importers import ArxivRawImporter, Bs4XmlFilePusher
+from fixtures import api
+from bs4 import BeautifulSoup
+
+
+@pytest.fixture(scope="function")
+def arxiv_importer(api):
+    ari = ArxivRawImporter(api, bezerk_mode=True)
+    ari._test_override = True
+    return ari
+
+def test_arxiv_importer(arxiv_importer):
+    last_index = arxiv_importer.api.get_changelog(limit=1)[0].index
+    with open('tests/files/arxivraw_1810.09584.xml', 'r') as f:
+        arxiv_importer.bezerk_mode = True
+        counts = Bs4XmlFilePusher(arxiv_importer, f, "record").run()
+    assert counts['insert'] == 2
+    assert counts['exists'] == 0
+    assert counts['skip'] == 0
+
+    # fetch most recent editgroup
+    change = arxiv_importer.api.get_changelog_entry(index=last_index+1)
+    eg = change.editgroup
+    assert eg.description
+    assert "arxiv" in eg.description.lower()
+    assert eg.extra['git_rev']
+    assert "fatcat_tools.ArxivRawImporter" in eg.extra['agent']
+
+    last_index = arxiv_importer.api.get_changelog(limit=1)[0].index
+    with open('tests/files/arxivraw_1810.09584.xml', 'r') as f:
+        arxiv_importer.bezerk_mode = False
+        arxiv_importer.reset()
+        counts = Bs4XmlFilePusher(arxiv_importer, f, "record").run()
+    assert counts['insert'] == 0
+    assert counts['exists'] == 2
+    assert counts['skip'] == 0
+    assert last_index == arxiv_importer.api.get_changelog(limit=1)[0].index
+
+def test_arxiv_xml_parse(arxiv_importer):
+    with open('tests/files/arxivraw_1810.09584.xml', 'r') as f:
+        soup = BeautifulSoup(f, "xml")
+        r = arxiv_importer.parse_record(soup.find_all("record")[0])
+
+    r1 = r[0]
+    r2 = r[1]
+    print(r1.extra)
+    print(r2.extra)
+    assert r1.work_id == r2.work_id
+    assert r1.title == "Martingale theory for housekeeping heat"
+    assert r1.subtitle == None
+    assert r1.original_title == None
+    assert r1.release_type == "article-journal"
+    assert r1.release_stage == "submitted"
+    assert r2.release_stage == "published"
+    assert r1.license_slug == "ARXIV-NED-1.0"
+    assert r1.version == "v1"
+    assert r2.version == "v2"
+    assert r1.ext_ids.arxiv == "1810.09584v1"
+    assert r2.ext_ids.arxiv == "1810.09584v2"
+    assert r1.ext_ids.doi == None
+    assert r2.ext_ids.doi == "10.1209/0295-5075/124/60006"
+    assert r1.release_year == 2018
+    assert str(r1.release_date) == "2018-10-22"
+    assert r2.release_year == 2019
+    assert str(r2.release_date) == "2019-01-13"
+    # matched by ISSN, so shouldn't be in there?
+    #assert extra['container_name'] == "Abstracts of the Papers Communicated to the Royal Society of London"
+    assert len(r1.contribs) == 4
+    # XXX: extra['arxiv'] stuff
+
+    assert r1.contribs[0].raw_name == "Raphael Chetrite"
+    assert r1.contribs[0].role == "author"
+    assert r1.contribs[1].raw_name == "Shamik Gupta"
+    assert r1.contribs[2].raw_name == "Izaak Neri"
+    assert r1.contribs[3].raw_name == "Édgar Roldán"
+    assert r1.contribs[3].role == "author"
+
+    assert len(r1.contribs) == 4
+    assert r1.contribs == r2.contribs
+
+    assert r1.abstracts[0].content.startswith("The housekeeping heat is the energy exchanged")
+    # order isn't deterministic
+    assert "application/x-latex" in [a.mimetype for a in r1.abstracts]
+    assert "text/plain" in [a.mimetype for a in r1.abstracts]
+
+    assert r1.abstracts == r2.abstracts
+
+    assert r1.extra['arxiv']['comments'] == "7 pages, 2 figures"
+    assert r1.extra['arxiv']['categories'] == ["cond-mat.stat-mech", "physics.bio-ph", "physics.data-an"]
+
+    assert r1.extra == r2.extra
+
+    assert not r1.refs
author	Bryan Newbold <bnewbold@robocracy.org>	2019-05-15 17:11:52 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2019-05-21 11:41:29 -0700
commit	91879651d7aa8a18a5fbd2b57dd60c171d6c8fba (patch)
tree	df8d1fd330e41de9e0c9b0a7dcfa97a7dbe6cf02 /python
parent	1b592132fe1a127368189e07bdbf9a16a807a284 (diff)
download	fatcat-91879651d7aa8a18a5fbd2b57dd60c171d6c8fba.tar.gz fatcat-91879651d7aa8a18a5fbd2b57dd60c171d6c8fba.zip