arxiv importer polish

author: Bryan Newbold <bnewbold@robocracy.org> 2019-05-21 11:34:21 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2019-05-21 11:41:29 -0700
commit: b9f6e139ba672d430c0918062fd3dd4f942fd812 (patch)
tree: eb42b0a5fe092379a0923c019171dc777b6a6031 /python
parent: 9688cedac61729bc417a3cb31096f52bdb6f16db (diff)
download: fatcat-b9f6e139ba672d430c0918062fd3dd4f942fd812.tar.gz
fatcat-b9f6e139ba672d430c0918062fd3dd4f942fd812.zip
2 files changed, 6 insertions, 4 deletions
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
index 0d0179cd..03ef10d6 100644
--- a/python/fatcat_tools/importers/arxiv.py
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -45,7 +45,6 @@ class ArxivRawImporter(EntityImporter):
     """
     Converts arxiv.org "arXivRaw" OAI-PMH XML records to fatcat release entities
 
-    TODO: this will require a special importer that keeps works together
     TODO: arxiv_id lookup in API (rust) with no version specified should select
           the "most recent" version; can be a simple sort?
     """
@@ -105,6 +104,7 @@ class ArxivRawImporter(EntityImporter):
 
         # don't know!
         release_type = "article"
+        number = None
 
         if metadata.find('journal-ref') and metadata.find('journal-ref').string:
             journal_ref = metadata.find('journal-ref').string.strip()
@@ -112,7 +112,7 @@ class ArxivRawImporter(EntityImporter):
             if "conf." in journal_ref.lower() or "proc." in journal_ref.lower():
                 release_type = "conference-paper"
         if metadata.find('report-no') and metadata.find('report-no').string:
-            extra['number'] = metadata.find('report-no').string.strip()
+            number = metadata.find('report-no').string.strip()
             release_type = "report"
         if metadata.find('acm-class') and metadata.find('acm-class').string:
             extra_arxiv['acm_class'] = metadata.find('acm_class').string.strip()
@@ -161,7 +161,7 @@ class ArxivRawImporter(EntityImporter):
             arxiv_id = base_id + version['version']
             release_date = version.date.string.strip()
             release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z").date()
-            # XXX: source_type?
+            # TODO: source_type?
             versions.append(fatcat_client.ReleaseEntity(
                 work_id=None,
                 title=title,
@@ -174,6 +174,7 @@ class ArxivRawImporter(EntityImporter):
                 ext_ids=fatcat_client.ReleaseExtIds(
                     arxiv=arxiv_id,
                 ),
+                number=number,
                 language=lang,
                 license_slug=license_slug,
                 abstracts=abstracts,
diff --git a/python/tests/import_arxiv.py b/python/tests/import_arxiv.py
index 8d91be10..bc4fca02 100644
--- a/python/tests/import_arxiv.py
+++ b/python/tests/import_arxiv.py
@@ -69,7 +69,8 @@ def test_arxiv_xml_parse(arxiv_importer):
     # matched by ISSN, so shouldn't be in there?
     #assert extra['container_name'] == "Abstracts of the Papers Communicated to the Royal Society of London"
     assert len(r1.contribs) == 4
-    # XXX: extra['arxiv'] stuff
+    assert r1.extra['arxiv']['categories'] == ['cond-mat.stat-mech', 'physics.bio-ph', 'physics.data-an']
+    assert r1.extra['arxiv']['base_id'] == '1810.09584'
 
     assert r1.contribs[0].raw_name == "Raphael Chetrite"
     assert r1.contribs[0].role == "author"
author	Bryan Newbold <bnewbold@robocracy.org>	2019-05-21 11:34:21 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2019-05-21 11:41:29 -0700
commit	b9f6e139ba672d430c0918062fd3dd4f942fd812 (patch)
tree	eb42b0a5fe092379a0923c019171dc777b6a6031 /python
parent	9688cedac61729bc417a3cb31096f52bdb6f16db (diff)
download	fatcat-b9f6e139ba672d430c0918062fd3dd4f942fd812.tar.gz fatcat-b9f6e139ba672d430c0918062fd3dd4f942fd812.zip