From b9f6e139ba672d430c0918062fd3dd4f942fd812 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 21 May 2019 11:34:21 -0700 Subject: arxiv importer polish --- python/fatcat_tools/importers/arxiv.py | 7 ++++--- python/tests/import_arxiv.py | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index 0d0179cd..03ef10d6 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -45,7 +45,6 @@ class ArxivRawImporter(EntityImporter): """ Converts arxiv.org "arXivRaw" OAI-PMH XML records to fatcat release entities - TODO: this will require a special importer that keeps works together TODO: arxiv_id lookup in API (rust) with no version specified should select the "most recent" version; can be a simple sort? """ @@ -105,6 +104,7 @@ class ArxivRawImporter(EntityImporter): # don't know! release_type = "article" + number = None if metadata.find('journal-ref') and metadata.find('journal-ref').string: journal_ref = metadata.find('journal-ref').string.strip() @@ -112,7 +112,7 @@ class ArxivRawImporter(EntityImporter): if "conf." in journal_ref.lower() or "proc." in journal_ref.lower(): release_type = "conference-paper" if metadata.find('report-no') and metadata.find('report-no').string: - extra['number'] = metadata.find('report-no').string.strip() + number = metadata.find('report-no').string.strip() release_type = "report" if metadata.find('acm-class') and metadata.find('acm-class').string: extra_arxiv['acm_class'] = metadata.find('acm_class').string.strip() @@ -161,7 +161,7 @@ class ArxivRawImporter(EntityImporter): arxiv_id = base_id + version['version'] release_date = version.date.string.strip() release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z").date() - # XXX: source_type? + # TODO: source_type? versions.append(fatcat_client.ReleaseEntity( work_id=None, title=title, @@ -174,6 +174,7 @@ class ArxivRawImporter(EntityImporter): ext_ids=fatcat_client.ReleaseExtIds( arxiv=arxiv_id, ), + number=number, language=lang, license_slug=license_slug, abstracts=abstracts, diff --git a/python/tests/import_arxiv.py b/python/tests/import_arxiv.py index 8d91be10..bc4fca02 100644 --- a/python/tests/import_arxiv.py +++ b/python/tests/import_arxiv.py @@ -69,7 +69,8 @@ def test_arxiv_xml_parse(arxiv_importer): # matched by ISSN, so shouldn't be in there? #assert extra['container_name'] == "Abstracts of the Papers Communicated to the Royal Society of London" assert len(r1.contribs) == 4 - # XXX: extra['arxiv'] stuff + assert r1.extra['arxiv']['categories'] == ['cond-mat.stat-mech', 'physics.bio-ph', 'physics.data-an'] + assert r1.extra['arxiv']['base_id'] == '1810.09584' assert r1.contribs[0].raw_name == "Raphael Chetrite" assert r1.contribs[0].role == "author" -- cgit v1.2.3