diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-21 11:34:21 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-21 11:41:29 -0700 |
commit | b9f6e139ba672d430c0918062fd3dd4f942fd812 (patch) | |
tree | eb42b0a5fe092379a0923c019171dc777b6a6031 /python | |
parent | 9688cedac61729bc417a3cb31096f52bdb6f16db (diff) | |
download | fatcat-b9f6e139ba672d430c0918062fd3dd4f942fd812.tar.gz fatcat-b9f6e139ba672d430c0918062fd3dd4f942fd812.zip |
arxiv importer polish
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_tools/importers/arxiv.py | 7 | ||||
-rw-r--r-- | python/tests/import_arxiv.py | 3 |
2 files changed, 6 insertions, 4 deletions
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index 0d0179cd..03ef10d6 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -45,7 +45,6 @@ class ArxivRawImporter(EntityImporter): """ Converts arxiv.org "arXivRaw" OAI-PMH XML records to fatcat release entities - TODO: this will require a special importer that keeps works together TODO: arxiv_id lookup in API (rust) with no version specified should select the "most recent" version; can be a simple sort? """ @@ -105,6 +104,7 @@ class ArxivRawImporter(EntityImporter): # don't know! release_type = "article" + number = None if metadata.find('journal-ref') and metadata.find('journal-ref').string: journal_ref = metadata.find('journal-ref').string.strip() @@ -112,7 +112,7 @@ class ArxivRawImporter(EntityImporter): if "conf." in journal_ref.lower() or "proc." in journal_ref.lower(): release_type = "conference-paper" if metadata.find('report-no') and metadata.find('report-no').string: - extra['number'] = metadata.find('report-no').string.strip() + number = metadata.find('report-no').string.strip() release_type = "report" if metadata.find('acm-class') and metadata.find('acm-class').string: extra_arxiv['acm_class'] = metadata.find('acm_class').string.strip() @@ -161,7 +161,7 @@ class ArxivRawImporter(EntityImporter): arxiv_id = base_id + version['version'] release_date = version.date.string.strip() release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z").date() - # XXX: source_type? + # TODO: source_type? versions.append(fatcat_client.ReleaseEntity( work_id=None, title=title, @@ -174,6 +174,7 @@ class ArxivRawImporter(EntityImporter): ext_ids=fatcat_client.ReleaseExtIds( arxiv=arxiv_id, ), + number=number, language=lang, license_slug=license_slug, abstracts=abstracts, diff --git a/python/tests/import_arxiv.py b/python/tests/import_arxiv.py index 8d91be10..bc4fca02 100644 --- a/python/tests/import_arxiv.py +++ b/python/tests/import_arxiv.py @@ -69,7 +69,8 @@ def test_arxiv_xml_parse(arxiv_importer): # matched by ISSN, so shouldn't be in there? #assert extra['container_name'] == "Abstracts of the Papers Communicated to the Royal Society of London" assert len(r1.contribs) == 4 - # XXX: extra['arxiv'] stuff + assert r1.extra['arxiv']['categories'] == ['cond-mat.stat-mech', 'physics.bio-ph', 'physics.data-an'] + assert r1.extra['arxiv']['base_id'] == '1810.09584' assert r1.contribs[0].raw_name == "Raphael Chetrite" assert r1.contribs[0].role == "author" |