aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/arxiv.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-21 19:57:55 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-21 19:57:55 -0700
commitf00c0a6c8217fe9a3872c3744cfdfa9a91285ab6 (patch)
treea32745597660311739966b693266842d9a4fe44b /python/fatcat_tools/importers/arxiv.py
parentaa5c9f1300088d35393e92dc21dcd62b949984cd (diff)
downloadfatcat-f00c0a6c8217fe9a3872c3744cfdfa9a91285ab6.tar.gz
fatcat-f00c0a6c8217fe9a3872c3744cfdfa9a91285ab6.zip
more arxiv polish
Diffstat (limited to 'python/fatcat_tools/importers/arxiv.py')
-rw-r--r--python/fatcat_tools/importers/arxiv.py51
1 files changed, 29 insertions, 22 deletions
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
index 5a33bff1..cbe66d8c 100644
--- a/python/fatcat_tools/importers/arxiv.py
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -81,6 +81,9 @@ class ArxivRawImporter(EntityImporter):
extra = dict()
extra_arxiv = dict()
+ # don't know!
+ release_type = "article"
+
base_id = metadata.id.string
doi = None
if metadata.doi and metadata.doi.string:
@@ -112,10 +115,7 @@ class ArxivRawImporter(EntityImporter):
lang = 'ru'
# more languages?
- # don't know!
- release_type = "article"
number = None
-
if metadata.find('journal-ref') and metadata.find('journal-ref').string:
journal_ref = metadata.find('journal-ref').string.strip()
extra_arxiv['journal_ref'] = journal_ref
@@ -123,7 +123,12 @@ class ArxivRawImporter(EntityImporter):
release_type = "conference-paper"
if metadata.find('report-no') and metadata.find('report-no').string:
number = metadata.find('report-no').string.strip()
- release_type = "report"
+ # at least some people plop extra metadata in here. hrmf!
+ if 'ISSN ' in number or 'ISBN ' in number or len(number.split()) > 2:
+ extra_arxiv['report-no'] = number
+ number = None
+ else:
+ release_type = "report"
if metadata.find('acm-class') and metadata.find('acm-class').string:
extra_arxiv['acm_class'] = metadata.find('acm-class').string.strip()
if metadata.categories and metadata.categories.string:
@@ -177,7 +182,7 @@ class ArxivRawImporter(EntityImporter):
title=title,
#original_title
version=version['version'],
- release_type="article",
+ release_type=release_type,
release_stage='submitted',
release_date=release_date.isoformat(),
release_year=release_date.year,
@@ -191,13 +196,14 @@ class ArxivRawImporter(EntityImporter):
contribs=contribs,
extra=extra,
))
- # TODO: assert that versions are actually in order
+ # TODO: assert that versions are actually in order?
assert versions
# only apply DOI to most recent version (HACK)
if doi:
versions[-1].ext_ids.doi = doi
- versions[-1].release_stage = "published"
+ if len(versions) > 1:
+ versions[-1].release_stage = "accepted"
return versions
def try_update(self, versions):
@@ -235,6 +241,11 @@ class ArxivRawImporter(EntityImporter):
except fatcat_client.rest.ApiException as err:
if err.status != 404:
raise err
+
+ if existing:
+ v._existing_work_id = existing.work_id
+ any_work_id = existing.work_id
+
if v.ext_ids.doi:
try:
existing_doi = self.api.lookup_release(arxiv=v.ext_ids.arxiv)
@@ -246,24 +257,20 @@ class ArxivRawImporter(EntityImporter):
# great, they match and have idents, nothing to do
pass
elif existing and existing.ident != existing_doi.ident:
- # could be bad, or could be that a new arxiv version was
- # created (update?)
- # stick with arxiv_id match as existing; don't update anything
+ # could be that a new arxiv version was created (update?),
+ # or that VOR has no arxiv version (or catalog is borked or
+ # something else)
+ # stick with arxiv_id match as existing, but don't set DOI;
+ # don't update anything
+ v.ext_ids.doi = None
pass
else:
assert not existing
- if not existing_doi.ext_ids.arxiv_id:
- # update the existing DOI-based record with our full arxiv_id
- existing_doi.ext_ids.arxiv_id = v.ext_ids.arxiv_id
- self.api.update_release(self.get_editgroup_id(), existing_doi.ident, existing_doi)
- self.counts['update'] += 1
- # as a flag to not count below
- v._updated = True
- existing = existing_doi
-
- if existing:
- v._existing_work_id = existing.work_id
- any_work_id = existing.work_id
+ # there's a pre-existing DOI release we should group under,
+ # but we don't know if we're the version-of-record or what,
+ # so just group but don't update existing DOI release
+ v.ext_ids.doi = None
+ any_work_id = any_work_id or existing_doi.work_id
last_edit = None
for v in versions: