summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-23 16:34:08 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-23 16:34:08 -0700
commit537db73cce0d0a71fafeeb20423491c0cb75ea7a (patch)
tree5d83b11ae8aedf7a4f54df5ae38f66e13051dc51
parent104f1fd882216dc79bd7515e0b66834a779c1bf7 (diff)
parent9fbab0defca5016c0eb5b851ff73e03efac4fac8 (diff)
downloadfatcat-537db73cce0d0a71fafeeb20423491c0cb75ea7a.tar.gz
fatcat-537db73cce0d0a71fafeeb20423491c0cb75ea7a.zip
Merge branch 'bnewbold-import-tweaks'
-rw-r--r--extra/elasticsearch/release_schema.json1
-rw-r--r--guide/src/entity_release.md10
-rw-r--r--python/fatcat_tools/importers/arxiv.py5
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py1
-rw-r--r--python/tests/import_arxiv.py3
5 files changed, 13 insertions, 7 deletions
diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json
index 57ff4eb9..85026060 100644
--- a/extra/elasticsearch/release_schema.json
+++ b/extra/elasticsearch/release_schema.json
@@ -78,6 +78,7 @@
"in_ia": { "type": "boolean" },
"in_ia_sim": { "type": "boolean" },
"in_shadows": { "type": "boolean" },
+ "is_superceded": { "type": "boolean" },
"author": { "type": "alias", "path": "contrib_names" },
"journal": { "type": "alias", "path": "container_name" },
diff --git a/guide/src/entity_release.md b/guide/src/entity_release.md
index d9943d2a..872708a2 100644
--- a/guide/src/entity_release.md
+++ b/guide/src/entity_release.md
@@ -163,15 +163,13 @@ complete or correct in more obscure cases.
- `aliases` (array of strings) for additional titles this release might be
known by
- `container_name` (string) if not matched to a container entity
-- `subtitle` (string)
- `group-title` (string) for releases within an collection/group
- `translation_of` (release identifier) if this release is a translation of
another (usually under the same work)
-- `withdrawn_date` (string, ISO date format): if this release has been
- retracted (post-publication) or withdrawn (pre- or post-publication), this is
- the datetime of that event. Retractions also result in a `retraction` release
- under the same `work` entity. This is intended to migrate from "extra" to a
- full release entity field.
+- `superceded` (boolean) if there is another release under the same work that
+ should be referenced/indicated instead. Intended as a temporary hint until
+ proper work-based search is implemented. As an example use, all arxiv release
+ versions except for the most recent get this set.
#### `release_type` Vocabulary
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
index 182d0471..71b2d134 100644
--- a/python/fatcat_tools/importers/arxiv.py
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -198,6 +198,7 @@ class ArxivRawImporter(EntityImporter):
# group-title
# arxiv: comments, categories, etc
extra_arxiv['base_id'] = base_id
+ extra['superceded'] = True
extra['arxiv'] = extra_arxiv
versions = []
@@ -223,11 +224,13 @@ class ArxivRawImporter(EntityImporter):
license_slug=license_slug,
abstracts=abstracts,
contribs=contribs,
- extra=extra,
+ extra=extra.copy(),
))
# TODO: assert that versions are actually in order?
assert versions
+ versions[-1].extra.pop('superceded')
+
# only apply DOI to most recent version (HACK)
if doi:
versions[-1].ext_ids.doi = doi
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index f287fe10..8589d364 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -292,6 +292,7 @@ def container_to_elasticsearch(entity, force_bool=True):
if extra.get('ia'):
if extra['ia'].get('sim'):
any_ia_sim = True
+ t['is_superceded'] = bool(extra.get('superceded'))
t['in_doaj'] = bool(in_doaj)
t['in_road'] = bool(in_road)
diff --git a/python/tests/import_arxiv.py b/python/tests/import_arxiv.py
index bbad8fa6..1e649616 100644
--- a/python/tests/import_arxiv.py
+++ b/python/tests/import_arxiv.py
@@ -71,6 +71,7 @@ def test_arxiv_xml_parse(arxiv_importer):
assert len(r1.contribs) == 4
assert r1.extra['arxiv']['categories'] == ['cond-mat.stat-mech', 'physics.bio-ph', 'physics.data-an']
assert r1.extra['arxiv']['base_id'] == '1810.09584'
+ assert r1.extra['superceded'] == True
assert r1.contribs[0].raw_name == "Raphael Chetrite"
assert r1.contribs[0].role == "author"
@@ -92,6 +93,8 @@ def test_arxiv_xml_parse(arxiv_importer):
assert r1.extra['arxiv']['comments'] == "7 pages, 2 figures"
assert r1.extra['arxiv']['categories'] == ["cond-mat.stat-mech", "physics.bio-ph", "physics.data-an"]
+ assert not r2.extra.get('superceded')
+ r2.extra['superceded'] = True
assert r1.extra == r2.extra
assert not r1.refs