aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-01-24 16:16:22 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-01-24 16:16:30 -0800
commita85de0b6e82c65d932c84a41b284083b4879934a (patch)
tree231688205efb16670471c48ca719d3e636771f3d
parent6b1b131ba5e899a069fe280663d331932a8cbae5 (diff)
downloadfatcat-a85de0b6e82c65d932c84a41b284083b4879934a.tar.gz
fatcat-a85de0b6e82c65d932c84a41b284083b4879934a.zip
tweak crossref import, and update tests
-rw-r--r--guide/src/entity_fields.md18
-rw-r--r--python/fatcat_tools/importers/crossref.py38
-rw-r--r--python/fatcat_web/templates/release_view.html6
-rw-r--r--python/tests/files/crossref-works.single.json24
-rw-r--r--python/tests/import_crossref.py35
5 files changed, 89 insertions, 32 deletions
diff --git a/guide/src/entity_fields.md b/guide/src/entity_fields.md
index f8fcf082..939ec084 100644
--- a/guide/src/entity_fields.md
+++ b/guide/src/entity_fields.md
@@ -106,6 +106,8 @@ guide.
Wikidata entities should be associated with at most a single `release`. In
the future it may be possible to associate Wikidata entities with `work`
entities instead. See the "External Identifiers" section of style guide.
+- `arxiv_id` (string) external identifier to a (version-specific) [arxiv.org]()
+ work
- `volume` (string): optionally, stores the specific volume of a serial
publication this release was published in.
type: string
@@ -262,11 +264,21 @@ Controlled vocabulary for `role` field on `contribs`:
Current "extra" fields, flags, and content:
- `crossref` (object), for extra crossref-specific metadata
-- `is_retracted` (boolean flag) if this work has been retracted
+ - `subject` (array of strings) for subject/category of content
+ - `type` (string) raw/original Crossref type
+ - `alternative-id` (array of strings)
+ - `archive` (array of strings), indicating preservation services deposited
+ - `funder` (object/dictionary)
+- `aliases` (array of strings) for additional titles this release might be
+ known by
+- `container_name` (string) if not matched to a container entity
+- `subtitle` (string)
+- `group-title` (string) for releases within an collection/group
+- `is_retracted` (boolean flag) if this work has been retracted (in addition to
+ `release_status` getting updated)
- `translation_of` (release identifier) if this release is a translation of
another (usually under the same work)
-- `arxiv_id` (string) external identifier to a (version-specific) [arxiv.org]()
- work
+
[arxiv.org]: https://arxiv.org
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 4a0322e7..fbf30a32 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -152,8 +152,8 @@ class CrossrefImporter(EntityImporter):
elif am.get('family'):
raw_name = am['family']
else:
- # TODO: defaults back to a pseudo-null value
- raw_name = am.get('given', '<blank>')
+ # TODO: can end up empty
+ raw_name = am.get('given')
extra = dict()
if ctype == "author":
index = i
@@ -244,9 +244,7 @@ class CrossrefImporter(EntityImporter):
'collection_title', 'chapter_number'):
if clean(rm.get(k)):
extra[k] = clean(rm[k])
- if extra:
- extra = dict(crossref=extra)
- else:
+ if not extra:
extra = None
refs.append(fatcat_client.ReleaseRef(
index=i,
@@ -269,20 +267,31 @@ class CrossrefImporter(EntityImporter):
# extra fields
extra = dict()
- for key in ('subject', 'type', 'alternative-id', 'container-title',
- 'subtitle', 'archive', 'funder', 'group-title'):
- # TODO: unpack "container-title" array?
+ extra_crossref = dict()
+ # top-level extra keys
+ if not container_id:
+ if obj.get('container-title'):
+ extra['container_name'] = clean(obj['container-title'][0])
+ for key in ('group-title', 'subtitle'):
val = obj.get(key)
if val:
if type(val) == str:
extra[key] = clean(val)
else:
extra[key] = val
+ # crossref-nested extra keys
+ for key in ('subject', 'type', 'alternative-id', 'archive', 'funder'):
+ val = obj.get(key)
+ if val:
+ if type(val) == str:
+ extra_crossref[key] = clean(val)
+ else:
+ extra_crossref[key] = val
if license_extra:
- extra['license'] = license_extra
+ extra_crossref['license'] = license_extra
if len(obj['title']) > 1:
- extra['other-titles'] = [clean(t) for t in obj['title'][1:]]
+ extra['aliases'] = [clean(t) for t in obj['title'][1:]]
# ISBN
isbn13 = None
@@ -325,11 +334,18 @@ class CrossrefImporter(EntityImporter):
release_year = raw_date[0]
release_date = None
+
original_title = None
if obj.get('original-title'):
original_title = clean(obj.get('original-title')[0], force_xml=True)
if obj.get('title'):
title = clean(obj.get('title')[0], force_xml=True)
+
+ if extra_crossref:
+ extra['crossref'] = extra_crossref
+ if not extra:
+ extra = None
+
re = fatcat_client.ReleaseEntity(
work_id=None,
container_id=container_id,
@@ -353,7 +369,7 @@ class CrossrefImporter(EntityImporter):
pages=clean(obj.get('page')),
language=None, # crossref doesn't supply language info
license_slug=license_slug,
- extra=dict(crossref=extra),
+ extra=extra,
abstracts=abstracts,
contribs=contribs,
refs=refs,
diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html
index 4e24b281..c81bf478 100644
--- a/python/fatcat_web/templates/release_view.html
+++ b/python/fatcat_web/templates/release_view.html
@@ -154,9 +154,9 @@ This release citing other releases.
{% if ref.container_name %}{{ ref.container_name }}.{% endif %}
{% if ref.year %}{{ ref.year }}{% endif %}
{% if ref.locator %}{{ ref.locator }}{% endif %}
- {% elif ref.extra and ref.extra.crossref %}
- {% if ref.extra.crossref.get('author') %}{{ ref.extra.crossref['author'] }}.{% endif %}
- {% if ref.extra.crossref.get('article-title') %}{{ ref.extra.crossref['article-title'] }}.{% endif %}
+ {% elif ref.extra %}
+ {% if ref.extra.get('author') %}{{ ref.extra['author'] }}.{% endif %}
+ {% if ref.extra.get('article-title') %}{{ ref.extra['article-title'] }}.{% endif %}
{% if ref.container_name %}{{ ref.container_name }}.{% endif %}
{% if ref.year %}{{ ref.year }}.{% endif %}
{% elif ref.extra and ref.extra.unstructured %}
diff --git a/python/tests/files/crossref-works.single.json b/python/tests/files/crossref-works.single.json
index e3d2e05c..f00c2142 100644
--- a/python/tests/files/crossref-works.single.json
+++ b/python/tests/files/crossref-works.single.json
@@ -36,6 +36,10 @@
},
"delay-in-days": 6452,
"content-version": "tdm"
+ },
+ {
+ "URL": "http://creativecommons.org/licenses/by-nc-nd/3.0/",
+ "content-version": "vor"
}
],
"content-domain": {
@@ -71,7 +75,11 @@
"source": "Crossref",
"is-referenced-by-count": 5,
"title": [
- "Renormalized perturbation theory by the moment method for degenerate states: Anharmonic oscillators"
+ "Renormalized perturbation theory by the moment method for degenerate states: Anharmonic oscillators",
+ "some other title"
+ ],
+ "original-title": [
+ "Renormalized perturbation theory auf deutsch"
],
"prefix": "10.1002",
"volume": "66",
@@ -79,7 +87,8 @@
{
"given": "Marcelo D.",
"family": "Radicioni",
- "affiliation": []
+ "affiliation": [],
+ "sequence": "first"
},
{
"given": "Carlos G.",
@@ -90,6 +99,11 @@
"given": "Francisco M.",
"family": "Fern�ndez",
"affiliation": []
+ },
+ {
+ "given": "",
+ "family": "",
+ "affiliation": []
}
],
"editor": [
@@ -108,6 +122,7 @@
],
"member": "311",
"ISBN": ["85-359-0277-5", "978-3-16-148410-0"],
+ "archive": ["Portico", "LOCKSS"],
"reference": [
{
"key": "10.1002/(SICI)1097-461X(1998)66:4<261::AID-QUA1>3.0.CO;2-T-BIB1",
@@ -115,6 +130,7 @@
"volume": "57",
"first-page": "1734",
"year": "1972",
+ "medium": "DVD",
"journal-title": "J. Chem. Phys.",
"DOI": "10.1063/1.1678462",
"doi-asserted-by": "crossref"
@@ -131,11 +147,11 @@
},
{
"key": "10.1002/(SICI)1097-461X(1998)66:4<261::AID-QUA1>3.0.CO;2-T-BIB3",
- "author": "Fernández",
+ "author": "",
"volume": "43",
"year": "1987",
"unstructured": "and Hypervirial Theorems, Lecture Notes in Chemistry, Vol. 43, (Springer, Berlin, 1987).",
- "volume-title": "Hypervirial Theorems, Lecture Notes in Chemistry",
+ "volume-title": "Hypervirial Theorem&#x2019;s, Lecture Notes in Chemistry &lt;3",
"DOI": "10.1007/978-3-642-93349-3",
"doi-asserted-by": "crossref"
},
diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py
index 6e7f72c5..cb2143b3 100644
--- a/python/tests/import_crossref.py
+++ b/python/tests/import_crossref.py
@@ -70,34 +70,47 @@ def test_crossref_dict_parse(crossref_importer):
# not a single line
raw = json.loads(f.read())
r = crossref_importer.parse_record(raw)
- extra = r.extra['crossref']
+ # ensure the API server is ok with format
+ JsonLinePusher(crossref_importer, [json.dumps(raw)]).run()
+
+ print(r.extra)
assert r.title == "Renormalized perturbation theory by the moment method for degenerate states: Anharmonic oscillators"
assert r.doi == "10.1002/(sici)1097-461x(1998)66:4<261::aid-qua1>3.0.co;2-t"
assert r.publisher == "Wiley-Blackwell"
- print(extra)
- assert extra['container-title'] == ["International Journal of Quantum Chemistry"]
assert r.release_type == "article-journal"
assert r.release_status == "published"
+ assert r.license_slug == "CC-BY-NC-ND"
+ assert r.original_title == "Renormalized perturbation theory auf deutsch"
assert r.isbn13 == "978-3-16-148410-0"
- assert 'subtitle' not in extra
- assert 'archive' not in extra
- assert 'funder' not in extra
- assert len(r.contribs) == 5
+ assert 'subtitle' not in r.extra
+ assert 'subtitle' not in r.extra['crossref']
+ assert 'funder' not in r.extra
+ assert 'funder' not in r.extra['crossref']
+ # matched by ISSN, so shouldn't be in there
+ #assert extra['container_name'] == "International Journal of Quantum Chemistry"
+ assert r.extra['aliases'] == ["some other title"]
+ assert r.extra['crossref']['archive'] == ['Portico', 'LOCKSS']
+ assert len(r.contribs) == 6
assert r.contribs[0].raw_name == "Marcelo D. Radicioni"
assert r.contribs[0].index == 0
+ assert r.contribs[0].extra['seq'] == "first"
assert r.contribs[1].raw_affiliation == "Some University"
assert r.contribs[1].extra['more_affiliations'] == ["Some Department"]
assert r.contribs[1].role == "author"
- assert r.contribs[3].role == "editor"
- assert r.contribs[3].index is None
- assert r.contribs[4].role == "translator"
+ assert r.contribs[4].role == "editor"
assert r.contribs[4].index is None
+ assert r.contribs[4].extra is None
+ assert r.contribs[5].role == "translator"
+ assert r.contribs[5].index is None
assert len(r.refs) == 25
assert r.refs[0].key == "BIB1"
assert r.refs[0].year == 1972
assert r.refs[0].locator == "1734"
assert r.refs[0].container_name == "J. Chem. Phys."
- assert r.refs[0].extra['crossref'] == {"volume": "57", "author": "Swenson", "doi": "10.1063/1.1678462"}
+ assert r.refs[0].extra == {"volume": "57", "author": "Swenson", "doi": "10.1063/1.1678462", "medium": "DVD"}
+ assert r.refs[2].key == 'BIB3'
+ assert r.refs[2].extra.get('author') is None
+ assert r.refs[2].container_name == "Hypervirial Theorem's, Lecture Notes in Chemistry <3"
assert r.refs[3].container_name == "Large Order Perturbation Theory and Summation Methods in Quantum Mechanics, Lecture Notes in Chemistry"
def test_stateful_checking(crossref_importer_existing):