From a85de0b6e82c65d932c84a41b284083b4879934a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 24 Jan 2019 16:16:22 -0800 Subject: tweak crossref import, and update tests --- guide/src/entity_fields.md | 18 ++++++++++--- python/fatcat_tools/importers/crossref.py | 38 +++++++++++++++++++-------- python/fatcat_web/templates/release_view.html | 6 ++--- python/tests/files/crossref-works.single.json | 24 ++++++++++++++--- python/tests/import_crossref.py | 35 ++++++++++++++++-------- 5 files changed, 89 insertions(+), 32 deletions(-) diff --git a/guide/src/entity_fields.md b/guide/src/entity_fields.md index f8fcf082..939ec084 100644 --- a/guide/src/entity_fields.md +++ b/guide/src/entity_fields.md @@ -106,6 +106,8 @@ guide. Wikidata entities should be associated with at most a single `release`. In the future it may be possible to associate Wikidata entities with `work` entities instead. See the "External Identifiers" section of style guide. +- `arxiv_id` (string) external identifier to a (version-specific) [arxiv.org]() + work - `volume` (string): optionally, stores the specific volume of a serial publication this release was published in. type: string @@ -262,11 +264,21 @@ Controlled vocabulary for `role` field on `contribs`: Current "extra" fields, flags, and content: - `crossref` (object), for extra crossref-specific metadata -- `is_retracted` (boolean flag) if this work has been retracted + - `subject` (array of strings) for subject/category of content + - `type` (string) raw/original Crossref type + - `alternative-id` (array of strings) + - `archive` (array of strings), indicating preservation services deposited + - `funder` (object/dictionary) +- `aliases` (array of strings) for additional titles this release might be + known by +- `container_name` (string) if not matched to a container entity +- `subtitle` (string) +- `group-title` (string) for releases within an collection/group +- `is_retracted` (boolean flag) if this work has been retracted (in addition to + `release_status` getting updated) - `translation_of` (release identifier) if this release is a translation of another (usually under the same work) -- `arxiv_id` (string) external identifier to a (version-specific) [arxiv.org]() - work + [arxiv.org]: https://arxiv.org diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 4a0322e7..fbf30a32 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -152,8 +152,8 @@ class CrossrefImporter(EntityImporter): elif am.get('family'): raw_name = am['family'] else: - # TODO: defaults back to a pseudo-null value - raw_name = am.get('given', '') + # TODO: can end up empty + raw_name = am.get('given') extra = dict() if ctype == "author": index = i @@ -244,9 +244,7 @@ class CrossrefImporter(EntityImporter): 'collection_title', 'chapter_number'): if clean(rm.get(k)): extra[k] = clean(rm[k]) - if extra: - extra = dict(crossref=extra) - else: + if not extra: extra = None refs.append(fatcat_client.ReleaseRef( index=i, @@ -269,20 +267,31 @@ class CrossrefImporter(EntityImporter): # extra fields extra = dict() - for key in ('subject', 'type', 'alternative-id', 'container-title', - 'subtitle', 'archive', 'funder', 'group-title'): - # TODO: unpack "container-title" array? + extra_crossref = dict() + # top-level extra keys + if not container_id: + if obj.get('container-title'): + extra['container_name'] = clean(obj['container-title'][0]) + for key in ('group-title', 'subtitle'): val = obj.get(key) if val: if type(val) == str: extra[key] = clean(val) else: extra[key] = val + # crossref-nested extra keys + for key in ('subject', 'type', 'alternative-id', 'archive', 'funder'): + val = obj.get(key) + if val: + if type(val) == str: + extra_crossref[key] = clean(val) + else: + extra_crossref[key] = val if license_extra: - extra['license'] = license_extra + extra_crossref['license'] = license_extra if len(obj['title']) > 1: - extra['other-titles'] = [clean(t) for t in obj['title'][1:]] + extra['aliases'] = [clean(t) for t in obj['title'][1:]] # ISBN isbn13 = None @@ -325,11 +334,18 @@ class CrossrefImporter(EntityImporter): release_year = raw_date[0] release_date = None + original_title = None if obj.get('original-title'): original_title = clean(obj.get('original-title')[0], force_xml=True) if obj.get('title'): title = clean(obj.get('title')[0], force_xml=True) + + if extra_crossref: + extra['crossref'] = extra_crossref + if not extra: + extra = None + re = fatcat_client.ReleaseEntity( work_id=None, container_id=container_id, @@ -353,7 +369,7 @@ class CrossrefImporter(EntityImporter): pages=clean(obj.get('page')), language=None, # crossref doesn't supply language info license_slug=license_slug, - extra=dict(crossref=extra), + extra=extra, abstracts=abstracts, contribs=contribs, refs=refs, diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html index 4e24b281..c81bf478 100644 --- a/python/fatcat_web/templates/release_view.html +++ b/python/fatcat_web/templates/release_view.html @@ -154,9 +154,9 @@ This release citing other releases. {% if ref.container_name %}{{ ref.container_name }}.{% endif %} {% if ref.year %}{{ ref.year }}{% endif %} {% if ref.locator %}{{ ref.locator }}{% endif %} - {% elif ref.extra and ref.extra.crossref %} - {% if ref.extra.crossref.get('author') %}{{ ref.extra.crossref['author'] }}.{% endif %} - {% if ref.extra.crossref.get('article-title') %}{{ ref.extra.crossref['article-title'] }}.{% endif %} + {% elif ref.extra %} + {% if ref.extra.get('author') %}{{ ref.extra['author'] }}.{% endif %} + {% if ref.extra.get('article-title') %}{{ ref.extra['article-title'] }}.{% endif %} {% if ref.container_name %}{{ ref.container_name }}.{% endif %} {% if ref.year %}{{ ref.year }}.{% endif %} {% elif ref.extra and ref.extra.unstructured %} diff --git a/python/tests/files/crossref-works.single.json b/python/tests/files/crossref-works.single.json index e3d2e05c..f00c2142 100644 --- a/python/tests/files/crossref-works.single.json +++ b/python/tests/files/crossref-works.single.json @@ -36,6 +36,10 @@ }, "delay-in-days": 6452, "content-version": "tdm" + }, + { + "URL": "http://creativecommons.org/licenses/by-nc-nd/3.0/", + "content-version": "vor" } ], "content-domain": { @@ -71,7 +75,11 @@ "source": "Crossref", "is-referenced-by-count": 5, "title": [ - "Renormalized perturbation theory by the moment method for degenerate states: Anharmonic oscillators" + "Renormalized perturbation theory by the moment method for degenerate states: Anharmonic oscillators", + "some other title" + ], + "original-title": [ + "Renormalized perturbation theory auf deutsch" ], "prefix": "10.1002", "volume": "66", @@ -79,7 +87,8 @@ { "given": "Marcelo D.", "family": "Radicioni", - "affiliation": [] + "affiliation": [], + "sequence": "first" }, { "given": "Carlos G.", @@ -90,6 +99,11 @@ "given": "Francisco M.", "family": "Fern�ndez", "affiliation": [] + }, + { + "given": "", + "family": "", + "affiliation": [] } ], "editor": [ @@ -108,6 +122,7 @@ ], "member": "311", "ISBN": ["85-359-0277-5", "978-3-16-148410-0"], + "archive": ["Portico", "LOCKSS"], "reference": [ { "key": "10.1002/(SICI)1097-461X(1998)66:4<261::AID-QUA1>3.0.CO;2-T-BIB1", @@ -115,6 +130,7 @@ "volume": "57", "first-page": "1734", "year": "1972", + "medium": "DVD", "journal-title": "J. Chem. Phys.", "DOI": "10.1063/1.1678462", "doi-asserted-by": "crossref" @@ -131,11 +147,11 @@ }, { "key": "10.1002/(SICI)1097-461X(1998)66:4<261::AID-QUA1>3.0.CO;2-T-BIB3", - "author": "Fernández", + "author": "", "volume": "43", "year": "1987", "unstructured": "and Hypervirial Theorems, Lecture Notes in Chemistry, Vol. 43, (Springer, Berlin, 1987).", - "volume-title": "Hypervirial Theorems, Lecture Notes in Chemistry", + "volume-title": "Hypervirial Theorem’s, Lecture Notes in Chemistry <3", "DOI": "10.1007/978-3-642-93349-3", "doi-asserted-by": "crossref" }, diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py index 6e7f72c5..cb2143b3 100644 --- a/python/tests/import_crossref.py +++ b/python/tests/import_crossref.py @@ -70,34 +70,47 @@ def test_crossref_dict_parse(crossref_importer): # not a single line raw = json.loads(f.read()) r = crossref_importer.parse_record(raw) - extra = r.extra['crossref'] + # ensure the API server is ok with format + JsonLinePusher(crossref_importer, [json.dumps(raw)]).run() + + print(r.extra) assert r.title == "Renormalized perturbation theory by the moment method for degenerate states: Anharmonic oscillators" assert r.doi == "10.1002/(sici)1097-461x(1998)66:4<261::aid-qua1>3.0.co;2-t" assert r.publisher == "Wiley-Blackwell" - print(extra) - assert extra['container-title'] == ["International Journal of Quantum Chemistry"] assert r.release_type == "article-journal" assert r.release_status == "published" + assert r.license_slug == "CC-BY-NC-ND" + assert r.original_title == "Renormalized perturbation theory auf deutsch" assert r.isbn13 == "978-3-16-148410-0" - assert 'subtitle' not in extra - assert 'archive' not in extra - assert 'funder' not in extra - assert len(r.contribs) == 5 + assert 'subtitle' not in r.extra + assert 'subtitle' not in r.extra['crossref'] + assert 'funder' not in r.extra + assert 'funder' not in r.extra['crossref'] + # matched by ISSN, so shouldn't be in there + #assert extra['container_name'] == "International Journal of Quantum Chemistry" + assert r.extra['aliases'] == ["some other title"] + assert r.extra['crossref']['archive'] == ['Portico', 'LOCKSS'] + assert len(r.contribs) == 6 assert r.contribs[0].raw_name == "Marcelo D. Radicioni" assert r.contribs[0].index == 0 + assert r.contribs[0].extra['seq'] == "first" assert r.contribs[1].raw_affiliation == "Some University" assert r.contribs[1].extra['more_affiliations'] == ["Some Department"] assert r.contribs[1].role == "author" - assert r.contribs[3].role == "editor" - assert r.contribs[3].index is None - assert r.contribs[4].role == "translator" + assert r.contribs[4].role == "editor" assert r.contribs[4].index is None + assert r.contribs[4].extra is None + assert r.contribs[5].role == "translator" + assert r.contribs[5].index is None assert len(r.refs) == 25 assert r.refs[0].key == "BIB1" assert r.refs[0].year == 1972 assert r.refs[0].locator == "1734" assert r.refs[0].container_name == "J. Chem. Phys." - assert r.refs[0].extra['crossref'] == {"volume": "57", "author": "Swenson", "doi": "10.1063/1.1678462"} + assert r.refs[0].extra == {"volume": "57", "author": "Swenson", "doi": "10.1063/1.1678462", "medium": "DVD"} + assert r.refs[2].key == 'BIB3' + assert r.refs[2].extra.get('author') is None + assert r.refs[2].container_name == "Hypervirial Theorem's, Lecture Notes in Chemistry <3" assert r.refs[3].container_name == "Large Order Perturbation Theory and Summation Methods in Quantum Mechanics, Lecture Notes in Chemistry" def test_stateful_checking(crossref_importer_existing): -- cgit v1.2.3