From 403b1a2d4591d878145a021a7c1e15e2d60c47d8 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 18 Dec 2019 20:21:49 +0100 Subject: improve datacite field mapping and import Current version succeeded to import a random sample of 100000 records (0.5%) from datacite. The --debug (write JSON to stdout) and --insert-log-file (log batch before committing to db) flags are temporary added to help debugging. Add few unit tests. Some edge cases: a) Existing keys without value requires a slightly awkward: ``` titles = attributes.get('titles', []) or [] ``` b) There can be 0, 1, or more (first one wins) titles. c) Date handling is probably not ideal. Datacite has a potentiall fine grained list of dates. The test case (tests/files/datacite_sample.jsonl) refers to https://ssl.fao.org/glis/doi/10.18730/8DYM9, which has date (main descriptor) 1986. The datacite record contains: 2017 (publicationYear, probably the year of record creation with reference system), 1978-06-03 (collected, e.g. experimental sample), 1986 ("Accepted"). The online version of the resource knows even one more date (2019-06-05 10:14:43 by WIEWS update). --- python/tests/import_datacite.py | 108 +++++++++++++++++++++++++++++++++------- 1 file changed, 91 insertions(+), 17 deletions(-) (limited to 'python/tests/import_datacite.py') diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 0bbaba2e..9c542fc6 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -1,25 +1,99 @@ """ Test datacite importer. +""" -Datacite is a aggregator, hence inputs are quite varied. +import datetime +import pytest +import gzip +from fatcat_tools.importers import DataciteImporter, JsonLinePusher +from fixtures import api +import json -Here is small sample of ID types taken from a sample: - 497344 "DOI" - 65013 "URL" - 22210 "CCDC" - 17853 "GBIF" - 17635 "Other" - 11474 "uri" - 9170 "Publisher ID" - 7775 "URN" - 6196 "DUCHAS" - 5624 "Handle" - 5056 "publisherId" +@pytest.fixture(scope="function") +def datacite_importer(api): + with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: + yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', + bezerk_mode=True) -A nice tool, not yet existing tool (maybe named indigo) would do the following: +@pytest.fixture(scope="function") +def datacite_importer_existing(api): + with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: + yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', + bezerk_mode=False) - $ shuf -n 100000 datacite.ndjson | indigo -t md > data.md -TODO(martin): Write tests. -""" +@pytest.mark.skip(reason="larger datacite import slows tests down") +def test_datacite_importer_huge(datacite_importer): + last_index = datacite_importer.api.get_changelog(limit=1)[0].index + with gzip.open('tests/files/datacite_1k_records.jsonl.gz', 'rt') as f: + datacite_importer.bezerk_mode = True + counts = JsonLinePusher(datacite_importer, f).run() + assert counts['insert'] == 998 + change = datacite_importer.api.get_changelog_entry(index=last_index+1) + release = datacite_importer.api.get_release(change.editgroup.edits.releases[0].ident) + assert len(release.contribs) == 3 + + +def test_datacite_importer(datacite_importer): + last_index = datacite_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/datacite_sample.jsonl', 'r') as f: + datacite_importer.bezerk_mode = True + counts = JsonLinePusher(datacite_importer, f).run() + assert counts['insert'] == 1 + assert counts['exists'] == 0 + assert counts['skip'] == 0 + + # fetch most recent editgroup + change = datacite_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup + assert eg.description + assert "datacite" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.DataciteImporter" in eg.extra['agent'] + + last_index = datacite_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/datacite_sample.jsonl', 'r') as f: + datacite_importer.bezerk_mode = False + datacite_importer.reset() + counts = JsonLinePusher(datacite_importer, f).run() + assert counts['insert'] == 0 + assert counts['exists'] == 1 + assert counts['skip'] == 0 + assert last_index == datacite_importer.api.get_changelog(limit=1)[0].index + +def test_datacite_dict_parse(datacite_importer): + with open('tests/files/datacite_sample.jsonl', 'r') as f: + raw = json.load(f) + r = datacite_importer.parse_record(raw) + # ensure the API server is ok with format + JsonLinePusher(datacite_importer, [json.dumps(raw)]).run() + + print(r.extra) + assert r.title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090" + assert r.publisher == "International Centre for Agricultural Research in Dry Areas" + assert r.release_type == "article" + assert r.release_stage == "published" + assert r.license_slug == None + assert r.original_title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090" + assert r.ext_ids.doi == "10.18730/8dym9" + assert r.ext_ids.isbn13 == None + assert r.language == "enc" + assert r.subtitle == None + assert r.release_date == None + assert r.release_year == 1986 + assert 'subtitle' not in r.extra + assert 'subtitle' not in r.extra['datacite'] + assert 'funder' not in r.extra + assert 'funder' not in r.extra['datacite'] + # matched by ISSN, so shouldn't be in there + #assert extra['container_name'] == "International Journal of Quantum Chemistry" + assert r.extra['datacite']['url'] == 'https://ssl.fao.org/glis/doi/10.18730/8DYM9' + assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}] + assert len(r.abstracts) == 1 + assert len(r.abstracts[0].content) == 421 + assert len(r.contribs) == 1 + assert r.contribs[0].raw_name == "GLIS Of The ITPGRFA" + assert r.contribs[0].given_name == None + assert r.contribs[0].surname == None + assert len(r.refs) == 0 -- cgit v1.2.3