From 4a82a0763bf927248f22e47ab5187af4beff83ee Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Mon, 9 Dec 2019 01:03:43 +0100 Subject: datacite: importer skeleton * contributors, title, date, publisher, container, license Field and value analysis via https://github.com/miku/indigo. --- python/tests/import_datacite.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 python/tests/import_datacite.py (limited to 'python/tests/import_datacite.py') diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py new file mode 100644 index 00000000..0bbaba2e --- /dev/null +++ b/python/tests/import_datacite.py @@ -0,0 +1,25 @@ +""" +Test datacite importer. + +Datacite is a aggregator, hence inputs are quite varied. + +Here is small sample of ID types taken from a sample: + + 497344 "DOI" + 65013 "URL" + 22210 "CCDC" + 17853 "GBIF" + 17635 "Other" + 11474 "uri" + 9170 "Publisher ID" + 7775 "URN" + 6196 "DUCHAS" + 5624 "Handle" + 5056 "publisherId" + +A nice tool, not yet existing tool (maybe named indigo) would do the following: + + $ shuf -n 100000 datacite.ndjson | indigo -t md > data.md + +TODO(martin): Write tests. +""" -- cgit v1.2.3 From 403b1a2d4591d878145a021a7c1e15e2d60c47d8 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 18 Dec 2019 20:21:49 +0100 Subject: improve datacite field mapping and import Current version succeeded to import a random sample of 100000 records (0.5%) from datacite. The --debug (write JSON to stdout) and --insert-log-file (log batch before committing to db) flags are temporary added to help debugging. Add few unit tests. Some edge cases: a) Existing keys without value requires a slightly awkward: ``` titles = attributes.get('titles', []) or [] ``` b) There can be 0, 1, or more (first one wins) titles. c) Date handling is probably not ideal. Datacite has a potentiall fine grained list of dates. The test case (tests/files/datacite_sample.jsonl) refers to https://ssl.fao.org/glis/doi/10.18730/8DYM9, which has date (main descriptor) 1986. The datacite record contains: 2017 (publicationYear, probably the year of record creation with reference system), 1978-06-03 (collected, e.g. experimental sample), 1986 ("Accepted"). The online version of the resource knows even one more date (2019-06-05 10:14:43 by WIEWS update). --- python/fatcat_import.py | 15 +- python/fatcat_tools/importers/datacite.py | 180 ++++++++++++++++++------ python/tests/files/datacite_1k_records.jsonl.gz | Bin 0 -> 684605 bytes python/tests/files/datacite_sample.jsonl | 1 + python/tests/import_datacite.py | 108 +++++++++++--- 5 files changed, 245 insertions(+), 59 deletions(-) create mode 100644 python/tests/files/datacite_1k_records.jsonl.gz create mode 100644 python/tests/files/datacite_sample.jsonl (limited to 'python/tests/import_datacite.py') diff --git a/python/fatcat_import.py b/python/fatcat_import.py index d7651792..90bb01a1 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -170,7 +170,10 @@ def run_datacite(args): dci = DataciteImporter(args.api, args.issn_map_file, edit_batch_size=args.batch_size, - bezerk_mode=args.bezerk_mode) + bezerk_mode=args.bezerk_mode, + debug=args.debug, + lang_detect=args.lang_detect, + insert_log_file=args.insert_log_file) if args.kafka_mode: KafkaJsonPusher(fci, args.kafka_hosts, args.kafka_env, "api-datacite", "fatcat-import", consume_batch_size=args.batch_size).run() @@ -464,6 +467,16 @@ def main(): sub_datacite.add_argument('--bezerk-mode', action='store_true', help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)") + sub_datacite.add_argument('--debug', + action='store_true', + help="write converted JSON to stdout") + sub_datacite.add_argument('--lang-detect', + action='store_true', + help="try to detect language (slow)") + sub_datacite.add_argument('--insert-log-file', + default='', + type=str, + help="write inserted documents into file (for debugging)") sub_datacite.set_defaults( func=run_datacite, auth_var="FATCAT_API_AUTH_TOKEN", diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 4e117dde..9774e334 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -6,13 +6,14 @@ Example doc at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8 from .common import EntityImporter import dateparser -import langcodes import datetime -import langdetect import fatcat_openapi_client +import hashlib import json +import langcodes +import langdetect +import sqlite3 import sys -import hashlib # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary CONTAINER_TYPE_MAP = { @@ -147,10 +148,11 @@ LICENSE_SLUG_MAP = { class DataciteImporter(EntityImporter): """ - Importer for datacite records. TODO(martin): Do we need issn_map_file? + Importer for datacite records. """ - def __init__(self, api, issn_map_file, **kwargs): + def __init__(self, api, issn_map_file, debug=False, lang_detect=False, + insert_log_file=None, **kwargs): eg_desc = kwargs.get('editgroup_description', "Automated import of Datacite DOI metadata, harvested from REST API") @@ -163,7 +165,42 @@ class DataciteImporter(EntityImporter): **kwargs) self.create_containers = kwargs.get('create_containers', True) + extid_map_file = kwargs.get('extid_map_file') + self.extid_map_db = None + if extid_map_file: + db_uri = "file:{}?mode=ro".format(extid_map_file) + print("Using external ID map: {}".format(db_uri), file=sys.stderr) + self.extid_map_db = sqlite3.connect(db_uri, uri=True) + else: + print("Not using external ID map", file=sys.stderr) + self.read_issn_map_file(issn_map_file) + self.debug = debug + self.lang_detect = lang_detect + self.insert_log_file = insert_log_file + + print('datacite with debug={}, lang_detect={}'.format(self.debug, self.lang_detect), file=sys.stderr) + + def lookup_ext_ids(self, doi): + """ + Return dictionary of identifiers refering to the same things as the given DOI. + """ + if self.extid_map_db is None: + return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) + row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", + [doi.lower()]).fetchone() + if row is None: + return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) + row = [str(cell or '') or None for cell in row] + return dict( + core_id=row[0], + pmid=row[1], + pmcid=row[2], + wikidata_qid=row[3], + # TODO: + arxiv_id=None, + jstor_id=None, + ) def parse_record(self, obj): """ @@ -174,14 +211,14 @@ class DataciteImporter(EntityImporter): attributes = obj['attributes'] - # Contributors. Many nameIdentifierSchemes, we do not use yet: - # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": [ - # "LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", "SCOPUS", - # "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID" ], + # Contributors. Many nameIdentifierSchemes, we do not use (yet): + # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": + # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", + # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"]. contribs = [] for i, c in enumerate(attributes['creators']): - if not c.get('nameType') == 'Personal': + if 'nameType' in c and not c.get('nameType') == 'Personal': continue creator_id = None for nid in c.get('nameIdentifiers', []): @@ -191,7 +228,7 @@ class DataciteImporter(EntityImporter): if not orcid: continue creator_id = self.lookup_orcid(orcid) - # If creator_id is None, should we create creators? + # TODO(martin): If creator_id is None, should we create creators? contribs.append(fatcat_openapi_client.ReleaseContrib( creator_id=creator_id, index=i, @@ -204,11 +241,27 @@ class DataciteImporter(EntityImporter): # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" title, subtitle = None, None - for entry in attributes.get('titles', []): - if not title and 'titleType' not in entry: - title = entry.get('title').strip() - if entry.get('titleType') == 'Subtitle': - subtitle = entry.get('title').strip() + titles = attributes.get('titles', []) or [] + if len(titles) == 0: + print('skipping record w/o title: {}'.format(obj), file=sys.stderr) + return False + elif len(titles) == 1: + # We do not care about the type then. + title = titles[0].get('title', '') or '' + title = title.strip() + else: + for entry in titles: + if not title and ('titleType' not in entry or not entry.get('titleType')): + title = entry.get('title').strip() + if entry.get('titleType') == 'Subtitle': + subtitle = entry.get('title', '').strip() + + if not title: + print('skipping record w/o title: {}'.format(obj), file=sys.stderr) + return False + + if not subtitle: + subtitle = None # Dates. A few internal dates (registered, created, updated) and # published (0..2554). We try to work with typed date list, in @@ -217,14 +270,13 @@ class DataciteImporter(EntityImporter): # "Updated", "Valid". release_year, release_date = None, None + # Ignore: Collected, Issued. date_type_prio = ( 'Valid', - 'Issued', 'Available', 'Accepted', 'Submitted', 'Copyrighted', - 'Collected', 'Created', 'Updated', ) @@ -233,15 +285,36 @@ class DataciteImporter(EntityImporter): for item in dates: if not item.get('dateType') == prio: continue - try: - result = dateparser.parse(item.get('date')) - except TypeError as err: - print("{} failed with: {}".format(item.get('date'), err), file=sys.stderr) - continue + + # Parse out date, use common patterns first, fallback to dateparser. + result, value, year_only = None, item.get('date', ''), False + + # Before using (expensive) dateparser, try a few common patterns. + common_patterns = ('%Y-%m-%d', '%Y', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S') + + for pattern in common_patterns: + try: + result = datetime.datetime.strptime(value, pattern) + except ValueError: + continue + else: + if pattern == '%Y': + year_only = True + break + + if result is None: + print('fallback for {}'.format(value), file=sys.stderr) + try: + result = dateparser.parse(value) + except TypeError as err: + print("{} date parsing failed with: {}".format(value, err), file=sys.stderr) + continue + if result is None: # Unparsable date. continue - release_date = result + if not year_only: + release_date = result.date() release_year = result.year if 1000 < release_year < datetime.date.today().year + 5: # Skip possibly bogus dates. @@ -280,10 +353,16 @@ class DataciteImporter(EntityImporter): container_id = self.lookup_issnl(issnl) if container_id is None and container.get('title'): + container_title = container.get('title') + if isinstance(container_title, list): + if len(container_title) > 0: + print('too many container titles: {}'.format(len(container_title))) + container_title = container_title[0] + assert isinstance(container_title, str) ce = fatcat_openapi_client.ContainerEntity( issnl=issnl, container_type=container_type, - name=container.get('title'), + name=container_title, ) ce_edit = self.create_container(ce) container_id = ce_edit.ident @@ -326,12 +405,12 @@ class DataciteImporter(EntityImporter): # closest, but not always supplied. for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', 'bibtex', 'ris'): value = attributes.get('types', {}).get(typeType) - release_type = DATACITE_TYPE_MAP.get(value) + release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value) if release_type is not None: break if release_type is None: - print("datacite unmapped type: {}".format(release_type), file=sys.stderr) + print("no mapped type: {}".format(value), file=sys.stderr) # Language values are varied ("ger", "es", "English", "ENG", "en-us", # "other", ...). Try to crush it with langcodes: "It may sound to you @@ -347,7 +426,7 @@ class DataciteImporter(EntityImporter): try: language = langcodes.get(value).language except langcodes.tag_parser.LanguageTagError: - print('could not determine language: {}'.format(value), file=sys.stderr) + pass # Abstracts appear in "attributes.descriptions[].descriptionType", some # of the observed values: "Methods", "TechnicalInfo", @@ -355,8 +434,8 @@ class DataciteImporter(EntityImporter): # "Other" fields might contain references or related articles (with # DOI). TODO(martin): maybe try to parse out some of those refs. abstracts = [] - - for desc in attributes.get('descriptions', []): + descs = attributes.get('descriptions', []) or [] + for desc in descs: if not desc.get('descriptionType') == 'Abstract': continue if len(desc.get('description', '')) < 10: @@ -364,10 +443,11 @@ class DataciteImporter(EntityImporter): text = desc.get('description') sha1 = hashlib.sha1(text.encode('utf-8')).hexdigest() lang = None - try: - lang = langdetect.detect(text) - except langdetect.lang_detect_exception.LangDetectException: - pass + if self.lang_detect: + try: + lang = langdetect.detect(text) + except langdetect.lang_detect_exception.LangDetectException as err: + print('language detection failed: {}'.format(err), file=sys.stderr) abstracts.append(fatcat_openapi_client.ReleaseAbstract( mimetype="text/plain", content=text, @@ -386,7 +466,8 @@ class DataciteImporter(EntityImporter): # For the moment, we only care about References. refs, ref_index = [], 0 - for rel in attributes.get('relatedIdentifiers', []): + relIds = attributes.get('relatedIdentifiers', []) or [] + for rel in relIds: if not rel.get('relationType') == 'References': continue ref_extra = dict() @@ -422,6 +503,9 @@ class DataciteImporter(EntityImporter): if extra_datacite: extra['datacite'] = extra_datacite + doi = attributes.get('doi', '').lower() + extids = self.lookup_ext_ids(doi=doi) + # Assemble release. re = fatcat_openapi_client.ReleaseEntity( work_id=None, @@ -435,7 +519,13 @@ class DataciteImporter(EntityImporter): release_date=release_date, publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( - doi=attributes.get('doi'), + doi=doi, + pmid=extids['pmid'], + pmcid=extids['pmcid'], + wikidata_qid=extids['wikidata_qid'], + core=extids['core_id'], + arxiv=extids['arxiv_id'], + jstor=extids['jstor_id'], ), contribs=contribs, volume=volume, @@ -449,11 +539,12 @@ class DataciteImporter(EntityImporter): ) return re - def try_update(self, re, debug=True): + def try_update(self, re): """ - When debug is true, write the RE to stdout. + When debug is true, write the RE to stdout, not to the database. Might + hide schema mismatch bugs. """ - if debug is True: + if self.debug is True: print(json.dumps(re.to_dict(), default=extended_json_encoder)) return False @@ -476,10 +567,16 @@ class DataciteImporter(EntityImporter): return True def insert_batch(self, batch): + print('inserting batch ({})'.format(len(batch)), file=sys.stderr) + if self.insert_log_file: + with open(self.insert_log_file, 'a') as f: + for doc in batch: + json.dump(doc.to_dict(), f, default=extended_json_encoder) + f.write('\n') self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), + description=self.editgroup_description, + extra=self.editgroup_extra), entity_list=batch)) def extended_json_encoder(value): @@ -491,6 +588,7 @@ def extended_json_encoder(value): return value.isoformat() if isinstance(value, set): return list(value) + raise TypeError('cannot encode type: {}'.format(type(value))) def lookup_license_slug(raw): """ diff --git a/python/tests/files/datacite_1k_records.jsonl.gz b/python/tests/files/datacite_1k_records.jsonl.gz new file mode 100644 index 00000000..28ea6e37 Binary files /dev/null and b/python/tests/files/datacite_1k_records.jsonl.gz differ diff --git a/python/tests/files/datacite_sample.jsonl b/python/tests/files/datacite_sample.jsonl new file mode 100644 index 00000000..dba3e267 --- /dev/null +++ b/python/tests/files/datacite_sample.jsonl @@ -0,0 +1 @@ +{"id":"10.18730/8dym9","type":"dois","attributes":{"doi":"10.18730/8dym9","identifiers":[{"identifier":"https://doi.org/10.18730/8dym9","identifierType":"DOI"},{"identifier":"ICDW 20791","identifierType":"Other"}],"creators":[{"name":"GLIS Of The ITPGRFA","affiliation":[]}],"titles":[{"title":"Triticum turgidum L. subsp. durum (Desf.) Husn. 97090"}],"publisher":"International Centre for Agricultural Research in Dry Areas","container":{},"publicationYear":2017,"subjects":[{"subject":"Plant Genetic Resource for Food and Agriculture"}],"contributors":[{"name":"International Centre For Agricultural Research In Dry Areas","affiliation":[]}],"dates":[{"date":"1986","dateType":"Accepted"},{"date":"1978-06-03","dateType":"Collected"},{"date":"2017","dateType":"Issued"}],"language":"en","types":{"ris":"GEN","bibtex":"misc","citeproc":"article","schemaOrg":"CreativeWork","resourceType":"PGRFA Material","resourceTypeGeneral":"PhysicalObject"},"relatedIdentifiers":[{"schemeUri":"http://www.fao.org/plant-treaty/areas-of-work/global-information-system/descriptors","schemeType":"XML","relationType":"HasMetadata","relatedIdentifier":"https://ssl.fao.org/glisapi/v1/pgrfas?doi=10.18730/8DYM9","relatedIdentifierType":"URL","relatedMetadataScheme":"GLIS Descriptors"},{"schemeUri":"http://rs.tdwg.org/dwc/terms/guides/text/index.htm","schemeType":"DwC-A","relationType":"HasMetadata","relatedIdentifier":"https://ssl.fao.org/glisapi/v1/pgrfas?_format=dwc&doi=10.18730/8DYM9","relatedIdentifierType":"URL","relatedMetadataScheme":"Darwin Core Archive"}],"sizes":[],"formats":[],"version":null,"rightsList":[],"descriptions":[{"description":"Plant Genetic Resource.
Taxonomy: Triticum turgidum L. subsp. durum (Desf.) Husn.
Common name(s): Wheat
Conserved by: International Centre for Agricultural Research in Dry Areas (ICARDA), Lebanon
Local sample unique identifier: 97090
Method of creation: Acquisition
Date: 1986
Biological status: Traditional cultivar/landrace
Other identifiers: ICDW 20791
MLS status: Included
Historical: No","descriptionType":"Abstract"}],"geoLocations":[{"geoLocationPlace":"Collecting site","geoLocationPoint":{"pointLatitude":"35.5","pointLongitude":"23.7333"}}],"fundingReferences":[],"url":"https://ssl.fao.org/glis/doi/10.18730/8DYM9","contentUrl":null,"metadataVersion":3,"schemaVersion":"http://datacite.org/schema/kernel-4","source":"mds","isActive":true,"state":"findable","reason":null,"created":"2017-11-11T12:26:01.000Z","registered":"2017-11-11T12:26:02.000Z","published":"2017","updated":"2019-08-02T16:34:56.000Z"},"relationships":{"client":{"data":{"id":"fao.itpgrfa","type":"clients"}}}} diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 0bbaba2e..9c542fc6 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -1,25 +1,99 @@ """ Test datacite importer. +""" -Datacite is a aggregator, hence inputs are quite varied. +import datetime +import pytest +import gzip +from fatcat_tools.importers import DataciteImporter, JsonLinePusher +from fixtures import api +import json -Here is small sample of ID types taken from a sample: - 497344 "DOI" - 65013 "URL" - 22210 "CCDC" - 17853 "GBIF" - 17635 "Other" - 11474 "uri" - 9170 "Publisher ID" - 7775 "URN" - 6196 "DUCHAS" - 5624 "Handle" - 5056 "publisherId" +@pytest.fixture(scope="function") +def datacite_importer(api): + with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: + yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', + bezerk_mode=True) -A nice tool, not yet existing tool (maybe named indigo) would do the following: +@pytest.fixture(scope="function") +def datacite_importer_existing(api): + with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: + yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', + bezerk_mode=False) - $ shuf -n 100000 datacite.ndjson | indigo -t md > data.md -TODO(martin): Write tests. -""" +@pytest.mark.skip(reason="larger datacite import slows tests down") +def test_datacite_importer_huge(datacite_importer): + last_index = datacite_importer.api.get_changelog(limit=1)[0].index + with gzip.open('tests/files/datacite_1k_records.jsonl.gz', 'rt') as f: + datacite_importer.bezerk_mode = True + counts = JsonLinePusher(datacite_importer, f).run() + assert counts['insert'] == 998 + change = datacite_importer.api.get_changelog_entry(index=last_index+1) + release = datacite_importer.api.get_release(change.editgroup.edits.releases[0].ident) + assert len(release.contribs) == 3 + + +def test_datacite_importer(datacite_importer): + last_index = datacite_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/datacite_sample.jsonl', 'r') as f: + datacite_importer.bezerk_mode = True + counts = JsonLinePusher(datacite_importer, f).run() + assert counts['insert'] == 1 + assert counts['exists'] == 0 + assert counts['skip'] == 0 + + # fetch most recent editgroup + change = datacite_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup + assert eg.description + assert "datacite" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.DataciteImporter" in eg.extra['agent'] + + last_index = datacite_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/datacite_sample.jsonl', 'r') as f: + datacite_importer.bezerk_mode = False + datacite_importer.reset() + counts = JsonLinePusher(datacite_importer, f).run() + assert counts['insert'] == 0 + assert counts['exists'] == 1 + assert counts['skip'] == 0 + assert last_index == datacite_importer.api.get_changelog(limit=1)[0].index + +def test_datacite_dict_parse(datacite_importer): + with open('tests/files/datacite_sample.jsonl', 'r') as f: + raw = json.load(f) + r = datacite_importer.parse_record(raw) + # ensure the API server is ok with format + JsonLinePusher(datacite_importer, [json.dumps(raw)]).run() + + print(r.extra) + assert r.title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090" + assert r.publisher == "International Centre for Agricultural Research in Dry Areas" + assert r.release_type == "article" + assert r.release_stage == "published" + assert r.license_slug == None + assert r.original_title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090" + assert r.ext_ids.doi == "10.18730/8dym9" + assert r.ext_ids.isbn13 == None + assert r.language == "enc" + assert r.subtitle == None + assert r.release_date == None + assert r.release_year == 1986 + assert 'subtitle' not in r.extra + assert 'subtitle' not in r.extra['datacite'] + assert 'funder' not in r.extra + assert 'funder' not in r.extra['datacite'] + # matched by ISSN, so shouldn't be in there + #assert extra['container_name'] == "International Journal of Quantum Chemistry" + assert r.extra['datacite']['url'] == 'https://ssl.fao.org/glis/doi/10.18730/8DYM9' + assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}] + assert len(r.abstracts) == 1 + assert len(r.abstracts[0].content) == 421 + assert len(r.contribs) == 1 + assert r.contribs[0].raw_name == "GLIS Of The ITPGRFA" + assert r.contribs[0].given_name == None + assert r.contribs[0].surname == None + assert len(r.refs) == 0 -- cgit v1.2.3 From a196435a0e88f85785742cdd089344f97401b43a Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Sat, 21 Dec 2019 23:30:56 +0100 Subject: address first round of MR14 comments * add missing langdetect * use entity_to_dict for json debug output * factor out code for fields in function and add table driven tests * update citeproc types * add author as default role * add raw_affiliation * include relations from datacite * remove url (covered by doi already) Using yapf for python formatting. --- python/Pipfile | 1 + python/Pipfile.lock | 7 + python/fatcat_tools/importers/datacite.py | 467 ++++++++++++++++++++---------- python/tests/import_datacite.py | 178 +++++++++++- 4 files changed, 503 insertions(+), 150 deletions(-) (limited to 'python/tests/import_datacite.py') diff --git a/python/Pipfile b/python/Pipfile index dfb87514..6325c180 100644 --- a/python/Pipfile +++ b/python/Pipfile @@ -49,6 +49,7 @@ elasticsearch-dsl = ">=6.0.0,<7.0.0" elasticsearch = ">=6.0.0,<7.0.0" langcodes = ">=1.4" dateparser = ">=0.7" +langdetect = "*" [requires] # Python 3.5 is the bundled (system) version of python for Ubuntu 16.04 diff --git a/python/Pipfile.lock b/python/Pipfile.lock index b6e066b5..f0f60aa8 100644 --- a/python/Pipfile.lock +++ b/python/Pipfile.lock @@ -306,6 +306,13 @@ "index": "pypi", "version": "==1.4.1" }, + "langdetect": { + "hashes": [ + "sha256:91a170d5f0ade380db809b3ba67f08e95fe6c6c8641f96d67a51ff7e98a9bf30" + ], + "index": "pypi", + "version": "==1.0.7" + }, "loginpass": { "hashes": [ "sha256:717c87c1870a7e00547fd9d989aea9b22232b2f48826f552d79c34a47f9618c9", diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 77ce1012..19b89edf 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -14,6 +14,7 @@ import langcodes import langdetect import sqlite3 import sys +from fatcat_tools.transforms import entity_to_dict # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary CONTAINER_TYPE_MAP = { @@ -55,16 +56,42 @@ DATACITE_TYPE_MAP = { 'Thesis': 'thesis', }, 'citeproc': { - 'dataset': 'dataset', - 'chapter': 'chapter', - 'article-journal': 'article-journal', - 'song': 'song', 'article': 'article', - 'report': 'report', + 'article-journal': 'article-journal', + 'article-magazine': 'article-magazine', + 'article-newspaper': 'article-newspaper', + 'bill': 'bill', + 'book': 'book', + 'broadcast': 'broadcast', + 'chapter': 'chapter', + 'dataset': 'dataset', + 'entry-dictionary': 'entry-dictionary', + 'entry-encyclopedia': 'entry-encyclopedia', + 'entry': 'entry', + 'figure': 'figure', 'graphic': 'graphic', + 'interview': 'interview', + 'legal_case': 'legal_case', + 'legislation': 'legislation', + 'manuscript': 'manuscript', + 'map': 'map', + 'motion_picture': 'motion_picture', + 'musical_score': 'musical_score', + 'pamphlet': 'pamphlet', + 'paper-conference': 'paper-conference', + 'patent': 'patent', + 'personal_communication': 'personal_communication', + 'post': 'post', + 'post-weblog': 'post-weblog', + 'report': 'report', + 'review-book': 'review-book', + 'review': 'review', + 'song': 'song', + 'speech': 'speech', 'thesis': 'thesis', - 'book': 'book', - }, + 'treaty': 'treaty', + 'webpage': 'webpage', + }, # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types 'bibtex': { 'phdthesis': 'thesis', 'inbook': 'chapter', @@ -88,7 +115,6 @@ DATACITE_TYPE_MAP = { } } - # TODO(martin): merge this with other maps, maybe. LICENSE_SLUG_MAP = { "//creativecommons.org/licenses/by/2.0/": "CC-BY", @@ -124,7 +150,8 @@ LICENSE_SLUG_MAP = { "//www.karger.com/Services/SiteLicenses": "KARGER", "//www.opensource.org/licenses/Apache-2.0": "Apache-2.0", "//www.opensource.org/licenses/BSD-3-Clause": "BSD-3-Clause", - "//www.opensource.org/licenses/EUPL-1.1": "EUPL-1.1", # redirects to EUPL-1.2 + "//www.opensource.org/licenses/EUPL-1.1": + "EUPL-1.1", # redirects to EUPL-1.2 "//www.opensource.org/licenses/MIT": "MIT", # "http://royalsocietypublishing.org/licence": "", # OA and "normal", https://royalsociety.org/journals/authors/licence-to-publish/ # "http://rsc.li/journals-terms-of-use": "RSC", @@ -146,23 +173,31 @@ LICENSE_SLUG_MAP = { # Note: Some URLs pointing to licensing terms are not in WB yet (but would be nice). } + class DataciteImporter(EntityImporter): """ Importer for datacite records. """ - - def __init__(self, api, issn_map_file, debug=False, lang_detect=False, - insert_log_file=None, **kwargs): - - eg_desc = kwargs.get('editgroup_description', - "Automated import of Datacite DOI metadata, harvested from REST API") + def __init__(self, + api, + issn_map_file, + debug=False, + lang_detect=False, + insert_log_file=None, + **kwargs): + + eg_desc = kwargs.get( + 'editgroup_description', + "Automated import of Datacite DOI metadata, harvested from REST API" + ) eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.DataciteImporter') + eg_extra['agent'] = eg_extra.get('agent', + 'fatcat_tools.DataciteImporter') super().__init__(api, - issn_map_file=issn_map_file, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + issn_map_file=issn_map_file, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) self.create_containers = kwargs.get('create_containers', True) extid_map_file = kwargs.get('extid_map_file') @@ -179,18 +214,31 @@ class DataciteImporter(EntityImporter): self.lang_detect = lang_detect self.insert_log_file = insert_log_file - print('datacite with debug={}, lang_detect={}'.format(self.debug, self.lang_detect), file=sys.stderr) + print('datacite with debug={}, lang_detect={}'.format( + self.debug, self.lang_detect), + file=sys.stderr) def lookup_ext_ids(self, doi): """ Return dictionary of identifiers refering to the same things as the given DOI. """ if self.extid_map_db is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) - row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", + return dict(core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None) + row = self.extid_map_db.execute( + "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]).fetchone() if row is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) + return dict(core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None) row = [str(cell or '') or None for cell in row] return dict( core_id=row[0], @@ -206,6 +254,8 @@ class DataciteImporter(EntityImporter): """ Mapping datacite JSON to ReleaseEntity. """ + if not obj or not isinstance(obj, dict): + return None if 'attributes' not in obj: return None @@ -218,43 +268,54 @@ class DataciteImporter(EntityImporter): contribs = [] for i, c in enumerate(attributes['creators']): - if 'nameType' in c and not c.get('nameType') == 'Personal': - continue - creator_id = None - for nid in c.get('nameIdentifiers', []): - if not nid.get('nameIdentifierScheme').lower() == "orcid": + nameType = c.get('nameType', '') or '' + if nameType == 'Personal': + creator_id = None + for nid in c.get('nameIdentifiers', []): + if not nid.get('nameIdentifierScheme').lower() == "orcid": + continue + orcid = nid.get('nameIdentifier', + '').replace('https://orcid.org/', '') + if not orcid: + continue + creator_id = self.lookup_orcid(orcid) + # TODO(martin): If creator_id is None, should we create creators? + + # If there are multiple affiliation strings, use the first one. + affiliations = c.get('affiliation', []) or [] + raw_affiliation = None + if len(affiliations) == 0: + raw_affiliation = None + else: + raw_affiliation = affiliations[0] + + contribs.append( + fatcat_openapi_client.ReleaseContrib( + creator_id=creator_id, + index=i, + raw_name=c.get('name'), + given_name=c.get('givenName'), + surname=c.get('familyName'), + role='author', + raw_affiliation=raw_affiliation, + )) + elif nameType == 'Organizational': + name = c.get('name', '') or '' + if name == 'NN': continue - orcid = nid.get('nameIdentifier', '').replace('https://orcid.org/', '') - if not orcid: + if len(name) < 3: continue - creator_id = self.lookup_orcid(orcid) - # TODO(martin): If creator_id is None, should we create creators? - contribs.append(fatcat_openapi_client.ReleaseContrib( - creator_id=creator_id, - index=i, - raw_name=c.get('name'), - given_name=c.get('givenName'), - surname=c.get('familyName'), - )) + extra = {'organization': name} + contribs.append(fatcat_openapi_client.ReleaseContrib( + index=i, extra=extra)) + else: + print('unknown name type: {}'.format(nameType), file=sys.stderr) # Title, may come with "attributes.titles[].titleType", like # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" - title, subtitle = None, None - titles = attributes.get('titles', []) or [] - if len(titles) == 0: - print('skipping record w/o title: {}'.format(obj), file=sys.stderr) - return False - elif len(titles) == 1: - # We do not care about the type then. - title = titles[0].get('title', '') or '' - title = title.strip() - else: - for entry in titles: - if not title and ('titleType' not in entry or not entry.get('titleType')): - title = entry.get('title').strip() - if entry.get('titleType') == 'Subtitle': - subtitle = entry.get('title', '').strip() + title, original_language_title, subtitle = parse_datacite_titles( + titles) if not title: print('skipping record w/o title: {}'.format(obj), file=sys.stderr) @@ -268,67 +329,14 @@ class DataciteImporter(EntityImporter): # "attributes.dates[].dateType", values: "Accepted", "Available" # "Collected", "Copyrighted", "Created", "Issued", "Submitted", # "Updated", "Valid". - release_year, release_date = None, None - - # Ignore: Collected, Issued. - date_type_prio = ( - 'Valid', - 'Available', - 'Accepted', - 'Submitted', - 'Copyrighted', - 'Created', - 'Updated', - ) - - # Before using (expensive) dateparser, try a few common patterns. - common_patterns = ('%Y-%m-%d', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S', '%Y') - - for prio in date_type_prio: - dates = attributes.get('dates', []) or [] # Never be None. - for item in dates: - if not item.get('dateType') == prio: - continue - - # Parse out date, use common patterns first, fallback to dateparser. - result, value, year_only = None, item.get('date', ''), False - - for pattern in common_patterns: - try: - result = datetime.datetime.strptime(value, pattern) - except ValueError: - continue - else: - if pattern == '%Y': - year_only = True - break - - if result is None: - print('fallback for {}'.format(value), file=sys.stderr) - try: - result = dateparser.parse(value) - except TypeError as err: - print("{} date parsing failed with: {}".format(value, err), file=sys.stderr) - continue - - if result is None: - # Unparsable date. - continue - if not year_only: - release_date = result.date() - release_year = result.year - if 1000 < release_year < datetime.date.today().year + 5: - # Skip possibly bogus dates. - continue - break - else: - continue - break + release_date, release_year = parse_datacite_dates( + attributes.get('dates', [])) # Publisher. A few NA values. A few bogus values. publisher = attributes.get('publisher') - if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)', '(:none)'): + if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)', + '(:none)'): publisher = None if publisher is not None and len(publisher) > 80: # Arbitrary magic value max length. TODO(martin): better heuristic, @@ -345,7 +353,8 @@ class DataciteImporter(EntityImporter): container = attributes.get('container', {}) or {} if container.get('type') in CONTAINER_TYPE_MAP.keys(): container_type = CONTAINER_TYPE_MAP.get(container['type']) - if container.get('identifier') and container.get('identifierType') == 'ISSN': + if container.get('identifier') and container.get( + 'identifierType') == 'ISSN': issn = container.get('identifier') if len(issn) == 8: issn = issn[:4] + "-" + issn[4:] @@ -357,7 +366,8 @@ class DataciteImporter(EntityImporter): container_title = container.get('title') if isinstance(container_title, list): if len(container_title) > 0: - print('too many container titles: {}'.format(len(container_title))) + print('too many container titles: {}'.format( + len(container_title))) container_title = container_title[0] assert isinstance(container_title, str) ce = fatcat_openapi_client.ContainerEntity( @@ -404,7 +414,8 @@ class DataciteImporter(EntityImporter): # types supplied in datacite. The "attributes.types.resourceType" # contains too many (176 in sample) things for now; citeproc may be the # closest, but not always supplied. - for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', 'bibtex', 'ris'): + for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', + 'bibtex', 'ris'): value = attributes.get('types', {}).get(typeType) release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value) if release_type is not None: @@ -442,19 +453,19 @@ class DataciteImporter(EntityImporter): if len(desc.get('description', '')) < 10: continue text = desc.get('description') - sha1 = hashlib.sha1(text.encode('utf-8')).hexdigest() lang = None if self.lang_detect: try: lang = langdetect.detect(text) except langdetect.lang_detect_exception.LangDetectException as err: - print('language detection failed: {}'.format(err), file=sys.stderr) - abstracts.append(fatcat_openapi_client.ReleaseAbstract( - mimetype="text/plain", - content=text, - sha1=sha1, - lang=lang, - )) + print('language detection failed: {}'.format(err), + file=sys.stderr) + abstracts.append( + fatcat_openapi_client.ReleaseAbstract( + mimetype="text/plain", + content=text, + lang=lang, + )) # References and relations. Datacite include many relation types in # "attributes.relatedIdentifiers[].relationType", e.g. @@ -476,17 +487,19 @@ class DataciteImporter(EntityImporter): ref_extra['doi'] = rel.get('relatedIdentifier') if not ref_extra: ref_extra = None - refs.append(fatcat_openapi_client.ReleaseRef( - index=ref_index, - extra=ref_extra, - )) + refs.append( + fatcat_openapi_client.ReleaseRef( + index=ref_index, + extra=ref_extra, + )) ref_index += 1 # Start with clear stages, e.g. published. TODO(martin): we could # probably infer a bit more from the relations, e.g. # "IsPreviousVersionOf" or "IsNewVersionOf". release_stage = None - if attributes.get('state') == 'findable' or attributes.get('isActive') is True: + if attributes.get( + 'state') == 'findable' or attributes.get('isActive') is True: release_stage = 'published' # Extra information. @@ -496,8 +509,22 @@ class DataciteImporter(EntityImporter): extra_datacite['license'] = license_extra if attributes.get('subjects'): extra_datacite['subjects'] = attributes['subjects'] - if attributes.get('url'): - extra_datacite['url'] = attributes['url'] + + # Include certain relations from relatedIdentifiers. Keeping the + # original structure of data here, which is a list of dicts, with + # relation type, identifer and identifier type (mostly). + relations = [] + for rel in relIds: + if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues', + 'IsVariantFormOf', 'IsSupplementTo', + 'HasVersion', 'IsMetadataFor', + 'IsNewVersionOf', 'IsIdenticalTo', + 'IsVersionOf', 'IsDerivedFrom', + 'IsSourceOf'): + relations.append(rel) + + if relations: + extra_datacite['relations'] = relations extra = dict() @@ -515,7 +542,7 @@ class DataciteImporter(EntityImporter): release_stage=release_stage, title=title, subtitle=subtitle, - original_title=title, + original_title=original_language_title, release_year=release_year, release_date=release_date, publisher=publisher, @@ -546,7 +573,7 @@ class DataciteImporter(EntityImporter): hide schema mismatch bugs. """ if self.debug is True: - print(json.dumps(re.to_dict(), default=extended_json_encoder)) + print(json.dumps(entity_to_dict(re, api_client=None))) return False # lookup existing DOI (don't need to try other ext idents for crossref) @@ -572,24 +599,15 @@ class DataciteImporter(EntityImporter): if self.insert_log_file: with open(self.insert_log_file, 'a') as f: for doc in batch: - json.dump(doc.to_dict(), f, default=extended_json_encoder) + json.dump(entity_to_dict(re, api_client=None), f) f.write('\n') - self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_release_auto_batch( + fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) -def extended_json_encoder(value): - """ - Can be used with json.dumps(value, default=extended_json_encoder) to serialize - value not serializable by default. https://docs.python.org/3/library/json.html#basic-usage - """ - if isinstance(value, (datetime.datetime, datetime.date)): - return value.isoformat() - if isinstance(value, set): - return list(value) - raise TypeError('cannot encode type: {}'.format(type(value))) def lookup_license_slug(raw): """ @@ -604,3 +622,156 @@ def lookup_license_slug(raw): if not raw.endswith('/'): raw = raw + '/' return LICENSE_SLUG_MAP.get(raw) + + +def find_original_language_title(item, min_length=4, max_questionmarks=3): + """ + Perform a few checks before returning a potential original language title. + """ + if not 'original_language_title' in item: + return None + title = item.get('title') + if not title: + return None + original_language_title = item.get('original_language_title') + if isinstance(original_language_title, + str) and title != original_language_title: + if len(original_language_title) < min_length: + return None + if original_language_title.count('?') > max_questionmarks: + return None + return original_language_title + if isinstance(original_language_title, dict): + content = original_language_title.get('__content__', '') or '' + if content and content != title and not content.count( + '?') > max_questionmarks: + return content + return None + + +def parse_datacite_titles(titles): + """ + Given a list of title items from datacite, return 3-tuple (title, + original_language_title, subtitle). + + Example input: + + [ + { + "title": "Meeting Heterogeneity in Consumer Demand" + } + ] + """ + title, original_language_title, subtitle = None, None, None + + if titles is None: + return title, original_language_title, subtitle + if len(titles) == 0: + return title, original_language_title, subtitle + elif len(titles) == 1: + original_language_title = find_original_language_title(titles[0]) + title = titles[0].get('title', '') or '' + title = title.strip() + if not title: + title = None + return title, original_language_title, subtitle + else: + for entry in titles: + if not title and ('titleType' not in entry + or not entry.get('titleType')): + title = entry.get('title').strip() + if not subtitle and entry.get('titleType') == 'Subtitle': + subtitle = entry.get('title', '').strip() + if not original_language_title: + original_language_title = find_original_language_title(entry) + + return title, original_language_title, subtitle + + +def parse_datacite_dates(dates): + """ + Given a list of date fields (under .dates), return tuple, (release_date, + release_year). + """ + release_date, release_year = None, None + + if not dates: + return release_date, release_year + + if not isinstance(dates, list): + raise ValueError('expected a list of date items') + + # Ignored: Collected, Issued. + date_type_prio = ( + 'Valid', + 'Available', + 'Accepted', + 'Submitted', + 'Copyrighted', + 'Created', + 'Updated', + ) + + # Before using (expensive) dateparser, try a few common patterns. + common_patterns = ('%Y-%m-%d', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ', + '%Y-%m-%dT%H:%M:%S', '%Y') + + def parse_item(item): + result, value, year_only = None, item.get('date', ''), False + release_date, release_year = None, None + + for pattern in common_patterns: + try: + result = datetime.datetime.strptime(value, pattern) + except ValueError: + continue + else: + if pattern == '%Y': + year_only = True + break + + if result is None: + print('fallback for {}'.format(value), file=sys.stderr) + try: + result = dateparser.parse(value) + except TypeError as err: + print("{} date parsing failed with: {}".format(value, err), + file=sys.stderr) + return result_date, result_year + + if result is None: + # Unparsable date. + return release_date, release_year + + if not year_only: + release_date = result.date() + release_year = result.year + + return release_date, release_year + + for prio in date_type_prio: + for item in dates: + if not item.get('dateType') == prio: + continue + + release_date, release_year = parse_item(item) + if release_date is None and release_year is None: + continue + + if release_year < 1000 or release_year > datetime.date.today( + ).year + 5: + # Skip possibly bogus dates. + release_year = None + continue + break + else: + continue + break + + if release_date is None and release_year is None: + for item in dates: + release_date, release_year = parse_item(item) + if release_year or release_date: + break + + return release_date, release_year diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 9c542fc6..ab67a310 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -2,10 +2,12 @@ Test datacite importer. """ +import collections import datetime import pytest import gzip from fatcat_tools.importers import DataciteImporter, JsonLinePusher +from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates from fixtures import api import json @@ -22,7 +24,6 @@ def datacite_importer_existing(api): yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False) - @pytest.mark.skip(reason="larger datacite import slows tests down") def test_datacite_importer_huge(datacite_importer): last_index = datacite_importer.api.get_changelog(limit=1)[0].index @@ -35,6 +36,179 @@ def test_datacite_importer_huge(datacite_importer): assert len(release.contribs) == 3 +def test_find_original_language_title(): + """ + Original language might be included, in various ways. + """ + Case = collections.namedtuple('Case', 'about input result') + cases = [ + Case('defaults to None', {}, None), + Case('ignore unknown keys', {'broken': 'kv'}, None), + Case('just a title', {'title': 'Noise Reduction'}, None), + Case('same title should be ignored', { + 'title': 'Noise Reduction', + 'original_language_title': 'Noise Reduction' + }, None), + Case('empty subdict is ignored', { + 'title': 'Noise Reduction', + 'original_language_title': {}, + }, None), + Case('unknown subdict keys are ignored', { + 'title': 'Noise Reduction', + 'original_language_title': {'broken': 'kv'}, + }, None), + Case('original string', { + 'title': 'Noise Reduction', + 'original_language_title': 'Подавление шума', + }, 'Подавление шума'), + Case('language tag is ignored, since its broken', { + 'title': 'Noise Reduction', + 'original_language_title': { + 'language': 'ja', + '__content__': 'Noise Reduction' + }, + }, None), + Case('do not care about language', { + 'title': 'Noise Reduction', + 'original_language_title': { + 'language': 'ja', + '__content__': 'Rauschunterdrückung', + }, + }, 'Rauschunterdrückung'), + Case('ignore excessive questionmarks', { + 'title': 'Noise Reduction', + 'original_language_title': { + 'language': 'ja', + '__content__': '???? However', + }, + }, None), + ] + + for case in cases: + result = find_original_language_title(case.input) + assert result == case.result + +def test_parse_datacite_titles(): + """ + Given a list of titles, find title, original_language_title and subtitle. + Result is a 3-tuple of title, original_language_title, subtitle. + """ + Case = collections.namedtuple('Case', 'about input result') + cases = [ + Case('handle None', None, (None, None, None)), + Case('empty list', [], (None, None, None)), + Case('empty item', [{}], (None, None, None)), + Case('broken keys', [{'broken': 'kv'}], (None, None, None)), + Case('title only', [{'title': 'Total carbon dioxide'}], + ('Total carbon dioxide', None, None), + ), + Case('title and subtitle', [ + {'title': 'Total carbon dioxide'}, + {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, + ], + ('Total carbon dioxide', None, 'Station TT043_7-9'), + ), + Case('title, subtitle order does not matter', [ + {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, + {'title': 'Total carbon dioxide'}, + ], + ('Total carbon dioxide', None, 'Station TT043_7-9'), + ), + Case('multiple titles, first wins', [ + {'title': 'Total carbon dioxide'}, + {'title': 'Meeting Heterogeneity'}, + ], + ('Total carbon dioxide', None, None), + ), + Case('multiple titles, plus sub', [ + {'title': 'Total carbon dioxide'}, + {'title': 'Meeting Heterogeneity'}, + {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, + ], + ('Total carbon dioxide', None, 'Station TT043_7-9'), + ), + Case('multiple titles, multiple subs', [ + {'title': 'Total carbon dioxide'}, + {'title': 'Meeting Heterogeneity'}, + {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, + {'title': 'Some other subtitle', 'titleType': 'Subtitle'}, + ], + ('Total carbon dioxide', None, 'Station TT043_7-9'), + ), + Case('title, original, sub', [ + {'title': 'Total carbon dioxide', 'original_language_title': 'Всего углекислого газа'}, + {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, + ], + ('Total carbon dioxide', 'Всего углекислого газа', 'Station TT043_7-9'), + ), + Case('title, original same as title, sub', [ + {'title': 'Total carbon dioxide', 'original_language_title': { + '__content__': 'Total carbon dioxide', + }}, + {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, + ], + ('Total carbon dioxide', None, 'Station TT043_7-9'), + ), + Case('title, original dict, sub', [ + {'title': 'Total carbon dioxide', 'original_language_title': { + '__content__': 'Всего углекислого газа', + }}, + {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, + ], + ('Total carbon dioxide', 'Всего углекислого газа', 'Station TT043_7-9'), + ), + ] + + for case in cases: + result = parse_datacite_titles(case.input) + assert result == case.result, case.about + +def test_parse_datacite_dates(): + """ + Test datacite date parsing. + """ + Case = collections.namedtuple('Case', 'about input result') + cases = [ + Case('None is None', None, (None, None)), + Case('empty list is None', [], (None, None)), + Case('empty item is None', [{}], (None, None)), + Case('empty item is None', [{'date': '2019'}], (None, 2019)), + Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, 2019)), + Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, 2020)), + Case('first with type', [ + {'date': '2019', 'dateType': 'Accepted'}, {'date': '2020'} + ], (None, 2019)), + Case('full date', [ + {'date': '2019-12-01', 'dateType': 'Valid'}, + ], (datetime.date(2019, 12, 1), 2019)), + Case('date type prio', [ + {'date': '2000-12-01', 'dateType': 'Valid'}, + {'date': '2010-01-01', 'dateType': 'Updated'}, + ], (datetime.date(2000, 12, 1), 2000)), + Case('date type prio, Available > Updated', [ + {'date': '2010-01-01', 'dateType': 'Updated'}, + {'date': '2000-12-01', 'dateType': 'Available'}, + ], (datetime.date(2000, 12, 1), 2000)), + Case('allow different date formats, Available > Updated', [ + {'date': '2010-01-01T10:00:00', 'dateType': 'Updated'}, + {'date': '2000-12-01T10:00:00', 'dateType': 'Available'}, + ], (datetime.date(2000, 12, 1), 2000)), + Case('allow different date formats, Available > Updated', [ + {'date': '2010-01-01T10:00:00Z', 'dateType': 'Updated'}, + {'date': '2000-12-01T10:00:00Z', 'dateType': 'Available'}, + ], (datetime.date(2000, 12, 1), 2000)), + Case('allow fuzzy date formats, Available > Updated', [ + {'date': '2010', 'dateType': 'Updated'}, + {'date': '2000 Dec 01', 'dateType': 'Available'}, + ], (datetime.date(2000, 12, 1), 2000)), + Case('ignore broken date', [ + {'date': 'Febrrr 45', 'dateType': 'Updated'}, + ], (None, None)), + ] + for case in cases: + result = parse_datacite_dates(case.input) + assert result == case.result, case.about + def test_datacite_importer(datacite_importer): last_index = datacite_importer.api.get_changelog(limit=1)[0].index with open('tests/files/datacite_sample.jsonl', 'r') as f: @@ -75,7 +249,7 @@ def test_datacite_dict_parse(datacite_importer): assert r.release_type == "article" assert r.release_stage == "published" assert r.license_slug == None - assert r.original_title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090" + assert r.original_title == None assert r.ext_ids.doi == "10.18730/8dym9" assert r.ext_ids.isbn13 == None assert r.language == "enc" -- cgit v1.2.3 From 9a2a7e35948e350aaf40b07d4d4427d288970d3f Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 26 Dec 2019 23:52:40 +0100 Subject: datacite: adjust tests --- python/tests/import_datacite.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'python/tests/import_datacite.py') diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index ab67a310..bc47a185 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -252,7 +252,7 @@ def test_datacite_dict_parse(datacite_importer): assert r.original_title == None assert r.ext_ids.doi == "10.18730/8dym9" assert r.ext_ids.isbn13 == None - assert r.language == "enc" + assert r.language == "en" assert r.subtitle == None assert r.release_date == None assert r.release_year == 1986 @@ -262,7 +262,6 @@ def test_datacite_dict_parse(datacite_importer): assert 'funder' not in r.extra['datacite'] # matched by ISSN, so shouldn't be in there #assert extra['container_name'] == "International Journal of Quantum Chemistry" - assert r.extra['datacite']['url'] == 'https://ssl.fao.org/glis/doi/10.18730/8DYM9' assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}] assert len(r.abstracts) == 1 assert len(r.abstracts[0].content) == 421 -- cgit v1.2.3 From 96e38edde79735b4080ec08d57e9f54759e97b61 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 2 Jan 2020 17:35:54 +0100 Subject: datacite: add conversion fixtures The `test_datacite_conversions` function will compare an input (datacite) document to an expected output (release entity as JSON). This way, it should not be too hard to add more cases by adding: input, output - and by increasing the counter in the range loop within the test. To view input and result side by side with vim, change into the test directory and run: tests/files/datacite $ ./caseview.sh 18 --- python/tests/files/datacite/caseview.sh | 17 + python/tests/files/datacite/datacite_doc_00.json | 140 +++++ python/tests/files/datacite/datacite_doc_01.json | 81 +++ python/tests/files/datacite/datacite_doc_02.json | 85 +++ python/tests/files/datacite/datacite_doc_03.json | 70 +++ python/tests/files/datacite/datacite_doc_04.json | 80 +++ python/tests/files/datacite/datacite_doc_05.json | 598 +++++++++++++++++++++ python/tests/files/datacite/datacite_doc_06.json | 83 +++ python/tests/files/datacite/datacite_doc_07.json | 120 +++++ python/tests/files/datacite/datacite_doc_08.json | 105 ++++ python/tests/files/datacite/datacite_doc_09.json | 130 +++++ python/tests/files/datacite/datacite_doc_10.json | 83 +++ python/tests/files/datacite/datacite_doc_11.json | 86 +++ python/tests/files/datacite/datacite_doc_12.json | 103 ++++ python/tests/files/datacite/datacite_doc_13.json | 86 +++ python/tests/files/datacite/datacite_doc_14.json | 166 ++++++ python/tests/files/datacite/datacite_doc_15.json | 79 +++ python/tests/files/datacite/datacite_doc_16.json | 80 +++ python/tests/files/datacite/datacite_doc_17.json | 72 +++ python/tests/files/datacite/datacite_doc_18.json | 79 +++ python/tests/files/datacite/datacite_doc_19.json | 79 +++ python/tests/files/datacite/datacite_doc_20.json | 42 ++ python/tests/files/datacite/datacite_doc_21.json | 42 ++ python/tests/files/datacite/datacite_doc_22.json | 44 ++ python/tests/files/datacite/datacite_doc_23.json | 44 ++ .../tests/files/datacite/datacite_result_00.json | 87 +++ .../tests/files/datacite/datacite_result_01.json | 32 ++ .../tests/files/datacite/datacite_result_02.json | 36 ++ .../tests/files/datacite/datacite_result_03.json | 19 + .../tests/files/datacite/datacite_result_04.json | 28 + .../tests/files/datacite/datacite_result_05.json | 530 ++++++++++++++++++ .../tests/files/datacite/datacite_result_06.json | 26 + .../tests/files/datacite/datacite_result_07.json | 73 +++ .../tests/files/datacite/datacite_result_08.json | 53 ++ .../tests/files/datacite/datacite_result_09.json | 35 ++ .../tests/files/datacite/datacite_result_10.json | 32 ++ .../tests/files/datacite/datacite_result_11.json | 21 + .../tests/files/datacite/datacite_result_12.json | 44 ++ .../tests/files/datacite/datacite_result_13.json | 28 + .../tests/files/datacite/datacite_result_14.json | 110 ++++ .../tests/files/datacite/datacite_result_15.json | 22 + .../tests/files/datacite/datacite_result_16.json | 31 ++ .../tests/files/datacite/datacite_result_17.json | 20 + .../tests/files/datacite/datacite_result_18.json | 15 + .../tests/files/datacite/datacite_result_19.json | 15 + .../tests/files/datacite/datacite_result_20.json | 14 + .../tests/files/datacite/datacite_result_21.json | 15 + .../tests/files/datacite/datacite_result_22.json | 22 + .../tests/files/datacite/datacite_result_23.json | 22 + python/tests/import_datacite.py | 26 +- 50 files changed, 3949 insertions(+), 1 deletion(-) create mode 100755 python/tests/files/datacite/caseview.sh create mode 100644 python/tests/files/datacite/datacite_doc_00.json create mode 100644 python/tests/files/datacite/datacite_doc_01.json create mode 100644 python/tests/files/datacite/datacite_doc_02.json create mode 100644 python/tests/files/datacite/datacite_doc_03.json create mode 100644 python/tests/files/datacite/datacite_doc_04.json create mode 100644 python/tests/files/datacite/datacite_doc_05.json create mode 100644 python/tests/files/datacite/datacite_doc_06.json create mode 100644 python/tests/files/datacite/datacite_doc_07.json create mode 100644 python/tests/files/datacite/datacite_doc_08.json create mode 100644 python/tests/files/datacite/datacite_doc_09.json create mode 100644 python/tests/files/datacite/datacite_doc_10.json create mode 100644 python/tests/files/datacite/datacite_doc_11.json create mode 100644 python/tests/files/datacite/datacite_doc_12.json create mode 100644 python/tests/files/datacite/datacite_doc_13.json create mode 100644 python/tests/files/datacite/datacite_doc_14.json create mode 100644 python/tests/files/datacite/datacite_doc_15.json create mode 100644 python/tests/files/datacite/datacite_doc_16.json create mode 100644 python/tests/files/datacite/datacite_doc_17.json create mode 100644 python/tests/files/datacite/datacite_doc_18.json create mode 100644 python/tests/files/datacite/datacite_doc_19.json create mode 100644 python/tests/files/datacite/datacite_doc_20.json create mode 100644 python/tests/files/datacite/datacite_doc_21.json create mode 100644 python/tests/files/datacite/datacite_doc_22.json create mode 100644 python/tests/files/datacite/datacite_doc_23.json create mode 100644 python/tests/files/datacite/datacite_result_00.json create mode 100644 python/tests/files/datacite/datacite_result_01.json create mode 100644 python/tests/files/datacite/datacite_result_02.json create mode 100644 python/tests/files/datacite/datacite_result_03.json create mode 100644 python/tests/files/datacite/datacite_result_04.json create mode 100644 python/tests/files/datacite/datacite_result_05.json create mode 100644 python/tests/files/datacite/datacite_result_06.json create mode 100644 python/tests/files/datacite/datacite_result_07.json create mode 100644 python/tests/files/datacite/datacite_result_08.json create mode 100644 python/tests/files/datacite/datacite_result_09.json create mode 100644 python/tests/files/datacite/datacite_result_10.json create mode 100644 python/tests/files/datacite/datacite_result_11.json create mode 100644 python/tests/files/datacite/datacite_result_12.json create mode 100644 python/tests/files/datacite/datacite_result_13.json create mode 100644 python/tests/files/datacite/datacite_result_14.json create mode 100644 python/tests/files/datacite/datacite_result_15.json create mode 100644 python/tests/files/datacite/datacite_result_16.json create mode 100644 python/tests/files/datacite/datacite_result_17.json create mode 100644 python/tests/files/datacite/datacite_result_18.json create mode 100644 python/tests/files/datacite/datacite_result_19.json create mode 100644 python/tests/files/datacite/datacite_result_20.json create mode 100644 python/tests/files/datacite/datacite_result_21.json create mode 100644 python/tests/files/datacite/datacite_result_22.json create mode 100644 python/tests/files/datacite/datacite_result_23.json (limited to 'python/tests/import_datacite.py') diff --git a/python/tests/files/datacite/caseview.sh b/python/tests/files/datacite/caseview.sh new file mode 100755 index 00000000..d1e98c04 --- /dev/null +++ b/python/tests/files/datacite/caseview.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# +# Open input and output in vertical vim split. +# +# $ caseview 13 +# +view() { + if [ -z "$1" ]; then + echo usage: "$0" CASE-NUMBER + exit 1 + else + padded=$(printf "%02d\n" "$1") + vim -O "datacite_doc_$padded.json" "datacite_result_$padded.json" + fi +} + +view "$@" diff --git a/python/tests/files/datacite/datacite_doc_00.json b/python/tests/files/datacite/datacite_doc_00.json new file mode 100644 index 00000000..248f525f --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_00.json @@ -0,0 +1,140 @@ +{ + "id": "10.1007/s10870-008-9413-z", + "type": "dois", + "attributes": { + "doi": "10.1007/s10870-008-9413-z", + "identifiers": [ + { + "identifier": "https://doi.org/10.1007/s10870-008-9413-z", + "identifierType": "DOI" + }, + { + "identifier": "s10870-008-9413-z", + "identifierType": "Publisher ID" + } + ], + "creators": [ + { + "name": "Li, Qian-Jin", + "nameType": "Personal", + "givenName": "Qian-Jin", + "familyName": "Li", + "affiliation": [] + }, + { + "name": "Yang, Chun-Long", + "nameType": "Personal", + "givenName": "Chun-Long", + "familyName": "Yang", + "affiliation": [] + } + ], + "titles": [ + { + "title": "Synthesis and Crystal Structure of a Compound with Two Conformational Isomers: N-(2-methylbenzoyl)-N′-(4-nitrophenyl)thiourea" + } + ], + "publisher": "Springer Science and Business Media LLC", + "container": { + "type": "Journal", + "issue": "12", + "title": "Journal of Chemical Crystallography", + "volume": "38", + "lastPage": "930", + "firstPage": "927", + "identifier": "1074-1542", + "identifierType": "ISSN" + }, + "publicationYear": 2008, + "subjects": [], + "contributors": [], + "dates": [ + { + "date": "2008-05-30", + "dateType": "Issued" + }, + { + "date": "2019-05-31T04:04:23Z", + "dateType": "Updated" + } + ], + "language": null, + "types": { + "ris": "JOUR", + "bibtex": "article", + "citeproc": "article-journal", + "schemaOrg": "ScholarlyArticle", + "resourceType": "JournalArticle", + "resourceTypeGeneral": "Text" + }, + "relatedIdentifiers": [ + { + "relationType": "IsPartOf", + "relatedIdentifier": "1074-1542", + "resourceTypeGeneral": "Collection", + "relatedIdentifierType": "ISSN" + }, + { + "relationType": "References", + "relatedIdentifier": "10.1016/j.bmcl.2005.09.033", + "relatedIdentifierType": "DOI" + }, + { + "relationType": "References", + "relatedIdentifier": "10.1016/s0022-1139(02)00330-5", + "relatedIdentifierType": "DOI" + }, + { + "relationType": "References", + "relatedIdentifier": "10.1016/s0010-8545(01)00337-x", + "relatedIdentifierType": "DOI" + }, + { + "relationType": "References", + "relatedIdentifier": "10.1016/j.tetlet.2005.06.135", + "relatedIdentifierType": "DOI" + }, + { + "relationType": "References", + "relatedIdentifier": "10.1039/p298700000s1", + "relatedIdentifierType": "DOI" + }, + { + "relationType": "References", + "relatedIdentifier": "10.1002/anie.199515551", + "relatedIdentifierType": "DOI" + } + ], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [ + { + "rightsUri": "http://www.springer.com/tdm" + } + ], + "descriptions": [], + "geoLocations": [], + "fundingReferences": [], + "url": "http://link.springer.com/10.1007/s10870-008-9413-z", + "contentUrl": null, + "metadataVersion": 1, + "schemaVersion": "http://datacite.org/schema/kernel-4", + "source": "levriero", + "isActive": true, + "state": "findable", + "reason": null, + "created": "2019-06-18T14:52:19.000Z", + "registered": null, + "published": "2008", + "updated": "2019-08-03T00:03:40.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "crossref.citations", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_01.json b/python/tests/files/datacite/datacite_doc_01.json new file mode 100644 index 00000000..c4ef6e45 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_01.json @@ -0,0 +1,81 @@ +{ + "id": "10.11588/diglit.25558.39", + "type": "dois", + "attributes": { + "doi": "10.11588/diglit.25558.39", + "identifiers": [ + { + "identifier": "https://doi.org/10.11588/diglit.25558.39", + "identifierType": "DOI" + } + ], + "creators": [ + { + "name": "Dargenty, G.", + "nameType": "Personal", + "givenName": "G.", + "familyName": "Dargenty", + "affiliation": [] + } + ], + "titles": [ + { + "lang": "de", + "title": "Ferdinand Gaillard, [1]: né à Paris le 16 janvier 1834, mort à Paris le 19 janvier 1887" + } + ], + "publisher": "University Library Heidelberg", + "container": {}, + "publicationYear": 1887, + "subjects": [], + "contributors": [], + "dates": [ + { + "date": "1887", + "dateType": "Issued" + } + ], + "language": "fre", + "types": { + "ris": "RPRT", + "bibtex": "article", + "citeproc": "article-journal", + "schemaOrg": "ScholarlyArticle", + "resourceType": "DigitalisatDigital copy", + "resourceTypeGeneral": "Text" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [ + { + "lang": "de", + "rights": "Standard (Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen) - http://www.ub.uni-heidelberg.de/helios/digi/nutzung/Welcome.html" + } + ], + "descriptions": [], + "geoLocations": [], + "fundingReferences": [], + "url": "http://digi.ub.uni-heidelberg.de/diglit/art1887_1/0172", + "contentUrl": null, + "metadataVersion": 4, + "schemaVersion": "http://datacite.org/schema/kernel-4", + "source": null, + "isActive": true, + "state": "findable", + "reason": null, + "created": "2016-12-08T07:43:15.000Z", + "registered": "2016-12-08T07:43:15.000Z", + "published": "1887", + "updated": "2019-08-02T14:27:33.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "gesis.ubhd", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_02.json b/python/tests/files/datacite/datacite_doc_02.json new file mode 100644 index 00000000..8b9a594e --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_02.json @@ -0,0 +1,85 @@ +{ + "id": "10.11588/diglit.37715.57", + "type": "dois", + "attributes": { + "doi": "10.11588/diglit.37715.57", + "identifiers": [ + { + "identifier": "https://doi.org/10.11588/diglit.37715.57", + "identifierType": "DOI" + } + ], + "creators": [ + { + "name": "Weyersberg, Albert", + "nameType": "Personal", + "givenName": "Albert", + "familyName": "Weyersberg", + "affiliation": [] + } + ], + "titles": [ + { + "lang": "de", + "title": "Solinger Schwertschmiede-Familien, [4]" + } + ], + "publisher": "University Library Heidelberg", + "container": {}, + "publicationYear": 1897, + "subjects": [], + "contributors": [], + "dates": [ + { + "date": "1897", + "dateType": "Issued" + } + ], + "language": "ger", + "types": { + "ris": "RPRT", + "bibtex": "article", + "citeproc": "article-journal", + "schemaOrg": "ScholarlyArticle", + "resourceType": "DigitalisatDigital copy", + "resourceTypeGeneral": "Text" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [ + { + "lang": "de", + "rights": "Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen - https://creativecommons.org/licenses/by-sa/3.0/de/" + }, + { + "lang": "en", + "rights": "Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen - https://creativecommons.org/licenses/by-sa/3.0/" + } + ], + "descriptions": [], + "geoLocations": [], + "fundingReferences": [], + "url": "https://digi.ub.uni-heidelberg.de/diglit/zhwk1897_1899/0131", + "contentUrl": null, + "metadataVersion": 2, + "schemaVersion": "http://datacite.org/schema/kernel-4", + "source": "mds", + "isActive": true, + "state": "findable", + "reason": null, + "created": "2018-11-29T12:04:12.000Z", + "registered": "2018-11-29T12:04:13.000Z", + "published": "1897", + "updated": "2019-08-02T21:31:04.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "gesis.ubhd", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_03.json b/python/tests/files/datacite/datacite_doc_03.json new file mode 100644 index 00000000..e77a359c --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_03.json @@ -0,0 +1,70 @@ +{ + "id": "10.13140/rg.2.2.30434.53446", + "type": "dois", + "attributes": { + "doi": "10.13140/rg.2.2.30434.53446", + "identifiers": [ + { + "identifier": "https://doi.org/10.13140/rg.2.2.30434.53446", + "identifierType": "DOI" + } + ], + "creators": [ + { + "name": "Mastura Yahya", + "affiliation": [] + } + ], + "titles": [ + { + "title": "midterm ah30903" + } + ], + "publisher": "Unpublished", + "container": {}, + "publicationYear": 2016, + "subjects": [], + "contributors": [], + "dates": [ + { + "date": "2016", + "dateType": "Issued" + } + ], + "language": "ms", + "types": { + "ris": "GEN", + "bibtex": "misc", + "citeproc": "article", + "schemaOrg": "CreativeWork" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [], + "descriptions": [], + "geoLocations": [], + "fundingReferences": [], + "url": "http://rgdoi.net/10.13140/RG.2.2.30434.53446", + "contentUrl": null, + "metadataVersion": 0, + "schemaVersion": "http://datacite.org/schema/kernel-3", + "source": null, + "isActive": true, + "state": "findable", + "reason": null, + "created": "2016-11-03T09:07:08.000Z", + "registered": "2016-11-03T09:07:09.000Z", + "published": "2016", + "updated": "2019-08-02T12:51:15.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "rg.rg", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_04.json b/python/tests/files/datacite/datacite_doc_04.json new file mode 100644 index 00000000..8655a26a --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_04.json @@ -0,0 +1,80 @@ +{ + "id": "10.14288/1.0080520", + "type": "dois", + "attributes": { + "doi": "10.14288/1.0080520", + "identifiers": [ + { + "identifier": "https://doi.org/10.14288/1.0080520", + "identifierType": "DOI" + } + ], + "creators": [ + { + "name": "Nicollerat, Marc Andre", + "nameType": "Personal", + "givenName": "Marc Andre", + "familyName": "Nicollerat", + "affiliation": [] + } + ], + "titles": [ + { + "title": "On chain maps inducing isomorphisms in homology" + } + ], + "publisher": "University of British Columbia", + "container": {}, + "publicationYear": 1973, + "subjects": [], + "contributors": [], + "dates": [ + { + "date": "1973", + "dateType": "Issued" + } + ], + "language": "en", + "types": { + "ris": "RPRT", + "bibtex": "article", + "citeproc": "article-journal", + "schemaOrg": "ScholarlyArticle", + "resourceType": "Text", + "resourceTypeGeneral": "Text" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [], + "descriptions": [ + { + "description": "Let A be an abelian category, I the full subcategory of A consisting of injective objects of A, and K(A) the category whose objects are cochain complexes of elements of A, and whose morphisms are homotopy classes of cochain maps. In (5), lemma 4.6., p. 42, R. Hartshorne has proved that, under certain conditions, a cochain complex X˙ ε. |KA)| can be embedded in a complex I˙ ε. |K(I)| in such a way that I˙ has the same cohomology as X˙. In Chapter I we show that the construction given in the two first parts of Hartshorne's Lemma is natural i.e. there exists a functor J : K(A) → K(I) and a natural transformation [formula omitted] (where E : K(I) → K(A) is the embedding functor) such that [formula omitted] is injective and induces isomorphism in cohomology. The question whether the construction given in the third part of the lemma is functorial is still open. We also prove that J is left adjoint to E, so that K(I) is a reflective subcategory of K(A). In the special case where A is a category [formula omitted] of left A-modules, and [formula omitted] the category of cochain complexes in [formula omitted] and cochain maps (not homotopy classes), we prove the existence of a functor [formula omitted] In Chapter II we study the natural homomorphism [formula omitted] where A, B are rings, and M, L, N modules or chain complexes. In particular we give several sufficient conditions under which v is an isomorphism, or induces isomorphism in homology. In the appendix we give a detailed proof of Hartshorne's Lemma. We think that this is useful, as no complete proof is, to our knowledge, to be found in the literature.", + "descriptionType": "Abstract" + } + ], + "geoLocations": [], + "fundingReferences": [], + "url": "https://doi.library.ubc.ca/10.14288/1.0080520", + "contentUrl": null, + "metadataVersion": 5, + "schemaVersion": "http://datacite.org/schema/kernel-3", + "source": null, + "isActive": true, + "state": "findable", + "reason": null, + "created": "2015-11-11T11:12:34.000Z", + "registered": "2015-11-11T11:12:35.000Z", + "published": "1973", + "updated": "2019-08-02T09:43:14.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "cisti.ubc", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_05.json b/python/tests/files/datacite/datacite_doc_05.json new file mode 100644 index 00000000..75e68e9d --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_05.json @@ -0,0 +1,598 @@ +{ + "id": "10.15156/bio/sh409843.07fu", + "type": "dois", + "attributes": { + "doi": "10.15156/bio/sh409843.07fu", + "identifiers": [ + { + "identifier": "https://doi.org/10.15156/bio/sh409843.07fu", + "identifierType": "DOI" + } + ], + "creators": [ + { + "name": "Kõljalg, Urmas", + "nameType": "Personal", + "givenName": "Urmas", + "familyName": "Kõljalg", + "affiliation": [] + }, + { + "name": "Abarenkov, Kessy", + "nameType": "Personal", + "givenName": "Kessy", + "familyName": "Abarenkov", + "affiliation": [] + }, + { + "name": "Nilsson, R. Henrik", + "nameType": "Personal", + "givenName": "R. Henrik", + "familyName": "Nilsson", + "affiliation": [] + }, + { + "name": "Larsson, Karl-Henrik", + "nameType": "Personal", + "givenName": "Karl-Henrik", + "familyName": "Larsson", + "affiliation": [] + }, + { + "name": "Aas, Anders Bjørnsgard", + "nameType": "Personal", + "givenName": "Anders Bjørnsgard", + "familyName": "Aas", + "affiliation": [] + }, + { + "name": "Adams, Rachel", + "nameType": "Personal", + "givenName": "Rachel", + "familyName": "Adams", + "affiliation": [] + }, + { + "name": "Alves, Artur", + "nameType": "Personal", + "givenName": "Artur", + "familyName": "Alves", + "affiliation": [] + }, + { + "name": "Ammirati, Joseph F.", + "nameType": "Personal", + "givenName": "Joseph F.", + "familyName": "Ammirati", + "affiliation": [] + }, + { + "name": "Arnold, A. Elizabeth", + "nameType": "Personal", + "givenName": "A. Elizabeth", + "familyName": "Arnold", + "affiliation": [] + }, + { + "name": "Bahram, Mohammad", + "nameType": "Personal", + "givenName": "Mohammad", + "familyName": "Bahram", + "affiliation": [] + }, + { + "name": "Bengtsson-Palme, Johan", + "nameType": "Personal", + "givenName": "Johan", + "familyName": "Bengtsson-Palme", + "affiliation": [] + }, + { + "name": "Berlin, Anna", + "nameType": "Personal", + "givenName": "Anna", + "familyName": "Berlin", + "affiliation": [] + }, + { + "name": "Botnen, Synnøve", + "nameType": "Personal", + "givenName": "Synnøve", + "familyName": "Botnen", + "affiliation": [] + }, + { + "name": "Bourlat, Sarah", + "nameType": "Personal", + "givenName": "Sarah", + "familyName": "Bourlat", + "affiliation": [] + }, + { + "name": "Cheeke, Tanya", + "nameType": "Personal", + "givenName": "Tanya", + "familyName": "Cheeke", + "affiliation": [] + }, + { + "name": "Dima, Bálint", + "nameType": "Personal", + "givenName": "Bálint", + "familyName": "Dima", + "affiliation": [] + }, + { + "name": "Drenkhan, Rein", + "nameType": "Personal", + "givenName": "Rein", + "familyName": "Drenkhan", + "affiliation": [] + }, + { + "name": "Duarte, Camila", + "nameType": "Personal", + "givenName": "Camila", + "familyName": "Duarte", + "affiliation": [] + }, + { + "name": "Dueñas, Margarita", + "nameType": "Personal", + "givenName": "Margarita", + "familyName": "Dueñas", + "affiliation": [] + }, + { + "name": "Eberhardt, Ursula", + "nameType": "Personal", + "givenName": "Ursula", + "familyName": "Eberhardt", + "affiliation": [] + }, + { + "name": "Friberg, Hanna", + "nameType": "Personal", + "givenName": "Hanna", + "familyName": "Friberg", + "affiliation": [] + }, + { + "name": "Frøslev, Tobias G.", + "nameType": "Personal", + "givenName": "Tobias G.", + "familyName": "Frøslev", + "affiliation": [] + }, + { + "name": "Garnica, Sigisfredo", + "nameType": "Personal", + "givenName": "Sigisfredo", + "familyName": "Garnica", + "affiliation": [] + }, + { + "name": "Geml, József", + "nameType": "Personal", + "givenName": "József", + "familyName": "Geml", + "affiliation": [] + }, + { + "name": "Ghobad-Nejhad, Masoomeh", + "nameType": "Personal", + "givenName": "Masoomeh", + "familyName": "Ghobad-Nejhad", + "affiliation": [] + }, + { + "name": "Grebenc, Tine", + "nameType": "Personal", + "givenName": "Tine", + "familyName": "Grebenc", + "affiliation": [] + }, + { + "name": "Griffith, Gareth W.", + "nameType": "Personal", + "givenName": "Gareth W.", + "familyName": "Griffith", + "affiliation": [] + }, + { + "name": "Hampe, Felix", + "nameType": "Personal", + "givenName": "Felix", + "familyName": "Hampe", + "affiliation": [] + }, + { + "name": "Kennedy, Peter", + "nameType": "Personal", + "givenName": "Peter", + "familyName": "Kennedy", + "affiliation": [] + }, + { + "name": "Khomich, Maryia", + "nameType": "Personal", + "givenName": "Maryia", + "familyName": "Khomich", + "affiliation": [] + }, + { + "name": "Kohout, Petr", + "nameType": "Personal", + "givenName": "Petr", + "familyName": "Kohout", + "affiliation": [] + }, + { + "name": "Kollom, Anu", + "nameType": "Personal", + "givenName": "Anu", + "familyName": "Kollom", + "affiliation": [] + }, + { + "name": "Larsson, Ellen", + "nameType": "Personal", + "givenName": "Ellen", + "familyName": "Larsson", + "affiliation": [] + }, + { + "name": "Laszlo, Irinyi", + "nameType": "Personal", + "givenName": "Irinyi", + "familyName": "Laszlo", + "affiliation": [] + }, + { + "name": "Leavitt, Steven", + "nameType": "Personal", + "givenName": "Steven", + "familyName": "Leavitt", + "affiliation": [] + }, + { + "name": "Liimatainen, Kare", + "nameType": "Personal", + "givenName": "Kare", + "familyName": "Liimatainen", + "affiliation": [] + }, + { + "name": "Lindahl, Björn", + "nameType": "Personal", + "givenName": "Björn", + "familyName": "Lindahl", + "affiliation": [] + }, + { + "name": "Lodge, Deborah J.", + "nameType": "Personal", + "givenName": "Deborah J.", + "familyName": "Lodge", + "affiliation": [] + }, + { + "name": "Lumbsch, Helge Thorsten", + "nameType": "Personal", + "givenName": "Helge Thorsten", + "familyName": "Lumbsch", + "affiliation": [] + }, + { + "name": "Martín Esteban, María Paz", + "nameType": "Personal", + "givenName": "María Paz", + "familyName": "Martín Esteban", + "affiliation": [] + }, + { + "name": "Meyer, Wieland", + "nameType": "Personal", + "givenName": "Wieland", + "familyName": "Meyer", + "affiliation": [] + }, + { + "name": "Miettinen, Otto", + "nameType": "Personal", + "givenName": "Otto", + "familyName": "Miettinen", + "affiliation": [] + }, + { + "name": "Nguyen, Nhu", + "nameType": "Personal", + "givenName": "Nhu", + "familyName": "Nguyen", + "affiliation": [] + }, + { + "name": "Niskanen, Tuula", + "nameType": "Personal", + "givenName": "Tuula", + "familyName": "Niskanen", + "affiliation": [] + }, + { + "name": "Oono, Ryoko", + "nameType": "Personal", + "givenName": "Ryoko", + "familyName": "Oono", + "affiliation": [] + }, + { + "name": "Öpik, Maarja", + "nameType": "Personal", + "givenName": "Maarja", + "familyName": "Öpik", + "affiliation": [] + }, + { + "name": "Ordynets, Alexander", + "nameType": "Personal", + "givenName": "Alexander", + "familyName": "Ordynets", + "affiliation": [] + }, + { + "name": "Pawłowska, Julia", + "nameType": "Personal", + "givenName": "Julia", + "familyName": "Pawłowska", + "affiliation": [] + }, + { + "name": "Peintner, Ursula", + "nameType": "Personal", + "givenName": "Ursula", + "familyName": "Peintner", + "affiliation": [] + }, + { + "name": "Pereira, Olinto Liparini", + "nameType": "Personal", + "givenName": "Olinto Liparini", + "familyName": "Pereira", + "affiliation": [] + }, + { + "name": "Pinho, Danilo Batista", + "nameType": "Personal", + "givenName": "Danilo Batista", + "familyName": "Pinho", + "affiliation": [] + }, + { + "name": "Põldmaa, Kadri", + "nameType": "Personal", + "givenName": "Kadri", + "familyName": "Põldmaa", + "affiliation": [] + }, + { + "name": "Runnel, Kadri", + "nameType": "Personal", + "givenName": "Kadri", + "familyName": "Runnel", + "affiliation": [] + }, + { + "name": "Ryberg, Martin", + "nameType": "Personal", + "givenName": "Martin", + "familyName": "Ryberg", + "affiliation": [] + }, + { + "name": "Saar, Irja", + "nameType": "Personal", + "givenName": "Irja", + "familyName": "Saar", + "affiliation": [] + }, + { + "name": "Sanli, Kemal", + "nameType": "Personal", + "givenName": "Kemal", + "familyName": "Sanli", + "affiliation": [] + }, + { + "name": "Scott, James", + "nameType": "Personal", + "givenName": "James", + "familyName": "Scott", + "affiliation": [] + }, + { + "name": "Spirin, Viacheslav", + "nameType": "Personal", + "givenName": "Viacheslav", + "familyName": "Spirin", + "affiliation": [] + }, + { + "name": "Suija, Ave", + "nameType": "Personal", + "givenName": "Ave", + "familyName": "Suija", + "affiliation": [] + }, + { + "name": "Svantesson, Sten", + "nameType": "Personal", + "givenName": "Sten", + "familyName": "Svantesson", + "affiliation": [] + }, + { + "name": "Tadych, Mariusz", + "nameType": "Personal", + "givenName": "Mariusz", + "familyName": "Tadych", + "affiliation": [] + }, + { + "name": "Takamatsu, Susumu", + "nameType": "Personal", + "givenName": "Susumu", + "familyName": "Takamatsu", + "affiliation": [] + }, + { + "name": "Tamm, Heidi", + "nameType": "Personal", + "givenName": "Heidi", + "familyName": "Tamm", + "affiliation": [] + }, + { + "name": "Taylor, AFS.", + "nameType": "Personal", + "givenName": "AFS.", + "familyName": "Taylor", + "affiliation": [] + }, + { + "name": "Tedersoo, Leho", + "nameType": "Personal", + "givenName": "Leho", + "familyName": "Tedersoo", + "affiliation": [] + }, + { + "name": "Telleria, M.T.", + "nameType": "Personal", + "givenName": "M.T.", + "familyName": "Telleria", + "affiliation": [] + }, + { + "name": "Udayanga, Dhanushka", + "nameType": "Personal", + "givenName": "Dhanushka", + "familyName": "Udayanga", + "affiliation": [] + }, + { + "name": "Unterseher, Martin", + "nameType": "Personal", + "givenName": "Martin", + "familyName": "Unterseher", + "affiliation": [] + }, + { + "name": "Volobuev, Sergey", + "nameType": "Personal", + "givenName": "Sergey", + "familyName": "Volobuev", + "affiliation": [] + }, + { + "name": "Weiss, Michael", + "nameType": "Personal", + "givenName": "Michael", + "familyName": "Weiss", + "affiliation": [] + }, + { + "name": "Wurzbacher, Christian", + "nameType": "Personal", + "givenName": "Christian", + "familyName": "Wurzbacher", + "affiliation": [] + } + ], + "titles": [ + { + "title": "SH409843.07FU" + }, + { + "title": "Gomphales", + "titleType": "Subtitle" + } + ], + "publisher": "UNITE Community", + "container": {}, + "publicationYear": 2015, + "subjects": [], + "contributors": [ + { + "name": "Kessy Abarenkov", + "affiliation": [] + }, + { + "name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden", + "affiliation": [] + } + ], + "dates": [ + { + "date": "2016-04-22", + "dateType": "Updated" + }, + { + "date": "2014-10-05", + "dateType": "Created" + }, + { + "date": "2015", + "dateType": "Issued" + } + ], + "language": "eng", + "types": { + "ris": "DATA", + "bibtex": "misc", + "citeproc": "dataset", + "schemaOrg": "Dataset", + "resourceType": "Dataset/UNITE Species Hypothesis", + "resourceTypeGeneral": "Dataset" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [ + "application/json" + ], + "version": null, + "rightsList": [ + { + "rights": "Attribution-NonCommercial (CC BY-NC)", + "rightsUri": "http://creativecommons.org/licenses/by-nc/4.0" + } + ], + "descriptions": [ + { + "description": "UNITE provides a unified way for delimiting, identifying, communicating, and working with DNA-based Species Hypotheses (SH). All fungal ITS sequences in the international nucleotide sequence databases are clustered to approximately the species level by applying a set of dynamic distance values (<0.5 - 3.0%). All species hypotheses are given a unique, stable name in the form of a DOI, and their taxonomic and ecological annotations are verified through distributed, web-based third-party annotation efforts. SHs are connected to a taxon name and its classification as far as possible (phylum, class, order, etc.) by taking into account identifications for all sequences in the SH. An automatically or manually designated sequence is chosen to represent each such SH. These sequences are released (https://unite.ut.ee/repository.php) for use by the scientific community in, for example, local sequence similarity searches and next-generation sequencing analysis pipelines. The system and the data are updated automatically as the number of public fungal ITS sequences grows.", + "descriptionType": "Abstract" + } + ], + "geoLocations": [], + "fundingReferences": [], + "url": "https://plutof.ut.ee/#/datacite/10.15156/BIO/SH409843.07FU", + "contentUrl": null, + "metadataVersion": 1, + "schemaVersion": "http://datacite.org/schema/kernel-3", + "source": null, + "isActive": true, + "state": "findable", + "reason": null, + "created": "2015-06-05T10:23:18.000Z", + "registered": "2015-06-05T10:23:19.000Z", + "published": "2015", + "updated": "2019-08-02T07:45:28.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "estdoi.bio", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_06.json b/python/tests/files/datacite/datacite_doc_06.json new file mode 100644 index 00000000..a7f3ee70 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_06.json @@ -0,0 +1,83 @@ +{ + "id": "10.16903/ethz-grs-d_006220", + "type": "dois", + "attributes": { + "doi": "10.16903/ethz-grs-d_006220", + "identifiers": [ + { + "identifier": "https://doi.org/10.16903/ethz-grs-d_006220", + "identifierType": "DOI" + } + ], + "creators": [ + { + "name": "Crispijn De Passe (Der Ältere) (1564-1637)", + "nameType": "Personal", + "affiliation": [] + } + ], + "titles": [ + { + "title": "Der Eifer (Sedulitas), Blatt 7 der Folge \"Die Tugenden\"" + } + ], + "publisher": "n.a.", + "container": {}, + "publicationYear": 1590, + "subjects": [], + "contributors": [], + "dates": [ + { + "date": "1590", + "dateType": "Available" + }, + { + "date": "1590", + "dateType": "Issued" + } + ], + "language": null, + "types": { + "ris": "GEN", + "bibtex": "misc", + "citeproc": "article", + "schemaOrg": "CreativeWork", + "resourceTypeGeneral": "InteractiveResource" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [ + "Blattgrösse: 21.0 x 14.4 x 0.0 cm (beschnitten)", + "Kupferstich" + ], + "version": null, + "rightsList": [ + { + "rights": "ETH-Bibliothek Zürich, Graphische Sammlung / D 6220 / Public Domain Mark 1.0" + } + ], + "descriptions": [], + "geoLocations": [], + "fundingReferences": [], + "url": "http://www.e-gs.ethz.ch/eMP/eMuseumPlus?service=ExternalInterface&module=collection&objectId=29469&viewType=detailView", + "contentUrl": null, + "metadataVersion": 1, + "schemaVersion": "http://datacite.org/schema/kernel-3", + "source": "mds", + "isActive": true, + "state": "findable", + "reason": null, + "created": "2017-12-13T12:03:09.000Z", + "registered": "2017-12-13T12:03:09.000Z", + "published": "1590", + "updated": "2019-08-02T17:20:02.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "ethz.gs", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_07.json b/python/tests/files/datacite/datacite_doc_07.json new file mode 100644 index 00000000..c70695b6 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_07.json @@ -0,0 +1,120 @@ +{ + "id": "10.18462/iir.icr.2015.0926", + "type": "dois", + "attributes": { + "doi": "10.18462/iir.icr.2015.0926", + "identifiers": [ + { + "identifier": "https://doi.org/10.18462/iir.icr.2015.0926", + "identifierType": "DOI" + } + ], + "creators": [ + { + "name": "ROTHUIZEN, E.", + "nameType": "Personal", + "givenName": "E.", + "familyName": "ROTHUIZEN", + "affiliation": [] + }, + { + "name": "ELMEGAARD, B.", + "nameType": "Personal", + "givenName": "B.", + "familyName": "ELMEGAARD", + "affiliation": [] + }, + { + "name": "MARKUSSEN W., B.", + "nameType": "Personal", + "givenName": "B.", + "familyName": "MARKUSSEN W.", + "affiliation": [] + }, + { + "name": "Et Al.", + "affiliation": [] + } + ], + "titles": [ + { + "title": "High efficient heat pump system using storage tanks to increase cop by means of the ISEC concept. 1: model validation." + } + ], + "publisher": "International Institute of Refrigeration (IIR)", + "container": {}, + "publicationYear": 2015, + "subjects": [ + { + "subject": "HEAT PUMP" + }, + { + "subject": "HOT WATER" + }, + { + "subject": "HEAT TRANSFER" + }, + { + "subject": "PERFORMANCE" + }, + { + "subject": "THERMAL STORAGE" + }, + { + "subject": "TANK" + }, + { + "subject": "MODEL" + } + ], + "contributors": [], + "dates": [ + { + "date": "2015", + "dateType": "Issued" + } + ], + "language": "eng", + "types": { + "ris": "DATA", + "bibtex": "misc", + "citeproc": "dataset", + "schemaOrg": "Dataset", + "resourceType": "Dataset", + "resourceTypeGeneral": "Dataset" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [], + "descriptions": [ + { + "description": "The purpose of the ISEC concept is to provide a high-efficient heat pump system for hot water production. The ISEC concept uses two storage tanks for the water, one discharged and one charged. Hot water for the industrial process is tapped from the charged tank, while the other tank is charging. Charging is done by circulating the water in the tank through the condenser of a heat pump several times and thereby gradually heating the water. The charging is done with a higher mass flow rate than the discharging to reach several circulations of the water during the time frame of one discharging. This result in a lower condensing temperature than if the water was heated in one step. Two test setups were built, one to test the performance of the heat pump gradually heating the water and one to investigate the stratification in the storage tanks. Furthermore, a dynamic model of the system was implemented in Dymola, and validated by the use of test data from the two experimental setups. This paper shows that there is a good consistency between the model and the experimental tests.", + "descriptionType": "Abstract" + } + ], + "geoLocations": [], + "fundingReferences": [], + "url": "http://www.iifiir.org/clientBookline/service/reference.asp?INSTANCE=EXPLOITATION&OUTPUT=PORTAL&DOCID=IFD_REFDOC_0015008&DOCBASE=IFD_REFDOC_EN&SETLANGUAGE=EN", + "contentUrl": null, + "metadataVersion": 0, + "schemaVersion": null, + "source": null, + "isActive": true, + "state": "findable", + "reason": null, + "created": "2016-11-21T13:08:14.000Z", + "registered": "2016-11-21T13:08:14.000Z", + "published": "2015", + "updated": "2019-08-16T18:00:59.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "inist.iif", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_08.json b/python/tests/files/datacite/datacite_doc_08.json new file mode 100644 index 00000000..e9170788 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_08.json @@ -0,0 +1,105 @@ +{ + "id": "10.22004/ag.econ.284864", + "type": "dois", + "attributes": { + "doi": "10.22004/ag.econ.284864", + "identifiers": [ + { + "identifier": "https://doi.org/10.22004/ag.econ.284864", + "identifierType": "DOI" + } + ], + "creators": [ + { + "name": "Kajisa, Kei", + "nameType": "Personal", + "givenName": "Kei", + "familyName": "Kajisa", + "affiliation": [], + "nameIdentifiers": [] + }, + { + "name": "Kajisa, Kei", + "nameType": "Personal", + "givenName": "Kei", + "familyName": "Kajisa", + "affiliation": [], + "nameIdentifiers": [] + } + ], + "titles": [ + { + "title": "Irrigation Policies under Rapid Industrialization and Labor Migration: Lessons from Japan, China and India" + } + ], + "publisher": "Unknown", + "container": {}, + "publicationYear": 2017, + "subjects": [ + { + "subject": "Land Economics/Use" + }, + { + "subject": "irrigation", + "subjectScheme": "keyword" + }, + { + "subject": "industrialization", + "subjectScheme": "keyword" + }, + { + "subject": "collective action", + "subjectScheme": "keyword" + } + ], + "contributors": [], + "dates": [ + { + "date": "2017", + "dateType": "Issued" + } + ], + "language": "eng", + "types": { + "ris": "RPRT", + "bibtex": "article", + "citeproc": "article-journal", + "schemaOrg": "ScholarlyArticle", + "resourceType": "Text", + "resourceTypeGeneral": "Text" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [], + "descriptions": [ + { + "description": "International society recognizes that the scarcity of fresh water is increasing and farming sectors suffer from lack of irrigation water. However, if we look at this issue with a framework of relative factor endowment, a different view will arise. In emerging states with rapid industrialization and labor migration, labor scarcity increases at a faster pace than that of irrigation water. Using the historical review of Japan’s irrigation policies as well as the case studies of India and China, this paper shows that the introduction of policies which do not reflect the actual relative resource scarcity may mislead the development path. We argue that under increasing relative labor scarcity it is important to realize the substitution of capital for labor for surface irrigation system management and that the substitution needs public support because the service of surface irrigation system has some externalities. Through this argument, this paper also intends to shed the light back to the role of the state for local resource management which seems to be unfairly undervalued since the boom of community participatory approach in the 1980s.", + "descriptionType": "Abstract" + } + ], + "geoLocations": [], + "fundingReferences": [], + "url": "https://ageconsearch.umn.edu/record/284864", + "contentUrl": null, + "metadataVersion": 1, + "schemaVersion": null, + "source": "mds", + "isActive": true, + "state": "findable", + "reason": null, + "created": "2019-08-24T07:46:47.000Z", + "registered": "2019-08-24T07:46:47.000Z", + "published": "2017", + "updated": "2019-08-25T09:38:33.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "tind.agecon", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_09.json b/python/tests/files/datacite/datacite_doc_09.json new file mode 100644 index 00000000..d09af545 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_09.json @@ -0,0 +1,130 @@ +{ + "id": "10.2314/gbv:880813733", + "type": "dois", + "attributes": { + "doi": "10.2314/gbv:880813733", + "identifiers": [ + { + "identifier": "https://doi.org/10.2314/gbv:880813733", + "identifierType": "DOI" + }, + { + "identifier": "880813733", + "identifierType": "ppn" + }, + { + "identifier": "03WKCF3C", + "identifierType": "contract" + }, + { + "identifier": "01132105", + "identifierType": "contract" + }, + { + "identifier": "GBV:880813733", + "identifierType": "firstid" + }, + { + "identifier": "TIBKAT:880813733", + "identifierType": "ftx-id" + } + ], + "creators": [ + { + "name": "Kirstaedter, Nils", + "nameType": "Personal", + "givenName": "Nils", + "familyName": "Kirstaedter", + "affiliation": [], + "nameIdentifiers": [] + } + ], + "titles": [ + { + "title": "BrightLas : TP3.3. Module für Direktdiodenstrahlquellen bis 4kW und Untersuchungen zur Leistungsskalierung (Diodemodul) : zum Verbundvorhaben Direktdiodenlaseranlagen und -systeme (VP3) im Förderschwerpunkt innovative regionale Wachstumskerne, BMBF : Abschlussbericht" + }, + { + "title": "Module für Direktdiodenstrahlquellen bis 4kW und Untersuchungen zur Leistungsskalierung (Diodemodul)", + "titleType": "AlternativeTitle" + }, + { + "title": "Direktdiodenlaseranlagen und -systeme (VP3)", + "titleType": "AlternativeTitle" + } + ], + "publisher": "[Lumics GmbH]", + "container": {}, + "publicationYear": 2016, + "subjects": [ + { + "subject": "Direktdiodenlasersysteme" + }, + { + "subject": "Physics", + "subjectScheme": "linsearch" + } + ], + "contributors": [ + { + "name": "TIB-Technische Informationsbibliothek Universitätsbibliothek Hannover", + "nameType": "Organizational", + "affiliation": [], + "contributorType": "HostingInstitution", + "nameIdentifiers": [] + }, + { + "name": "Technische Informationsbibliothek (TIB)", + "affiliation": [], + "contributorType": "DataManager", + "nameIdentifiers": [] + } + ], + "dates": [ + { + "date": "2016", + "dateType": "Issued" + } + ], + "language": "de", + "types": { + "ris": "RPRT", + "bibtex": "article", + "citeproc": "report", + "schemaOrg": "ScholarlyArticle", + "resourceType": "Report", + "resourceTypeGeneral": "Text" + }, + "relatedIdentifiers": [], + "sizes": [ + "1 Online-Ressource (10 Seiten, 1,40 MB)" + ], + "formats": [ + "application/pdf" + ], + "version": "1.0", + "rightsList": [], + "descriptions": [], + "geoLocations": [], + "fundingReferences": [], + "url": "https://www.tib.eu/suchen/id/TIBKAT:880813733/", + "contentUrl": null, + "metadataVersion": 9, + "schemaVersion": "http://datacite.org/schema/kernel-4", + "source": "mds", + "isActive": true, + "state": "findable", + "reason": null, + "created": "2017-02-25T00:00:18.000Z", + "registered": "2017-02-25T00:00:19.000Z", + "published": "2016", + "updated": "2019-08-03T05:53:51.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "tib.tib", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_10.json b/python/tests/files/datacite/datacite_doc_10.json new file mode 100644 index 00000000..d40fc272 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_10.json @@ -0,0 +1,83 @@ +{ + "id": "10.25549/wpacards-m6171", + "type": "dois", + "attributes": { + "doi": "10.25549/wpacards-m6171", + "identifiers": [ + { + "identifier": "https://doi.org/10.25549/wpacards-m6171", + "identifierType": "DOI" + } + ], + "creators": [ + { + "name": "Unknown", + "affiliation": [] + } + ], + "titles": [ + { + "title": "WPA household census for 210 E VERNON, Los Angeles" + } + ], + "publisher": "University of Southern California Digital Library (USC.DL)", + "container": {}, + "publicationYear": 2012, + "subjects": [ + { + "subject": "housing areas" + }, + { + "subject": "Dwellings" + } + ], + "contributors": [], + "dates": [ + { + "date": "2012", + "dateType": "Issued" + } + ], + "language": "eng", + "types": { + "ris": "DATA", + "bibtex": "misc", + "citeproc": "dataset", + "schemaOrg": "Dataset", + "resourceType": "Dataset", + "resourceTypeGeneral": "Dataset" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [], + "descriptions": [ + { + "descriptionType": "Abstract" + } + ], + "geoLocations": [], + "fundingReferences": [], + "url": "http://digitallibrary.usc.edu/cdm/ref/collection/p15799coll8/id/2608", + "contentUrl": null, + "metadataVersion": 0, + "schemaVersion": "http://datacite.org/schema/kernel-4", + "source": "mds", + "isActive": true, + "state": "findable", + "reason": null, + "created": "2018-09-09T08:32:09.000Z", + "registered": "2018-09-09T08:33:10.000Z", + "published": "2012", + "updated": "2019-08-02T20:03:32.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "usc.dl", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_11.json b/python/tests/files/datacite/datacite_doc_11.json new file mode 100644 index 00000000..50fe8363 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_11.json @@ -0,0 +1,86 @@ +{ + "id": "10.3932/ethz-a-000055869", + "type": "dois", + "attributes": { + "doi": "10.3932/ethz-a-000055869", + "identifiers": [ + { + "identifier": "https://doi.org/10.3932/ethz-a-000055869", + "identifierType": "DOI" + } + ], + "creators": [ + { + "name": "Comet Photo AG (Zürich)", + "affiliation": [] + } + ], + "titles": [ + { + "title": "N1 bei Safenwil" + } + ], + "publisher": "ETH-Bibliothek Zürich, Bildarchiv", + "container": {}, + "publicationYear": 1965, + "subjects": [], + "contributors": [], + "dates": [ + { + "date": "1965", + "dateType": "Available" + }, + { + "date": "1965", + "dateType": "Issued" + } + ], + "language": "de", + "types": { + "ris": "FIGURE", + "bibtex": "misc", + "citeproc": "graphic", + "schemaOrg": "ImageObject", + "resourceTypeGeneral": "Image" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [ + "TIFF-Bild" + ], + "version": null, + "rightsList": [], + "descriptions": [ + { + "description": "Download und Nutzung frei", + "descriptionType": "Other" + }, + { + "description": "10, N1, Genève, Bern, Zürich, Sankt Gallen, Sankt Margrethen, Strassen, Strassenbau, 2.", + "descriptionType": "Other" + } + ], + "geoLocations": [], + "fundingReferences": [], + "url": "http://ba.e-pics.ethz.ch/link.jsp?id=44861", + "contentUrl": null, + "metadataVersion": 6, + "schemaVersion": "http://datacite.org/schema/kernel-3", + "source": "mds", + "isActive": true, + "state": "findable", + "reason": null, + "created": "2019-03-04T23:56:42.000Z", + "registered": "2019-07-30T13:17:45.000Z", + "published": "1965", + "updated": "2019-08-02T22:08:26.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "ethz.epics-ba", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_12.json b/python/tests/files/datacite/datacite_doc_12.json new file mode 100644 index 00000000..31c0f0ca --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_12.json @@ -0,0 +1,103 @@ +{ + "id": "10.5167/uzh-171449", + "type": "dois", + "attributes": { + "doi": "10.5167/uzh-171449", + "identifiers": [ + { + "identifier": "https://doi.org/10.5167/uzh-171449", + "identifierType": "DOI" + } + ], + "creators": [ + { + "name": "Spanias, Charalampos", + "nameType": "Personal", + "givenName": "Charalampos", + "familyName": "Spanias", + "affiliation": [], + "nameIdentifiers": [] + }, + { + "name": "Nikolaidis, Pantelis T", + "nameType": "Personal", + "givenName": "Pantelis T", + "familyName": "Nikolaidis", + "affiliation": [], + "nameIdentifiers": [] + }, + { + "name": "Rosemann, Thomas", + "nameType": "Personal", + "givenName": "Thomas", + "familyName": "Rosemann", + "affiliation": [], + "nameIdentifiers": [] + }, + { + "name": "Knechtle, Beat", + "nameType": "Personal", + "givenName": "Beat", + "familyName": "Knechtle", + "affiliation": [], + "nameIdentifiers": [] + } + ], + "titles": [ + { + "title": "Anthropometric and Physiological Profile of Mixed Martial Art Athletes: A Brief Review" + } + ], + "publisher": "MDPI Publishing", + "container": {}, + "publicationYear": 2019, + "subjects": [], + "contributors": [], + "dates": [ + { + "date": "2019-06-14", + "dateType": "Available" + }, + { + "date": "2019", + "dateType": "Issued" + } + ], + "language": null, + "types": { + "ris": "RPRT", + "bibtex": "article", + "citeproc": "article-journal", + "schemaOrg": "ScholarlyArticle", + "resourceTypeGeneral": "Text" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [], + "descriptions": [], + "geoLocations": [], + "fundingReferences": [], + "url": "https://www.zora.uzh.ch/id/eprint/171449", + "contentUrl": null, + "metadataVersion": 0, + "schemaVersion": null, + "source": "mds", + "isActive": true, + "state": "findable", + "reason": null, + "created": "2019-06-27T01:01:35.000Z", + "registered": "2019-06-27T01:01:36.000Z", + "published": "2019", + "updated": "2019-09-26T16:44:24.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "ethz.zora", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_13.json b/python/tests/files/datacite/datacite_doc_13.json new file mode 100644 index 00000000..ff6eb229 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_13.json @@ -0,0 +1,86 @@ +{ + "id": "10.5169/seals-314104", + "type": "dois", + "attributes": { + "doi": "10.5169/seals-314104", + "identifiers": [ + { + "identifier": "https://doi.org/10.5169/seals-314104", + "identifierType": "DOI" + } + ], + "creators": [ + { + "name": "O.M.", + "affiliation": [] + }, + { + "name": "Hiltbrunner, Hermann", + "nameType": "Personal", + "givenName": "Hermann", + "familyName": "Hiltbrunner", + "affiliation": [] + } + ], + "titles": [ + { + "title": "[Müssen wir des Glücks uns schämen?]" + } + ], + "publisher": "Buchdruckerei Büchler & Co.", + "container": {}, + "publicationYear": 1940, + "subjects": [], + "contributors": [], + "dates": [ + { + "date": "1940-10-05", + "dateType": "Available" + }, + { + "date": "1940", + "dateType": "Issued" + } + ], + "language": null, + "types": { + "ris": "JOUR", + "bibtex": "article", + "citeproc": "article-journal", + "schemaOrg": "ScholarlyArticle", + "resourceType": "Journal Article", + "resourceTypeGeneral": "Text" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [ + "text/html", + "application/pdf" + ], + "version": null, + "rightsList": [], + "descriptions": [], + "geoLocations": [], + "fundingReferences": [], + "url": "https://www.e-periodica.ch/digbib/view?pid=sle-001:1940-1941:45::13", + "contentUrl": null, + "metadataVersion": 17, + "schemaVersion": "http://datacite.org/schema/kernel-3", + "source": null, + "isActive": true, + "state": "findable", + "reason": null, + "created": "2013-03-22T14:02:08.000Z", + "registered": "2013-03-22T13:58:11.000Z", + "published": "1940", + "updated": "2019-08-02T02:22:55.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "ethz.seals", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_14.json b/python/tests/files/datacite/datacite_doc_14.json new file mode 100644 index 00000000..b1e1ebf2 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_14.json @@ -0,0 +1,166 @@ +{ + "id": "10.5517/cc7gns3", + "type": "dois", + "attributes": { + "doi": "10.5517/cc7gns3", + "identifiers": [ + { + "identifier": "https://doi.org/10.5517/cc7gns3", + "identifierType": "DOI" + }, + { + "identifier": "222635", + "identifierType": "CCDC" + } + ], + "creators": [ + { + "name": "Stulz, E.", + "nameType": "Personal", + "givenName": "E.", + "familyName": "Stulz", + "affiliation": [] + }, + { + "name": "Scott, S.M.", + "nameType": "Personal", + "givenName": "S.M.", + "familyName": "Scott", + "affiliation": [] + }, + { + "name": "Ng, Yiu-Fai", + "nameType": "Personal", + "givenName": "Yiu-Fai", + "familyName": "Ng", + "affiliation": [] + }, + { + "name": "Bond, A.D.", + "nameType": "Personal", + "givenName": "A.D.", + "familyName": "Bond", + "affiliation": [] + }, + { + "name": "Teat, S.J.", + "nameType": "Personal", + "givenName": "S.J.", + "familyName": "Teat", + "affiliation": [] + }, + { + "name": "Darling, S.L.", + "nameType": "Personal", + "givenName": "S.L.", + "familyName": "Darling", + "affiliation": [] + }, + { + "name": "Feeder, N.", + "nameType": "Personal", + "givenName": "N.", + "familyName": "Feeder", + "affiliation": [] + }, + { + "name": "Sanders, J.K.M.", + "nameType": "Personal", + "givenName": "J.K.M.", + "familyName": "Sanders", + "affiliation": [] + } + ], + "titles": [ + { + "title": "CCDC 222635: Experimental Crystal Structure Determination" + } + ], + "publisher": "Cambridge Crystallographic Data Centre", + "container": {}, + "publicationYear": 2004, + "subjects": [ + { + "subject": "Crystal Structure" + }, + { + "subject": "Experimental 3D Coordinates" + }, + { + "subject": "Crystal System" + }, + { + "subject": "Space Group" + }, + { + "subject": "Cell Parameters" + }, + { + "subject": "Crystallography" + }, + { + "subject": "bis(mu~2~-5-(3,5-Di-t-butylphenyl)-15-(4-(2-(diphenylphosphino)ethynyl)phenyl)-2,8,12,18-tetrahexyl-3,7,13,17-tetramethylporphyrinato)-(5,15-bis(3,5-di-t-butylphenyl)-2,8,12,18-tetraethyl-3,7,13,17-tetramethylporphyrinato)-di-nickel-ruthenium chloroform solvate" + } + ], + "contributors": [], + "dates": [ + { + "date": "2004", + "dateType": "Issued" + } + ], + "language": "eng", + "types": { + "ris": "DATA", + "bibtex": "misc", + "citeproc": "dataset", + "schemaOrg": "Dataset", + "resourceTypeGeneral": "Dataset" + }, + "relatedIdentifiers": [ + { + "relationType": "IsSupplementTo", + "relatedIdentifier": "10.1021/ic034699w", + "relatedIdentifierType": "DOI" + } + ], + "sizes": [], + "formats": [ + "CIF" + ], + "version": null, + "rightsList": [], + "descriptions": [ + { + "description": "Related Article: E.Stulz, S.M.Scott, Yiu-Fai Ng, A.D.Bond, S.J.Teat, S.L.Darling, N.Feeder, J.K.M.Sanders|2003|Inorg.Chem.|42|6564|doi:10.1021/ic034699w", + "descriptionType": "Other" + }, + { + "description": "An entry from the Cambridge Structural Database, the world’s repository for small molecule crystal structures. The entry contains experimental data from a crystal diffraction study. The deposited dataset for this entry is freely available from the CCDC and typically includes 3D coordinates, cell parameters, space group, experimental conditions and quality measures.", + "descriptionType": "Abstract" + } + ], + "geoLocations": [], + "fundingReferences": [], + "url": "http://www.ccdc.cam.ac.uk/services/structure_request?id=doi:10.5517/cc7gns3&sid=DataCite", + "contentUrl": null, + "metadataVersion": 2, + "schemaVersion": "http://datacite.org/schema/kernel-3", + "source": null, + "isActive": true, + "state": "findable", + "reason": null, + "created": "2014-03-18T07:28:28.000Z", + "registered": "2014-03-18T07:28:29.000Z", + "published": "2004", + "updated": "2019-08-02T03:38:32.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "ccdc.csd", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_15.json b/python/tests/files/datacite/datacite_doc_15.json new file mode 100644 index 00000000..5b4ee8ec --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_15.json @@ -0,0 +1,79 @@ +{ + "id": "10.6073/pasta/95296d8416aae24f3d39b4ecb27f0b28", + "type": "dois", + "attributes": { + "doi": "10.6073/pasta/95296d8416aae24f3d39b4ecb27f0b28", + "identifiers": [ + { + "identifier": "https://doi.org/10.6073/pasta/95296d8416aae24f3d39b4ecb27f0b28", + "identifierType": "DOI" + }, + { + "identifier": "https://pasta.lternet.edu/package/eml/knb-lter-vcr/102/16", + "identifierType": "URL" + } + ], + "creators": [ + { + "name": "Richardson, David", + "nameType": "Personal", + "givenName": "David", + "familyName": "Richardson", + "affiliation": [] + } + ], + "titles": [ + { + "title": "Parramore Island of the Virginia Coast Reserve Permanent Plot Resurvey: Tree data 1997" + } + ], + "publisher": "Environmental Data Initiative", + "container": {}, + "publicationYear": 2017, + "subjects": [], + "contributors": [], + "dates": [ + { + "date": "2017", + "dateType": "Issued" + } + ], + "language": null, + "types": { + "ris": "DATA", + "bibtex": "misc", + "citeproc": "dataset", + "schemaOrg": "Dataset", + "resourceType": "dataPackage", + "resourceTypeGeneral": "Dataset" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [], + "descriptions": [], + "geoLocations": [], + "fundingReferences": [], + "url": "https://portal.lternet.edu/nis/mapbrowse?packageid=knb-lter-vcr.102.16", + "contentUrl": null, + "metadataVersion": 1, + "schemaVersion": "http://datacite.org/schema/kernel-2.2", + "source": null, + "isActive": true, + "state": "findable", + "reason": null, + "created": "2017-02-01T18:20:04.000Z", + "registered": "2017-02-01T18:20:05.000Z", + "published": "2017", + "updated": "2019-08-02T14:16:49.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "edi.edi", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_16.json b/python/tests/files/datacite/datacite_doc_16.json new file mode 100644 index 00000000..5af7fbe1 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_16.json @@ -0,0 +1,80 @@ +{ + "id": "10.6084/m9.figshare.1282478", + "type": "dois", + "attributes": { + "doi": "10.6084/m9.figshare.1282478", + "identifiers": [ + { + "identifier": "https://doi.org/10.6084/m9.figshare.1282478", + "identifierType": "DOI" + } + ], + "creators": [ + { + "name": "Sochi, Taha", + "nameType": "Personal", + "givenName": "Taha", + "familyName": "Sochi", + "affiliation": [] + } + ], + "titles": [ + { + "title": "Testing the Connectivity of Networks" + } + ], + "publisher": "Figshare", + "container": {}, + "publicationYear": 2014, + "subjects": [], + "contributors": [], + "dates": [ + { + "date": "2014", + "dateType": "Issued" + } + ], + "language": null, + "types": { + "ris": "DATA", + "bibtex": "misc", + "citeproc": "dataset", + "schemaOrg": "Dataset", + "resourceType": "Paper", + "resourceTypeGeneral": "Dataset" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [ + { + "rights": "CC-BY", + "rightsUri": "http://creativecommons.org/licenses/by/3.0/us" + } + ], + "descriptions": [], + "geoLocations": [], + "fundingReferences": [], + "url": "http://figshare.com/articles/Testing_the_Connectivity_of_Networks/1282478", + "contentUrl": null, + "metadataVersion": 0, + "schemaVersion": "http://datacite.org/schema/kernel-3", + "source": null, + "isActive": true, + "state": "findable", + "reason": null, + "created": "2014-12-31T15:38:16.000Z", + "registered": "2014-12-31T15:38:18.000Z", + "published": "2014", + "updated": "2019-08-02T04:52:11.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "figshare.ars", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_17.json b/python/tests/files/datacite/datacite_doc_17.json new file mode 100644 index 00000000..f1363a61 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_17.json @@ -0,0 +1,72 @@ +{ + "id": "10.7910/dvn/tsqfwc/yytj22", + "type": "dois", + "attributes": { + "doi": "10.7910/dvn/tsqfwc/yytj22", + "identifiers": [ + { + "identifier": "https://doi.org/10.7910/dvn/tsqfwc/yytj22", + "identifierType": "DOI" + } + ], + "creators": [ + { + "name": "Di Giovanna, Antonino Paolo (University Of Florence)", + "nameType": "Personal", + "affiliation": [] + } + ], + "titles": [ + { + "title": "gel_BSA-FITC_Markov_segmntation0343.tif" + } + ], + "publisher": "Harvard Dataverse", + "container": {}, + "publicationYear": 2018, + "subjects": [], + "contributors": [], + "dates": [ + { + "date": "2018", + "dateType": "Issued" + } + ], + "language": null, + "types": { + "ris": "DATA", + "bibtex": "misc", + "citeproc": "dataset", + "schemaOrg": "Dataset", + "resourceTypeGeneral": "Dataset" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [], + "descriptions": [], + "geoLocations": [], + "fundingReferences": [], + "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/TSQFWC/YYTJ22", + "contentUrl": null, + "metadataVersion": 0, + "schemaVersion": "http://datacite.org/schema/kernel-4", + "source": "mds", + "isActive": true, + "state": "findable", + "reason": null, + "created": "2018-08-22T17:36:10.000Z", + "registered": "2018-08-22T17:37:30.000Z", + "published": "2018", + "updated": "2019-08-02T19:43:20.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "gdcc.harvard-dv", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_18.json b/python/tests/files/datacite/datacite_doc_18.json new file mode 100644 index 00000000..f6bc81a6 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_18.json @@ -0,0 +1,79 @@ +{ + "id": "10.7916/d81z522m", + "type": "dois", + "attributes": { + "doi": "10.7916/d81z522m", + "identifiers": [ + { + "identifier": "https://doi.org/10.7916/d81z522m", + "identifierType": "DOI" + } + ], + "creators": [ + { + "name": "(:Unav)", + "affiliation": [], + "nameIdentifiers": [] + } + ], + "titles": [ + { + "title": "Eastern questionnaire, answer sheet for Interviewee 53215, page 064" + } + ], + "publisher": "Columbia University", + "container": {}, + "publicationYear": 2017, + "subjects": [], + "contributors": [], + "dates": [ + { + "date": "2017-08-21", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "language": null, + "types": { + "ris": "GEN", + "bibtex": "misc", + "citeproc": "article", + "schemaOrg": "CreativeWork" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [], + "descriptions": [], + "geoLocations": [], + "fundingReferences": [], + "url": "https://dlc.library.columbia.edu/lcaaj/cul:k3j9kd52d6", + "contentUrl": null, + "metadataVersion": 2, + "schemaVersion": "http://datacite.org/schema/kernel-3", + "source": "ez", + "isActive": true, + "state": "findable", + "reason": null, + "created": "2017-11-29T02:15:31.000Z", + "registered": "2017-11-29T02:15:32.000Z", + "published": "2017", + "updated": "2019-08-04T13:17:58.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "cul.columbia", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_19.json b/python/tests/files/datacite/datacite_doc_19.json new file mode 100644 index 00000000..c0bc25ba --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_19.json @@ -0,0 +1,79 @@ +{ + "id": "10.7916/d86x0cg1", + "type": "dois", + "attributes": { + "doi": "10.7916/d86x0cg1", + "identifiers": [ + { + "identifier": "https://doi.org/10.7916/d86x0cg1", + "identifierType": "DOI" + } + ], + "creators": [ + { + "name": "(:Unav)", + "affiliation": [], + "nameIdentifiers": [] + } + ], + "titles": [ + { + "title": "Eastern questionnaire, answer sheet for Interviewee 55236, page 092" + } + ], + "publisher": "Columbia University", + "container": {}, + "publicationYear": 2017, + "subjects": [], + "contributors": [], + "dates": [ + { + "date": "2017-08-24", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "language": null, + "types": { + "ris": "GEN", + "bibtex": "misc", + "citeproc": "article", + "schemaOrg": "CreativeWork" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [], + "descriptions": [], + "geoLocations": [], + "fundingReferences": [], + "url": "https://dlc.library.columbia.edu/lcaaj/cul:44j0zpc98s", + "contentUrl": null, + "metadataVersion": 3, + "schemaVersion": "http://datacite.org/schema/kernel-3", + "source": "ez", + "isActive": true, + "state": "findable", + "reason": null, + "created": "2017-11-29T09:29:33.000Z", + "registered": "2017-11-29T09:29:34.000Z", + "published": "2017", + "updated": "2019-08-04T23:43:40.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "cul.columbia", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_20.json b/python/tests/files/datacite/datacite_doc_20.json new file mode 100644 index 00000000..964e2cbb --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_20.json @@ -0,0 +1,42 @@ +{ + "attributes": { + "doi": "10.7916/d86x0cg1", + "creators": [ + { + "name": "(:Unav)", + "affiliation": [], + "nameIdentifiers": [] + } + ], + "titles": [ + { + "title": "

Eastern questionnaire

" + } + ], + "publicationYear": 2017, + "dates": [ + { + "date": "2017-08-24", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "language": null, + "types": { + "ris": "GEN", + "bibtex": "misc", + "citeproc": "article", + "schemaOrg": "CreativeWork" + }, + "isActive": true, + "state": "findable" + } + } + \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_doc_21.json b/python/tests/files/datacite/datacite_doc_21.json new file mode 100644 index 00000000..cae7f40f --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_21.json @@ -0,0 +1,42 @@ +{ + "attributes": { + "doi": "10.7916/d86x0cg1", + "creators": [ + { + "name": "(:Unav)", + "affiliation": [], + "nameIdentifiers": [] + } + ], + "titles": [ + { + "title": "ABC" + } + ], + "publicationYear": 2017, + "language": "GERMAN", + "types": { + "ris": "GEN", + "bibtex": "misc", + "citeproc": "article", + "schemaOrg": "CreativeWork" + }, + "dates": [ + { + "date": "2017-08-24", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "isActive": true, + "state": "findable" + } + } + \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_doc_22.json b/python/tests/files/datacite/datacite_doc_22.json new file mode 100644 index 00000000..42448ddf --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_22.json @@ -0,0 +1,44 @@ +{ + "attributes": { + "doi": "10.7916/d86x0cg1", + "creators": [ + { + "name": "Anton Welch", + "affiliation": [ + "Department of pataphysics" + ], + "nameIdentifiers": [] + } + ], + "titles": [ + { + "title": "ABC" + } + ], + "publicationYear": 2017, + "language": "GERMAN", + "types": { + "ris": "GEN", + "bibtex": "misc", + "citeproc": "article", + "schemaOrg": "CreativeWork" + }, + "dates": [ + { + "date": "2017-08-24", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "isActive": true, + "state": "findable" + } + } + diff --git a/python/tests/files/datacite/datacite_doc_23.json b/python/tests/files/datacite/datacite_doc_23.json new file mode 100644 index 00000000..1e5bcc3f --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_23.json @@ -0,0 +1,44 @@ +{ + "attributes": { + "doi": "10.7916/d86x0cg1\u2013xxx", + "creators": [ + { + "name": "Anton Welch", + "affiliation": [ + "Department of pataphysics" + ], + "nameIdentifiers": [] + } + ], + "titles": [ + { + "title": "ABC" + } + ], + "publicationYear": 2017, + "language": "GERMAN", + "types": { + "ris": "GEN", + "bibtex": "misc", + "citeproc": "article", + "schemaOrg": "CreativeWork" + }, + "dates": [ + { + "date": "2017-08-24", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "isActive": true, + "state": "findable" + } + } + diff --git a/python/tests/files/datacite/datacite_result_00.json b/python/tests/files/datacite/datacite_result_00.json new file mode 100644 index 00000000..085e23f3 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_00.json @@ -0,0 +1,87 @@ +{ + "extra": { + "container_name": "Journal of Chemical Crystallography", + "datacite": { + "license": [ + { + "rightsUri": "http://www.springer.com/tdm" + } + ], + "relations": [ + { + "relationType": "IsPartOf", + "relatedIdentifier": "1074-1542", + "resourceTypeGeneral": "Collection", + "relatedIdentifierType": "ISSN" + } + ] + } + }, + "title": "Synthesis and Crystal Structure of a Compound with Two Conformational Isomers: N-(2-methylbenzoyl)-N\u2032-(4-nitrophenyl)thiourea", + "release_type": "article-journal", + "release_stage": "published", + "release_date": "2019-05-31", + "release_year": 2019, + "ext_ids": { + "doi": "10.1007/s10870-008-9413-z" + }, + "volume": "38", + "issue": "12", + "pages": "927-930", + "publisher": "Springer Science and Business Media LLC", + "contribs": [ + { + "index": 0, + "raw_name": "Li, Qian-Jin", + "given_name": "Qian-Jin", + "surname": "Li", + "role": "author" + }, + { + "index": 1, + "raw_name": "Yang, Chun-Long", + "given_name": "Chun-Long", + "surname": "Yang", + "role": "author" + } + ], + "refs": [ + { + "index": 0, + "extra": { + "doi": "10.1016/j.bmcl.2005.09.033" + } + }, + { + "index": 1, + "extra": { + "doi": "10.1016/s0022-1139(02)00330-5" + } + }, + { + "index": 2, + "extra": { + "doi": "10.1016/s0010-8545(01)00337-x" + } + }, + { + "index": 3, + "extra": { + "doi": "10.1016/j.tetlet.2005.06.135" + } + }, + { + "index": 4, + "extra": { + "doi": "10.1039/p298700000s1" + } + }, + { + "index": 5, + "extra": { + "doi": "10.1002/anie.199515551" + } + } + ], + "abstracts": [] +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_01.json b/python/tests/files/datacite/datacite_result_01.json new file mode 100644 index 00000000..f8c6b930 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_01.json @@ -0,0 +1,32 @@ +{ + "extra": { + "datacite": { + "license": [ + { + "lang": "de", + "rights": "Standard (Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen) - http://www.ub.uni-heidelberg.de/helios/digi/nutzung/Welcome.html" + } + ] + } + }, + "title": "Ferdinand Gaillard, [1]: n\u00e9 \u00e0 Paris le 16 janvier 1834, mort \u00e0 Paris le 19 janvier 1887", + "release_type": "article-journal", + "release_stage": "published", + "release_year": 1887, + "ext_ids": { + "doi": "10.11588/diglit.25558.39" + }, + "publisher": "University Library Heidelberg", + "language": "fr", + "contribs": [ + { + "index": 0, + "raw_name": "Dargenty, G.", + "given_name": "G.", + "surname": "Dargenty", + "role": "author" + } + ], + "refs": [], + "abstracts": [] +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_02.json b/python/tests/files/datacite/datacite_result_02.json new file mode 100644 index 00000000..f8b85f38 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_02.json @@ -0,0 +1,36 @@ +{ + "extra": { + "datacite": { + "license": [ + { + "lang": "de", + "rights": "Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen - https://creativecommons.org/licenses/by-sa/3.0/de/" + }, + { + "lang": "en", + "rights": "Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen - https://creativecommons.org/licenses/by-sa/3.0/" + } + ] + } + }, + "title": "Solinger Schwertschmiede-Familien, [4]", + "release_type": "article-journal", + "release_stage": "published", + "release_year": 1897, + "ext_ids": { + "doi": "10.11588/diglit.37715.57" + }, + "publisher": "University Library Heidelberg", + "language": "de", + "contribs": [ + { + "index": 0, + "raw_name": "Weyersberg, Albert", + "given_name": "Albert", + "surname": "Weyersberg", + "role": "author" + } + ], + "refs": [], + "abstracts": [] +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_03.json b/python/tests/files/datacite/datacite_result_03.json new file mode 100644 index 00000000..3e3c2bd5 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_03.json @@ -0,0 +1,19 @@ +{ + "extra": {}, + "title": "midterm ah30903", + "release_type": "article", + "release_year": 2016, + "ext_ids": { + "doi": "10.13140/rg.2.2.30434.53446" + }, + "language": "ms", + "contribs": [ + { + "index": 0, + "raw_name": "Mastura Yahya", + "role": "author" + } + ], + "refs": [], + "abstracts": [] +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_04.json b/python/tests/files/datacite/datacite_result_04.json new file mode 100644 index 00000000..7ca70d6c --- /dev/null +++ b/python/tests/files/datacite/datacite_result_04.json @@ -0,0 +1,28 @@ +{ + "extra": {}, + "title": "On chain maps inducing isomorphisms in homology", + "release_type": "article-journal", + "release_stage": "published", + "release_year": 1973, + "ext_ids": { + "doi": "10.14288/1.0080520" + }, + "publisher": "University of British Columbia", + "language": "en", + "contribs": [ + { + "index": 0, + "raw_name": "Nicollerat, Marc Andre", + "given_name": "Marc Andre", + "surname": "Nicollerat", + "role": "author" + } + ], + "refs": [], + "abstracts": [ + { + "content": "Let A be an abelian category, I the full subcategory of A consisting of injective objects of A, and K(A) the category whose objects are cochain complexes of elements of A, and whose morphisms are homotopy classes of cochain maps. In (5), lemma 4.6., p. 42, R. Hartshorne has proved that, under certain conditions, a cochain complex X\u02d9 \u03b5. |KA)| can be embedded in a complex I\u02d9 \u03b5. |K(I)| in such a way that I\u02d9 has the same cohomology as X\u02d9. In Chapter I we show that the construction given in the two first parts of Hartshorne's Lemma is natural i.e. there exists a functor J : K(A) \u2192 K(I) and a natural transformation [formula omitted] (where E : K(I) \u2192 K(A) is the embedding functor) such that [formula omitted] is injective and induces isomorphism in cohomology. The question whether the construction given in the third part of the lemma is functorial is still open. We also prove that J is left adjoint to E, so that K(I) is a reflective subcategory of K(A). In the special case where A is a category [formula omitted] of left A-modules, and [formula omitted] the category of cochain complexes in [formula omitted] and cochain maps (not homotopy classes), we prove the existence of a functor [formula omitted] In Chapter II we study the natural homomorphism [formula omitted] where A, B are rings, and M, L, N modules or chain complexes. In particular we give several sufficient conditions under which v is an isomorphism, or induces isomorphism in homology. In the appendix we give a detailed proof of Hartshorne's Lemma. We think that this is useful, as no complete proof is, to our knowledge, to be found in the literature.", + "mimetype": "text/plain" + } + ] +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json new file mode 100644 index 00000000..e61769de --- /dev/null +++ b/python/tests/files/datacite/datacite_result_05.json @@ -0,0 +1,530 @@ +{ + "extra": { + "datacite": { + "license": [ + { + "rights": "Attribution-NonCommercial (CC BY-NC)", + "rightsUri": "http://creativecommons.org/licenses/by-nc/4.0" + } + ] + } + }, + "title": "SH409843.07FU", + "subtitle": "Gomphales", + "release_type": "dataset", + "release_stage": "published", + "release_date": "2014-10-05", + "release_year": 2014, + "ext_ids": { + "doi": "10.15156/bio/sh409843.07fu" + }, + "publisher": "UNITE Community", + "language": "en", + "license_slug": "CC-BY-NC", + "contribs": [ + { + "index": 0, + "raw_name": "K\u00f5ljalg, Urmas", + "given_name": "Urmas", + "surname": "K\u00f5ljalg", + "role": "author" + }, + { + "index": 1, + "raw_name": "Abarenkov, Kessy", + "given_name": "Kessy", + "surname": "Abarenkov", + "role": "author" + }, + { + "index": 2, + "raw_name": "Nilsson, R. Henrik", + "given_name": "R. Henrik", + "surname": "Nilsson", + "role": "author" + }, + { + "index": 3, + "raw_name": "Larsson, Karl-Henrik", + "given_name": "Karl-Henrik", + "surname": "Larsson", + "role": "author" + }, + { + "index": 4, + "raw_name": "Aas, Anders Bj\u00f8rnsgard", + "given_name": "Anders Bj\u00f8rnsgard", + "surname": "Aas", + "role": "author" + }, + { + "index": 5, + "raw_name": "Adams, Rachel", + "given_name": "Rachel", + "surname": "Adams", + "role": "author" + }, + { + "index": 6, + "raw_name": "Alves, Artur", + "given_name": "Artur", + "surname": "Alves", + "role": "author" + }, + { + "index": 7, + "raw_name": "Ammirati, Joseph F.", + "given_name": "Joseph F.", + "surname": "Ammirati", + "role": "author" + }, + { + "index": 8, + "raw_name": "Arnold, A. Elizabeth", + "given_name": "A. Elizabeth", + "surname": "Arnold", + "role": "author" + }, + { + "index": 9, + "raw_name": "Bahram, Mohammad", + "given_name": "Mohammad", + "surname": "Bahram", + "role": "author" + }, + { + "index": 10, + "raw_name": "Bengtsson-Palme, Johan", + "given_name": "Johan", + "surname": "Bengtsson-Palme", + "role": "author" + }, + { + "index": 11, + "raw_name": "Berlin, Anna", + "given_name": "Anna", + "surname": "Berlin", + "role": "author" + }, + { + "index": 12, + "raw_name": "Botnen, Synn\u00f8ve", + "given_name": "Synn\u00f8ve", + "surname": "Botnen", + "role": "author" + }, + { + "index": 13, + "raw_name": "Bourlat, Sarah", + "given_name": "Sarah", + "surname": "Bourlat", + "role": "author" + }, + { + "index": 14, + "raw_name": "Cheeke, Tanya", + "given_name": "Tanya", + "surname": "Cheeke", + "role": "author" + }, + { + "index": 15, + "raw_name": "Dima, B\u00e1lint", + "given_name": "B\u00e1lint", + "surname": "Dima", + "role": "author" + }, + { + "index": 16, + "raw_name": "Drenkhan, Rein", + "given_name": "Rein", + "surname": "Drenkhan", + "role": "author" + }, + { + "index": 17, + "raw_name": "Duarte, Camila", + "given_name": "Camila", + "surname": "Duarte", + "role": "author" + }, + { + "index": 18, + "raw_name": "Due\u00f1as, Margarita", + "given_name": "Margarita", + "surname": "Due\u00f1as", + "role": "author" + }, + { + "index": 19, + "raw_name": "Eberhardt, Ursula", + "given_name": "Ursula", + "surname": "Eberhardt", + "role": "author" + }, + { + "index": 20, + "raw_name": "Friberg, Hanna", + "given_name": "Hanna", + "surname": "Friberg", + "role": "author" + }, + { + "index": 21, + "raw_name": "Fr\u00f8slev, Tobias G.", + "given_name": "Tobias G.", + "surname": "Fr\u00f8slev", + "role": "author" + }, + { + "index": 22, + "raw_name": "Garnica, Sigisfredo", + "given_name": "Sigisfredo", + "surname": "Garnica", + "role": "author" + }, + { + "index": 23, + "raw_name": "Geml, J\u00f3zsef", + "given_name": "J\u00f3zsef", + "surname": "Geml", + "role": "author" + }, + { + "index": 24, + "raw_name": "Ghobad-Nejhad, Masoomeh", + "given_name": "Masoomeh", + "surname": "Ghobad-Nejhad", + "role": "author" + }, + { + "index": 25, + "raw_name": "Grebenc, Tine", + "given_name": "Tine", + "surname": "Grebenc", + "role": "author" + }, + { + "index": 26, + "raw_name": "Griffith, Gareth W.", + "given_name": "Gareth W.", + "surname": "Griffith", + "role": "author" + }, + { + "index": 27, + "raw_name": "Hampe, Felix", + "given_name": "Felix", + "surname": "Hampe", + "role": "author" + }, + { + "index": 28, + "raw_name": "Kennedy, Peter", + "given_name": "Peter", + "surname": "Kennedy", + "role": "author" + }, + { + "index": 29, + "raw_name": "Khomich, Maryia", + "given_name": "Maryia", + "surname": "Khomich", + "role": "author" + }, + { + "index": 30, + "raw_name": "Kohout, Petr", + "given_name": "Petr", + "surname": "Kohout", + "role": "author" + }, + { + "index": 31, + "raw_name": "Kollom, Anu", + "given_name": "Anu", + "surname": "Kollom", + "role": "author" + }, + { + "index": 32, + "raw_name": "Larsson, Ellen", + "given_name": "Ellen", + "surname": "Larsson", + "role": "author" + }, + { + "index": 33, + "raw_name": "Laszlo, Irinyi", + "given_name": "Irinyi", + "surname": "Laszlo", + "role": "author" + }, + { + "index": 34, + "raw_name": "Leavitt, Steven", + "given_name": "Steven", + "surname": "Leavitt", + "role": "author" + }, + { + "index": 35, + "raw_name": "Liimatainen, Kare", + "given_name": "Kare", + "surname": "Liimatainen", + "role": "author" + }, + { + "index": 36, + "raw_name": "Lindahl, Bj\u00f6rn", + "given_name": "Bj\u00f6rn", + "surname": "Lindahl", + "role": "author" + }, + { + "index": 37, + "raw_name": "Lodge, Deborah J.", + "given_name": "Deborah J.", + "surname": "Lodge", + "role": "author" + }, + { + "index": 38, + "raw_name": "Lumbsch, Helge Thorsten", + "given_name": "Helge Thorsten", + "surname": "Lumbsch", + "role": "author" + }, + { + "index": 39, + "raw_name": "Mart\u00edn Esteban, Mar\u00eda Paz", + "given_name": "Mar\u00eda Paz", + "surname": "Mart\u00edn Esteban", + "role": "author" + }, + { + "index": 40, + "raw_name": "Meyer, Wieland", + "given_name": "Wieland", + "surname": "Meyer", + "role": "author" + }, + { + "index": 41, + "raw_name": "Miettinen, Otto", + "given_name": "Otto", + "surname": "Miettinen", + "role": "author" + }, + { + "index": 42, + "raw_name": "Nguyen, Nhu", + "given_name": "Nhu", + "surname": "Nguyen", + "role": "author" + }, + { + "index": 43, + "raw_name": "Niskanen, Tuula", + "given_name": "Tuula", + "surname": "Niskanen", + "role": "author" + }, + { + "index": 44, + "raw_name": "Oono, Ryoko", + "given_name": "Ryoko", + "surname": "Oono", + "role": "author" + }, + { + "index": 45, + "raw_name": "\u00d6pik, Maarja", + "given_name": "Maarja", + "surname": "\u00d6pik", + "role": "author" + }, + { + "index": 46, + "raw_name": "Ordynets, Alexander", + "given_name": "Alexander", + "surname": "Ordynets", + "role": "author" + }, + { + "index": 47, + "raw_name": "Paw\u0142owska, Julia", + "given_name": "Julia", + "surname": "Paw\u0142owska", + "role": "author" + }, + { + "index": 48, + "raw_name": "Peintner, Ursula", + "given_name": "Ursula", + "surname": "Peintner", + "role": "author" + }, + { + "index": 49, + "raw_name": "Pereira, Olinto Liparini", + "given_name": "Olinto Liparini", + "surname": "Pereira", + "role": "author" + }, + { + "index": 50, + "raw_name": "Pinho, Danilo Batista", + "given_name": "Danilo Batista", + "surname": "Pinho", + "role": "author" + }, + { + "index": 51, + "raw_name": "P\u00f5ldmaa, Kadri", + "given_name": "Kadri", + "surname": "P\u00f5ldmaa", + "role": "author" + }, + { + "index": 52, + "raw_name": "Runnel, Kadri", + "given_name": "Kadri", + "surname": "Runnel", + "role": "author" + }, + { + "index": 53, + "raw_name": "Ryberg, Martin", + "given_name": "Martin", + "surname": "Ryberg", + "role": "author" + }, + { + "index": 54, + "raw_name": "Saar, Irja", + "given_name": "Irja", + "surname": "Saar", + "role": "author" + }, + { + "index": 55, + "raw_name": "Sanli, Kemal", + "given_name": "Kemal", + "surname": "Sanli", + "role": "author" + }, + { + "index": 56, + "raw_name": "Scott, James", + "given_name": "James", + "surname": "Scott", + "role": "author" + }, + { + "index": 57, + "raw_name": "Spirin, Viacheslav", + "given_name": "Viacheslav", + "surname": "Spirin", + "role": "author" + }, + { + "index": 58, + "raw_name": "Suija, Ave", + "given_name": "Ave", + "surname": "Suija", + "role": "author" + }, + { + "index": 59, + "raw_name": "Svantesson, Sten", + "given_name": "Sten", + "surname": "Svantesson", + "role": "author" + }, + { + "index": 60, + "raw_name": "Tadych, Mariusz", + "given_name": "Mariusz", + "surname": "Tadych", + "role": "author" + }, + { + "index": 61, + "raw_name": "Takamatsu, Susumu", + "given_name": "Susumu", + "surname": "Takamatsu", + "role": "author" + }, + { + "index": 62, + "raw_name": "Tamm, Heidi", + "given_name": "Heidi", + "surname": "Tamm", + "role": "author" + }, + { + "index": 63, + "raw_name": "Taylor, AFS.", + "given_name": "AFS.", + "surname": "Taylor", + "role": "author" + }, + { + "index": 64, + "raw_name": "Tedersoo, Leho", + "given_name": "Leho", + "surname": "Tedersoo", + "role": "author" + }, + { + "index": 65, + "raw_name": "Telleria, M.T.", + "given_name": "M.T.", + "surname": "Telleria", + "role": "author" + }, + { + "index": 66, + "raw_name": "Udayanga, Dhanushka", + "given_name": "Dhanushka", + "surname": "Udayanga", + "role": "author" + }, + { + "index": 67, + "raw_name": "Unterseher, Martin", + "given_name": "Martin", + "surname": "Unterseher", + "role": "author" + }, + { + "index": 68, + "raw_name": "Volobuev, Sergey", + "given_name": "Sergey", + "surname": "Volobuev", + "role": "author" + }, + { + "index": 69, + "raw_name": "Weiss, Michael", + "given_name": "Michael", + "surname": "Weiss", + "role": "author" + }, + { + "index": 70, + "raw_name": "Wurzbacher, Christian", + "given_name": "Christian", + "surname": "Wurzbacher", + "role": "author" + } + ], + "refs": [], + "abstracts": [ + { + "content": "UNITE provides a unified way for delimiting, identifying, communicating, and working with DNA-based Species Hypotheses (SH). All fungal ITS sequences in the international nucleotide sequence databases are clustered to approximately the species level by applying a set of dynamic distance values (<0.5 - 3.0%). All species hypotheses are given a unique, stable name in the form of a DOI, and their taxonomic and ecological annotations are verified through distributed, web-based third-party annotation efforts. SHs are connected to a taxon name and its classification as far as possible (phylum, class, order, etc.) by taking into account identifications for all sequences in the SH. An automatically or manually designated sequence is chosen to represent each such SH. These sequences are released (https://unite.ut.ee/repository.php) for use by the scientific community in, for example, local sequence similarity searches and next-generation sequencing analysis pipelines. The system and the data are updated automatically as the number of public fungal ITS sequences grows.", + "mimetype": "text/plain" + } + ] +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_06.json b/python/tests/files/datacite/datacite_result_06.json new file mode 100644 index 00000000..61f2549d --- /dev/null +++ b/python/tests/files/datacite/datacite_result_06.json @@ -0,0 +1,26 @@ +{ + "extra": { + "datacite": { + "license": [ + { + "rights": "ETH-Bibliothek Z\u00fcrich, Graphische Sammlung / D 6220 / Public Domain Mark 1.0" + } + ] + } + }, + "title": "Der Eifer (Sedulitas), Blatt 7 der Folge \"Die Tugenden\"", + "release_type": "article", + "release_year": 1590, + "ext_ids": { + "doi": "10.16903/ethz-grs-d_006220" + }, + "contribs": [ + { + "index": 0, + "raw_name": "Crispijn De Passe (Der \u00c4ltere) (1564-1637)", + "role": "author" + } + ], + "refs": [], + "abstracts": [] +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_07.json b/python/tests/files/datacite/datacite_result_07.json new file mode 100644 index 00000000..324bb663 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_07.json @@ -0,0 +1,73 @@ +{ + "extra": { + "datacite": { + "subjects": [ + { + "subject": "HEAT PUMP" + }, + { + "subject": "HOT WATER" + }, + { + "subject": "HEAT TRANSFER" + }, + { + "subject": "PERFORMANCE" + }, + { + "subject": "THERMAL STORAGE" + }, + { + "subject": "TANK" + }, + { + "subject": "MODEL" + } + ] + } + }, + "title": "High efficient heat pump system using storage tanks to increase cop by means of the ISEC concept. 1: model validation.", + "release_type": "dataset", + "release_stage": "published", + "release_year": 2015, + "ext_ids": { + "doi": "10.18462/iir.icr.2015.0926" + }, + "publisher": "International Institute of Refrigeration (IIR)", + "language": "en", + "contribs": [ + { + "index": 0, + "raw_name": "ROTHUIZEN, E.", + "given_name": "E.", + "surname": "ROTHUIZEN", + "role": "author" + }, + { + "index": 1, + "raw_name": "ELMEGAARD, B.", + "given_name": "B.", + "surname": "ELMEGAARD", + "role": "author" + }, + { + "index": 2, + "raw_name": "MARKUSSEN W., B.", + "given_name": "B.", + "surname": "MARKUSSEN W.", + "role": "author" + }, + { + "index": 3, + "raw_name": "Et Al.", + "role": "author" + } + ], + "refs": [], + "abstracts": [ + { + "content": "The purpose of the ISEC concept is to provide a high-efficient heat pump system for hot water production. The ISEC concept uses two storage tanks for the water, one discharged and one charged. Hot water for the industrial process is tapped from the charged tank, while the other tank is charging. Charging is done by circulating the water in the tank through the condenser of a heat pump several times and thereby gradually heating the water. The charging is done with a higher mass flow rate than the discharging to reach several circulations of the water during the time frame of one discharging. This result in a lower condensing temperature than if the water was heated in one step. Two test setups were built, one to test the performance of the heat pump gradually heating the water and one to investigate the stratification in the storage tanks. Furthermore, a dynamic model of the system was implemented in Dymola, and validated by the use of test data from the two experimental setups. This paper shows that there is a good consistency between the model and the experimental tests.", + "mimetype": "text/plain" + } + ] +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_08.json b/python/tests/files/datacite/datacite_result_08.json new file mode 100644 index 00000000..281c3679 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_08.json @@ -0,0 +1,53 @@ +{ + "extra": { + "datacite": { + "subjects": [ + { + "subject": "Land Economics/Use" + }, + { + "subject": "irrigation", + "subjectScheme": "keyword" + }, + { + "subject": "industrialization", + "subjectScheme": "keyword" + }, + { + "subject": "collective action", + "subjectScheme": "keyword" + } + ] + } + }, + "title": "Irrigation Policies under Rapid Industrialization and Labor Migration: Lessons from Japan, China and India", + "release_type": "article-journal", + "release_year": 2017, + "ext_ids": { + "doi": "10.22004/ag.econ.284864" + }, + "language": "en", + "contribs": [ + { + "index": 0, + "raw_name": "Kajisa, Kei", + "given_name": "Kei", + "surname": "Kajisa", + "role": "author" + }, + { + "index": 1, + "raw_name": "Kajisa, Kei", + "given_name": "Kei", + "surname": "Kajisa", + "role": "author" + } + ], + "refs": [], + "abstracts": [ + { + "content": "International society recognizes that the scarcity of fresh water is increasing and farming sectors suffer from lack of irrigation water. However, if we look at this issue with a framework of relative factor endowment, a different view will arise. In emerging states with rapid industrialization and labor migration, labor scarcity increases at a faster pace than that of irrigation water. Using the historical review of Japan\u2019s irrigation policies as well as the case studies of India and China, this paper shows that the introduction of policies which do not reflect the actual relative resource scarcity may mislead the development path. We argue that under increasing relative labor scarcity it is important to realize the substitution of capital for labor for surface irrigation system management and that the substitution needs public support because the service of surface irrigation system has some externalities. Through this argument, this paper also intends to shed the light back to the role of the state for local resource management which seems to be unfairly undervalued since the boom of community participatory approach in the 1980s.", + "mimetype": "text/plain" + } + ] +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_09.json b/python/tests/files/datacite/datacite_result_09.json new file mode 100644 index 00000000..01f92f85 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_09.json @@ -0,0 +1,35 @@ +{ + "extra": { + "datacite": { + "subjects": [ + { + "subject": "Direktdiodenlasersysteme" + }, + { + "subject": "Physics", + "subjectScheme": "linsearch" + } + ] + } + }, + "title": "BrightLas : TP3.3. Module f\u00fcr Direktdiodenstrahlquellen bis 4kW und Untersuchungen zur Leistungsskalierung (Diodemodul) : zum Verbundvorhaben Direktdiodenlaseranlagen und -systeme (VP3) im F\u00f6rderschwerpunkt innovative regionale Wachstumskerne, BMBF : Abschlussbericht", + "release_type": "report", + "release_stage": "published", + "release_year": 2016, + "ext_ids": { + "doi": "10.2314/gbv:880813733" + }, + "publisher": "[Lumics GmbH]", + "language": "de", + "contribs": [ + { + "index": 0, + "raw_name": "Kirstaedter, Nils", + "given_name": "Nils", + "surname": "Kirstaedter", + "role": "author" + } + ], + "refs": [], + "abstracts": [] +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_10.json b/python/tests/files/datacite/datacite_result_10.json new file mode 100644 index 00000000..325facf7 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_10.json @@ -0,0 +1,32 @@ +{ + "extra": { + "datacite": { + "subjects": [ + { + "subject": "housing areas" + }, + { + "subject": "Dwellings" + } + ] + } + }, + "title": "WPA household census for 210 E VERNON, Los Angeles", + "release_type": "dataset", + "release_stage": "published", + "release_year": 2012, + "ext_ids": { + "doi": "10.25549/wpacards-m6171" + }, + "publisher": "University of Southern California Digital Library (USC.DL)", + "language": "en", + "contribs": [ + { + "index": 0, + "raw_name": "Unknown", + "role": "author" + } + ], + "refs": [], + "abstracts": [] +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_11.json b/python/tests/files/datacite/datacite_result_11.json new file mode 100644 index 00000000..037c5ac2 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_11.json @@ -0,0 +1,21 @@ +{ + "extra": {}, + "title": "N1 bei Safenwil", + "release_type": "graphic", + "release_stage": "published", + "release_year": 1965, + "ext_ids": { + "doi": "10.3932/ethz-a-000055869" + }, + "publisher": "ETH-Bibliothek Z\u00fcrich, Bildarchiv", + "language": "de", + "contribs": [ + { + "index": 0, + "raw_name": "Comet Photo AG (Z\u00fcrich)", + "role": "author" + } + ], + "refs": [], + "abstracts": [] +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_12.json b/python/tests/files/datacite/datacite_result_12.json new file mode 100644 index 00000000..6b6cad4a --- /dev/null +++ b/python/tests/files/datacite/datacite_result_12.json @@ -0,0 +1,44 @@ +{ + "extra": {}, + "title": "Anthropometric and Physiological Profile of Mixed Martial Art Athletes: A Brief Review", + "release_type": "article-journal", + "release_stage": "published", + "release_date": "2019-06-14", + "release_year": 2019, + "ext_ids": { + "doi": "10.5167/uzh-171449" + }, + "publisher": "MDPI Publishing", + "contribs": [ + { + "index": 0, + "raw_name": "Spanias, Charalampos", + "given_name": "Charalampos", + "surname": "Spanias", + "role": "author" + }, + { + "index": 1, + "raw_name": "Nikolaidis, Pantelis T", + "given_name": "Pantelis T", + "surname": "Nikolaidis", + "role": "author" + }, + { + "index": 2, + "raw_name": "Rosemann, Thomas", + "given_name": "Thomas", + "surname": "Rosemann", + "role": "author" + }, + { + "index": 3, + "raw_name": "Knechtle, Beat", + "given_name": "Beat", + "surname": "Knechtle", + "role": "author" + } + ], + "refs": [], + "abstracts": [] +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_13.json b/python/tests/files/datacite/datacite_result_13.json new file mode 100644 index 00000000..3da3816d --- /dev/null +++ b/python/tests/files/datacite/datacite_result_13.json @@ -0,0 +1,28 @@ +{ + "extra": {}, + "title": "[M\u00fcssen wir des Gl\u00fccks uns sch\u00e4men?]", + "release_type": "article-journal", + "release_stage": "published", + "release_date": "1940-10-05", + "release_year": 1940, + "ext_ids": { + "doi": "10.5169/seals-314104" + }, + "publisher": "Buchdruckerei B\u00fcchler & Co.", + "contribs": [ + { + "index": 0, + "raw_name": "O.M.", + "role": "author" + }, + { + "index": 1, + "raw_name": "Hiltbrunner, Hermann", + "given_name": "Hermann", + "surname": "Hiltbrunner", + "role": "author" + } + ], + "refs": [], + "abstracts": [] +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_14.json b/python/tests/files/datacite/datacite_result_14.json new file mode 100644 index 00000000..94c00472 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_14.json @@ -0,0 +1,110 @@ +{ + "extra": { + "datacite": { + "subjects": [ + { + "subject": "Crystal Structure" + }, + { + "subject": "Experimental 3D Coordinates" + }, + { + "subject": "Crystal System" + }, + { + "subject": "Space Group" + }, + { + "subject": "Cell Parameters" + }, + { + "subject": "Crystallography" + }, + { + "subject": "bis(mu~2~-5-(3,5-Di-t-butylphenyl)-15-(4-(2-(diphenylphosphino)ethynyl)phenyl)-2,8,12,18-tetrahexyl-3,7,13,17-tetramethylporphyrinato)-(5,15-bis(3,5-di-t-butylphenyl)-2,8,12,18-tetraethyl-3,7,13,17-tetramethylporphyrinato)-di-nickel-ruthenium chloroform solvate" + } + ], + "relations": [ + { + "relationType": "IsSupplementTo", + "relatedIdentifier": "10.1021/ic034699w", + "relatedIdentifierType": "DOI" + } + ] + } + }, + "title": "CCDC 222635: Experimental Crystal Structure Determination", + "release_type": "dataset", + "release_stage": "published", + "release_year": 2004, + "ext_ids": { + "doi": "10.5517/cc7gns3" + }, + "publisher": "Cambridge Crystallographic Data Centre", + "language": "en", + "contribs": [ + { + "index": 0, + "raw_name": "Stulz, E.", + "given_name": "E.", + "surname": "Stulz", + "role": "author" + }, + { + "index": 1, + "raw_name": "Scott, S.M.", + "given_name": "S.M.", + "surname": "Scott", + "role": "author" + }, + { + "index": 2, + "raw_name": "Ng, Yiu-Fai", + "given_name": "Yiu-Fai", + "surname": "Ng", + "role": "author" + }, + { + "index": 3, + "raw_name": "Bond, A.D.", + "given_name": "A.D.", + "surname": "Bond", + "role": "author" + }, + { + "index": 4, + "raw_name": "Teat, S.J.", + "given_name": "S.J.", + "surname": "Teat", + "role": "author" + }, + { + "index": 5, + "raw_name": "Darling, S.L.", + "given_name": "S.L.", + "surname": "Darling", + "role": "author" + }, + { + "index": 6, + "raw_name": "Feeder, N.", + "given_name": "N.", + "surname": "Feeder", + "role": "author" + }, + { + "index": 7, + "raw_name": "Sanders, J.K.M.", + "given_name": "J.K.M.", + "surname": "Sanders", + "role": "author" + } + ], + "refs": [], + "abstracts": [ + { + "content": "An entry from the Cambridge Structural Database, the world\u2019s repository for small molecule crystal structures. The entry contains experimental data from a crystal diffraction study. The deposited dataset for this entry is freely available from the CCDC and typically includes 3D coordinates, cell parameters, space group, experimental conditions and quality measures.", + "mimetype": "text/plain" + } + ] +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_15.json b/python/tests/files/datacite/datacite_result_15.json new file mode 100644 index 00000000..0614f6ba --- /dev/null +++ b/python/tests/files/datacite/datacite_result_15.json @@ -0,0 +1,22 @@ +{ + "extra": {}, + "title": "Parramore Island of the Virginia Coast Reserve Permanent Plot Resurvey: Tree data 1997", + "release_type": "dataset", + "release_stage": "published", + "release_year": 2017, + "ext_ids": { + "doi": "10.6073/pasta/95296d8416aae24f3d39b4ecb27f0b28" + }, + "publisher": "Environmental Data Initiative", + "contribs": [ + { + "index": 0, + "raw_name": "Richardson, David", + "given_name": "David", + "surname": "Richardson", + "role": "author" + } + ], + "refs": [], + "abstracts": [] +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_16.json b/python/tests/files/datacite/datacite_result_16.json new file mode 100644 index 00000000..1d861cf6 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_16.json @@ -0,0 +1,31 @@ +{ + "extra": { + "datacite": { + "license": [ + { + "rights": "CC-BY", + "rightsUri": "http://creativecommons.org/licenses/by/3.0/us" + } + ] + } + }, + "title": "Testing the Connectivity of Networks", + "release_type": "dataset", + "release_stage": "published", + "release_year": 2014, + "ext_ids": { + "doi": "10.6084/m9.figshare.1282478" + }, + "publisher": "Figshare", + "contribs": [ + { + "index": 0, + "raw_name": "Sochi, Taha", + "given_name": "Taha", + "surname": "Sochi", + "role": "author" + } + ], + "refs": [], + "abstracts": [] +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_17.json b/python/tests/files/datacite/datacite_result_17.json new file mode 100644 index 00000000..0852a09e --- /dev/null +++ b/python/tests/files/datacite/datacite_result_17.json @@ -0,0 +1,20 @@ +{ + "extra": {}, + "title": "gel_BSA-FITC_Markov_segmntation0343.tif", + "release_type": "dataset", + "release_stage": "published", + "release_year": 2018, + "ext_ids": { + "doi": "10.7910/dvn/tsqfwc/yytj22" + }, + "publisher": "Harvard Dataverse", + "contribs": [ + { + "index": 0, + "raw_name": "Di Giovanna, Antonino Paolo (University Of Florence)", + "role": "author" + } + ], + "refs": [], + "abstracts": [] +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_18.json b/python/tests/files/datacite/datacite_result_18.json new file mode 100644 index 00000000..12ab39fe --- /dev/null +++ b/python/tests/files/datacite/datacite_result_18.json @@ -0,0 +1,15 @@ +{ + "extra": {}, + "title": "Eastern questionnaire, answer sheet for Interviewee 53215, page 064", + "release_type": "article", + "release_stage": "published", + "release_date": "2017-08-21", + "release_year": 2017, + "ext_ids": { + "doi": "10.7916/d81z522m" + }, + "publisher": "Columbia University", + "contribs": [], + "refs": [], + "abstracts": [] +} diff --git a/python/tests/files/datacite/datacite_result_19.json b/python/tests/files/datacite/datacite_result_19.json new file mode 100644 index 00000000..1505db92 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_19.json @@ -0,0 +1,15 @@ +{ + "extra": {}, + "title": "Eastern questionnaire, answer sheet for Interviewee 55236, page 092", + "release_type": "article", + "release_stage": "published", + "release_date": "2017-08-24", + "release_year": 2017, + "ext_ids": { + "doi": "10.7916/d86x0cg1" + }, + "publisher": "Columbia University", + "contribs": [], + "refs": [], + "abstracts": [] +} diff --git a/python/tests/files/datacite/datacite_result_20.json b/python/tests/files/datacite/datacite_result_20.json new file mode 100644 index 00000000..1868eede --- /dev/null +++ b/python/tests/files/datacite/datacite_result_20.json @@ -0,0 +1,14 @@ +{ + "extra": {}, + "title": "

Eastern questionnaire

", + "release_type": "article", + "release_stage": "published", + "release_date": "2017-08-24", + "release_year": 2017, + "ext_ids": { + "doi": "10.7916/d86x0cg1" + }, + "contribs": [], + "refs": [], + "abstracts": [] +} diff --git a/python/tests/files/datacite/datacite_result_21.json b/python/tests/files/datacite/datacite_result_21.json new file mode 100644 index 00000000..9214065a --- /dev/null +++ b/python/tests/files/datacite/datacite_result_21.json @@ -0,0 +1,15 @@ +{ + "extra": {}, + "title": "ABC", + "release_type": "article", + "release_stage": "published", + "release_date": "2017-08-24", + "release_year": 2017, + "ext_ids": { + "doi": "10.7916/d86x0cg1" + }, + "contribs": [], + "refs": [], + "abstracts": [], + "language": "de" +} diff --git a/python/tests/files/datacite/datacite_result_22.json b/python/tests/files/datacite/datacite_result_22.json new file mode 100644 index 00000000..e9939e09 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_22.json @@ -0,0 +1,22 @@ +{ + "extra": {}, + "title": "ABC", + "release_type": "article", + "release_stage": "published", + "release_date": "2017-08-24", + "release_year": 2017, + "ext_ids": { + "doi": "10.7916/d86x0cg1" + }, + "contribs": [ + { + "raw_affiliation": "Department of pataphysics", + "index": 0, + "raw_name": "Anton Welch", + "role": "author" + } + ], + "refs": [], + "abstracts": [], + "language": "de" +} diff --git a/python/tests/files/datacite/datacite_result_23.json b/python/tests/files/datacite/datacite_result_23.json new file mode 100644 index 00000000..2bf66eae --- /dev/null +++ b/python/tests/files/datacite/datacite_result_23.json @@ -0,0 +1,22 @@ +{ + "extra": {}, + "title": "ABC", + "release_type": "article", + "release_stage": "published", + "release_date": "2017-08-24", + "release_year": 2017, + "ext_ids": { + "doi": "10.7916/d86x0cg1-xxx" + }, + "contribs": [ + { + "index": 0, + "raw_name": "Anton Welch", + "role": "author", + "raw_affiliation": "Department of pataphysics" + } + ], + "refs": [], + "abstracts": [], + "language": "de" +} diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index bc47a185..cdc165d7 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -7,7 +7,8 @@ import datetime import pytest import gzip from fatcat_tools.importers import DataciteImporter, JsonLinePusher -from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates +from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates, clean_doi +from fatcat_tools.transforms import entity_to_dict from fixtures import api import json @@ -270,3 +271,26 @@ def test_datacite_dict_parse(datacite_importer): assert r.contribs[0].given_name == None assert r.contribs[0].surname == None assert len(r.refs) == 0 + +def test_clean_doi(): + assert clean_doi("10.25513/1812-3996.2017.1.34\u201342") == "10.25513/1812-3996.2017.1.34-42" + assert "123" == clean_doi("123") + +def test_datacite_conversions(datacite_importer): + """ + Datacite JSON to release entity JSON representation. The count is hardcoded + for now. + """ + datacite_importer.debug = True + for i in range(24): + src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i) + dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i) + print('testing mapping from {} => {}'.format(src, dst)) + with open(src, 'r') as f: + re = datacite_importer.parse_record(json.load(f)) + result = entity_to_dict(re) + with open(dst, 'r') as f: + expected = json.loads(f.read()) + + assert result == expected + -- cgit v1.2.3 From be43049db0da2df4343bd5e1392d6c5201fc67d0 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 2 Jan 2020 18:11:35 +0100 Subject: datacite: address raw_name index form comment > The convention for display_name and raw_name is to be how the name would normally be printed, not in index form (surname comma given_name). So we might need to un-encode names like "Tricart, Pierre". Use an additional `index_form_to_display_name` function to convert index from to display form, heuristically. --- python/fatcat_tools/importers/datacite.py | 43 +++++++ .../tests/files/datacite/datacite_result_00.json | 4 +- .../tests/files/datacite/datacite_result_01.json | 2 +- .../tests/files/datacite/datacite_result_02.json | 2 +- .../tests/files/datacite/datacite_result_04.json | 2 +- .../tests/files/datacite/datacite_result_05.json | 142 ++++++++++----------- .../tests/files/datacite/datacite_result_07.json | 6 +- .../tests/files/datacite/datacite_result_08.json | 4 +- .../tests/files/datacite/datacite_result_09.json | 2 +- .../tests/files/datacite/datacite_result_12.json | 8 +- .../tests/files/datacite/datacite_result_13.json | 2 +- .../tests/files/datacite/datacite_result_14.json | 16 +-- .../tests/files/datacite/datacite_result_15.json | 2 +- .../tests/files/datacite/datacite_result_16.json | 2 +- .../tests/files/datacite/datacite_result_18.json | 2 +- .../tests/files/datacite/datacite_result_19.json | 2 +- .../tests/files/datacite/datacite_result_20.json | 2 +- .../tests/files/datacite/datacite_result_21.json | 6 +- .../tests/files/datacite/datacite_result_22.json | 10 +- .../tests/files/datacite/datacite_result_23.json | 6 +- python/tests/import_datacite.py | 18 ++- 21 files changed, 171 insertions(+), 112 deletions(-) (limited to 'python/tests/import_datacite.py') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index a03587c0..bd135569 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -331,6 +331,10 @@ class DataciteImporter(EntityImporter): if name in ('(:Unav)', 'NA', 'NN', '(:Null)'): continue + # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'. + if name: + name = index_form_to_display_name(name) + contribs.append( fatcat_openapi_client.ReleaseContrib( creator_id=creator_id, @@ -859,3 +863,42 @@ def clean_doi(doi): doi = doi.replace(c, "-") return doi +def index_form_to_display_name(s): + """ + Try to convert an index form name, like 'Razis, Panos A' into display_name, + e.g. 'Panos A Razis'. + """ + if ',' not in s: + return s + skip_on_chars = ['(', ')', '*'] + for char in skip_on_chars: + if char in s: + return s + if s.count(',') > 1: + # "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan" + return s + stopwords = [ + 'Archive', + 'Collection', + 'Coordinator', + 'Department', + 'Germany', + 'International', + 'National', + 'Netherlands', + 'Office', + 'Organisation', + 'Organization', + 'Service', + 'Services', + 'United States', + 'University', + 'Verein', + 'Volkshochschule', + ] + for stop in stopwords: + if stop.lower() in s.lower(): + return s + + a, b = s.split(',') + return '{} {}'.format(b.strip(), a.strip()) diff --git a/python/tests/files/datacite/datacite_result_00.json b/python/tests/files/datacite/datacite_result_00.json index 085e23f3..a4b28076 100644 --- a/python/tests/files/datacite/datacite_result_00.json +++ b/python/tests/files/datacite/datacite_result_00.json @@ -32,14 +32,14 @@ "contribs": [ { "index": 0, - "raw_name": "Li, Qian-Jin", + "raw_name": "Qian-Jin Li", "given_name": "Qian-Jin", "surname": "Li", "role": "author" }, { "index": 1, - "raw_name": "Yang, Chun-Long", + "raw_name": "Chun-Long Yang", "given_name": "Chun-Long", "surname": "Yang", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_01.json b/python/tests/files/datacite/datacite_result_01.json index f8c6b930..46be2515 100644 --- a/python/tests/files/datacite/datacite_result_01.json +++ b/python/tests/files/datacite/datacite_result_01.json @@ -21,7 +21,7 @@ "contribs": [ { "index": 0, - "raw_name": "Dargenty, G.", + "raw_name": "G. Dargenty", "given_name": "G.", "surname": "Dargenty", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_02.json b/python/tests/files/datacite/datacite_result_02.json index f8b85f38..bdcb4951 100644 --- a/python/tests/files/datacite/datacite_result_02.json +++ b/python/tests/files/datacite/datacite_result_02.json @@ -25,7 +25,7 @@ "contribs": [ { "index": 0, - "raw_name": "Weyersberg, Albert", + "raw_name": "Albert Weyersberg", "given_name": "Albert", "surname": "Weyersberg", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_04.json b/python/tests/files/datacite/datacite_result_04.json index 7ca70d6c..54b19ef9 100644 --- a/python/tests/files/datacite/datacite_result_04.json +++ b/python/tests/files/datacite/datacite_result_04.json @@ -12,7 +12,7 @@ "contribs": [ { "index": 0, - "raw_name": "Nicollerat, Marc Andre", + "raw_name": "Marc Andre Nicollerat", "given_name": "Marc Andre", "surname": "Nicollerat", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json index e61769de..a790c26e 100644 --- a/python/tests/files/datacite/datacite_result_05.json +++ b/python/tests/files/datacite/datacite_result_05.json @@ -24,497 +24,497 @@ "contribs": [ { "index": 0, - "raw_name": "K\u00f5ljalg, Urmas", + "raw_name": "Urmas K\u00f5ljalg", "given_name": "Urmas", "surname": "K\u00f5ljalg", "role": "author" }, { "index": 1, - "raw_name": "Abarenkov, Kessy", + "raw_name": "Kessy Abarenkov", "given_name": "Kessy", "surname": "Abarenkov", "role": "author" }, { "index": 2, - "raw_name": "Nilsson, R. Henrik", + "raw_name": "R. Henrik Nilsson", "given_name": "R. Henrik", "surname": "Nilsson", "role": "author" }, { "index": 3, - "raw_name": "Larsson, Karl-Henrik", + "raw_name": "Karl-Henrik Larsson", "given_name": "Karl-Henrik", "surname": "Larsson", "role": "author" }, { "index": 4, - "raw_name": "Aas, Anders Bj\u00f8rnsgard", + "raw_name": "Anders Bj\u00f8rnsgard Aas", "given_name": "Anders Bj\u00f8rnsgard", "surname": "Aas", "role": "author" }, { "index": 5, - "raw_name": "Adams, Rachel", + "raw_name": "Rachel Adams", "given_name": "Rachel", "surname": "Adams", "role": "author" }, { "index": 6, - "raw_name": "Alves, Artur", + "raw_name": "Artur Alves", "given_name": "Artur", "surname": "Alves", "role": "author" }, { "index": 7, - "raw_name": "Ammirati, Joseph F.", + "raw_name": "Joseph F. Ammirati", "given_name": "Joseph F.", "surname": "Ammirati", "role": "author" }, { "index": 8, - "raw_name": "Arnold, A. Elizabeth", + "raw_name": "A. Elizabeth Arnold", "given_name": "A. Elizabeth", "surname": "Arnold", "role": "author" }, { "index": 9, - "raw_name": "Bahram, Mohammad", + "raw_name": "Mohammad Bahram", "given_name": "Mohammad", "surname": "Bahram", "role": "author" }, { "index": 10, - "raw_name": "Bengtsson-Palme, Johan", + "raw_name": "Johan Bengtsson-Palme", "given_name": "Johan", "surname": "Bengtsson-Palme", "role": "author" }, { "index": 11, - "raw_name": "Berlin, Anna", + "raw_name": "Anna Berlin", "given_name": "Anna", "surname": "Berlin", "role": "author" }, { "index": 12, - "raw_name": "Botnen, Synn\u00f8ve", + "raw_name": "Synn\u00f8ve Botnen", "given_name": "Synn\u00f8ve", "surname": "Botnen", "role": "author" }, { "index": 13, - "raw_name": "Bourlat, Sarah", + "raw_name": "Sarah Bourlat", "given_name": "Sarah", "surname": "Bourlat", "role": "author" }, { "index": 14, - "raw_name": "Cheeke, Tanya", + "raw_name": "Tanya Cheeke", "given_name": "Tanya", "surname": "Cheeke", "role": "author" }, { "index": 15, - "raw_name": "Dima, B\u00e1lint", + "raw_name": "B\u00e1lint Dima", "given_name": "B\u00e1lint", "surname": "Dima", "role": "author" }, { "index": 16, - "raw_name": "Drenkhan, Rein", + "raw_name": "Rein Drenkhan", "given_name": "Rein", "surname": "Drenkhan", "role": "author" }, { "index": 17, - "raw_name": "Duarte, Camila", + "raw_name": "Camila Duarte", "given_name": "Camila", "surname": "Duarte", "role": "author" }, { "index": 18, - "raw_name": "Due\u00f1as, Margarita", + "raw_name": "Margarita Due\u00f1as", "given_name": "Margarita", "surname": "Due\u00f1as", "role": "author" }, { "index": 19, - "raw_name": "Eberhardt, Ursula", + "raw_name": "Ursula Eberhardt", "given_name": "Ursula", "surname": "Eberhardt", "role": "author" }, { "index": 20, - "raw_name": "Friberg, Hanna", + "raw_name": "Hanna Friberg", "given_name": "Hanna", "surname": "Friberg", "role": "author" }, { "index": 21, - "raw_name": "Fr\u00f8slev, Tobias G.", + "raw_name": "Tobias G. Fr\u00f8slev", "given_name": "Tobias G.", "surname": "Fr\u00f8slev", "role": "author" }, { "index": 22, - "raw_name": "Garnica, Sigisfredo", + "raw_name": "Sigisfredo Garnica", "given_name": "Sigisfredo", "surname": "Garnica", "role": "author" }, { "index": 23, - "raw_name": "Geml, J\u00f3zsef", + "raw_name": "J\u00f3zsef Geml", "given_name": "J\u00f3zsef", "surname": "Geml", "role": "author" }, { "index": 24, - "raw_name": "Ghobad-Nejhad, Masoomeh", + "raw_name": "Masoomeh Ghobad-Nejhad", "given_name": "Masoomeh", "surname": "Ghobad-Nejhad", "role": "author" }, { "index": 25, - "raw_name": "Grebenc, Tine", + "raw_name": "Tine Grebenc", "given_name": "Tine", "surname": "Grebenc", "role": "author" }, { "index": 26, - "raw_name": "Griffith, Gareth W.", + "raw_name": "Gareth W. Griffith", "given_name": "Gareth W.", "surname": "Griffith", "role": "author" }, { "index": 27, - "raw_name": "Hampe, Felix", + "raw_name": "Felix Hampe", "given_name": "Felix", "surname": "Hampe", "role": "author" }, { "index": 28, - "raw_name": "Kennedy, Peter", + "raw_name": "Peter Kennedy", "given_name": "Peter", "surname": "Kennedy", "role": "author" }, { "index": 29, - "raw_name": "Khomich, Maryia", + "raw_name": "Maryia Khomich", "given_name": "Maryia", "surname": "Khomich", "role": "author" }, { "index": 30, - "raw_name": "Kohout, Petr", + "raw_name": "Petr Kohout", "given_name": "Petr", "surname": "Kohout", "role": "author" }, { "index": 31, - "raw_name": "Kollom, Anu", + "raw_name": "Anu Kollom", "given_name": "Anu", "surname": "Kollom", "role": "author" }, { "index": 32, - "raw_name": "Larsson, Ellen", + "raw_name": "Ellen Larsson", "given_name": "Ellen", "surname": "Larsson", "role": "author" }, { "index": 33, - "raw_name": "Laszlo, Irinyi", + "raw_name": "Irinyi Laszlo", "given_name": "Irinyi", "surname": "Laszlo", "role": "author" }, { "index": 34, - "raw_name": "Leavitt, Steven", + "raw_name": "Steven Leavitt", "given_name": "Steven", "surname": "Leavitt", "role": "author" }, { "index": 35, - "raw_name": "Liimatainen, Kare", + "raw_name": "Kare Liimatainen", "given_name": "Kare", "surname": "Liimatainen", "role": "author" }, { "index": 36, - "raw_name": "Lindahl, Bj\u00f6rn", + "raw_name": "Bj\u00f6rn Lindahl", "given_name": "Bj\u00f6rn", "surname": "Lindahl", "role": "author" }, { "index": 37, - "raw_name": "Lodge, Deborah J.", + "raw_name": "Deborah J. Lodge", "given_name": "Deborah J.", "surname": "Lodge", "role": "author" }, { "index": 38, - "raw_name": "Lumbsch, Helge Thorsten", + "raw_name": "Helge Thorsten Lumbsch", "given_name": "Helge Thorsten", "surname": "Lumbsch", "role": "author" }, { "index": 39, - "raw_name": "Mart\u00edn Esteban, Mar\u00eda Paz", + "raw_name": "Mar\u00eda Paz Mart\u00edn Esteban", "given_name": "Mar\u00eda Paz", "surname": "Mart\u00edn Esteban", "role": "author" }, { "index": 40, - "raw_name": "Meyer, Wieland", + "raw_name": "Wieland Meyer", "given_name": "Wieland", "surname": "Meyer", "role": "author" }, { "index": 41, - "raw_name": "Miettinen, Otto", + "raw_name": "Otto Miettinen", "given_name": "Otto", "surname": "Miettinen", "role": "author" }, { "index": 42, - "raw_name": "Nguyen, Nhu", + "raw_name": "Nhu Nguyen", "given_name": "Nhu", "surname": "Nguyen", "role": "author" }, { "index": 43, - "raw_name": "Niskanen, Tuula", + "raw_name": "Tuula Niskanen", "given_name": "Tuula", "surname": "Niskanen", "role": "author" }, { "index": 44, - "raw_name": "Oono, Ryoko", + "raw_name": "Ryoko Oono", "given_name": "Ryoko", "surname": "Oono", "role": "author" }, { "index": 45, - "raw_name": "\u00d6pik, Maarja", + "raw_name": "Maarja \u00d6pik", "given_name": "Maarja", "surname": "\u00d6pik", "role": "author" }, { "index": 46, - "raw_name": "Ordynets, Alexander", + "raw_name": "Alexander Ordynets", "given_name": "Alexander", "surname": "Ordynets", "role": "author" }, { "index": 47, - "raw_name": "Paw\u0142owska, Julia", + "raw_name": "Julia Paw\u0142owska", "given_name": "Julia", "surname": "Paw\u0142owska", "role": "author" }, { "index": 48, - "raw_name": "Peintner, Ursula", + "raw_name": "Ursula Peintner", "given_name": "Ursula", "surname": "Peintner", "role": "author" }, { "index": 49, - "raw_name": "Pereira, Olinto Liparini", + "raw_name": "Olinto Liparini Pereira", "given_name": "Olinto Liparini", "surname": "Pereira", "role": "author" }, { "index": 50, - "raw_name": "Pinho, Danilo Batista", + "raw_name": "Danilo Batista Pinho", "given_name": "Danilo Batista", "surname": "Pinho", "role": "author" }, { "index": 51, - "raw_name": "P\u00f5ldmaa, Kadri", + "raw_name": "Kadri P\u00f5ldmaa", "given_name": "Kadri", "surname": "P\u00f5ldmaa", "role": "author" }, { "index": 52, - "raw_name": "Runnel, Kadri", + "raw_name": "Kadri Runnel", "given_name": "Kadri", "surname": "Runnel", "role": "author" }, { "index": 53, - "raw_name": "Ryberg, Martin", + "raw_name": "Martin Ryberg", "given_name": "Martin", "surname": "Ryberg", "role": "author" }, { "index": 54, - "raw_name": "Saar, Irja", + "raw_name": "Irja Saar", "given_name": "Irja", "surname": "Saar", "role": "author" }, { "index": 55, - "raw_name": "Sanli, Kemal", + "raw_name": "Kemal Sanli", "given_name": "Kemal", "surname": "Sanli", "role": "author" }, { "index": 56, - "raw_name": "Scott, James", + "raw_name": "James Scott", "given_name": "James", "surname": "Scott", "role": "author" }, { "index": 57, - "raw_name": "Spirin, Viacheslav", + "raw_name": "Viacheslav Spirin", "given_name": "Viacheslav", "surname": "Spirin", "role": "author" }, { "index": 58, - "raw_name": "Suija, Ave", + "raw_name": "Ave Suija", "given_name": "Ave", "surname": "Suija", "role": "author" }, { "index": 59, - "raw_name": "Svantesson, Sten", + "raw_name": "Sten Svantesson", "given_name": "Sten", "surname": "Svantesson", "role": "author" }, { "index": 60, - "raw_name": "Tadych, Mariusz", + "raw_name": "Mariusz Tadych", "given_name": "Mariusz", "surname": "Tadych", "role": "author" }, { "index": 61, - "raw_name": "Takamatsu, Susumu", + "raw_name": "Susumu Takamatsu", "given_name": "Susumu", "surname": "Takamatsu", "role": "author" }, { "index": 62, - "raw_name": "Tamm, Heidi", + "raw_name": "Heidi Tamm", "given_name": "Heidi", "surname": "Tamm", "role": "author" }, { "index": 63, - "raw_name": "Taylor, AFS.", + "raw_name": "AFS. Taylor", "given_name": "AFS.", "surname": "Taylor", "role": "author" }, { "index": 64, - "raw_name": "Tedersoo, Leho", + "raw_name": "Leho Tedersoo", "given_name": "Leho", "surname": "Tedersoo", "role": "author" }, { "index": 65, - "raw_name": "Telleria, M.T.", + "raw_name": "M.T. Telleria", "given_name": "M.T.", "surname": "Telleria", "role": "author" }, { "index": 66, - "raw_name": "Udayanga, Dhanushka", + "raw_name": "Dhanushka Udayanga", "given_name": "Dhanushka", "surname": "Udayanga", "role": "author" }, { "index": 67, - "raw_name": "Unterseher, Martin", + "raw_name": "Martin Unterseher", "given_name": "Martin", "surname": "Unterseher", "role": "author" }, { "index": 68, - "raw_name": "Volobuev, Sergey", + "raw_name": "Sergey Volobuev", "given_name": "Sergey", "surname": "Volobuev", "role": "author" }, { "index": 69, - "raw_name": "Weiss, Michael", + "raw_name": "Michael Weiss", "given_name": "Michael", "surname": "Weiss", "role": "author" }, { "index": 70, - "raw_name": "Wurzbacher, Christian", + "raw_name": "Christian Wurzbacher", "given_name": "Christian", "surname": "Wurzbacher", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_07.json b/python/tests/files/datacite/datacite_result_07.json index 324bb663..f572263c 100644 --- a/python/tests/files/datacite/datacite_result_07.json +++ b/python/tests/files/datacite/datacite_result_07.json @@ -38,21 +38,21 @@ "contribs": [ { "index": 0, - "raw_name": "ROTHUIZEN, E.", + "raw_name": "E. ROTHUIZEN", "given_name": "E.", "surname": "ROTHUIZEN", "role": "author" }, { "index": 1, - "raw_name": "ELMEGAARD, B.", + "raw_name": "B. ELMEGAARD", "given_name": "B.", "surname": "ELMEGAARD", "role": "author" }, { "index": 2, - "raw_name": "MARKUSSEN W., B.", + "raw_name": "B. MARKUSSEN W.", "given_name": "B.", "surname": "MARKUSSEN W.", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_08.json b/python/tests/files/datacite/datacite_result_08.json index 281c3679..581ca1eb 100644 --- a/python/tests/files/datacite/datacite_result_08.json +++ b/python/tests/files/datacite/datacite_result_08.json @@ -30,14 +30,14 @@ "contribs": [ { "index": 0, - "raw_name": "Kajisa, Kei", + "raw_name": "Kei Kajisa", "given_name": "Kei", "surname": "Kajisa", "role": "author" }, { "index": 1, - "raw_name": "Kajisa, Kei", + "raw_name": "Kei Kajisa", "given_name": "Kei", "surname": "Kajisa", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_09.json b/python/tests/files/datacite/datacite_result_09.json index 01f92f85..db103d2b 100644 --- a/python/tests/files/datacite/datacite_result_09.json +++ b/python/tests/files/datacite/datacite_result_09.json @@ -24,7 +24,7 @@ "contribs": [ { "index": 0, - "raw_name": "Kirstaedter, Nils", + "raw_name": "Nils Kirstaedter", "given_name": "Nils", "surname": "Kirstaedter", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_12.json b/python/tests/files/datacite/datacite_result_12.json index 6b6cad4a..192062e3 100644 --- a/python/tests/files/datacite/datacite_result_12.json +++ b/python/tests/files/datacite/datacite_result_12.json @@ -12,28 +12,28 @@ "contribs": [ { "index": 0, - "raw_name": "Spanias, Charalampos", + "raw_name": "Charalampos Spanias", "given_name": "Charalampos", "surname": "Spanias", "role": "author" }, { "index": 1, - "raw_name": "Nikolaidis, Pantelis T", + "raw_name": "Pantelis T Nikolaidis", "given_name": "Pantelis T", "surname": "Nikolaidis", "role": "author" }, { "index": 2, - "raw_name": "Rosemann, Thomas", + "raw_name": "Thomas Rosemann", "given_name": "Thomas", "surname": "Rosemann", "role": "author" }, { "index": 3, - "raw_name": "Knechtle, Beat", + "raw_name": "Beat Knechtle", "given_name": "Beat", "surname": "Knechtle", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_13.json b/python/tests/files/datacite/datacite_result_13.json index 3da3816d..c8971667 100644 --- a/python/tests/files/datacite/datacite_result_13.json +++ b/python/tests/files/datacite/datacite_result_13.json @@ -17,7 +17,7 @@ }, { "index": 1, - "raw_name": "Hiltbrunner, Hermann", + "raw_name": "Hermann Hiltbrunner", "given_name": "Hermann", "surname": "Hiltbrunner", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_14.json b/python/tests/files/datacite/datacite_result_14.json index 94c00472..94ad000a 100644 --- a/python/tests/files/datacite/datacite_result_14.json +++ b/python/tests/files/datacite/datacite_result_14.json @@ -45,56 +45,56 @@ "contribs": [ { "index": 0, - "raw_name": "Stulz, E.", + "raw_name": "E. Stulz", "given_name": "E.", "surname": "Stulz", "role": "author" }, { "index": 1, - "raw_name": "Scott, S.M.", + "raw_name": "S.M. Scott", "given_name": "S.M.", "surname": "Scott", "role": "author" }, { "index": 2, - "raw_name": "Ng, Yiu-Fai", + "raw_name": "Yiu-Fai Ng", "given_name": "Yiu-Fai", "surname": "Ng", "role": "author" }, { "index": 3, - "raw_name": "Bond, A.D.", + "raw_name": "A.D. Bond", "given_name": "A.D.", "surname": "Bond", "role": "author" }, { "index": 4, - "raw_name": "Teat, S.J.", + "raw_name": "S.J. Teat", "given_name": "S.J.", "surname": "Teat", "role": "author" }, { "index": 5, - "raw_name": "Darling, S.L.", + "raw_name": "S.L. Darling", "given_name": "S.L.", "surname": "Darling", "role": "author" }, { "index": 6, - "raw_name": "Feeder, N.", + "raw_name": "N. Feeder", "given_name": "N.", "surname": "Feeder", "role": "author" }, { "index": 7, - "raw_name": "Sanders, J.K.M.", + "raw_name": "J.K.M. Sanders", "given_name": "J.K.M.", "surname": "Sanders", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_15.json b/python/tests/files/datacite/datacite_result_15.json index 0614f6ba..bdeb8426 100644 --- a/python/tests/files/datacite/datacite_result_15.json +++ b/python/tests/files/datacite/datacite_result_15.json @@ -11,7 +11,7 @@ "contribs": [ { "index": 0, - "raw_name": "Richardson, David", + "raw_name": "David Richardson", "given_name": "David", "surname": "Richardson", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_16.json b/python/tests/files/datacite/datacite_result_16.json index 1d861cf6..ea8c2e59 100644 --- a/python/tests/files/datacite/datacite_result_16.json +++ b/python/tests/files/datacite/datacite_result_16.json @@ -20,7 +20,7 @@ "contribs": [ { "index": 0, - "raw_name": "Sochi, Taha", + "raw_name": "Taha Sochi", "given_name": "Taha", "surname": "Sochi", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_18.json b/python/tests/files/datacite/datacite_result_18.json index 12ab39fe..274858c3 100644 --- a/python/tests/files/datacite/datacite_result_18.json +++ b/python/tests/files/datacite/datacite_result_18.json @@ -12,4 +12,4 @@ "contribs": [], "refs": [], "abstracts": [] -} +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_19.json b/python/tests/files/datacite/datacite_result_19.json index 1505db92..8d797268 100644 --- a/python/tests/files/datacite/datacite_result_19.json +++ b/python/tests/files/datacite/datacite_result_19.json @@ -12,4 +12,4 @@ "contribs": [], "refs": [], "abstracts": [] -} +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_20.json b/python/tests/files/datacite/datacite_result_20.json index 1868eede..97d7ae75 100644 --- a/python/tests/files/datacite/datacite_result_20.json +++ b/python/tests/files/datacite/datacite_result_20.json @@ -11,4 +11,4 @@ "contribs": [], "refs": [], "abstracts": [] -} +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_21.json b/python/tests/files/datacite/datacite_result_21.json index 9214065a..0a05a7cd 100644 --- a/python/tests/files/datacite/datacite_result_21.json +++ b/python/tests/files/datacite/datacite_result_21.json @@ -8,8 +8,8 @@ "ext_ids": { "doi": "10.7916/d86x0cg1" }, + "language": "de", "contribs": [], "refs": [], - "abstracts": [], - "language": "de" -} + "abstracts": [] +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_22.json b/python/tests/files/datacite/datacite_result_22.json index e9939e09..9e4225b5 100644 --- a/python/tests/files/datacite/datacite_result_22.json +++ b/python/tests/files/datacite/datacite_result_22.json @@ -8,15 +8,15 @@ "ext_ids": { "doi": "10.7916/d86x0cg1" }, + "language": "de", "contribs": [ { - "raw_affiliation": "Department of pataphysics", "index": 0, "raw_name": "Anton Welch", - "role": "author" + "role": "author", + "raw_affiliation": "Department of pataphysics" } ], "refs": [], - "abstracts": [], - "language": "de" -} + "abstracts": [] +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_23.json b/python/tests/files/datacite/datacite_result_23.json index 2bf66eae..46f60492 100644 --- a/python/tests/files/datacite/datacite_result_23.json +++ b/python/tests/files/datacite/datacite_result_23.json @@ -8,6 +8,7 @@ "ext_ids": { "doi": "10.7916/d86x0cg1-xxx" }, + "language": "de", "contribs": [ { "index": 0, @@ -17,6 +18,5 @@ } ], "refs": [], - "abstracts": [], - "language": "de" -} + "abstracts": [] +} \ No newline at end of file diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index cdc165d7..3e47fce8 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -7,7 +7,7 @@ import datetime import pytest import gzip from fatcat_tools.importers import DataciteImporter, JsonLinePusher -from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates, clean_doi +from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates, clean_doi, index_form_to_display_name from fatcat_tools.transforms import entity_to_dict from fixtures import api import json @@ -294,3 +294,19 @@ def test_datacite_conversions(datacite_importer): assert result == expected +def test_index_form_to_display_name(): + Case = collections.namedtuple('Case', 'input output') + cases = [ + Case('', ''), + Case('ABC', 'ABC'), + Case('International Space Station', 'International Space Station'), + Case('Jin, Shan', 'Shan Jin'), + Case('Volkshochschule Der Bundesstadt Bonn', 'Volkshochschule Der Bundesstadt Bonn'), + Case('Solomon, P. M.', 'P. M. Solomon'), + Case('Sujeevan Ratnasingham', 'Sujeevan Ratnasingham'), + Case('Paul Stöckli (1906-1991), Künstler', 'Paul Stöckli (1906-1991), Künstler'), + ] + + for c in cases: + assert c.output == index_form_to_display_name(c.input) + -- cgit v1.2.3 From 61f0bbfbfdaf41be799fa41c88077806ef913188 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 2 Jan 2020 19:02:04 +0100 Subject: datacite: add another test case --- python/tests/files/datacite/datacite_doc_24.json | 48 ++++++++++++++++++++++ .../tests/files/datacite/datacite_result_24.json | 22 ++++++++++ python/tests/import_datacite.py | 2 +- 3 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 python/tests/files/datacite/datacite_doc_24.json create mode 100644 python/tests/files/datacite/datacite_result_24.json (limited to 'python/tests/import_datacite.py') diff --git a/python/tests/files/datacite/datacite_doc_24.json b/python/tests/files/datacite/datacite_doc_24.json new file mode 100644 index 00000000..6123350b --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_24.json @@ -0,0 +1,48 @@ +{ + "attributes": { + "doi": "10.7916/d86x0cg1", + "creators": [ + { + "name": "Anton Welch", + "affiliation": [ + "Department of pataphysics" + ], + "nameIdentifiers": [] + } + ], + "titles": [ + { + "title": "ABC" + }, + { + "title": "DEF", + "titleType": "Subtitle" + } + ], + "publicationYear": 2016, + "language": "DE-CH", + "types": { + "ris": "GEN", + "bibtex": "misc", + "citeproc": "article", + "schemaOrg": "CreativeWork" + }, + "dates": [ + { + "date": "2017-08-24", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "isActive": true, + "state": "findable" + } + } + diff --git a/python/tests/files/datacite/datacite_result_24.json b/python/tests/files/datacite/datacite_result_24.json new file mode 100644 index 00000000..42859275 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_24.json @@ -0,0 +1,22 @@ +{ + "extra": {}, + "title": "ABC", + "subtitle": "DEF", + "release_type": "article", + "release_stage": "published", + "release_date": "2017-08-24", + "release_year": 2017, + "ext_ids": { + "doi": "10.7916/d86x0cg1" + }, + "contribs": [ + { + "index": 0, + "raw_name": "Anton Welch", + "role": "author", + "raw_affiliation": "Department of pataphysics" + } + ], + "refs": [], + "abstracts": [] +} diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 3e47fce8..54a529c5 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -282,7 +282,7 @@ def test_datacite_conversions(datacite_importer): for now. """ datacite_importer.debug = True - for i in range(24): + for i in range(25): src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i) dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i) print('testing mapping from {} => {}'.format(src, dst)) -- cgit v1.2.3 From e4402d6d4b162d57507d5beb57de88017cea549d Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 3 Jan 2020 19:51:53 +0100 Subject: datacite: prepare release_month (stub) --- python/fatcat_tools/importers/datacite.py | 20 ++++++++++---------- python/tests/import_datacite.py | 28 ++++++++++++++-------------- 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'python/tests/import_datacite.py') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index d13e855e..45c8a421 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -378,7 +378,7 @@ class DataciteImporter(EntityImporter): # "attributes.dates[].dateType", values: "Accepted", "Available" # "Collected", "Copyrighted", "Created", "Issued", "Submitted", # "Updated", "Valid". - release_date, release_year = parse_datacite_dates( + release_date, release_month, release_year = parse_datacite_dates( attributes.get('dates', [])) # Start with clear stages, e.g. published. TODO(martin): we could @@ -762,10 +762,10 @@ def parse_datacite_dates(dates): Given a list of date fields (under .dates), return tuple, (release_date, release_year). """ - release_date, release_year = None, None + release_date, release_month, release_year = None, None, None if not dates: - return release_date, release_year + return release_date, release_month, release_year if not isinstance(dates, list): raise ValueError('expected a list of date items') @@ -789,7 +789,7 @@ def parse_datacite_dates(dates): def parse_item(item): result, value, year_only = None, item.get('date', ''), False - release_date, release_year = None, None + release_date, release_month, release_year = None, None, None for pattern in common_patterns: try: @@ -808,24 +808,24 @@ def parse_datacite_dates(dates): except TypeError as err: print("{} date parsing failed with: {}".format(value, err), file=sys.stderr) - return result_date, result_year + return result_date, release_month, result_year if result is None: # Unparsable date. - return release_date, release_year + return release_date, release_month, release_year if not year_only: release_date = result.date() release_year = result.year - return release_date, release_year + return release_date, release_month, release_year for prio in date_type_prio: for item in dates: if not item.get('dateType') == prio: continue - release_date, release_year = parse_item(item) + release_date, release_month, release_year = parse_item(item) if release_date is None and release_year is None: continue @@ -841,11 +841,11 @@ def parse_datacite_dates(dates): if release_date is None and release_year is None: for item in dates: - release_date, release_year = parse_item(item) + release_date, release_month, release_year = parse_item(item) if release_year or release_date: break - return release_date, release_year + return release_date, release_month, release_year def clean_doi(doi): """ diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 54a529c5..29c608ee 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -170,41 +170,41 @@ def test_parse_datacite_dates(): """ Case = collections.namedtuple('Case', 'about input result') cases = [ - Case('None is None', None, (None, None)), - Case('empty list is None', [], (None, None)), - Case('empty item is None', [{}], (None, None)), - Case('empty item is None', [{'date': '2019'}], (None, 2019)), - Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, 2019)), - Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, 2020)), + Case('None is None', None, (None, None, None)), + Case('empty list is None', [], (None, None, None)), + Case('empty item is None', [{}], (None, None, None)), + Case('empty item is None', [{'date': '2019'}], (None, None, 2019)), + Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, None, 2019)), + Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, None, 2020)), Case('first with type', [ {'date': '2019', 'dateType': 'Accepted'}, {'date': '2020'} - ], (None, 2019)), + ], (None, None, 2019)), Case('full date', [ {'date': '2019-12-01', 'dateType': 'Valid'}, - ], (datetime.date(2019, 12, 1), 2019)), + ], (datetime.date(2019, 12, 1), None, 2019)), Case('date type prio', [ {'date': '2000-12-01', 'dateType': 'Valid'}, {'date': '2010-01-01', 'dateType': 'Updated'}, - ], (datetime.date(2000, 12, 1), 2000)), + ], (datetime.date(2000, 12, 1), None, 2000)), Case('date type prio, Available > Updated', [ {'date': '2010-01-01', 'dateType': 'Updated'}, {'date': '2000-12-01', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), 2000)), + ], (datetime.date(2000, 12, 1), None, 2000)), Case('allow different date formats, Available > Updated', [ {'date': '2010-01-01T10:00:00', 'dateType': 'Updated'}, {'date': '2000-12-01T10:00:00', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), 2000)), + ], (datetime.date(2000, 12, 1), None, 2000)), Case('allow different date formats, Available > Updated', [ {'date': '2010-01-01T10:00:00Z', 'dateType': 'Updated'}, {'date': '2000-12-01T10:00:00Z', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), 2000)), + ], (datetime.date(2000, 12, 1), None, 2000)), Case('allow fuzzy date formats, Available > Updated', [ {'date': '2010', 'dateType': 'Updated'}, {'date': '2000 Dec 01', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), 2000)), + ], (datetime.date(2000, 12, 1), None, 2000)), Case('ignore broken date', [ {'date': 'Febrrr 45', 'dateType': 'Updated'}, - ], (None, None)), + ], (None, None, None)), ] for case in cases: result = parse_datacite_dates(case.input) -- cgit v1.2.3 From 55dcece5a476b1492bf6c7f4597a469b48b41264 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 3 Jan 2020 22:40:53 +0100 Subject: datacite: parse_datacite_dates returns month As [...] we will soon add support for release_month field in the release schema. --- python/fatcat_tools/importers/datacite.py | 45 ++++++++++++++++++++++++------- python/tests/import_datacite.py | 23 +++++++++++----- 2 files changed, 51 insertions(+), 17 deletions(-) (limited to 'python/tests/import_datacite.py') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 45c8a421..5891f8de 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -9,6 +9,7 @@ functions (parse_datacite_...), which can be tested more easily. """ from .common import EntityImporter, clean +import collections import dateparser import datetime import fatcat_openapi_client @@ -783,43 +784,68 @@ def parse_datacite_dates(dates): 'Updated', ) + # We need to note the granularity, since a string like "2019" would be + # parsed into "2019-01-01", even though the month is unknown. Use 3 + # granularity types: 'y', 'm', 'd'. + Pattern = collections.namedtuple('Pattern', 'layout granularity') + # Before using (expensive) dateparser, try a few common patterns. - common_patterns = ('%Y-%m-%d', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ', - '%Y-%m-%dT%H:%M:%S', '%Y') + common_patterns = ( + Pattern('%Y-%m-%d', 'd'), + Pattern('%Y-%m', 'm'), + Pattern('%Y-%m-%dT%H:%M:%SZ', 'd'), + Pattern('%Y-%m-%dT%H:%M:%S', 'd'), + Pattern('%Y', 'y'), + ) def parse_item(item): result, value, year_only = None, item.get('date', ''), False release_date, release_month, release_year = None, None, None - for pattern in common_patterns: + for layout, granularity in common_patterns: try: - result = datetime.datetime.strptime(value, pattern) + result = datetime.datetime.strptime(value, layout) except ValueError: continue else: - if pattern == '%Y': + if granularity == 'y': year_only = True break if result is None: print('fallback for {}'.format(value), file=sys.stderr) + parser = dateparser.DateDataParser() try: - result = dateparser.parse(value) + # Results in a dict with keys: date_obj, period, locale. + parse_result = parser.get_date_data(value) + + # A datetime object, later we need a date, only. + result = parse_result['date_obj'] + if result is not None: + if parse_result['period'] == 'year': + return None, None, result.year + elif parse_result['period'] == 'month': + return None, result.month, result.year + else: + return result.date(), result.month, result.year except TypeError as err: print("{} date parsing failed with: {}".format(value, err), file=sys.stderr) - return result_date, release_month, result_year if result is None: # Unparsable date. return release_date, release_month, release_year - if not year_only: + if granularity != 'y': release_date = result.date() release_year = result.year + if granularity in ('m', 'd'): + release_month = result.month return release_date, release_month, release_year + today = datetime.date.today() + for prio in date_type_prio: for item in dates: if not item.get('dateType') == prio: @@ -829,8 +855,7 @@ def parse_datacite_dates(dates): if release_date is None and release_year is None: continue - if release_year < 1000 or release_year > datetime.date.today( - ).year + 5: + if release_year < 1000 or release_year > today.year + 5: # Skip possibly bogus dates. release_year = None continue diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 29c608ee..c2fcdec9 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -173,7 +173,7 @@ def test_parse_datacite_dates(): Case('None is None', None, (None, None, None)), Case('empty list is None', [], (None, None, None)), Case('empty item is None', [{}], (None, None, None)), - Case('empty item is None', [{'date': '2019'}], (None, None, 2019)), + Case('year only yields year only', [{'date': '2019'}], (None, None, 2019)), Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, None, 2019)), Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, None, 2020)), Case('first with type', [ @@ -181,27 +181,36 @@ def test_parse_datacite_dates(): ], (None, None, 2019)), Case('full date', [ {'date': '2019-12-01', 'dateType': 'Valid'}, - ], (datetime.date(2019, 12, 1), None, 2019)), + ], (datetime.date(2019, 12, 1), 12, 2019)), Case('date type prio', [ {'date': '2000-12-01', 'dateType': 'Valid'}, {'date': '2010-01-01', 'dateType': 'Updated'}, - ], (datetime.date(2000, 12, 1), None, 2000)), + ], (datetime.date(2000, 12, 1), 12, 2000)), Case('date type prio, Available > Updated', [ {'date': '2010-01-01', 'dateType': 'Updated'}, {'date': '2000-12-01', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), None, 2000)), + ], (datetime.date(2000, 12, 1), 12, 2000)), Case('allow different date formats, Available > Updated', [ {'date': '2010-01-01T10:00:00', 'dateType': 'Updated'}, {'date': '2000-12-01T10:00:00', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), None, 2000)), + ], (datetime.date(2000, 12, 1), 12, 2000)), Case('allow different date formats, Available > Updated', [ {'date': '2010-01-01T10:00:00Z', 'dateType': 'Updated'}, {'date': '2000-12-01T10:00:00Z', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), None, 2000)), + ], (datetime.date(2000, 12, 1), 12, 2000)), Case('allow fuzzy date formats, Available > Updated', [ {'date': '2010', 'dateType': 'Updated'}, {'date': '2000 Dec 01', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), None, 2000)), + ], (datetime.date(2000, 12, 1), 12, 2000)), + Case('fuzzy year only', [ + {'date': 'Year 2010', 'dateType': 'Issued'}, + ], (None, None, 2010)), + Case('fuzzy year and month', [ + {'date': 'Year 2010 Feb', 'dateType': 'Issued'}, + ], (None, 2, 2010)), + Case('fuzzy year, month, day', [ + {'date': 'Year 2010 Feb 24', 'dateType': 'Issued'}, + ], (datetime.date(2010, 2, 24), 2, 2010)), Case('ignore broken date', [ {'date': 'Febrrr 45', 'dateType': 'Updated'}, ], (None, None, None)), -- cgit v1.2.3 From 328d7901df30ba94685d34d6a428e798b4604839 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 3 Jan 2020 22:53:23 +0100 Subject: datacite: use normal.clean_doi --- python/fatcat_tools/importers/datacite.py | 12 +----------- python/tests/import_datacite.py | 4 ---- 2 files changed, 1 insertion(+), 15 deletions(-) (limited to 'python/tests/import_datacite.py') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 5891f8de..d0c75b6e 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -20,6 +20,7 @@ import langdetect import sqlite3 import sys from fatcat_tools.transforms import entity_to_dict +from fatcat_tools.normal import clean_doi # Cutoff length for abstracts. @@ -872,17 +873,6 @@ def parse_datacite_dates(dates): return release_date, release_month, release_year -def clean_doi(doi): - """ - 10.25513/1812-3996.2017.1.34–42 // 8211, Hex 2013, Octal 20023 - See also: https://github.com/miku/throwaway-check-doi - - Replace unicode HYPHEN..HORIZONTAL BAR with HYPHEN-MINUS. - """ - for c in ('\u2010', '\u2011', '\u2012', '\u2013', '\u2014', '\u2015'): - doi = doi.replace(c, "-") - return doi - def index_form_to_display_name(s): """ Try to convert an index form name, like 'Razis, Panos A' into display_name, diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index c2fcdec9..881452ed 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -281,10 +281,6 @@ def test_datacite_dict_parse(datacite_importer): assert r.contribs[0].surname == None assert len(r.refs) == 0 -def test_clean_doi(): - assert clean_doi("10.25513/1812-3996.2017.1.34\u201342") == "10.25513/1812-3996.2017.1.34-42" - assert "123" == clean_doi("123") - def test_datacite_conversions(datacite_importer): """ Datacite JSON to release entity JSON representation. The count is hardcoded -- cgit v1.2.3 From 171c4ae9f48984438e59bf521b3ec9dd78ce6d3d Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Mon, 6 Jan 2020 22:25:26 +0100 Subject: datacite: indicate mismatched file in test --- python/tests/import_datacite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'python/tests/import_datacite.py') diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 881452ed..9ee479e8 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -297,7 +297,7 @@ def test_datacite_conversions(datacite_importer): with open(dst, 'r') as f: expected = json.loads(f.read()) - assert result == expected + assert result == expected, 'output mismatch in {}'.format(dst) def test_index_form_to_display_name(): Case = collections.namedtuple('Case', 'input output') -- cgit v1.2.3 From a7e5460d6355dd0e99b08e480d4e50755fda3b16 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 8 Jan 2020 03:47:10 +0100 Subject: datacite: mark additional files as stub --- python/fatcat_tools/importers/datacite.py | 4 ++ python/tests/files/datacite/datacite_doc_25.json | 47 ++++++++++++++++++++++ .../tests/files/datacite/datacite_result_25.json | 25 ++++++++++++ python/tests/import_datacite.py | 2 +- 4 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 python/tests/files/datacite/datacite_doc_25.json create mode 100644 python/tests/files/datacite/datacite_result_25.json (limited to 'python/tests/import_datacite.py') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 587a65aa..90bc3db7 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -547,6 +547,10 @@ class DataciteImporter(EntityImporter): if publisher == 'Cambridge Crystallographic Data Centre': release_type = 'entry' + # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire." + if title.lower().startswith('additional file'): + release_type = 'stub' + # Language values are varied ("ger", "es", "English", "ENG", "en-us", # "other", ...). Try to crush it with langcodes: "It may sound to you # like langcodes solves a pretty boring problem. At one level, that's diff --git a/python/tests/files/datacite/datacite_doc_25.json b/python/tests/files/datacite/datacite_doc_25.json new file mode 100644 index 00000000..60cd0ab7 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_25.json @@ -0,0 +1,47 @@ +{ + "attributes": { + "doi": "10.7916/d86x0cg1", + "creators": [ + { + "name": "Anton Welch", + "affiliation": [ + "Department of pataphysics" + ], + "nameIdentifiers": [] + } + ], + "titles": [ + { + "title": "Additional file 123: ABC" + }, + { + "title": "DEF", + "titleType": "Subtitle" + } + ], + "publicationYear": 2016, + "language": "DE-CH", + "types": { + "ris": "GEN", + "bibtex": "misc", + "citeproc": "article", + "schemaOrg": "CreativeWork" + }, + "dates": [ + { + "date": "2017-08-24", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "isActive": true, + "state": "findable" + } +} diff --git a/python/tests/files/datacite/datacite_result_25.json b/python/tests/files/datacite/datacite_result_25.json new file mode 100644 index 00000000..8a370bbb --- /dev/null +++ b/python/tests/files/datacite/datacite_result_25.json @@ -0,0 +1,25 @@ +{ + "extra": { + "datacite": {}, + "month": 8 + }, + "title": "Additional file 123: ABC", + "subtitle": "DEF", + "release_type": "stub", + "release_stage": "published", + "release_date": "2017-08-24", + "release_year": 2017, + "ext_ids": { + "doi": "10.7916/d86x0cg1" + }, + "contribs": [ + { + "index": 0, + "raw_name": "Anton Welch", + "role": "author", + "raw_affiliation": "Department of pataphysics" + } + ], + "refs": [], + "abstracts": [] +} diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 9ee479e8..7293ecac 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -287,7 +287,7 @@ def test_datacite_conversions(datacite_importer): for now. """ datacite_importer.debug = True - for i in range(25): + for i in range(26): src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i) dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i) print('testing mapping from {} => {}'.format(src, dst)) -- cgit v1.2.3 From 62d6a7e48d6bea1bc7f451c6043f38aee2051f9b Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 8 Jan 2020 22:33:58 +0100 Subject: datacite: factor out contributor handling Use values from: * attributes.creators[] * attributes.contributors[] --- python/fatcat_tools/importers/datacite.py | 183 ++++++++++++--------- python/tests/files/datacite/datacite_doc_26.json | 57 +++++++ .../tests/files/datacite/datacite_result_05.json | 6 + .../tests/files/datacite/datacite_result_09.json | 11 ++ .../tests/files/datacite/datacite_result_26.json | 31 ++++ python/tests/import_datacite.py | 4 +- 6 files changed, 210 insertions(+), 82 deletions(-) create mode 100644 python/tests/files/datacite/datacite_doc_26.json create mode 100644 python/tests/files/datacite/datacite_result_26.json (limited to 'python/tests/import_datacite.py') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index fc986994..9ca72758 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -303,88 +303,11 @@ class DataciteImporter(EntityImporter): print('[{}] skipping non-ascii doi for now'.format(doi)) return None - # Contributors. Many nameIdentifierSchemes, we do not use (yet): - # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": - # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", - # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"]. - contribs = [] - - # Names, that should be ignored right away. - name_blacklist = set(('Occdownload Gbif.Org',)) - - for i, c in enumerate(attributes['creators']): - nameType = c.get('nameType', '') or '' - if nameType in ('', 'Personal'): - creator_id = None - for nid in c.get('nameIdentifiers', []): - name_scheme = nid.get('nameIdentifierScheme', '') or '' - if not name_scheme.lower() == "orcid": - continue - orcid = nid.get('nameIdentifier', - '').replace('https://orcid.org/', '') - if not orcid: - continue - creator_id = self.lookup_orcid(orcid) - # TODO(martin): If creator_id is None, should we create creators? - - # If there are multiple affiliation strings, use the first one. - affiliations = c.get('affiliation', []) or [] - raw_affiliation = None - if len(affiliations) == 0: - raw_affiliation = None - else: - raw_affiliation = clean(affiliations[0]) - - name = c.get('name') - given_name = c.get('givenName') - surname = c.get('familyName') - - if name: - name = clean(name) - - if name in name_blacklist: - continue - - if given_name: - given_name = clean(given_name) - - if surname: - surname = clean(surname) - - if not name: - continue - - if raw_affiliation == '': - continue - if name.lower() in UNKNOWN_MARKERS: - continue + creators = attributes.get('creators', []) or [] + contributors = attributes.get('contributors', []) or [] # Much fewer than creators. - # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'. - if name: - name = index_form_to_display_name(name) - - contribs.append( - fatcat_openapi_client.ReleaseContrib( - creator_id=creator_id, - index=i, - raw_name=name, - given_name=given_name, - surname=surname, - role='author', - raw_affiliation=raw_affiliation, - )) - elif nameType == 'Organizational': - name = c.get('name', '') or '' - if name in UNKNOWN_MARKERS: - continue - if len(name) < 3: - continue - extra = {'organization': name} - contribs.append(fatcat_openapi_client.ReleaseContrib( - index=i, extra=extra)) - else: - print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr) + contribs = self.parse_datacite_creators(creators) + self.parse_datacite_creators(contributors, role=None, set_index=False) # Title, may come with "attributes.titles[].titleType", like # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" @@ -767,6 +690,104 @@ class DataciteImporter(EntityImporter): extra=self.editgroup_extra), entity_list=batch)) + def parse_datacite_creators(self, creators, role='author', set_index=True): + """ + Parses a list of creators into a list of ReleaseContrib objects. Set + set_index to False, if the index contrib field should be left blank. + """ + # Contributors. Many nameIdentifierSchemes, we do not use (yet): + # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": + # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", + # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"]. + contribs = [] + + # Names, that should be ignored right away. + name_blacklist = set(('Occdownload Gbif.Org',)) + + for i, c in enumerate(creators): + if not set_index: + i = None + nameType = c.get('nameType', '') or '' + if nameType in ('', 'Personal'): + creator_id = None + for nid in c.get('nameIdentifiers', []): + name_scheme = nid.get('nameIdentifierScheme', '') or '' + if not name_scheme.lower() == "orcid": + continue + orcid = nid.get('nameIdentifier', '').replace('https://orcid.org/', '') + if not orcid: + continue + creator_id = self.lookup_orcid(orcid) + # TODO(martin): If creator_id is None, should we create creators? + + # If there are multiple affiliation strings, use the first one. + affiliations = c.get('affiliation', []) or [] + raw_affiliation = None + if len(affiliations) == 0: + raw_affiliation = None + else: + raw_affiliation = clean(affiliations[0]) + + name = c.get('name') + given_name = c.get('givenName') + surname = c.get('familyName') + + if name: + name = clean(name) + if not name: + continue + if name in name_blacklist: + continue + if name.lower() in UNKNOWN_MARKERS: + continue + # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'. + if name: + name = index_form_to_display_name(name) + + if given_name: + given_name = clean(given_name) + if surname: + surname = clean(surname) + if raw_affiliation == '': + continue + + extra = None + + # "DataManager", "DataCurator", "ContactPerson", "Distributor", + # "RegistrationAgency", "Sponsor", "Researcher", + # "RelatedPerson", "ProjectLeader", "Editor", "Other", + # "ProjectMember", "Funder", "RightsHolder", "DataCollector", + # "Supervisor", "Producer", "HostingInstitution", "ResearchGroup" + contributorType = c.get('contributorType', '') or '' + + if contributorType: + extra = {'type': contributorType} + + contribs.append( + fatcat_openapi_client.ReleaseContrib( + creator_id=creator_id, + index=i, + raw_name=name, + given_name=given_name, + surname=surname, + role=role, + raw_affiliation=raw_affiliation, + extra=extra, + )) + elif nameType == 'Organizational': + name = c.get('name', '') or '' + if name in UNKNOWN_MARKERS: + continue + if len(name) < 3: + continue + extra = {'organization': name} + contribs.append(fatcat_openapi_client.ReleaseContrib( + index=i, extra=extra)) + else: + print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr) + + return contribs + def lookup_license_slug(raw): """ @@ -971,6 +992,8 @@ def index_form_to_display_name(s): if s.count(',') > 1: # "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan" return s + + # Not names, but sprinkled in fields where authors live. stopwords = [s.lower() for s in ( 'Archive', 'Collection', diff --git a/python/tests/files/datacite/datacite_doc_26.json b/python/tests/files/datacite/datacite_doc_26.json new file mode 100644 index 00000000..c2abb1b2 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_26.json @@ -0,0 +1,57 @@ +{ + "attributes": { + "doi": "10.7916/d86x0cg1", + "creators": [ + { + "name": "Anton Welch", + "affiliation": [ + "Department of pataphysics" + ], + "nameIdentifiers": [] + } + ], + "contributors": [ + { + "name": "Wemmer, David", + "nameType": "Personal", + "givenName": "David", + "familyName": "Wemmer", + "affiliation": [], + "contributorType": "Editor" + } + ], + "titles": [ + { + "title": "Additional file 123: ABC" + }, + { + "title": "DEF", + "titleType": "Subtitle" + } + ], + "publicationYear": 2016, + "language": "DE-CH", + "types": { + "ris": "GEN", + "bibtex": "misc", + "citeproc": "article", + "schemaOrg": "CreativeWork" + }, + "dates": [ + { + "date": "2017-08-24", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "isActive": true, + "state": "findable" + } +} diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json index 22542a10..c4e5418d 100644 --- a/python/tests/files/datacite/datacite_result_05.json +++ b/python/tests/files/datacite/datacite_result_05.json @@ -523,6 +523,12 @@ "given_name": "Christian", "surname": "Wurzbacher", "role": "author" + }, + { + "raw_name": "Kessy Abarenkov" + }, + { + "raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden" } ], "refs": [], diff --git a/python/tests/files/datacite/datacite_result_09.json b/python/tests/files/datacite/datacite_result_09.json index fd873309..c93dc769 100644 --- a/python/tests/files/datacite/datacite_result_09.json +++ b/python/tests/files/datacite/datacite_result_09.json @@ -32,6 +32,17 @@ "given_name": "Nils", "surname": "Kirstaedter", "role": "author" + }, + { + "extra": { + "organization": "TIB-Technische Informationsbibliothek Universitätsbibliothek Hannover" + } + }, + { + "raw_name": "Technische Informationsbibliothek (TIB)", + "extra": { + "type": "DataManager" + } } ], "refs": [], diff --git a/python/tests/files/datacite/datacite_result_26.json b/python/tests/files/datacite/datacite_result_26.json new file mode 100644 index 00000000..8d26197c --- /dev/null +++ b/python/tests/files/datacite/datacite_result_26.json @@ -0,0 +1,31 @@ +{ + "extra": { + "datacite": {}, + "release_month": 8 + }, + "title": "Additional file 123: ABC", + "subtitle": "DEF", + "release_type": "stub", + "release_stage": "published", + "release_date": "2017-08-24", + "release_year": 2017, + "ext_ids": { + "doi": "10.7916/d86x0cg1" + }, + "contribs": [ + { + "index": 0, + "raw_name": "Anton Welch", + "role": "author", + "raw_affiliation": "Department of pataphysics" + }, + { + "extra": {"type": "Editor"}, + "raw_name": "David Wemmer", + "given_name": "David", + "surname": "Wemmer" + } + ], + "refs": [], + "abstracts": [] +} diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 7293ecac..5ad7ef2c 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -275,7 +275,7 @@ def test_datacite_dict_parse(datacite_importer): assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}] assert len(r.abstracts) == 1 assert len(r.abstracts[0].content) == 421 - assert len(r.contribs) == 1 + assert len(r.contribs) == 2 assert r.contribs[0].raw_name == "GLIS Of The ITPGRFA" assert r.contribs[0].given_name == None assert r.contribs[0].surname == None @@ -287,7 +287,7 @@ def test_datacite_conversions(datacite_importer): for now. """ datacite_importer.debug = True - for i in range(26): + for i in range(27): src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i) dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i) print('testing mapping from {} => {}'.format(src, dst)) -- cgit v1.2.3