diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-01-08 23:31:40 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-01-08 23:31:40 +0100 |
commit | 081746837a55bf5f34c96f12f1abb5a00d5b478c (patch) | |
tree | 88af1ade558ad6695918d36648b3ed4a5bea6954 /python | |
parent | 27723a61bde5591bae8115d801d0d09b7ef01b03 (diff) | |
parent | 277bd183d7139bb1a8857bc2a48c0aa92012455d (diff) | |
download | fatcat-081746837a55bf5f34c96f12f1abb5a00d5b478c.tar.gz fatcat-081746837a55bf5f34c96f12f1abb5a00d5b478c.zip |
Merge branch 'martin-datacite-import'
Pipfile.lock is broken.
* martin-datacite-import: (68 commits)
datacite: pass in doi into factored out method
datacite: reformat test cases and use jq . --sort-keys
datacite: factor out contributor handling
datacite: catch type mismatch in language detection
datacite: adjust tests for release_month
datacite: name extra.month, extra.release_month
datacite: mark additional files as stub
datacite: CCDC are entries, mostly
datacite: use more specific release_type, if possible
datacite: ignore certain names
datacite: over 3% records have the same title: stub
datacite: fill a few more release_type gaps
datacite: adding datacite-specific extra metadata
datacite: apply pylint suggestions
datacite: fix typos
datacite: set release_stage to published by default
datacite: month field should be top-level
datacite: include month in extra
datacite: indicate mismatched file in test
datacite: clean abstracts, use unknown value tokens
...
Diffstat (limited to 'python')
64 files changed, 5736 insertions, 5 deletions
diff --git a/python/Pipfile b/python/Pipfile index fcf28302..1a19a145 100644 --- a/python/Pipfile +++ b/python/Pipfile @@ -48,6 +48,9 @@ pygal = "*" elasticsearch-dsl = ">=6.0.0,<7.0.0" elasticsearch = ">=6.0.0,<7.0.0" dateparser = ">=0.7" +langdetect = "*" +pathlib2 = "*" +pycountry = "*" [requires] # Python 3.5 is the bundled (system) version of python for Ubuntu 16.04 diff --git a/python/Pipfile.lock b/python/Pipfile.lock index e029ce4f..a4408cdd 100644 --- a/python/Pipfile.lock +++ b/python/Pipfile.lock @@ -1,7 +1,11 @@ { "_meta": { "hash": { +<<<<<<< HEAD "sha256": "03fc6c65c7bcbf96a5ef90afba8b6a0264a248a67b31ed339f399470b5f3d5fc" +======= + "sha256": "fb9c3d2307483efe01d9c28a306bad319c84a94a4253d5c7c25bcfe2dad20c5d" +>>>>>>> martin-datacite-import }, "pipfile-spec": 6, "requires": { @@ -229,7 +233,6 @@ }, "flask-misaka": { "hashes": [ - "sha256:bcfdacc0803ccea75d377737e82c83489b2153d922c9d9f9eabc5148d216ed70", "sha256:d0cfb0efd9e5afacda76defd4a605a68390f4fb1bef283c71534fd3ce0d3efb5", "sha256:f423c3beb5502742a57330a272f81d53223f6f99d45cc45b03926e3a3034f589" ], @@ -299,6 +302,16 @@ ], "version": "==2.5.0" }, +<<<<<<< HEAD +======= + "langdetect": { + "hashes": [ + "sha256:91a170d5f0ade380db809b3ba67f08e95fe6c6c8641f96d67a51ff7e98a9bf30" + ], + "index": "pypi", + "version": "==1.0.7" + }, +>>>>>>> martin-datacite-import "loginpass": { "hashes": [ "sha256:717c87c1870a7e00547fd9d989aea9b22232b2f48826f552d79c34a47f9618c9", @@ -392,6 +405,21 @@ ], "version": "==3.1.0" }, + "pathlib2": { + "hashes": [ + "sha256:0ec8205a157c80d7acc301c0b18fbd5d44fe655968f5d947b6ecef5290fc35db", + "sha256:6cd9a47b597b37cc57de1c05e56fb1a1c9cc9fab04fe78c29acd090418529868" + ], + "index": "pypi", + "version": "==2.3.5" + }, + "pycountry": { + "hashes": [ + "sha256:3c57aa40adcf293d59bebaffbe60d8c39976fba78d846a018dc0c2ec9c6cb3cb" + ], + "index": "pypi", + "version": "==19.8.18" + }, "pycparser": { "hashes": [ "sha256:a988718abfad80b6b157acce7bf130a30876d27603738ac39f140993246b25b3" @@ -415,7 +443,6 @@ }, "pykafka": { "hashes": [ - "sha256:6b075909a52cb0c95325bc16ab797bbcdbb37386652ea460705ed4472ce91459", "sha256:f0bbd394ae6970042a587c99fe4dc0966e67787249d963d4ce2f810dc9490577" ], "index": "pypi", @@ -502,6 +529,9 @@ "version": "==2019.3" }, "raven": { + "extras": [ + "flask" + ], "hashes": [ "sha256:3fa6de6efa2493a7c827472e984ce9b020797d0da16f1db67197bcc23c8fae54", "sha256:44a13f87670836e153951af9a3c80405d36b43097db869a36e92809673692ce4" @@ -546,8 +576,7 @@ "requests-oauthlib": { "hashes": [ "sha256:7f71572defaecd16372f9006f33c2ec8c077c3cfa6f5911a9a90202beb513f3d", - "sha256:b4261601a71fd721a8bd6d7aa1cc1d6a8a93b4a9f5e96626f8e4d91e8beeaa6a", - "sha256:fa6c47b933f01060936d87ae9327fead68768b69c6c9ea2109c48be30f2d4dbc" + "sha256:b4261601a71fd721a8bd6d7aa1cc1d6a8a93b4a9f5e96626f8e4d91e8beeaa6a" ], "version": "==1.3.0" }, @@ -595,8 +624,12 @@ }, "wcwidth": { "hashes": [ +<<<<<<< HEAD "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603", "sha256:f28b3e8a6483e5d49e7f8949ac1a78314e740333ae305b4ba5defd3e74fb37a8" +======= + "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603" +>>>>>>> martin-datacite-import ], "version": "==0.1.8" }, @@ -798,7 +831,7 @@ "sha256:0ec8205a157c80d7acc301c0b18fbd5d44fe655968f5d947b6ecef5290fc35db", "sha256:6cd9a47b597b37cc57de1c05e56fb1a1c9cc9fab04fe78c29acd090418529868" ], - "markers": "python_version < '3.6'", + "index": "pypi", "version": "==2.3.5" }, "pexpect": { @@ -894,11 +927,19 @@ }, "pytest": { "hashes": [ +<<<<<<< HEAD "sha256:6b571215b5a790f9b41f19f3531c53a45cf6bb8ef2988bc1ff9afb38270b25fa", "sha256:e41d489ff43948babd0fad7ad5e49b8735d5d55e26628a58673c39ff61d95de4" ], "index": "pypi", "version": "==5.3.2" +======= + "sha256:6192875be8af57b694b7c4904e909680102befcb99e610ef3d9f786952f795aa", + "sha256:f8447ebf8fd3d362868a5d3f43a9df786dfdfe9608843bd9002a2d47a104808f" + ], + "index": "pypi", + "version": "==4.6.8" +>>>>>>> martin-datacite-import }, "pytest-cov": { "hashes": [ @@ -1003,8 +1044,12 @@ }, "wcwidth": { "hashes": [ +<<<<<<< HEAD "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603", "sha256:f28b3e8a6483e5d49e7f8949ac1a78314e740333ae305b4ba5defd3e74fb37a8" +======= + "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603" +>>>>>>> martin-datacite-import ], "version": "==0.1.8" }, diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 184dcc0a..fb8830ca 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -167,6 +167,20 @@ def run_cdl_dash_dat(args): print("fileset id: {}".format(fs.ident)) print("link: https://fatcat.wiki/fileset/{}".format(fs.ident)) +def run_datacite(args): + dci = DataciteImporter(args.api, + args.issn_map_file, + edit_batch_size=args.batch_size, + bezerk_mode=args.bezerk_mode, + debug=args.debug, + extid_map_file=args.extid_map_file, + insert_log_file=args.insert_log_file) + if args.kafka_mode: + KafkaJsonPusher(dci, args.kafka_hosts, args.kafka_env, "api-datacite", + "fatcat-import", consume_batch_size=args.batch_size).run() + else: + JsonLinePusher(dci, args.json_file).run() + def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -443,6 +457,35 @@ def main(): type=str, help="use existing editgroup (instead of creating a new one)") + sub_datacite = subparsers.add_parser('datacite', + help="import datacite.org metadata") + sub_datacite.add_argument('json_file', + help="File with jsonlines from datacite.org v2 API to import from", + default=sys.stdin, type=argparse.FileType('r')) + sub_datacite.add_argument('issn_map_file', + help="ISSN to ISSN-L mapping file", + default=None, type=argparse.FileType('r')) + sub_datacite.add_argument('--extid-map-file', + help="DOI-to-other-identifiers sqlite3 database", + default=None, type=str) + sub_datacite.add_argument('--kafka-mode', + action='store_true', + help="consume from kafka topic (not stdin)") + sub_datacite.add_argument('--bezerk-mode', + action='store_true', + help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)") + sub_datacite.add_argument('--debug', + action='store_true', + help="write converted JSON to stdout") + sub_datacite.add_argument('--insert-log-file', + default='', + type=str, + help="write inserted documents into file (for debugging)") + sub_datacite.set_defaults( + func=run_datacite, + auth_var="FATCAT_AUTH_WORKER_DATACITE", + ) + args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index bb9c5b17..d936605f 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -14,6 +14,7 @@ To run an import you combine two classes; one each of: from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLargeFilePusher, Bs4XmlLinesPusher, Bs4XmlFileListPusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP, lookup_license_slug +from .datacite import DataciteImporter from .jalc import JalcImporter from .jstor import JstorImporter from .arxiv import ArxivRawImporter diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py new file mode 100644 index 00000000..b1862b44 --- /dev/null +++ b/python/fatcat_tools/importers/datacite.py @@ -0,0 +1,1023 @@ +""" +Prototype importer for datacite.org data. + +Example input document at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8. + +Datacite being an aggregator, the data is varied and exposes a couple of +problems in content and structure. A few fields habe their own parsing +functions (parse_datacite_...), which can be tested more easily. +""" + +import collections +import datetime +import hashlib +import json +import sqlite3 +import sys + +import dateparser +import fatcat_openapi_client +import langdetect +import pycountry + +from fatcat_tools.normal import clean_doi +from fatcat_tools.transforms import entity_to_dict + +from .common import EntityImporter, clean + +# Cutoff length for abstracts. +MAX_ABSTRACT_LENGTH = 2048 + +# https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary +CONTAINER_TYPE_MAP = { + 'Journal': 'journal', + 'Series': 'journal', + 'Book Series': 'book-series', +} + +# The docs/guide should be the cannonical home for these mappings; update there +# first. Map various datacite type types to CSL-ish types. None means TODO or +# remove. +DATACITE_TYPE_MAP = { + 'ris': { + 'THES': 'thesis', + 'SOUND': 'song', # 99.9% maps to citeproc song, so use that (exception: report) + 'CHAP': 'chapter', + 'FIGURE': 'figure', + 'RPRT': 'report', + 'JOUR': 'article-journal', + 'MPCT': 'motion_picture', + 'GEN': 'article-journal', # GEN consist of 99% article and report, post-weblog, misc - and one dataset + 'BOOK': 'book', + 'DATA': 'dataset', + 'COMP': 'software', + }, + 'schemaOrg': { + 'Dataset': 'dataset', + 'Book': 'book', + 'ScholarlyArticle': 'article-journal', + 'ImageObject': 'graphic', + 'Collection': None, + 'MediaObject': None, + 'Event': None, + 'SoftwareSourceCode': 'software', + 'Chapter': 'chapter', + 'CreativeWork': None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score. + 'PublicationIssue': 'article', + 'AudioObject': None, + 'Thesis': 'thesis', + }, + 'citeproc': { + 'article': 'article', + 'article-journal': 'article-journal', + 'article-magazine': 'article-magazine', + 'article-newspaper': 'article-newspaper', + 'bill': 'bill', + 'book': 'book', + 'broadcast': 'broadcast', + 'chapter': 'chapter', + 'dataset': 'dataset', + 'entry-dictionary': 'entry-dictionary', + 'entry-encyclopedia': 'entry-encyclopedia', + 'entry': 'entry', + 'figure': 'figure', + 'graphic': 'graphic', + 'interview': 'interview', + 'legal_case': 'legal_case', + 'legislation': 'legislation', + 'manuscript': 'manuscript', + 'map': 'map', + 'motion_picture': 'motion_picture', + 'musical_score': 'musical_score', + 'pamphlet': 'pamphlet', + 'paper-conference': 'paper-conference', + 'patent': 'patent', + 'personal_communication': 'personal_communication', + 'post': 'post', + 'post-weblog': 'post-weblog', + 'report': 'report', + 'review-book': 'review-book', + 'review': 'review', + 'song': 'song', + 'speech': 'speech', + 'thesis': 'thesis', + 'treaty': 'treaty', + 'webpage': 'webpage', + }, # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types + 'bibtex': { + 'phdthesis': 'thesis', + 'inbook': 'chapter', + 'misc': None, + 'article': 'article-journal', + 'book': 'book', + }, + 'resourceTypeGeneral': { + 'Image': 'graphic', + 'Dataset': 'dataset', + 'PhysicalObject': None, + 'Collection': None, + 'Text': None, # "Greyliterature, labnotes, accompanyingmaterials" + 'Sound': None, + 'InteractiveResource': None, + 'Event': None, + 'Software': 'software', + 'Other': None, + 'Workflow': None, + 'Audiovisual': None, + } # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32 +} + +# DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43. +DATACITE_UNKNOWN_MARKERS = ( + '(:unac)', # temporarily inaccessible + '(:unal)', # unallowed, suppressed intentionally + '(:unap)', # not applicable, makes no sense + '(:unas)', # value unassigned (e.g., Untitled) + '(:unav)', # value unavailable, possibly unknown + '(:unkn)', # known to be unknown (e.g., Anonymous, Inconnue) + '(:none)', # never had a value, never will + '(:null)', # explicitly and meaningfully empty + '(:tba)', # to be assigned or announced later + '(:etal)', # too numerous to list (et alia) +) + +# UNKNOWN_MARKERS joins official datacite markers with a generic tokens marking +# unknown values. +UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set(( + 'NA', + 'NN', + 'n.a.', + '[s.n.]', +))) + +# TODO(martin): merge this with other maps, maybe. +LICENSE_SLUG_MAP = { + "//creativecommons.org/licenses/by/2.0/": "CC-BY", + "//creativecommons.org/licenses/by/2.0/uk/legalcode": "CC-BY", + "//creativecommons.org/licenses/by/3.0/": "CC-BY", + "//creativecommons.org/licenses/by/3.0/us": "CC-BY", + "//creativecommons.org/licenses/by/4.0/": "CC-BY", + "//creativecommons.org/licenses/by/4.0/deed.de/": "CC-BY", + "//creativecommons.org/licenses/by/4.0/deed.en_US/": "CC-BY", + "//creativecommons.org/licenses/by/4.0/legalcode/": "CC-BY", + "//creativecommons.org/licenses/by-nc/2.0/": "CC-BY-NC", + "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC", + "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC", + "//creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC", + "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND", + "//creativecommons.org/licenses/by-nc-nd/3.0/gr": "CC-BY-NC-ND", + "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-ND", + "//creativecommons.org/licenses/by-nc-nd/4.0/legalcode": "CC-BY-ND", + "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA", + "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND", + "//creativecommons.org/licenses/by-sa/3.0/de": "CC-BY-SA", + "//creativecommons.org/licenses/by-sa/3.0/gr": "CC-BY-SA", + "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA", + "//creativecommons.org/licenses/by-sa/4.0/legalcode": "CC-BY-SA", + "//creativecommons.org/licenses/CC-BY/4.0/": "CC-BY", + "//creativecommons.org/licenses/publicdomain/zero/1.0/": "CC-0", + "//creativecommons.org/publicdomain/zero/1.0/": "CC-0", + "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0", + "//opensource.org/licenses/MIT": "MIT", + "//www.elsevier.com/open-access/userlicense/1.0": "ELSEVIER-USER-1.0", + "//www.gnu.org/licenses/gpl-3.0.en.html": "GPLv3", + "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html": "GPLv2", + "//www.karger.com/Services/SiteLicenses": "KARGER", + "//www.opensource.org/licenses/Apache-2.0": "Apache-2.0", + "//www.opensource.org/licenses/BSD-3-Clause": "BSD-3-Clause", + "//www.opensource.org/licenses/EUPL-1.1": + "EUPL-1.1", # redirects to EUPL-1.2 + "//www.opensource.org/licenses/MIT": "MIT", + # "http://royalsocietypublishing.org/licence": "", # OA and "normal", https://royalsociety.org/journals/authors/licence-to-publish/ + # "http://rsc.li/journals-terms-of-use": "RSC", + # "http://www.fu-berlin.de/sites/refubium/rechtliches/Nutzungsbedingungen": "", # 53 UrhG. + # "http://www.nrcresearchpress.com/page/about/CorporateTextAndDataMining": "", + # "http://www.springer.com/tdm": "", + # "https://cds.unistra.fr/vizier-org/licences_vizier.html": "", # Maybe try to "SPN" those: https://web.archive.org/web/*/https://cds.unistra.fr/vizier-org/licences_vizier.html + # "https://link.aps.org/licenses/aps-default-accepted-manuscript-license": "", + # "https://oparu.uni-ulm.de/xmlui/license_opod_v1": "", + # "https://publikationen.bibliothek.kit.edu/kitopen-lizenz": "", + # "https://rightsstatements.org/page/InC/1.0?language=en": "", + # "https://services.ceda.ac.uk/cedasite/register/info": "", + # "https://wdc.dlr.de/ndmc/userfiles/file/NDMC-Data_Sharing_Principles.pdf": "", # 404 + # "https://www.cambridge.org/core/terms": "", + # "https://www.elsevier.com/tdm/userlicense/1.0", + # "info:eu-repo/semantics/closedAccess": "", # https://wiki.surfnet.nl/display/standards/info-eu-repo/#info-eu-repo-AccessRights + # "info:eu-repo/semantics/embargoedAccess": "", + # "info:eu-repo/semantics/openAccess": "", + # Note: Some URLs pointing to licensing terms are not in WB yet (but would be nice). +} + +# TODO(martin): drop this after 3.7 upgrade +try: + isascii = str.isascii # new in 3.7, https://docs.python.org/3/library/stdtypes.html#str.isascii +except AttributeError: + isascii = lambda s: len(s) == len(s.encode()) + + +class DataciteImporter(EntityImporter): + """ + Importer for datacite records. + """ + def __init__(self, + api, + issn_map_file, + debug=False, + insert_log_file=None, + **kwargs): + + eg_desc = kwargs.get( + 'editgroup_description', + "Automated import of Datacite DOI metadata, harvested from REST API" + ) + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', + 'fatcat_tools.DataciteImporter') + super().__init__(api, + issn_map_file=issn_map_file, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + + self.create_containers = kwargs.get('create_containers', True) + extid_map_file = kwargs.get('extid_map_file') + self.extid_map_db = None + if extid_map_file: + db_uri = "file:{}?mode=ro".format(extid_map_file) + print("Using external ID map: {}".format(db_uri), file=sys.stderr) + self.extid_map_db = sqlite3.connect(db_uri, uri=True) + else: + print("Not using external ID map", file=sys.stderr) + + self.read_issn_map_file(issn_map_file) + self.debug = debug + self.insert_log_file = insert_log_file + + print('datacite with debug={}'.format(self.debug), file=sys.stderr) + + def lookup_ext_ids(self, doi): + """ + Return dictionary of identifiers refering to the same things as the given DOI. + """ + if self.extid_map_db is None: + return dict(core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None) + row = self.extid_map_db.execute( + "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", + [doi.lower()]).fetchone() + if row is None: + return dict(core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None) + row = [str(cell or '') or None for cell in row] + return dict( + core_id=row[0], + pmid=row[1], + pmcid=row[2], + wikidata_qid=row[3], + # TODO: + arxiv_id=None, + jstor_id=None, + ) + + def parse_record(self, obj): + """ + Mapping datacite JSON to ReleaseEntity. + """ + if not obj or not isinstance(obj, dict): + return None + if 'attributes' not in obj: + return None + + attributes = obj['attributes'] + doi = clean_doi(attributes.get('doi', '').lower()) + + if not isascii(doi): + print('[{}] skipping non-ascii doi for now'.format(doi)) + return None + + + creators = attributes.get('creators', []) or [] + contributors = attributes.get('contributors', []) or [] # Much fewer than creators. + + contribs = self.parse_datacite_creators(creators, doi=doi) + self.parse_datacite_creators(contributors, role=None, set_index=False, doi=doi) + + # Title, may come with "attributes.titles[].titleType", like + # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" + titles = attributes.get('titles', []) or [] + title, original_language_title, subtitle = parse_datacite_titles( + titles) + + if title is None: + print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) + return False + + title = clean(title) + if not title: + print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) + return False + + if not subtitle: + subtitle = None + else: + subtitle = clean(subtitle) + + # Dates. A few internal dates (registered, created, updated) and + # published (0..2554). We try to work with typed date list, in + # "attributes.dates[].dateType", values: "Accepted", "Available" + # "Collected", "Copyrighted", "Created", "Issued", "Submitted", + # "Updated", "Valid". + release_date, release_month, release_year = parse_datacite_dates( + attributes.get('dates', [])) + + # Start with clear stages, e.g. published. TODO(martin): we could + # probably infer a bit more from the relations, e.g. + # "IsPreviousVersionOf" or "IsNewVersionOf". + release_stage = 'published' + + # TODO(martin): If 'state' is not 'findable' or 'isActive' is not true, + # we might want something else than 'published'. See also: + # https://support.datacite.org/docs/doi-states. + + # Publisher. A few NA values. A few bogus values. + publisher = attributes.get('publisher') + + if publisher in UNKNOWN_MARKERS | set(('Unpublished', 'Unknown')): + publisher = None + release_stage = None + if publisher is not None and len(publisher) > 80: + # Arbitrary magic value max length. TODO(martin): better heuristic, + # but factored out; first we have to log misses. Example: + # "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller, + # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE / + # Unbekannt, Nutzungsrechte müssen durch den Nutzer abgeklärt + # werden" + publisher = None + + if publisher: + publisher = clean(publisher) + + # Container. For the moment, only ISSN as container. + container_id = None + container_name = None + + container = attributes.get('container', {}) or {} + if container.get('type') in CONTAINER_TYPE_MAP.keys(): + container_type = CONTAINER_TYPE_MAP.get(container['type']) + if container.get('identifier') and container.get( + 'identifierType') == 'ISSN': + issn = container.get('identifier') + if len(issn) == 8: + issn = issn[:4] + "-" + issn[4:] + issnl = self.issn2issnl(issn) + if issnl is not None: + container_id = self.lookup_issnl(issnl) + + if container_id is None and container.get('title'): + container_name = container.get('title') + if isinstance(container_name, list): + if len(container_name) > 0: + print('[{}] too many container titles: {}'.format(doi, + len(container_name))) + container_name = container_name[0] + assert isinstance(container_name, str) + ce = fatcat_openapi_client.ContainerEntity( + issnl=issnl, + container_type=container_type, + name=container_name, + ) + ce_edit = self.create_container(ce) + container_id = ce_edit.ident + self._issnl_id_map[issnl] = container_id + else: + # TODO(martin): factor this out into a testable function. + # TODO(martin): "container_name": "№1(1) (2018)" / 10.26087/inasan.2018.1.1.013 + container_name = container.get('title') + if isinstance(container_name, list): + if len(container_name) > 0: + print('[{}] too many container titles: {}'.format(doi, + len(container_name))) + container_name = container_name[0] + + # Volume and issue. + volume = container.get('volume') + issue = container.get('issue') + + if volume: + volume = clean(volume) + + if issue: + issue = clean(issue) + + # Pages. + pages = None + + first_page = container.get('firstPage') + last_page = container.get('lastPage') + + if first_page and last_page: + try: + _ = int(first_page) < int(last_page) + pages = '{}-{}'.format(first_page, last_page) + except ValueError as err: + # TODO(martin): This is more debug than info. + # print('[{}] {}'.format(doi, err), file=sys.stderr) + pass + + if not pages and first_page: + pages = first_page + + # License. + license_slug = None + license_extra = [] + + for l in attributes.get('rightsList', []): + slug = lookup_license_slug(l.get('rightsUri')) + if slug: + license_slug = slug + license_extra.append(l) + + # Release type. Try to determine the release type from a variety of + # types supplied in datacite. The "attributes.types.resourceType" is + # uncontrolled (170000+ unique values, from "null", "Dataset" to + # "Jupyter Notebook" and "Macroseismic Data Points" or "2 days of IP + # flows in 2009") citeproc may be the closest, but not always supplied. + # Order lookup roughly by completeness of mapping. + for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'): + value = attributes.get('types', {}).get(typeType) + release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value) + if release_type is not None: + break + + if release_type is None: + print("[{}] no mapped type: {}".format(doi, value), file=sys.stderr) + + # release_type exception: Global Biodiversity Information Facility + # publishes highly interesting datasets, but titles are mostly the same + # ("GBIF Occurrence Download" or "Occurrence Download"); set + # release_type to "stub" (CSL/FC). + if publisher == 'The Global Biodiversity Information Facility': + release_type = 'stub' + + # release_type exception: lots of "Experimental Crystal Structure Determination" + if publisher == 'Cambridge Crystallographic Data Centre': + release_type = 'entry' + + # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire." + if title.lower().startswith('additional file'): + release_type = 'stub' + + # Language values are varied ("ger", "es", "English", "ENG", "en-us", + # "other", ...). Try to crush it with langcodes: "It may sound to you + # like langcodes solves a pretty boring problem. At one level, that's + # right. Sometimes you have a boring problem, and it's great when a + # library solves it for you." -- TODO(martin): We need more of these. + language = None + + value = attributes.get('language', '') or '' + try: + language = pycountry.languages.lookup(value).alpha_2 + except (LookupError, AttributeError) as err: + pass + # TODO(martin): Print this on debug level, only. + # print('[{}] language lookup miss for {}: {}'.format(doi, value, err), file=sys.stderr) + + # Abstracts appear in "attributes.descriptions[].descriptionType", some + # of the observed values: "Methods", "TechnicalInfo", + # "SeriesInformation", "Other", "TableOfContents", "Abstract". The + # "Other" fields might contain references or related articles (with + # DOI). TODO(martin): maybe try to parse out some of those refs. + abstracts = [] + descs = attributes.get('descriptions', []) or [] + for desc in descs: + if not desc.get('descriptionType') == 'Abstract': + continue + if len(desc.get('description', '') or '') < 10: + continue + text = desc.get('description', '') + if len(text) > MAX_ABSTRACT_LENGTH: + text = text[:MAX_ABSTRACT_LENGTH] + " [...]" + lang = None + try: + lang = langdetect.detect(text) + except (langdetect.lang_detect_exception.LangDetectException, TypeError) as err: + print('[{}] language detection failed with {} on {}'.format(doi, err, text), file=sys.stderr) + abstracts.append( + fatcat_openapi_client.ReleaseAbstract( + mimetype="text/plain", + content=clean(text), + lang=lang, + )) + + # References and relations. Datacite include many relation types in + # "attributes.relatedIdentifiers[].relationType", e.g. + # "IsPartOf", "IsPreviousVersionOf", "Continues", "IsVariantFormOf", + # "IsSupplementTo", "Cites", "IsSupplementedBy", "IsDocumentedBy", "HasVersion", + # "IsCitedBy", "IsMetadataFor", "IsNewVersionOf", "IsIdenticalTo", "HasPart", + # "References", "Reviews", "HasMetadata", "IsContinuedBy", "IsVersionOf", + # "IsDerivedFrom", "IsSourceOf". + # + # For the moment, we only care about References. + refs, ref_index = [], 0 + + relIds = attributes.get('relatedIdentifiers', []) or [] + for rel in relIds: + if not rel.get('relationType', '') in ('References', 'Cites'): + continue + ref_extra = dict() + if rel.get('relatedIdentifierType', '') == 'DOI': + ref_extra['doi'] = rel.get('relatedIdentifier') + if not ref_extra: + ref_extra = None + refs.append( + fatcat_openapi_client.ReleaseRef( + index=ref_index, + extra=ref_extra, + )) + ref_index += 1 + + # More specific release_type via 'Reviews' relationsship. + for rel in relIds: + if rel.get('relatedIdentifierType', '') != 'Reviews': + continue + release_type = 'review' + + # Extra information. + extra_datacite = dict() + + if license_extra: + extra_datacite['license'] = license_extra + if attributes.get('subjects'): + extra_datacite['subjects'] = attributes['subjects'] + + # Include version information. + metadata_version = attributes.get('metadataVersion') or '' + schema_version = attributes.get('schemaVersion') or '' + + if metadata_version: + extra_datacite['metadataVersion'] = metadata_version + if schema_version: + extra_datacite['schemaVersion'] = schema_version + + # Include resource types. + types = attributes.get('types', {}) or {} + resource_type = types.get('resourceType', '') or '' + resource_type_general = types.get('resourceTypeGeneral', '') or '' + + if resource_type: + extra_datacite['resourceType'] = resource_type + if resource_type_general: + extra_datacite['resourceTypeGeneral'] = resource_type_general + + # Include certain relations from relatedIdentifiers. Keeping the + # original structure of data here, which is a list of dicts, with + # relation type, identifer and identifier type (mostly). + relations = [] + for rel in relIds: + if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues', + 'IsVariantFormOf', 'IsSupplementTo', + 'HasVersion', 'IsMetadataFor', + 'IsNewVersionOf', 'IsIdenticalTo', + 'IsVersionOf', 'IsDerivedFrom', + 'IsSourceOf'): + relations.append(rel) + + if relations: + extra_datacite['relations'] = relations + + extra = dict() + + # "1.0.0", "v1.305.2019", "Final", "v1.0.0", "v0.3.0", "1", "0.19.0", + # "3.1", "v1.1", "{version}", "4.0", "10329", "11672", "11555", + # "v1.4.5", "2", "V1", "v3.0", "v0", "v0.6", "11124", "v1.0-beta", "1st + # Edition", "20191024", "v2.0.0", "v0.9.3", "10149", "2.0", null, + # "v0.1.1", "3.0", "1.0", "3", "v1.12.2", "20191018", "v0.3.1", "v1.0", + # "10161", "10010691", "10780", # "Presentación" + version = attributes.get('version') + + # top-level extra keys + if not container_id and container_name: + extra['container_name'] = container_name + + # Always include datacite key, even if value is empty (dict). + extra['datacite'] = extra_datacite + + # Preparation for a schema update. + if release_month: + extra['release_month'] = release_month + + extids = self.lookup_ext_ids(doi=doi) + + # Assemble release. + re = fatcat_openapi_client.ReleaseEntity( + work_id=None, + container_id=container_id, + release_type=release_type, + release_stage=release_stage, + title=title, + subtitle=subtitle, + original_title=original_language_title, + release_year=release_year, + release_date=release_date, + publisher=publisher, + ext_ids=fatcat_openapi_client.ReleaseExtIds( + doi=doi, + pmid=extids['pmid'], + pmcid=extids['pmcid'], + wikidata_qid=extids['wikidata_qid'], + core=extids['core_id'], + arxiv=extids['arxiv_id'], + jstor=extids['jstor_id'], + ), + contribs=contribs, + volume=volume, + issue=issue, + pages=pages, + language=language, + abstracts=abstracts, + refs=refs, + extra=extra, + license_slug=license_slug, + version=version, + ) + return re + + def try_update(self, re): + """ + When debug is true, write the RE to stdout, not to the database. Might + hide schema mismatch bugs. + """ + if self.debug is True: + print(json.dumps(entity_to_dict(re, api_client=None))) + return False + + # lookup existing DOI (don't need to try other ext idents for crossref) + existing = None + try: + existing = self.api.lookup_release(doi=re.ext_ids.doi) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + # doesn't exist, need to update + return True + + # eventually we'll want to support "updates", but for now just skip if + # entity already exists + if existing: + self.counts['exists'] += 1 + return False + + return True + + def insert_batch(self, batch): + print('inserting batch ({})'.format(len(batch)), file=sys.stderr) + if self.insert_log_file: + with open(self.insert_log_file, 'a') as f: + for doc in batch: + json.dump(entity_to_dict(doc, api_client=None), f) + f.write('\n') + self.api.create_release_auto_batch( + fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) + + def parse_datacite_creators(self, creators, role='author', set_index=True, doi=None): + """ + Parses a list of creators into a list of ReleaseContrib objects. Set + set_index to False, if the index contrib field should be left blank. + The doi parameter is only used for debugging. + """ + # Contributors. Many nameIdentifierSchemes, we do not use (yet): + # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": + # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", + # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"]. + contribs = [] + + # Names, that should be ignored right away. + name_blacklist = set(('Occdownload Gbif.Org',)) + + for i, c in enumerate(creators): + if not set_index: + i = None + nameType = c.get('nameType', '') or '' + if nameType in ('', 'Personal'): + creator_id = None + for nid in c.get('nameIdentifiers', []): + name_scheme = nid.get('nameIdentifierScheme', '') or '' + if not name_scheme.lower() == "orcid": + continue + orcid = nid.get('nameIdentifier', '').replace('https://orcid.org/', '') + if not orcid: + continue + creator_id = self.lookup_orcid(orcid) + # TODO(martin): If creator_id is None, should we create creators? + + # If there are multiple affiliation strings, use the first one. + affiliations = c.get('affiliation', []) or [] + raw_affiliation = None + if len(affiliations) == 0: + raw_affiliation = None + else: + raw_affiliation = clean(affiliations[0]) + + name = c.get('name') + given_name = c.get('givenName') + surname = c.get('familyName') + + if name: + name = clean(name) + if not name: + continue + if name in name_blacklist: + continue + if name.lower() in UNKNOWN_MARKERS: + continue + # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'. + if name: + name = index_form_to_display_name(name) + + if given_name: + given_name = clean(given_name) + if surname: + surname = clean(surname) + if raw_affiliation == '': + continue + + extra = None + + # "DataManager", "DataCurator", "ContactPerson", "Distributor", + # "RegistrationAgency", "Sponsor", "Researcher", + # "RelatedPerson", "ProjectLeader", "Editor", "Other", + # "ProjectMember", "Funder", "RightsHolder", "DataCollector", + # "Supervisor", "Producer", "HostingInstitution", "ResearchGroup" + contributorType = c.get('contributorType', '') or '' + + if contributorType: + extra = {'type': contributorType} + + contribs.append( + fatcat_openapi_client.ReleaseContrib( + creator_id=creator_id, + index=i, + raw_name=name, + given_name=given_name, + surname=surname, + role=role, + raw_affiliation=raw_affiliation, + extra=extra, + )) + elif nameType == 'Organizational': + name = c.get('name', '') or '' + if name in UNKNOWN_MARKERS: + continue + if len(name) < 3: + continue + extra = {'organization': name} + contribs.append(fatcat_openapi_client.ReleaseContrib( + index=i, extra=extra)) + else: + print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr) + + return contribs + + +def lookup_license_slug(raw): + """ + TODO(martin): reuse from or combine with crossref, maybe. + """ + if not raw: + return None + raw = raw.strip().replace('http://', '//').replace('https://', '//') + if 'creativecommons.org' in raw.lower(): + raw = raw.lower() + raw = raw.replace('/legalcode', '/').replace('/uk', '') + if not raw.endswith('/'): + raw = raw + '/' + return LICENSE_SLUG_MAP.get(raw) + + +def find_original_language_title(item, min_length=4, max_questionmarks=3): + """ + Perform a few checks before returning a potential original language title. + + Example input: {'title': 'Some title', 'original_language_title': 'Some title'} + """ + if not 'original_language_title' in item: + return None + title = item.get('title') + if not title: + return None + original_language_title = item.get('original_language_title') + if isinstance(original_language_title, + str) and title != original_language_title: + if len(original_language_title) < min_length: + return None + if original_language_title.count('?') > max_questionmarks: + return None + return original_language_title + if isinstance(original_language_title, dict): + content = original_language_title.get('__content__', '') or '' + if content and content != title and not content.count( + '?') > max_questionmarks: + return content + return None + + +def parse_datacite_titles(titles): + """ + Given a list of title items from datacite, return 3-tuple (title, + original_language_title, subtitle). + + Example input: [{"title": "Meeting Heterogeneity in Consumer Demand"}] + """ + title, original_language_title, subtitle = None, None, None + + if titles is None: + return title, original_language_title, subtitle + if len(titles) == 0: + return title, original_language_title, subtitle + elif len(titles) == 1: + original_language_title = find_original_language_title(titles[0]) + title = titles[0].get('title', '') or '' + title = title.strip() + if not title: + title = None + return title, original_language_title, subtitle + else: + for entry in titles: + if not title and ('titleType' not in entry + or not entry.get('titleType')): + title = entry.get('title').strip() + if not subtitle and entry.get('titleType') == 'Subtitle': + subtitle = entry.get('title', '').strip() + if not original_language_title: + original_language_title = find_original_language_title(entry) + + return title, original_language_title, subtitle + + +def parse_datacite_dates(dates): + """ + Given a list of date fields (under .dates), return tuple, (release_date, + release_year). + """ + release_date, release_month, release_year = None, None, None + + if not dates: + return release_date, release_month, release_year + + if not isinstance(dates, list): + raise ValueError('expected a list of date items') + + # Observed values: "Available", "Submitted", "Valid", "Issued", "Accepted", + # "Collected", "Updated", "Copyrighted", "Created" + # Ignored for now: "Collected", "Issued" + date_type_prio = ( + 'Valid', + 'Available', + 'Accepted', + 'Submitted', + 'Copyrighted', + 'Created', + 'Updated', + ) + + # We need to note the granularity, since a string like "2019" would be + # parsed into "2019-01-01", even though the month is unknown. Use 3 + # granularity types: 'y', 'm', 'd'. + Pattern = collections.namedtuple('Pattern', 'layout granularity') + + # Before using (expensive) dateparser, try a few common patterns. + common_patterns = ( + Pattern('%Y-%m-%d', 'd'), + Pattern('%Y-%m', 'm'), + Pattern('%Y-%m-%dT%H:%M:%SZ', 'd'), + Pattern('%Y-%m-%dT%H:%M:%S', 'd'), + Pattern('%Y', 'y'), + ) + + def parse_item(item): + result, value, year_only = None, item.get('date', ''), False + release_date, release_month, release_year = None, None, None + + for layout, granularity in common_patterns: + try: + result = datetime.datetime.strptime(value, layout) + except ValueError: + continue + else: + if granularity == 'y': + year_only = True + break + + if result is None: + print('fallback for {}'.format(value), file=sys.stderr) + parser = dateparser.DateDataParser() + try: + # Results in a dict with keys: date_obj, period, locale. + parse_result = parser.get_date_data(value) + + # A datetime object, later we need a date, only. + result = parse_result['date_obj'] + if result is not None: + if parse_result['period'] == 'year': + return None, None, result.year + elif parse_result['period'] == 'month': + return None, result.month, result.year + else: + return result.date(), result.month, result.year + except TypeError as err: + print("{} date parsing failed with: {}".format(value, err), + file=sys.stderr) + + if result is None: + # Unparsable date. + return release_date, release_month, release_year + + if granularity != 'y': + release_date = result.date() + release_year = result.year + if granularity in ('m', 'd'): + release_month = result.month + + return release_date, release_month, release_year + + today = datetime.date.today() + + for prio in date_type_prio: + for item in dates: + if not item.get('dateType') == prio: + continue + + release_date, release_month, release_year = parse_item(item) + if release_date is None and release_year is None: + continue + + if release_year < 1000 or release_year > today.year + 5: + # Skip possibly bogus dates. + release_year = None + continue + break + else: + continue + break + + if release_date is None and release_year is None: + for item in dates: + release_date, release_month, release_year = parse_item(item) + if release_year or release_date: + break + + return release_date, release_month, release_year + +def index_form_to_display_name(s): + """ + Try to convert an index form name, like 'Razis, Panos A' into display_name, + e.g. 'Panos A Razis'. + """ + if ',' not in s: + return s + skip_on_chars = ['(', ')', '*'] + for char in skip_on_chars: + if char in s: + return s + if s.count(',') > 1: + # "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan" + return s + + # Not names, but sprinkled in fields where authors live. + stopwords = [s.lower() for s in ( + 'Archive', + 'Collection', + 'Coordinator', + 'Department', + 'Germany', + 'International', + 'National', + 'Netherlands', + 'Office', + 'Organisation', + 'Organization', + 'Service', + 'Services', + 'United States', + 'University', + 'Verein', + 'Volkshochschule', + )] + lower = s.lower() + for stop in stopwords: + if stop in lower: + return s + + a, b = s.split(',') + return '{} {}'.format(b.strip(), a.strip()) diff --git a/python/tests/files/datacite/casecreate.sh b/python/tests/files/datacite/casecreate.sh new file mode 100755 index 00000000..82655dc3 --- /dev/null +++ b/python/tests/files/datacite/casecreate.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# +# casecreate.sh creates a new test case file pair by copying the last one. +# +set -eo pipefail + +max=$(find . -name 'datacite_doc_*' | sort -n | tail -1 | grep -Eo '[0-9]+') +if [ -z $max ]; then + echo "failed, expected datacite_doc_[NUMBER]..." + exit 1 +fi +new=$((max+1)) +cp "datacite_doc_$max.json" "datacite_doc_$new.json" +cp "datacite_result_$max.json" "datacite_result_$new.json" + +[ -f ./caseview.sh ] && ./caseview.sh "$new" diff --git a/python/tests/files/datacite/caseview.sh b/python/tests/files/datacite/caseview.sh new file mode 100755 index 00000000..d1e98c04 --- /dev/null +++ b/python/tests/files/datacite/caseview.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# +# Open input and output in vertical vim split. +# +# $ caseview 13 +# +view() { + if [ -z "$1" ]; then + echo usage: "$0" CASE-NUMBER + exit 1 + else + padded=$(printf "%02d\n" "$1") + vim -O "datacite_doc_$padded.json" "datacite_result_$padded.json" + fi +} + +view "$@" diff --git a/python/tests/files/datacite/datacite_doc_00.json b/python/tests/files/datacite/datacite_doc_00.json new file mode 100644 index 00000000..f60b106f --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_00.json @@ -0,0 +1,140 @@ +{ + "attributes": { + "container": { + "firstPage": "927", + "identifier": "1074-1542", + "identifierType": "ISSN", + "issue": "12", + "lastPage": "930", + "title": "Journal of Chemical Crystallography", + "type": "Journal", + "volume": "38" + }, + "contentUrl": null, + "contributors": [], + "created": "2019-06-18T14:52:19.000Z", + "creators": [ + { + "affiliation": [], + "familyName": "Li", + "givenName": "Qian-Jin", + "name": "Li, Qian-Jin", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Yang", + "givenName": "Chun-Long", + "name": "Yang, Chun-Long", + "nameType": "Personal" + } + ], + "dates": [ + { + "date": "2008-05-30", + "dateType": "Issued" + }, + { + "date": "2019-05-31T04:04:23Z", + "dateType": "Updated" + } + ], + "descriptions": [], + "doi": "10.1007/s10870-008-9413-z", + "formats": [], + "fundingReferences": [], + "geoLocations": [], + "identifiers": [ + { + "identifier": "https://doi.org/10.1007/s10870-008-9413-z", + "identifierType": "DOI" + }, + { + "identifier": "s10870-008-9413-z", + "identifierType": "Publisher ID" + } + ], + "isActive": true, + "language": null, + "metadataVersion": 1, + "publicationYear": 2008, + "published": "2008", + "publisher": "Springer Science and Business Media LLC", + "reason": null, + "registered": null, + "relatedIdentifiers": [ + { + "relatedIdentifier": "1074-1542", + "relatedIdentifierType": "ISSN", + "relationType": "IsPartOf", + "resourceTypeGeneral": "Collection" + }, + { + "relatedIdentifier": "10.1016/j.bmcl.2005.09.033", + "relatedIdentifierType": "DOI", + "relationType": "References" + }, + { + "relatedIdentifier": "10.1016/s0022-1139(02)00330-5", + "relatedIdentifierType": "DOI", + "relationType": "References" + }, + { + "relatedIdentifier": "10.1016/s0010-8545(01)00337-x", + "relatedIdentifierType": "DOI", + "relationType": "References" + }, + { + "relatedIdentifier": "10.1016/j.tetlet.2005.06.135", + "relatedIdentifierType": "DOI", + "relationType": "References" + }, + { + "relatedIdentifier": "10.1039/p298700000s1", + "relatedIdentifierType": "DOI", + "relationType": "References" + }, + { + "relatedIdentifier": "10.1002/anie.199515551", + "relatedIdentifierType": "DOI", + "relationType": "References" + } + ], + "rightsList": [ + { + "rightsUri": "http://www.springer.com/tdm" + } + ], + "schemaVersion": "http://datacite.org/schema/kernel-4", + "sizes": [], + "source": "levriero", + "state": "findable", + "subjects": [], + "titles": [ + { + "title": "Synthesis and Crystal Structure of a Compound with Two Conformational Isomers: N-(2-methylbenzoyl)-N′-(4-nitrophenyl)thiourea" + } + ], + "types": { + "bibtex": "article", + "citeproc": "article-journal", + "resourceType": "JournalArticle", + "resourceTypeGeneral": "Text", + "ris": "JOUR", + "schemaOrg": "ScholarlyArticle" + }, + "updated": "2019-08-03T00:03:40.000Z", + "url": "http://link.springer.com/10.1007/s10870-008-9413-z", + "version": null + }, + "id": "10.1007/s10870-008-9413-z", + "relationships": { + "client": { + "data": { + "id": "crossref.citations", + "type": "clients" + } + } + }, + "type": "dois" +} diff --git a/python/tests/files/datacite/datacite_doc_01.json b/python/tests/files/datacite/datacite_doc_01.json new file mode 100644 index 00000000..16a446b3 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_01.json @@ -0,0 +1,81 @@ +{ + "attributes": { + "container": {}, + "contentUrl": null, + "contributors": [], + "created": "2016-12-08T07:43:15.000Z", + "creators": [ + { + "affiliation": [], + "familyName": "Dargenty", + "givenName": "G.", + "name": "Dargenty, G.", + "nameType": "Personal" + } + ], + "dates": [ + { + "date": "1887", + "dateType": "Issued" + } + ], + "descriptions": [], + "doi": "10.11588/diglit.25558.39", + "formats": [], + "fundingReferences": [], + "geoLocations": [], + "identifiers": [ + { + "identifier": "https://doi.org/10.11588/diglit.25558.39", + "identifierType": "DOI" + } + ], + "isActive": true, + "language": "fre", + "metadataVersion": 4, + "publicationYear": 1887, + "published": "1887", + "publisher": "University Library Heidelberg", + "reason": null, + "registered": "2016-12-08T07:43:15.000Z", + "relatedIdentifiers": [], + "rightsList": [ + { + "lang": "de", + "rights": "Standard (Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen) - http://www.ub.uni-heidelberg.de/helios/digi/nutzung/Welcome.html" + } + ], + "schemaVersion": "http://datacite.org/schema/kernel-4", + "sizes": [], + "source": null, + "state": "findable", + "subjects": [], + "titles": [ + { + "lang": "de", + "title": "Ferdinand Gaillard, [1]: né à Paris le 16 janvier 1834, mort à Paris le 19 janvier 1887" + } + ], + "types": { + "bibtex": "article", + "citeproc": "article-journal", + "resourceType": "DigitalisatDigital copy", + "resourceTypeGeneral": "Text", + "ris": "RPRT", + "schemaOrg": "ScholarlyArticle" + }, + "updated": "2019-08-02T14:27:33.000Z", + "url": "http://digi.ub.uni-heidelberg.de/diglit/art1887_1/0172", + "version": null + }, + "id": "10.11588/diglit.25558.39", + "relationships": { + "client": { + "data": { + "id": "gesis.ubhd", + "type": "clients" + } + } + }, + "type": "dois" +} diff --git a/python/tests/files/datacite/datacite_doc_02.json b/python/tests/files/datacite/datacite_doc_02.json new file mode 100644 index 00000000..139e2cb0 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_02.json @@ -0,0 +1,85 @@ +{ + "attributes": { + "container": {}, + "contentUrl": null, + "contributors": [], + "created": "2018-11-29T12:04:12.000Z", + "creators": [ + { + "affiliation": [], + "familyName": "Weyersberg", + "givenName": "Albert", + "name": "Weyersberg, Albert", + "nameType": "Personal" + } + ], + "dates": [ + { + "date": "1897", + "dateType": "Issued" + } + ], + "descriptions": [], + "doi": "10.11588/diglit.37715.57", + "formats": [], + "fundingReferences": [], + "geoLocations": [], + "identifiers": [ + { + "identifier": "https://doi.org/10.11588/diglit.37715.57", + "identifierType": "DOI" + } + ], + "isActive": true, + "language": "ger", + "metadataVersion": 2, + "publicationYear": 1897, + "published": "1897", + "publisher": "University Library Heidelberg", + "reason": null, + "registered": "2018-11-29T12:04:13.000Z", + "relatedIdentifiers": [], + "rightsList": [ + { + "lang": "de", + "rights": "Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen - https://creativecommons.org/licenses/by-sa/3.0/de/" + }, + { + "lang": "en", + "rights": "Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen - https://creativecommons.org/licenses/by-sa/3.0/" + } + ], + "schemaVersion": "http://datacite.org/schema/kernel-4", + "sizes": [], + "source": "mds", + "state": "findable", + "subjects": [], + "titles": [ + { + "lang": "de", + "title": "Solinger Schwertschmiede-Familien, [4]" + } + ], + "types": { + "bibtex": "article", + "citeproc": "article-journal", + "resourceType": "DigitalisatDigital copy", + "resourceTypeGeneral": "Text", + "ris": "RPRT", + "schemaOrg": "ScholarlyArticle" + }, + "updated": "2019-08-02T21:31:04.000Z", + "url": "https://digi.ub.uni-heidelberg.de/diglit/zhwk1897_1899/0131", + "version": null + }, + "id": "10.11588/diglit.37715.57", + "relationships": { + "client": { + "data": { + "id": "gesis.ubhd", + "type": "clients" + } + } + }, + "type": "dois" +} diff --git a/python/tests/files/datacite/datacite_doc_03.json b/python/tests/files/datacite/datacite_doc_03.json new file mode 100644 index 00000000..80bacabc --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_03.json @@ -0,0 +1,70 @@ +{ + "attributes": { + "container": {}, + "contentUrl": null, + "contributors": [], + "created": "2016-11-03T09:07:08.000Z", + "creators": [ + { + "affiliation": [], + "name": "Mastura Yahya" + } + ], + "dates": [ + { + "date": "2016", + "dateType": "Issued" + } + ], + "descriptions": [], + "doi": "10.13140/rg.2.2.30434.53446", + "formats": [], + "fundingReferences": [], + "geoLocations": [], + "identifiers": [ + { + "identifier": "https://doi.org/10.13140/rg.2.2.30434.53446", + "identifierType": "DOI" + } + ], + "isActive": true, + "language": "ms", + "metadataVersion": 0, + "publicationYear": 2016, + "published": "2016", + "publisher": "Unpublished", + "reason": null, + "registered": "2016-11-03T09:07:09.000Z", + "relatedIdentifiers": [], + "rightsList": [], + "schemaVersion": "http://datacite.org/schema/kernel-3", + "sizes": [], + "source": null, + "state": "findable", + "subjects": [], + "titles": [ + { + "title": "midterm ah30903" + } + ], + "types": { + "bibtex": "misc", + "citeproc": "article", + "ris": "GEN", + "schemaOrg": "CreativeWork" + }, + "updated": "2019-08-02T12:51:15.000Z", + "url": "http://rgdoi.net/10.13140/RG.2.2.30434.53446", + "version": null + }, + "id": "10.13140/rg.2.2.30434.53446", + "relationships": { + "client": { + "data": { + "id": "rg.rg", + "type": "clients" + } + } + }, + "type": "dois" +} diff --git a/python/tests/files/datacite/datacite_doc_04.json b/python/tests/files/datacite/datacite_doc_04.json new file mode 100644 index 00000000..f7d06a75 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_04.json @@ -0,0 +1,80 @@ +{ + "attributes": { + "container": {}, + "contentUrl": null, + "contributors": [], + "created": "2015-11-11T11:12:34.000Z", + "creators": [ + { + "affiliation": [], + "familyName": "Nicollerat", + "givenName": "Marc Andre", + "name": "Nicollerat, Marc Andre", + "nameType": "Personal" + } + ], + "dates": [ + { + "date": "1973", + "dateType": "Issued" + } + ], + "descriptions": [ + { + "description": "Let A be an abelian category, I the full subcategory of A consisting of injective objects of A, and K(A) the category whose objects are cochain complexes of elements of A, and whose morphisms are homotopy classes of cochain maps. In (5), lemma 4.6., p. 42, R. Hartshorne has proved that, under certain conditions, a cochain complex X˙ ε. |KA)| can be embedded in a complex I˙ ε. |K(I)| in such a way that I˙ has the same cohomology as X˙. In Chapter I we show that the construction given in the two first parts of Hartshorne's Lemma is natural i.e. there exists a functor J : K(A) → K(I) and a natural transformation [formula omitted] (where E : K(I) → K(A) is the embedding functor) such that [formula omitted] is injective and induces isomorphism in cohomology. The question whether the construction given in the third part of the lemma is functorial is still open. We also prove that J is left adjoint to E, so that K(I) is a reflective subcategory of K(A). In the special case where A is a category [formula omitted] of left A-modules, and [formula omitted] the category of cochain complexes in [formula omitted] and cochain maps (not homotopy classes), we prove the existence of a functor [formula omitted] In Chapter II we study the natural homomorphism [formula omitted] where A, B are rings, and M, L, N modules or chain complexes. In particular we give several sufficient conditions under which v is an isomorphism, or induces isomorphism in homology. In the appendix we give a detailed proof of Hartshorne's Lemma. We think that this is useful, as no complete proof is, to our knowledge, to be found in the literature.", + "descriptionType": "Abstract" + } + ], + "doi": "10.14288/1.0080520", + "formats": [], + "fundingReferences": [], + "geoLocations": [], + "identifiers": [ + { + "identifier": "https://doi.org/10.14288/1.0080520", + "identifierType": "DOI" + } + ], + "isActive": true, + "language": "en", + "metadataVersion": 5, + "publicationYear": 1973, + "published": "1973", + "publisher": "University of British Columbia", + "reason": null, + "registered": "2015-11-11T11:12:35.000Z", + "relatedIdentifiers": [], + "rightsList": [], + "schemaVersion": "http://datacite.org/schema/kernel-3", + "sizes": [], + "source": null, + "state": "findable", + "subjects": [], + "titles": [ + { + "title": "On chain maps inducing isomorphisms in homology" + } + ], + "types": { + "bibtex": "article", + "citeproc": "article-journal", + "resourceType": "Text", + "resourceTypeGeneral": "Text", + "ris": "RPRT", + "schemaOrg": "ScholarlyArticle" + }, + "updated": "2019-08-02T09:43:14.000Z", + "url": "https://doi.library.ubc.ca/10.14288/1.0080520", + "version": null + }, + "id": "10.14288/1.0080520", + "relationships": { + "client": { + "data": { + "id": "cisti.ubc", + "type": "clients" + } + } + }, + "type": "dois" +} diff --git a/python/tests/files/datacite/datacite_doc_05.json b/python/tests/files/datacite/datacite_doc_05.json new file mode 100644 index 00000000..76fb73a8 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_05.json @@ -0,0 +1,598 @@ +{ + "attributes": { + "container": {}, + "contentUrl": null, + "contributors": [ + { + "affiliation": [], + "name": "Kessy Abarenkov" + }, + { + "affiliation": [], + "name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden" + } + ], + "created": "2015-06-05T10:23:18.000Z", + "creators": [ + { + "affiliation": [], + "familyName": "Kõljalg", + "givenName": "Urmas", + "name": "Kõljalg, Urmas", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Abarenkov", + "givenName": "Kessy", + "name": "Abarenkov, Kessy", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Nilsson", + "givenName": "R. Henrik", + "name": "Nilsson, R. Henrik", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Larsson", + "givenName": "Karl-Henrik", + "name": "Larsson, Karl-Henrik", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Aas", + "givenName": "Anders Bjørnsgard", + "name": "Aas, Anders Bjørnsgard", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Adams", + "givenName": "Rachel", + "name": "Adams, Rachel", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Alves", + "givenName": "Artur", + "name": "Alves, Artur", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Ammirati", + "givenName": "Joseph F.", + "name": "Ammirati, Joseph F.", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Arnold", + "givenName": "A. Elizabeth", + "name": "Arnold, A. Elizabeth", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Bahram", + "givenName": "Mohammad", + "name": "Bahram, Mohammad", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Bengtsson-Palme", + "givenName": "Johan", + "name": "Bengtsson-Palme, Johan", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Berlin", + "givenName": "Anna", + "name": "Berlin, Anna", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Botnen", + "givenName": "Synnøve", + "name": "Botnen, Synnøve", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Bourlat", + "givenName": "Sarah", + "name": "Bourlat, Sarah", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Cheeke", + "givenName": "Tanya", + "name": "Cheeke, Tanya", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Dima", + "givenName": "Bálint", + "name": "Dima, Bálint", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Drenkhan", + "givenName": "Rein", + "name": "Drenkhan, Rein", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Duarte", + "givenName": "Camila", + "name": "Duarte, Camila", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Dueñas", + "givenName": "Margarita", + "name": "Dueñas, Margarita", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Eberhardt", + "givenName": "Ursula", + "name": "Eberhardt, Ursula", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Friberg", + "givenName": "Hanna", + "name": "Friberg, Hanna", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Frøslev", + "givenName": "Tobias G.", + "name": "Frøslev, Tobias G.", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Garnica", + "givenName": "Sigisfredo", + "name": "Garnica, Sigisfredo", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Geml", + "givenName": "József", + "name": "Geml, József", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Ghobad-Nejhad", + "givenName": "Masoomeh", + "name": "Ghobad-Nejhad, Masoomeh", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Grebenc", + "givenName": "Tine", + "name": "Grebenc, Tine", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Griffith", + "givenName": "Gareth W.", + "name": "Griffith, Gareth W.", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Hampe", + "givenName": "Felix", + "name": "Hampe, Felix", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Kennedy", + "givenName": "Peter", + "name": "Kennedy, Peter", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Khomich", + "givenName": "Maryia", + "name": "Khomich, Maryia", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Kohout", + "givenName": "Petr", + "name": "Kohout, Petr", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Kollom", + "givenName": "Anu", + "name": "Kollom, Anu", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Larsson", + "givenName": "Ellen", + "name": "Larsson, Ellen", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Laszlo", + "givenName": "Irinyi", + "name": "Laszlo, Irinyi", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Leavitt", + "givenName": "Steven", + "name": "Leavitt, Steven", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Liimatainen", + "givenName": "Kare", + "name": "Liimatainen, Kare", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Lindahl", + "givenName": "Björn", + "name": "Lindahl, Björn", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Lodge", + "givenName": "Deborah J.", + "name": "Lodge, Deborah J.", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Lumbsch", + "givenName": "Helge Thorsten", + "name": "Lumbsch, Helge Thorsten", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Martín Esteban", + "givenName": "María Paz", + "name": "Martín Esteban, María Paz", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Meyer", + "givenName": "Wieland", + "name": "Meyer, Wieland", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Miettinen", + "givenName": "Otto", + "name": "Miettinen, Otto", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Nguyen", + "givenName": "Nhu", + "name": "Nguyen, Nhu", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Niskanen", + "givenName": "Tuula", + "name": "Niskanen, Tuula", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Oono", + "givenName": "Ryoko", + "name": "Oono, Ryoko", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Öpik", + "givenName": "Maarja", + "name": "Öpik, Maarja", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Ordynets", + "givenName": "Alexander", + "name": "Ordynets, Alexander", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Pawłowska", + "givenName": "Julia", + "name": "Pawłowska, Julia", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Peintner", + "givenName": "Ursula", + "name": "Peintner, Ursula", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Pereira", + "givenName": "Olinto Liparini", + "name": "Pereira, Olinto Liparini", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Pinho", + "givenName": "Danilo Batista", + "name": "Pinho, Danilo Batista", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Põldmaa", + "givenName": "Kadri", + "name": "Põldmaa, Kadri", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Runnel", + "givenName": "Kadri", + "name": "Runnel, Kadri", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Ryberg", + "givenName": "Martin", + "name": "Ryberg, Martin", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Saar", + "givenName": "Irja", + "name": "Saar, Irja", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Sanli", + "givenName": "Kemal", + "name": "Sanli, Kemal", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Scott", + "givenName": "James", + "name": "Scott, James", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Spirin", + "givenName": "Viacheslav", + "name": "Spirin, Viacheslav", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Suija", + "givenName": "Ave", + "name": "Suija, Ave", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Svantesson", + "givenName": "Sten", + "name": "Svantesson, Sten", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Tadych", + "givenName": "Mariusz", + "name": "Tadych, Mariusz", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Takamatsu", + "givenName": "Susumu", + "name": "Takamatsu, Susumu", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Tamm", + "givenName": "Heidi", + "name": "Tamm, Heidi", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Taylor", + "givenName": "AFS.", + "name": "Taylor, AFS.", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Tedersoo", + "givenName": "Leho", + "name": "Tedersoo, Leho", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Telleria", + "givenName": "M.T.", + "name": "Telleria, M.T.", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Udayanga", + "givenName": "Dhanushka", + "name": "Udayanga, Dhanushka", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Unterseher", + "givenName": "Martin", + "name": "Unterseher, Martin", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Volobuev", + "givenName": "Sergey", + "name": "Volobuev, Sergey", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Weiss", + "givenName": "Michael", + "name": "Weiss, Michael", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Wurzbacher", + "givenName": "Christian", + "name": "Wurzbacher, Christian", + "nameType": "Personal" + } + ], + "dates": [ + { + "date": "2016-04-22", + "dateType": "Updated" + }, + { + "date": "2014-10-05", + "dateType": "Created" + }, + { + "date": "2015", + "dateType": "Issued" + } + ], + "descriptions": [ + { + "description": "UNITE provides a unified way for delimiting, identifying, communicating, and working with DNA-based Species Hypotheses (SH). All fungal ITS sequences in the international nucleotide sequence databases are clustered to approximately the species level by applying a set of dynamic distance values (<0.5 - 3.0%). All species hypotheses are given a unique, stable name in the form of a DOI, and their taxonomic and ecological annotations are verified through distributed, web-based third-party annotation efforts. SHs are connected to a taxon name and its classification as far as possible (phylum, class, order, etc.) by taking into account identifications for all sequences in the SH. An automatically or manually designated sequence is chosen to represent each such SH. These sequences are released (https://unite.ut.ee/repository.php) for use by the scientific community in, for example, local sequence similarity searches and next-generation sequencing analysis pipelines. The system and the data are updated automatically as the number of public fungal ITS sequences grows.", + "descriptionType": "Abstract" + } + ], + "doi": "10.15156/bio/sh409843.07fu", + "formats": [ + "application/json" + ], + "fundingReferences": [], + "geoLocations": [], + "identifiers": [ + { + "identifier": "https://doi.org/10.15156/bio/sh409843.07fu", + "identifierType": "DOI" + } + ], + "isActive": true, + "language": "eng", + "metadataVersion": 1, + "publicationYear": 2015, + "published": "2015", + "publisher": "UNITE Community", + "reason": null, + "registered": "2015-06-05T10:23:19.000Z", + "relatedIdentifiers": [], + "rightsList": [ + { + "rights": "Attribution-NonCommercial (CC BY-NC)", + "rightsUri": "http://creativecommons.org/licenses/by-nc/4.0" + } + ], + "schemaVersion": "http://datacite.org/schema/kernel-3", + "sizes": [], + "source": null, + "state": "findable", + "subjects": [], + "titles": [ + { + "title": "SH409843.07FU" + }, + { + "title": "Gomphales", + "titleType": "Subtitle" + } + ], + "types": { + "bibtex": "misc", + "citeproc": "dataset", + "resourceType": "Dataset/UNITE Species Hypothesis", + "resourceTypeGeneral": "Dataset", + "ris": "DATA", + "schemaOrg": "Dataset" + }, + "updated": "2019-08-02T07:45:28.000Z", + "url": "https://plutof.ut.ee/#/datacite/10.15156/BIO/SH409843.07FU", + "version": null + }, + "id": "10.15156/bio/sh409843.07fu", + "relationships": { + "client": { + "data": { + "id": "estdoi.bio", + "type": "clients" + } + } + }, + "type": "dois" +} diff --git a/python/tests/files/datacite/datacite_doc_06.json b/python/tests/files/datacite/datacite_doc_06.json new file mode 100644 index 00000000..01cb2cb3 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_06.json @@ -0,0 +1,83 @@ +{ + "attributes": { + "container": {}, + "contentUrl": null, + "contributors": [], + "created": "2017-12-13T12:03:09.000Z", + "creators": [ + { + "affiliation": [], + "name": "Crispijn De Passe (Der Ältere) (1564-1637)", + "nameType": "Personal" + } + ], + "dates": [ + { + "date": "1590", + "dateType": "Available" + }, + { + "date": "1590", + "dateType": "Issued" + } + ], + "descriptions": [], + "doi": "10.16903/ethz-grs-d_006220", + "formats": [ + "Blattgrösse: 21.0 x 14.4 x 0.0 cm (beschnitten)", + "Kupferstich" + ], + "fundingReferences": [], + "geoLocations": [], + "identifiers": [ + { + "identifier": "https://doi.org/10.16903/ethz-grs-d_006220", + "identifierType": "DOI" + } + ], + "isActive": true, + "language": null, + "metadataVersion": 1, + "publicationYear": 1590, + "published": "1590", + "publisher": "n.a.", + "reason": null, + "registered": "2017-12-13T12:03:09.000Z", + "relatedIdentifiers": [], + "rightsList": [ + { + "rights": "ETH-Bibliothek Zürich, Graphische Sammlung / D 6220 / Public Domain Mark 1.0" + } + ], + "schemaVersion": "http://datacite.org/schema/kernel-3", + "sizes": [], + "source": "mds", + "state": "findable", + "subjects": [], + "titles": [ + { + "title": "Der Eifer (Sedulitas), Blatt 7 der Folge \"Die Tugenden\"" + } + ], + "types": { + "bibtex": "misc", + "citeproc": "article", + "resourceTypeGeneral": "InteractiveResource", + "ris": "GEN", + "schemaOrg": "CreativeWork" + }, + "updated": "2019-08-02T17:20:02.000Z", + "url": "http://www.e-gs.ethz.ch/eMP/eMuseumPlus?service=ExternalInterface&module=collection&objectId=29469&viewType=detailView", + "version": null + }, + "id": "10.16903/ethz-grs-d_006220", + "relationships": { + "client": { + "data": { + "id": "ethz.gs", + "type": "clients" + } + } + }, + "type": "dois" +} diff --git a/python/tests/files/datacite/datacite_doc_07.json b/python/tests/files/datacite/datacite_doc_07.json new file mode 100644 index 00000000..8e292fea --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_07.json @@ -0,0 +1,120 @@ +{ + "attributes": { + "container": {}, + "contentUrl": null, + "contributors": [], + "created": "2016-11-21T13:08:14.000Z", + "creators": [ + { + "affiliation": [], + "familyName": "ROTHUIZEN", + "givenName": "E.", + "name": "ROTHUIZEN, E.", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "ELMEGAARD", + "givenName": "B.", + "name": "ELMEGAARD, B.", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "MARKUSSEN W.", + "givenName": "B.", + "name": "MARKUSSEN W., B.", + "nameType": "Personal" + }, + { + "affiliation": [], + "name": "Et Al." + } + ], + "dates": [ + { + "date": "2015", + "dateType": "Issued" + } + ], + "descriptions": [ + { + "description": "The purpose of the ISEC concept is to provide a high-efficient heat pump system for hot water production. The ISEC concept uses two storage tanks for the water, one discharged and one charged. Hot water for the industrial process is tapped from the charged tank, while the other tank is charging. Charging is done by circulating the water in the tank through the condenser of a heat pump several times and thereby gradually heating the water. The charging is done with a higher mass flow rate than the discharging to reach several circulations of the water during the time frame of one discharging. This result in a lower condensing temperature than if the water was heated in one step. Two test setups were built, one to test the performance of the heat pump gradually heating the water and one to investigate the stratification in the storage tanks. Furthermore, a dynamic model of the system was implemented in Dymola, and validated by the use of test data from the two experimental setups. This paper shows that there is a good consistency between the model and the experimental tests.", + "descriptionType": "Abstract" + } + ], + "doi": "10.18462/iir.icr.2015.0926", + "formats": [], + "fundingReferences": [], + "geoLocations": [], + "identifiers": [ + { + "identifier": "https://doi.org/10.18462/iir.icr.2015.0926", + "identifierType": "DOI" + } + ], + "isActive": true, + "language": "eng", + "metadataVersion": 0, + "publicationYear": 2015, + "published": "2015", + "publisher": "International Institute of Refrigeration (IIR)", + "reason": null, + "registered": "2016-11-21T13:08:14.000Z", + "relatedIdentifiers": [], + "rightsList": [], + "schemaVersion": null, + "sizes": [], + "source": null, + "state": "findable", + "subjects": [ + { + "subject": "HEAT PUMP" + }, + { + "subject": "HOT WATER" + }, + { + "subject": "HEAT TRANSFER" + }, + { + "subject": "PERFORMANCE" + }, + { + "subject": "THERMAL STORAGE" + }, + { + "subject": "TANK" + }, + { + "subject": "MODEL" + } + ], + "titles": [ + { + "title": "High efficient heat pump system using storage tanks to increase cop by means of the ISEC concept. 1: model validation." + } + ], + "types": { + "bibtex": "misc", + "citeproc": "dataset", + "resourceType": "Dataset", + "resourceTypeGeneral": "Dataset", + "ris": "DATA", + "schemaOrg": "Dataset" + }, + "updated": "2019-08-16T18:00:59.000Z", + "url": "http://www.iifiir.org/clientBookline/service/reference.asp?INSTANCE=EXPLOITATION&OUTPUT=PORTAL&DOCID=IFD_REFDOC_0015008&DOCBASE=IFD_REFDOC_EN&SETLANGUAGE=EN", + "version": null + }, + "id": "10.18462/iir.icr.2015.0926", + "relationships": { + "client": { + "data": { + "id": "inist.iif", + "type": "clients" + } + } + }, + "type": "dois" +} diff --git a/python/tests/files/datacite/datacite_doc_08.json b/python/tests/files/datacite/datacite_doc_08.json new file mode 100644 index 00000000..84f756e8 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_08.json @@ -0,0 +1,105 @@ +{ + "attributes": { + "container": {}, + "contentUrl": null, + "contributors": [], + "created": "2019-08-24T07:46:47.000Z", + "creators": [ + { + "affiliation": [], + "familyName": "Kajisa", + "givenName": "Kei", + "name": "Kajisa, Kei", + "nameIdentifiers": [], + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Kajisa", + "givenName": "Kei", + "name": "Kajisa, Kei", + "nameIdentifiers": [], + "nameType": "Personal" + } + ], + "dates": [ + { + "date": "2017", + "dateType": "Issued" + } + ], + "descriptions": [ + { + "description": "International society recognizes that the scarcity of fresh water is increasing and farming sectors suffer from lack of irrigation water. However, if we look at this issue with a framework of relative factor endowment, a different view will arise. In emerging states with rapid industrialization and labor migration, labor scarcity increases at a faster pace than that of irrigation water. Using the historical review of Japan’s irrigation policies as well as the case studies of India and China, this paper shows that the introduction of policies which do not reflect the actual relative resource scarcity may mislead the development path. We argue that under increasing relative labor scarcity it is important to realize the substitution of capital for labor for surface irrigation system management and that the substitution needs public support because the service of surface irrigation system has some externalities. Through this argument, this paper also intends to shed the light back to the role of the state for local resource management which seems to be unfairly undervalued since the boom of community participatory approach in the 1980s.", + "descriptionType": "Abstract" + } + ], + "doi": "10.22004/ag.econ.284864", + "formats": [], + "fundingReferences": [], + "geoLocations": [], + "identifiers": [ + { + "identifier": "https://doi.org/10.22004/ag.econ.284864", + "identifierType": "DOI" + } + ], + "isActive": true, + "language": "eng", + "metadataVersion": 1, + "publicationYear": 2017, + "published": "2017", + "publisher": "Unknown", + "reason": null, + "registered": "2019-08-24T07:46:47.000Z", + "relatedIdentifiers": [], + "rightsList": [], + "schemaVersion": null, + "sizes": [], + "source": "mds", + "state": "findable", + "subjects": [ + { + "subject": "Land Economics/Use" + }, + { + "subject": "irrigation", + "subjectScheme": "keyword" + }, + { + "subject": "industrialization", + "subjectScheme": "keyword" + }, + { + "subject": "collective action", + "subjectScheme": "keyword" + } + ], + "titles": [ + { + "title": "Irrigation Policies under Rapid Industrialization and Labor Migration: Lessons from Japan, China and India" + } + ], + "types": { + "bibtex": "article", + "citeproc": "article-journal", + "resourceType": "Text", + "resourceTypeGeneral": "Text", + "ris": "RPRT", + "schemaOrg": "ScholarlyArticle" + }, + "updated": "2019-08-25T09:38:33.000Z", + "url": "https://ageconsearch.umn.edu/record/284864", + "version": null + }, + "id": "10.22004/ag.econ.284864", + "relationships": { + "client": { + "data": { + "id": "tind.agecon", + "type": "clients" + } + } + }, + "type": "dois" +} diff --git a/python/tests/files/datacite/datacite_doc_09.json b/python/tests/files/datacite/datacite_doc_09.json new file mode 100644 index 00000000..d6617d0d --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_09.json @@ -0,0 +1,130 @@ +{ + "attributes": { + "container": {}, + "contentUrl": null, + "contributors": [ + { + "affiliation": [], + "contributorType": "HostingInstitution", + "name": "TIB-Technische Informationsbibliothek Universitätsbibliothek Hannover", + "nameIdentifiers": [], + "nameType": "Organizational" + }, + { + "affiliation": [], + "contributorType": "DataManager", + "name": "Technische Informationsbibliothek (TIB)", + "nameIdentifiers": [] + } + ], + "created": "2017-02-25T00:00:18.000Z", + "creators": [ + { + "affiliation": [], + "familyName": "Kirstaedter", + "givenName": "Nils", + "name": "Kirstaedter, Nils", + "nameIdentifiers": [], + "nameType": "Personal" + } + ], + "dates": [ + { + "date": "2016", + "dateType": "Issued" + } + ], + "descriptions": [], + "doi": "10.2314/gbv:880813733", + "formats": [ + "application/pdf" + ], + "fundingReferences": [], + "geoLocations": [], + "identifiers": [ + { + "identifier": "https://doi.org/10.2314/gbv:880813733", + "identifierType": "DOI" + }, + { + "identifier": "880813733", + "identifierType": "ppn" + }, + { + "identifier": "03WKCF3C", + "identifierType": "contract" + }, + { + "identifier": "01132105", + "identifierType": "contract" + }, + { + "identifier": "GBV:880813733", + "identifierType": "firstid" + }, + { + "identifier": "TIBKAT:880813733", + "identifierType": "ftx-id" + } + ], + "isActive": true, + "language": "de", + "metadataVersion": 9, + "publicationYear": 2016, + "published": "2016", + "publisher": "[Lumics GmbH]", + "reason": null, + "registered": "2017-02-25T00:00:19.000Z", + "relatedIdentifiers": [], + "rightsList": [], + "schemaVersion": "http://datacite.org/schema/kernel-4", + "sizes": [ + "1 Online-Ressource (10 Seiten, 1,40 MB)" + ], + "source": "mds", + "state": "findable", + "subjects": [ + { + "subject": "Direktdiodenlasersysteme" + }, + { + "subject": "Physics", + "subjectScheme": "linsearch" + } + ], + "titles": [ + { + "title": "BrightLas : TP3.3. Module für Direktdiodenstrahlquellen bis 4kW und Untersuchungen zur Leistungsskalierung (Diodemodul) : zum Verbundvorhaben Direktdiodenlaseranlagen und -systeme (VP3) im Förderschwerpunkt innovative regionale Wachstumskerne, BMBF : Abschlussbericht" + }, + { + "title": "Module für Direktdiodenstrahlquellen bis 4kW und Untersuchungen zur Leistungsskalierung (Diodemodul)", + "titleType": "AlternativeTitle" + }, + { + "title": "Direktdiodenlaseranlagen und -systeme (VP3)", + "titleType": "AlternativeTitle" + } + ], + "types": { + "bibtex": "article", + "citeproc": "report", + "resourceType": "Report", + "resourceTypeGeneral": "Text", + "ris": "RPRT", + "schemaOrg": "ScholarlyArticle" + }, + "updated": "2019-08-03T05:53:51.000Z", + "url": "https://www.tib.eu/suchen/id/TIBKAT:880813733/", + "version": "1.0" + }, + "id": "10.2314/gbv:880813733", + "relationships": { + "client": { + "data": { + "id": "tib.tib", + "type": "clients" + } + } + }, + "type": "dois" +} diff --git a/python/tests/files/datacite/datacite_doc_10.json b/python/tests/files/datacite/datacite_doc_10.json new file mode 100644 index 00000000..154242cb --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_10.json @@ -0,0 +1,83 @@ +{ + "attributes": { + "container": {}, + "contentUrl": null, + "contributors": [], + "created": "2018-09-09T08:32:09.000Z", + "creators": [ + { + "affiliation": [], + "name": "Unknown" + } + ], + "dates": [ + { + "date": "2012", + "dateType": "Issued" + } + ], + "descriptions": [ + { + "descriptionType": "Abstract" + } + ], + "doi": "10.25549/wpacards-m6171", + "formats": [], + "fundingReferences": [], + "geoLocations": [], + "identifiers": [ + { + "identifier": "https://doi.org/10.25549/wpacards-m6171", + "identifierType": "DOI" + } + ], + "isActive": true, + "language": "eng", + "metadataVersion": 0, + "publicationYear": 2012, + "published": "2012", + "publisher": "University of Southern California Digital Library (USC.DL)", + "reason": null, + "registered": "2018-09-09T08:33:10.000Z", + "relatedIdentifiers": [], + "rightsList": [], + "schemaVersion": "http://datacite.org/schema/kernel-4", + "sizes": [], + "source": "mds", + "state": "findable", + "subjects": [ + { + "subject": "housing areas" + }, + { + "subject": "Dwellings" + } + ], + "titles": [ + { + "title": "WPA household census for 210 E VERNON, Los Angeles" + } + ], + "types": { + "bibtex": "misc", + "citeproc": "dataset", + "resourceType": "Dataset", + "resourceTypeGeneral": "Dataset", + "ris": "DATA", + "schemaOrg": "Dataset" + }, + "updated": "2019-08-02T20:03:32.000Z", + "url": "http://digitallibrary.usc.edu/cdm/ref/collection/p15799coll8/id/2608", + "version": null + }, + "id": "10.25549/wpacards-m6171", + "relationships": { + "client": { + "data": { + "id": "usc.dl", + "type": "clients" + } + } + }, + "type": "dois" +} diff --git a/python/tests/files/datacite/datacite_doc_11.json b/python/tests/files/datacite/datacite_doc_11.json new file mode 100644 index 00000000..80194762 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_11.json @@ -0,0 +1,86 @@ +{ + "attributes": { + "container": {}, + "contentUrl": null, + "contributors": [], + "created": "2019-03-04T23:56:42.000Z", + "creators": [ + { + "affiliation": [], + "name": "Comet Photo AG (Zürich)" + } + ], + "dates": [ + { + "date": "1965", + "dateType": "Available" + }, + { + "date": "1965", + "dateType": "Issued" + } + ], + "descriptions": [ + { + "description": "Download und Nutzung frei", + "descriptionType": "Other" + }, + { + "description": "10, N1, Genève, Bern, Zürich, Sankt Gallen, Sankt Margrethen, Strassen, Strassenbau, 2.", + "descriptionType": "Other" + } + ], + "doi": "10.3932/ethz-a-000055869", + "formats": [ + "TIFF-Bild" + ], + "fundingReferences": [], + "geoLocations": [], + "identifiers": [ + { + "identifier": "https://doi.org/10.3932/ethz-a-000055869", + "identifierType": "DOI" + } + ], + "isActive": true, + "language": "de", + "metadataVersion": 6, + "publicationYear": 1965, + "published": "1965", + "publisher": "ETH-Bibliothek Zürich, Bildarchiv", + "reason": null, + "registered": "2019-07-30T13:17:45.000Z", + "relatedIdentifiers": [], + "rightsList": [], + "schemaVersion": "http://datacite.org/schema/kernel-3", + "sizes": [], + "source": "mds", + "state": "findable", + "subjects": [], + "titles": [ + { + "title": "N1 bei Safenwil" + } + ], + "types": { + "bibtex": "misc", + "citeproc": "graphic", + "resourceTypeGeneral": "Image", + "ris": "FIGURE", + "schemaOrg": "ImageObject" + }, + "updated": "2019-08-02T22:08:26.000Z", + "url": "http://ba.e-pics.ethz.ch/link.jsp?id=44861", + "version": null + }, + "id": "10.3932/ethz-a-000055869", + "relationships": { + "client": { + "data": { + "id": "ethz.epics-ba", + "type": "clients" + } + } + }, + "type": "dois" +} diff --git a/python/tests/files/datacite/datacite_doc_12.json b/python/tests/files/datacite/datacite_doc_12.json new file mode 100644 index 00000000..642011d5 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_12.json @@ -0,0 +1,103 @@ +{ + "attributes": { + "container": {}, + "contentUrl": null, + "contributors": [], + "created": "2019-06-27T01:01:35.000Z", + "creators": [ + { + "affiliation": [], + "familyName": "Spanias", + "givenName": "Charalampos", + "name": "Spanias, Charalampos", + "nameIdentifiers": [], + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Nikolaidis", + "givenName": "Pantelis T", + "name": "Nikolaidis, Pantelis T", + "nameIdentifiers": [], + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Rosemann", + "givenName": "Thomas", + "name": "Rosemann, Thomas", + "nameIdentifiers": [], + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Knechtle", + "givenName": "Beat", + "name": "Knechtle, Beat", + "nameIdentifiers": [], + "nameType": "Personal" + } + ], + "dates": [ + { + "date": "2019-06-14", + "dateType": "Available" + }, + { + "date": "2019", + "dateType": "Issued" + } + ], + "descriptions": [], + "doi": "10.5167/uzh-171449", + "formats": [], + "fundingReferences": [], + "geoLocations": [], + "identifiers": [ + { + "identifier": "https://doi.org/10.5167/uzh-171449", + "identifierType": "DOI" + } + ], + "isActive": true, + "language": null, + "metadataVersion": 0, + "publicationYear": 2019, + "published": "2019", + "publisher": "MDPI Publishing", + "reason": null, + "registered": "2019-06-27T01:01:36.000Z", + "relatedIdentifiers": [], + "rightsList": [], + "schemaVersion": null, + "sizes": [], + "source": "mds", + "state": "findable", + "subjects": [], + "titles": [ + { + "title": "Anthropometric and Physiological Profile of Mixed Martial Art Athletes: A Brief Review" + } + ], + "types": { + "bibtex": "article", + "citeproc": "article-journal", + "resourceTypeGeneral": "Text", + "ris": "RPRT", + "schemaOrg": "ScholarlyArticle" + }, + "updated": "2019-09-26T16:44:24.000Z", + "url": "https://www.zora.uzh.ch/id/eprint/171449", + "version": null + }, + "id": "10.5167/uzh-171449", + "relationships": { + "client": { + "data": { + "id": "ethz.zora", + "type": "clients" + } + } + }, + "type": "dois" +} diff --git a/python/tests/files/datacite/datacite_doc_13.json b/python/tests/files/datacite/datacite_doc_13.json new file mode 100644 index 00000000..0cada273 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_13.json @@ -0,0 +1,86 @@ +{ + "attributes": { + "container": {}, + "contentUrl": null, + "contributors": [], + "created": "2013-03-22T14:02:08.000Z", + "creators": [ + { + "affiliation": [], + "name": "O.M." + }, + { + "affiliation": [], + "familyName": "Hiltbrunner", + "givenName": "Hermann", + "name": "Hiltbrunner, Hermann", + "nameType": "Personal" + } + ], + "dates": [ + { + "date": "1940-10-05", + "dateType": "Available" + }, + { + "date": "1940", + "dateType": "Issued" + } + ], + "descriptions": [], + "doi": "10.5169/seals-314104", + "formats": [ + "text/html", + "application/pdf" + ], + "fundingReferences": [], + "geoLocations": [], + "identifiers": [ + { + "identifier": "https://doi.org/10.5169/seals-314104", + "identifierType": "DOI" + } + ], + "isActive": true, + "language": null, + "metadataVersion": 17, + "publicationYear": 1940, + "published": "1940", + "publisher": "Buchdruckerei Büchler & Co.", + "reason": null, + "registered": "2013-03-22T13:58:11.000Z", + "relatedIdentifiers": [], + "rightsList": [], + "schemaVersion": "http://datacite.org/schema/kernel-3", + "sizes": [], + "source": null, + "state": "findable", + "subjects": [], + "titles": [ + { + "title": "[Müssen wir des Glücks uns schämen?]" + } + ], + "types": { + "bibtex": "article", + "citeproc": "article-journal", + "resourceType": "Journal Article", + "resourceTypeGeneral": "Text", + "ris": "JOUR", + "schemaOrg": "ScholarlyArticle" + }, + "updated": "2019-08-02T02:22:55.000Z", + "url": "https://www.e-periodica.ch/digbib/view?pid=sle-001:1940-1941:45::13", + "version": null + }, + "id": "10.5169/seals-314104", + "relationships": { + "client": { + "data": { + "id": "ethz.seals", + "type": "clients" + } + } + }, + "type": "dois" +} diff --git a/python/tests/files/datacite/datacite_doc_14.json b/python/tests/files/datacite/datacite_doc_14.json new file mode 100644 index 00000000..c0911819 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_14.json @@ -0,0 +1,166 @@ +{ + "attributes": { + "container": {}, + "contentUrl": null, + "contributors": [], + "created": "2014-03-18T07:28:28.000Z", + "creators": [ + { + "affiliation": [], + "familyName": "Stulz", + "givenName": "E.", + "name": "Stulz, E.", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Scott", + "givenName": "S.M.", + "name": "Scott, S.M.", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Ng", + "givenName": "Yiu-Fai", + "name": "Ng, Yiu-Fai", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Bond", + "givenName": "A.D.", + "name": "Bond, A.D.", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Teat", + "givenName": "S.J.", + "name": "Teat, S.J.", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Darling", + "givenName": "S.L.", + "name": "Darling, S.L.", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Feeder", + "givenName": "N.", + "name": "Feeder, N.", + "nameType": "Personal" + }, + { + "affiliation": [], + "familyName": "Sanders", + "givenName": "J.K.M.", + "name": "Sanders, J.K.M.", + "nameType": "Personal" + } + ], + "dates": [ + { + "date": "2004", + "dateType": "Issued" + } + ], + "descriptions": [ + { + "description": "Related Article: E.Stulz, S.M.Scott, Yiu-Fai Ng, A.D.Bond, S.J.Teat, S.L.Darling, N.Feeder, J.K.M.Sanders|2003|Inorg.Chem.|42|6564|doi:10.1021/ic034699w", + "descriptionType": "Other" + }, + { + "description": "An entry from the Cambridge Structural Database, the world’s repository for small molecule crystal structures. The entry contains experimental data from a crystal diffraction study. The deposited dataset for this entry is freely available from the CCDC and typically includes 3D coordinates, cell parameters, space group, experimental conditions and quality measures.", + "descriptionType": "Abstract" + } + ], + "doi": "10.5517/cc7gns3", + "formats": [ + "CIF" + ], + "fundingReferences": [], + "geoLocations": [], + "identifiers": [ + { + "identifier": "https://doi.org/10.5517/cc7gns3", + "identifierType": "DOI" + }, + { + "identifier": "222635", + "identifierType": "CCDC" + } + ], + "isActive": true, + "language": "eng", + "metadataVersion": 2, + "publicationYear": 2004, + "published": "2004", + "publisher": "Cambridge Crystallographic Data Centre", + "reason": null, + "registered": "2014-03-18T07:28:29.000Z", + "relatedIdentifiers": [ + { + "relatedIdentifier": "10.1021/ic034699w", + "relatedIdentifierType": "DOI", + "relationType": "IsSupplementTo" + } + ], + "rightsList": [], + "schemaVersion": "http://datacite.org/schema/kernel-3", + "sizes": [], + "source": null, + "state": "findable", + "subjects": [ + { + "subject": "Crystal Structure" + }, + { + "subject": "Experimental 3D Coordinates" + }, + { + "subject": "Crystal System" + }, + { + "subject": "Space Group" + }, + { + "subject": "Cell Parameters" + }, + { + "subject": "Crystallography" + }, + { + "subject": "bis(mu~2~-5-(3,5-Di-t-butylphenyl)-15-(4-(2-(diphenylphosphino)ethynyl)phenyl)-2,8,12,18-tetrahexyl-3,7,13,17-tetramethylporphyrinato)-(5,15-bis(3,5-di-t-butylphenyl)-2,8,12,18-tetraethyl-3,7,13,17-tetramethylporphyrinato)-di-nickel-ruthenium chloroform solvate" + } + ], + "titles": [ + { + "title": "CCDC 222635: Experimental Crystal Structure Determination" + } + ], + "types": { + "bibtex": "misc", + "citeproc": "dataset", + "resourceTypeGeneral": "Dataset", + "ris": "DATA", + "schemaOrg": "Dataset" + }, + "updated": "2019-08-02T03:38:32.000Z", + "url": "http://www.ccdc.cam.ac.uk/services/structure_request?id=doi:10.5517/cc7gns3&sid=DataCite", + "version": null + }, + "id": "10.5517/cc7gns3", + "relationships": { + "client": { + "data": { + "id": "ccdc.csd", + "type": "clients" + } + } + }, + "type": "dois" +} diff --git a/python/tests/files/datacite/datacite_doc_15.json b/python/tests/files/datacite/datacite_doc_15.json new file mode 100644 index 00000000..8dc67267 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_15.json @@ -0,0 +1,79 @@ +{ + "attributes": { + "container": {}, + "contentUrl": null, + "contributors": [], + "created": "2017-02-01T18:20:04.000Z", + "creators": [ + { + "affiliation": [], + "familyName": "Richardson", + "givenName": "David", + "name": "Richardson, David", + "nameType": "Personal" + } + ], + "dates": [ + { + "date": "2017", + "dateType": "Issued" + } + ], + "descriptions": [], + "doi": "10.6073/pasta/95296d8416aae24f3d39b4ecb27f0b28", + "formats": [], + "fundingReferences": [], + "geoLocations": [], + "identifiers": [ + { + "identifier": "https://doi.org/10.6073/pasta/95296d8416aae24f3d39b4ecb27f0b28", + "identifierType": "DOI" + }, + { + "identifier": "https://pasta.lternet.edu/package/eml/knb-lter-vcr/102/16", + "identifierType": "URL" + } + ], + "isActive": true, + "language": null, + "metadataVersion": 1, + "publicationYear": 2017, + "published": "2017", + "publisher": "Environmental Data Initiative", + "reason": null, + "registered": "2017-02-01T18:20:05.000Z", + "relatedIdentifiers": [], + "rightsList": [], + "schemaVersion": "http://datacite.org/schema/kernel-2.2", + "sizes": [], + "source": null, + "state": "findable", + "subjects": [], + "titles": [ + { + "title": "Parramore Island of the Virginia Coast Reserve Permanent Plot Resurvey: Tree data 1997" + } + ], + "types": { + "bibtex": "misc", + "citeproc": "dataset", + "resourceType": "dataPackage", + "resourceTypeGeneral": "Dataset", + "ris": "DATA", + "schemaOrg": "Dataset" + }, + "updated": "2019-08-02T14:16:49.000Z", + "url": "https://portal.lternet.edu/nis/mapbrowse?packageid=knb-lter-vcr.102.16", + "version": null + }, + "id": "10.6073/pasta/95296d8416aae24f3d39b4ecb27f0b28", + "relationships": { + "client": { + "data": { + "id": "edi.edi", + "type": "clients" + } + } + }, + "type": "dois" +} diff --git a/python/tests/files/datacite/datacite_doc_16.json b/python/tests/files/datacite/datacite_doc_16.json new file mode 100644 index 00000000..72ad59ac --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_16.json @@ -0,0 +1,80 @@ +{ + "attributes": { + "container": {}, + "contentUrl": null, + "contributors": [], + "created": "2014-12-31T15:38:16.000Z", + "creators": [ + { + "affiliation": [], + "familyName": "Sochi", + "givenName": "Taha", + "name": "Sochi, Taha", + "nameType": "Personal" + } + ], + "dates": [ + { + "date": "2014", + "dateType": "Issued" + } + ], + "descriptions": [], + "doi": "10.6084/m9.figshare.1282478", + "formats": [], + "fundingReferences": [], + "geoLocations": [], + "identifiers": [ + { + "identifier": "https://doi.org/10.6084/m9.figshare.1282478", + "identifierType": "DOI" + } + ], + "isActive": true, + "language": null, + "metadataVersion": 0, + "publicationYear": 2014, + "published": "2014", + "publisher": "Figshare", + "reason": null, + "registered": "2014-12-31T15:38:18.000Z", + "relatedIdentifiers": [], + "rightsList": [ + { + "rights": "CC-BY", + "rightsUri": "http://creativecommons.org/licenses/by/3.0/us" + } + ], + "schemaVersion": "http://datacite.org/schema/kernel-3", + "sizes": [], + "source": null, + "state": "findable", + "subjects": [], + "titles": [ + { + "title": "Testing the Connectivity of Networks" + } + ], + "types": { + "bibtex": "misc", + "citeproc": "dataset", + "resourceType": "Paper", + "resourceTypeGeneral": "Dataset", + "ris": "DATA", + "schemaOrg": "Dataset" + }, + "updated": "2019-08-02T04:52:11.000Z", + "url": "http://figshare.com/articles/Testing_the_Connectivity_of_Networks/1282478", + "version": null + }, + "id": "10.6084/m9.figshare.1282478", + "relationships": { + "client": { + "data": { + "id": "figshare.ars", + "type": "clients" + } + } + }, + "type": "dois" +} diff --git a/python/tests/files/datacite/datacite_doc_17.json b/python/tests/files/datacite/datacite_doc_17.json new file mode 100644 index 00000000..93ec715e --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_17.json @@ -0,0 +1,72 @@ +{ + "attributes": { + "container": {}, + "contentUrl": null, + "contributors": [], + "created": "2018-08-22T17:36:10.000Z", + "creators": [ + { + "affiliation": [], + "name": "Di Giovanna, Antonino Paolo (University Of Florence)", + "nameType": "Personal" + } + ], + "dates": [ + { + "date": "2018", + "dateType": "Issued" + } + ], + "descriptions": [], + "doi": "10.7910/dvn/tsqfwc/yytj22", + "formats": [], + "fundingReferences": [], + "geoLocations": [], + "identifiers": [ + { + "identifier": "https://doi.org/10.7910/dvn/tsqfwc/yytj22", + "identifierType": "DOI" + } + ], + "isActive": true, + "language": null, + "metadataVersion": 0, + "publicationYear": 2018, + "published": "2018", + "publisher": "Harvard Dataverse", + "reason": null, + "registered": "2018-08-22T17:37:30.000Z", + "relatedIdentifiers": [], + "rightsList": [], + "schemaVersion": "http://datacite.org/schema/kernel-4", + "sizes": [], + "source": "mds", + "state": "findable", + "subjects": [], + "titles": [ + { + "title": "gel_BSA-FITC_Markov_segmntation0343.tif" + } + ], + "types": { + "bibtex": "misc", + "citeproc": "dataset", + "resourceTypeGeneral": "Dataset", + "ris": "DATA", + "schemaOrg": "Dataset" + }, + "updated": "2019-08-02T19:43:20.000Z", + "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/TSQFWC/YYTJ22", + "version": null + }, + "id": "10.7910/dvn/tsqfwc/yytj22", + "relationships": { + "client": { + "data": { + "id": "gdcc.harvard-dv", + "type": "clients" + } + } + }, + "type": "dois" +} diff --git a/python/tests/files/datacite/datacite_doc_18.json b/python/tests/files/datacite/datacite_doc_18.json new file mode 100644 index 00000000..b5c41b68 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_18.json @@ -0,0 +1,79 @@ +{ + "attributes": { + "container": {}, + "contentUrl": null, + "contributors": [], + "created": "2017-11-29T02:15:31.000Z", + "creators": [ + { + "affiliation": [], + "name": "(:Unav)", + "nameIdentifiers": [] + } + ], + "dates": [ + { + "date": "2017-08-21", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "descriptions": [], + "doi": "10.7916/d81z522m", + "formats": [], + "fundingReferences": [], + "geoLocations": [], + "identifiers": [ + { + "identifier": "https://doi.org/10.7916/d81z522m", + "identifierType": "DOI" + } + ], + "isActive": true, + "language": null, + "metadataVersion": 2, + "publicationYear": 2017, + "published": "2017", + "publisher": "Columbia University", + "reason": null, + "registered": "2017-11-29T02:15:32.000Z", + "relatedIdentifiers": [], + "rightsList": [], + "schemaVersion": "http://datacite.org/schema/kernel-3", + "sizes": [], + "source": "ez", + "state": "findable", + "subjects": [], + "titles": [ + { + "title": "Eastern questionnaire, answer sheet for Interviewee 53215, page 064" + } + ], + "types": { + "bibtex": "misc", + "citeproc": "article", + "ris": "GEN", + "schemaOrg": "CreativeWork" + }, + "updated": "2019-08-04T13:17:58.000Z", + "url": "https://dlc.library.columbia.edu/lcaaj/cul:k3j9kd52d6", + "version": null + }, + "id": "10.7916/d81z522m", + "relationships": { + "client": { + "data": { + "id": "cul.columbia", + "type": "clients" + } + } + }, + "type": "dois" +} diff --git a/python/tests/files/datacite/datacite_doc_19.json b/python/tests/files/datacite/datacite_doc_19.json new file mode 100644 index 00000000..9fbe7372 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_19.json @@ -0,0 +1,79 @@ +{ + "attributes": { + "container": {}, + "contentUrl": null, + "contributors": [], + "created": "2017-11-29T09:29:33.000Z", + "creators": [ + { + "affiliation": [], + "name": "(:Unav)", + "nameIdentifiers": [] + } + ], + "dates": [ + { + "date": "2017-08-24", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "descriptions": [], + "doi": "10.7916/d86x0cg1", + "formats": [], + "fundingReferences": [], + "geoLocations": [], + "identifiers": [ + { + "identifier": "https://doi.org/10.7916/d86x0cg1", + "identifierType": "DOI" + } + ], + "isActive": true, + "language": null, + "metadataVersion": 3, + "publicationYear": 2017, + "published": "2017", + "publisher": "Columbia University", + "reason": null, + "registered": "2017-11-29T09:29:34.000Z", + "relatedIdentifiers": [], + "rightsList": [], + "schemaVersion": "http://datacite.org/schema/kernel-3", + "sizes": [], + "source": "ez", + "state": "findable", + "subjects": [], + "titles": [ + { + "title": "Eastern questionnaire, answer sheet for Interviewee 55236, page 092" + } + ], + "types": { + "bibtex": "misc", + "citeproc": "article", + "ris": "GEN", + "schemaOrg": "CreativeWork" + }, + "updated": "2019-08-04T23:43:40.000Z", + "url": "https://dlc.library.columbia.edu/lcaaj/cul:44j0zpc98s", + "version": null + }, + "id": "10.7916/d86x0cg1", + "relationships": { + "client": { + "data": { + "id": "cul.columbia", + "type": "clients" + } + } + }, + "type": "dois" +} diff --git a/python/tests/files/datacite/datacite_doc_20.json b/python/tests/files/datacite/datacite_doc_20.json new file mode 100644 index 00000000..7126ee37 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_20.json @@ -0,0 +1,41 @@ +{ + "attributes": { + "creators": [ + { + "affiliation": [], + "name": "(:Unav)", + "nameIdentifiers": [] + } + ], + "dates": [ + { + "date": "2017-08-24", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "doi": "10.7916/d86x0cg1", + "isActive": true, + "language": null, + "publicationYear": 2017, + "state": "findable", + "titles": [ + { + "title": "<h1>Eastern questionnaire</h1>" + } + ], + "types": { + "bibtex": "misc", + "citeproc": "article", + "ris": "GEN", + "schemaOrg": "CreativeWork" + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_21.json b/python/tests/files/datacite/datacite_doc_21.json new file mode 100644 index 00000000..248879c2 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_21.json @@ -0,0 +1,41 @@ +{ + "attributes": { + "creators": [ + { + "affiliation": [], + "name": "(:Unav)", + "nameIdentifiers": [] + } + ], + "dates": [ + { + "date": "2017-08-24", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "doi": "10.7916/d86x0cg1", + "isActive": true, + "language": "GERMAN", + "publicationYear": 2017, + "state": "findable", + "titles": [ + { + "title": "ABC" + } + ], + "types": { + "bibtex": "misc", + "citeproc": "article", + "ris": "GEN", + "schemaOrg": "CreativeWork" + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_22.json b/python/tests/files/datacite/datacite_doc_22.json new file mode 100644 index 00000000..0f7c5e57 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_22.json @@ -0,0 +1,43 @@ +{ + "attributes": { + "creators": [ + { + "affiliation": [ + "Department of pataphysics" + ], + "name": "Anton Welch", + "nameIdentifiers": [] + } + ], + "dates": [ + { + "date": "2017-08-24", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "doi": "10.7916/d86x0cg1", + "isActive": true, + "language": "GERMAN", + "publicationYear": 2017, + "state": "findable", + "titles": [ + { + "title": "ABC" + } + ], + "types": { + "bibtex": "misc", + "citeproc": "article", + "ris": "GEN", + "schemaOrg": "CreativeWork" + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_23.json b/python/tests/files/datacite/datacite_doc_23.json new file mode 100644 index 00000000..b755f1a5 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_23.json @@ -0,0 +1,43 @@ +{ + "attributes": { + "creators": [ + { + "affiliation": [ + "Department of pataphysics" + ], + "name": "Anton Welch", + "nameIdentifiers": [] + } + ], + "dates": [ + { + "date": "2017-08-24", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "doi": "10.7916/d86x0cg1–xxx", + "isActive": true, + "language": "GERMAN", + "publicationYear": 2017, + "state": "findable", + "titles": [ + { + "title": "ABC" + } + ], + "types": { + "bibtex": "misc", + "citeproc": "article", + "ris": "GEN", + "schemaOrg": "CreativeWork" + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_24.json b/python/tests/files/datacite/datacite_doc_24.json new file mode 100644 index 00000000..4023055b --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_24.json @@ -0,0 +1,47 @@ +{ + "attributes": { + "creators": [ + { + "affiliation": [ + "Department of pataphysics" + ], + "name": "Anton Welch", + "nameIdentifiers": [] + } + ], + "dates": [ + { + "date": "2017-08-24", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "doi": "10.7916/d86x0cg1", + "isActive": true, + "language": "DE-CH", + "publicationYear": 2016, + "state": "findable", + "titles": [ + { + "title": "ABC" + }, + { + "title": "DEF", + "titleType": "Subtitle" + } + ], + "types": { + "bibtex": "misc", + "citeproc": "article", + "ris": "GEN", + "schemaOrg": "CreativeWork" + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_25.json b/python/tests/files/datacite/datacite_doc_25.json new file mode 100644 index 00000000..2b219728 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_25.json @@ -0,0 +1,47 @@ +{ + "attributes": { + "creators": [ + { + "affiliation": [ + "Department of pataphysics" + ], + "name": "Anton Welch", + "nameIdentifiers": [] + } + ], + "dates": [ + { + "date": "2017-08-24", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "doi": "10.7916/d86x0cg1", + "isActive": true, + "language": "DE-CH", + "publicationYear": 2016, + "state": "findable", + "titles": [ + { + "title": "Additional file 123: ABC" + }, + { + "title": "DEF", + "titleType": "Subtitle" + } + ], + "types": { + "bibtex": "misc", + "citeproc": "article", + "ris": "GEN", + "schemaOrg": "CreativeWork" + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_26.json b/python/tests/files/datacite/datacite_doc_26.json new file mode 100644 index 00000000..36fa565d --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_26.json @@ -0,0 +1,57 @@ +{ + "attributes": { + "contributors": [ + { + "affiliation": [], + "contributorType": "Editor", + "familyName": "Wemmer", + "givenName": "David", + "name": "Wemmer, David", + "nameType": "Personal" + } + ], + "creators": [ + { + "affiliation": [ + "Department of pataphysics" + ], + "name": "Anton Welch", + "nameIdentifiers": [] + } + ], + "dates": [ + { + "date": "2017-08-24", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "doi": "10.7916/d86x0cg1", + "isActive": true, + "language": "DE-CH", + "publicationYear": 2016, + "state": "findable", + "titles": [ + { + "title": "Additional file 123: ABC" + }, + { + "title": "DEF", + "titleType": "Subtitle" + } + ], + "types": { + "bibtex": "misc", + "citeproc": "article", + "ris": "GEN", + "schemaOrg": "CreativeWork" + } + } +} diff --git a/python/tests/files/datacite/datacite_result_00.json b/python/tests/files/datacite/datacite_result_00.json new file mode 100644 index 00000000..89450f9d --- /dev/null +++ b/python/tests/files/datacite/datacite_result_00.json @@ -0,0 +1,92 @@ +{ + "abstracts": [], + "contribs": [ + { + "given_name": "Qian-Jin", + "index": 0, + "raw_name": "Qian-Jin Li", + "role": "author", + "surname": "Li" + }, + { + "given_name": "Chun-Long", + "index": 1, + "raw_name": "Chun-Long Yang", + "role": "author", + "surname": "Yang" + } + ], + "ext_ids": { + "doi": "10.1007/s10870-008-9413-z" + }, + "extra": { + "container_name": "Journal of Chemical Crystallography", + "datacite": { + "license": [ + { + "rightsUri": "http://www.springer.com/tdm" + } + ], + "metadataVersion": 1, + "relations": [ + { + "relatedIdentifier": "1074-1542", + "relatedIdentifierType": "ISSN", + "relationType": "IsPartOf", + "resourceTypeGeneral": "Collection" + } + ], + "resourceType": "JournalArticle", + "resourceTypeGeneral": "Text", + "schemaVersion": "http://datacite.org/schema/kernel-4" + }, + "release_month": 5 + }, + "issue": "12", + "pages": "927-930", + "publisher": "Springer Science and Business Media LLC", + "refs": [ + { + "extra": { + "doi": "10.1016/j.bmcl.2005.09.033" + }, + "index": 0 + }, + { + "extra": { + "doi": "10.1016/s0022-1139(02)00330-5" + }, + "index": 1 + }, + { + "extra": { + "doi": "10.1016/s0010-8545(01)00337-x" + }, + "index": 2 + }, + { + "extra": { + "doi": "10.1016/j.tetlet.2005.06.135" + }, + "index": 3 + }, + { + "extra": { + "doi": "10.1039/p298700000s1" + }, + "index": 4 + }, + { + "extra": { + "doi": "10.1002/anie.199515551" + }, + "index": 5 + } + ], + "release_date": "2019-05-31", + "release_stage": "published", + "release_type": "article-journal", + "release_year": 2019, + "title": "Synthesis and Crystal Structure of a Compound with Two Conformational Isomers: N-(2-methylbenzoyl)-N′-(4-nitrophenyl)thiourea", + "volume": "38" +} diff --git a/python/tests/files/datacite/datacite_result_01.json b/python/tests/files/datacite/datacite_result_01.json new file mode 100644 index 00000000..9fc62db4 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_01.json @@ -0,0 +1,36 @@ +{ + "abstracts": [], + "contribs": [ + { + "given_name": "G.", + "index": 0, + "raw_name": "G. Dargenty", + "role": "author", + "surname": "Dargenty" + } + ], + "ext_ids": { + "doi": "10.11588/diglit.25558.39" + }, + "extra": { + "datacite": { + "license": [ + { + "lang": "de", + "rights": "Standard (Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen) - http://www.ub.uni-heidelberg.de/helios/digi/nutzung/Welcome.html" + } + ], + "metadataVersion": 4, + "resourceType": "DigitalisatDigital copy", + "resourceTypeGeneral": "Text", + "schemaVersion": "http://datacite.org/schema/kernel-4" + } + }, + "language": "fr", + "publisher": "University Library Heidelberg", + "refs": [], + "release_stage": "published", + "release_type": "article-journal", + "release_year": 1887, + "title": "Ferdinand Gaillard, [1]: né à Paris le 16 janvier 1834, mort à Paris le 19 janvier 1887" +} diff --git a/python/tests/files/datacite/datacite_result_02.json b/python/tests/files/datacite/datacite_result_02.json new file mode 100644 index 00000000..d6b9556f --- /dev/null +++ b/python/tests/files/datacite/datacite_result_02.json @@ -0,0 +1,40 @@ +{ + "abstracts": [], + "contribs": [ + { + "given_name": "Albert", + "index": 0, + "raw_name": "Albert Weyersberg", + "role": "author", + "surname": "Weyersberg" + } + ], + "ext_ids": { + "doi": "10.11588/diglit.37715.57" + }, + "extra": { + "datacite": { + "license": [ + { + "lang": "de", + "rights": "Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen - https://creativecommons.org/licenses/by-sa/3.0/de/" + }, + { + "lang": "en", + "rights": "Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen - https://creativecommons.org/licenses/by-sa/3.0/" + } + ], + "metadataVersion": 2, + "resourceType": "DigitalisatDigital copy", + "resourceTypeGeneral": "Text", + "schemaVersion": "http://datacite.org/schema/kernel-4" + } + }, + "language": "de", + "publisher": "University Library Heidelberg", + "refs": [], + "release_stage": "published", + "release_type": "article-journal", + "release_year": 1897, + "title": "Solinger Schwertschmiede-Familien, [4]" +} diff --git a/python/tests/files/datacite/datacite_result_03.json b/python/tests/files/datacite/datacite_result_03.json new file mode 100644 index 00000000..6aa65aee --- /dev/null +++ b/python/tests/files/datacite/datacite_result_03.json @@ -0,0 +1,23 @@ +{ + "abstracts": [], + "contribs": [ + { + "index": 0, + "raw_name": "Mastura Yahya", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.13140/rg.2.2.30434.53446" + }, + "extra": { + "datacite": { + "schemaVersion": "http://datacite.org/schema/kernel-3" + } + }, + "language": "ms", + "refs": [], + "release_type": "article", + "release_year": 2016, + "title": "midterm ah30903" +} diff --git a/python/tests/files/datacite/datacite_result_04.json b/python/tests/files/datacite/datacite_result_04.json new file mode 100644 index 00000000..571c3f64 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_04.json @@ -0,0 +1,36 @@ +{ + "abstracts": [ + { + "content": "Let A be an abelian category, I the full subcategory of A consisting of injective objects of A, and K(A) the category whose objects are cochain complexes of elements of A, and whose morphisms are homotopy classes of cochain maps. In (5), lemma 4.6., p. 42, R. Hartshorne has proved that, under certain conditions, a cochain complex X˙ ε. |KA)| can be embedded in a complex I˙ ε. |K(I)| in such a way that I˙ has the same cohomology as X˙. In Chapter I we show that the construction given in the two first parts of Hartshorne's Lemma is natural i.e. there exists a functor J : K(A) → K(I) and a natural transformation [formula omitted] (where E : K(I) → K(A) is the embedding functor) such that [formula omitted] is injective and induces isomorphism in cohomology. The question whether the construction given in the third part of the lemma is functorial is still open. We also prove that J is left adjoint to E, so that K(I) is a reflective subcategory of K(A). In the special case where A is a category [formula omitted] of left A-modules, and [formula omitted] the category of cochain complexes in [formula omitted] and cochain maps (not homotopy classes), we prove the existence of a functor [formula omitted] In Chapter II we study the natural homomorphism [formula omitted] where A, B are rings, and M, L, N modules or chain complexes. In particular we give several sufficient conditions under which v is an isomorphism, or induces isomorphism in homology. In the appendix we give a detailed proof of Hartshorne's Lemma. We think that this is useful, as no complete proof is, to our knowledge, to be found in the literature.", + "lang": "en", + "mimetype": "text/plain" + } + ], + "contribs": [ + { + "given_name": "Marc Andre", + "index": 0, + "raw_name": "Marc Andre Nicollerat", + "role": "author", + "surname": "Nicollerat" + } + ], + "ext_ids": { + "doi": "10.14288/1.0080520" + }, + "extra": { + "datacite": { + "metadataVersion": 5, + "resourceType": "Text", + "resourceTypeGeneral": "Text", + "schemaVersion": "http://datacite.org/schema/kernel-3" + } + }, + "language": "en", + "publisher": "University of British Columbia", + "refs": [], + "release_stage": "published", + "release_type": "article-journal", + "release_year": 1973, + "title": "On chain maps inducing isomorphisms in homology" +} diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json new file mode 100644 index 00000000..5b7b4ed2 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_05.json @@ -0,0 +1,542 @@ +{ + "abstracts": [ + { + "content": "UNITE provides a unified way for delimiting, identifying, communicating, and working with DNA-based Species Hypotheses (SH). All fungal ITS sequences in the international nucleotide sequence databases are clustered to approximately the species level by applying a set of dynamic distance values (<0.5 - 3.0%). All species hypotheses are given a unique, stable name in the form of a DOI, and their taxonomic and ecological annotations are verified through distributed, web-based third-party annotation efforts. SHs are connected to a taxon name and its classification as far as possible (phylum, class, order, etc.) by taking into account identifications for all sequences in the SH. An automatically or manually designated sequence is chosen to represent each such SH. These sequences are released (https://unite.ut.ee/repository.php) for use by the scientific community in, for example, local sequence similarity searches and next-generation sequencing analysis pipelines. The system and the data are updated automatically as the number of public fungal ITS sequences grows.", + "lang": "en", + "mimetype": "text/plain" + } + ], + "contribs": [ + { + "given_name": "Urmas", + "index": 0, + "raw_name": "Urmas Kõljalg", + "role": "author", + "surname": "Kõljalg" + }, + { + "given_name": "Kessy", + "index": 1, + "raw_name": "Kessy Abarenkov", + "role": "author", + "surname": "Abarenkov" + }, + { + "given_name": "R. Henrik", + "index": 2, + "raw_name": "R. Henrik Nilsson", + "role": "author", + "surname": "Nilsson" + }, + { + "given_name": "Karl-Henrik", + "index": 3, + "raw_name": "Karl-Henrik Larsson", + "role": "author", + "surname": "Larsson" + }, + { + "given_name": "Anders Bjørnsgard", + "index": 4, + "raw_name": "Anders Bjørnsgard Aas", + "role": "author", + "surname": "Aas" + }, + { + "given_name": "Rachel", + "index": 5, + "raw_name": "Rachel Adams", + "role": "author", + "surname": "Adams" + }, + { + "given_name": "Artur", + "index": 6, + "raw_name": "Artur Alves", + "role": "author", + "surname": "Alves" + }, + { + "given_name": "Joseph F.", + "index": 7, + "raw_name": "Joseph F. Ammirati", + "role": "author", + "surname": "Ammirati" + }, + { + "given_name": "A. Elizabeth", + "index": 8, + "raw_name": "A. Elizabeth Arnold", + "role": "author", + "surname": "Arnold" + }, + { + "given_name": "Mohammad", + "index": 9, + "raw_name": "Mohammad Bahram", + "role": "author", + "surname": "Bahram" + }, + { + "given_name": "Johan", + "index": 10, + "raw_name": "Johan Bengtsson-Palme", + "role": "author", + "surname": "Bengtsson-Palme" + }, + { + "given_name": "Anna", + "index": 11, + "raw_name": "Anna Berlin", + "role": "author", + "surname": "Berlin" + }, + { + "given_name": "Synnøve", + "index": 12, + "raw_name": "Synnøve Botnen", + "role": "author", + "surname": "Botnen" + }, + { + "given_name": "Sarah", + "index": 13, + "raw_name": "Sarah Bourlat", + "role": "author", + "surname": "Bourlat" + }, + { + "given_name": "Tanya", + "index": 14, + "raw_name": "Tanya Cheeke", + "role": "author", + "surname": "Cheeke" + }, + { + "given_name": "Bálint", + "index": 15, + "raw_name": "Bálint Dima", + "role": "author", + "surname": "Dima" + }, + { + "given_name": "Rein", + "index": 16, + "raw_name": "Rein Drenkhan", + "role": "author", + "surname": "Drenkhan" + }, + { + "given_name": "Camila", + "index": 17, + "raw_name": "Camila Duarte", + "role": "author", + "surname": "Duarte" + }, + { + "given_name": "Margarita", + "index": 18, + "raw_name": "Margarita Dueñas", + "role": "author", + "surname": "Dueñas" + }, + { + "given_name": "Ursula", + "index": 19, + "raw_name": "Ursula Eberhardt", + "role": "author", + "surname": "Eberhardt" + }, + { + "given_name": "Hanna", + "index": 20, + "raw_name": "Hanna Friberg", + "role": "author", + "surname": "Friberg" + }, + { + "given_name": "Tobias G.", + "index": 21, + "raw_name": "Tobias G. Frøslev", + "role": "author", + "surname": "Frøslev" + }, + { + "given_name": "Sigisfredo", + "index": 22, + "raw_name": "Sigisfredo Garnica", + "role": "author", + "surname": "Garnica" + }, + { + "given_name": "József", + "index": 23, + "raw_name": "József Geml", + "role": "author", + "surname": "Geml" + }, + { + "given_name": "Masoomeh", + "index": 24, + "raw_name": "Masoomeh Ghobad-Nejhad", + "role": "author", + "surname": "Ghobad-Nejhad" + }, + { + "given_name": "Tine", + "index": 25, + "raw_name": "Tine Grebenc", + "role": "author", + "surname": "Grebenc" + }, + { + "given_name": "Gareth W.", + "index": 26, + "raw_name": "Gareth W. Griffith", + "role": "author", + "surname": "Griffith" + }, + { + "given_name": "Felix", + "index": 27, + "raw_name": "Felix Hampe", + "role": "author", + "surname": "Hampe" + }, + { + "given_name": "Peter", + "index": 28, + "raw_name": "Peter Kennedy", + "role": "author", + "surname": "Kennedy" + }, + { + "given_name": "Maryia", + "index": 29, + "raw_name": "Maryia Khomich", + "role": "author", + "surname": "Khomich" + }, + { + "given_name": "Petr", + "index": 30, + "raw_name": "Petr Kohout", + "role": "author", + "surname": "Kohout" + }, + { + "given_name": "Anu", + "index": 31, + "raw_name": "Anu Kollom", + "role": "author", + "surname": "Kollom" + }, + { + "given_name": "Ellen", + "index": 32, + "raw_name": "Ellen Larsson", + "role": "author", + "surname": "Larsson" + }, + { + "given_name": "Irinyi", + "index": 33, + "raw_name": "Irinyi Laszlo", + "role": "author", + "surname": "Laszlo" + }, + { + "given_name": "Steven", + "index": 34, + "raw_name": "Steven Leavitt", + "role": "author", + "surname": "Leavitt" + }, + { + "given_name": "Kare", + "index": 35, + "raw_name": "Kare Liimatainen", + "role": "author", + "surname": "Liimatainen" + }, + { + "given_name": "Björn", + "index": 36, + "raw_name": "Björn Lindahl", + "role": "author", + "surname": "Lindahl" + }, + { + "given_name": "Deborah J.", + "index": 37, + "raw_name": "Deborah J. Lodge", + "role": "author", + "surname": "Lodge" + }, + { + "given_name": "Helge Thorsten", + "index": 38, + "raw_name": "Helge Thorsten Lumbsch", + "role": "author", + "surname": "Lumbsch" + }, + { + "given_name": "María Paz", + "index": 39, + "raw_name": "María Paz Martín Esteban", + "role": "author", + "surname": "Martín Esteban" + }, + { + "given_name": "Wieland", + "index": 40, + "raw_name": "Wieland Meyer", + "role": "author", + "surname": "Meyer" + }, + { + "given_name": "Otto", + "index": 41, + "raw_name": "Otto Miettinen", + "role": "author", + "surname": "Miettinen" + }, + { + "given_name": "Nhu", + "index": 42, + "raw_name": "Nhu Nguyen", + "role": "author", + "surname": "Nguyen" + }, + { + "given_name": "Tuula", + "index": 43, + "raw_name": "Tuula Niskanen", + "role": "author", + "surname": "Niskanen" + }, + { + "given_name": "Ryoko", + "index": 44, + "raw_name": "Ryoko Oono", + "role": "author", + "surname": "Oono" + }, + { + "given_name": "Maarja", + "index": 45, + "raw_name": "Maarja Öpik", + "role": "author", + "surname": "Öpik" + }, + { + "given_name": "Alexander", + "index": 46, + "raw_name": "Alexander Ordynets", + "role": "author", + "surname": "Ordynets" + }, + { + "given_name": "Julia", + "index": 47, + "raw_name": "Julia Pawłowska", + "role": "author", + "surname": "Pawłowska" + }, + { + "given_name": "Ursula", + "index": 48, + "raw_name": "Ursula Peintner", + "role": "author", + "surname": "Peintner" + }, + { + "given_name": "Olinto Liparini", + "index": 49, + "raw_name": "Olinto Liparini Pereira", + "role": "author", + "surname": "Pereira" + }, + { + "given_name": "Danilo Batista", + "index": 50, + "raw_name": "Danilo Batista Pinho", + "role": "author", + "surname": "Pinho" + }, + { + "given_name": "Kadri", + "index": 51, + "raw_name": "Kadri Põldmaa", + "role": "author", + "surname": "Põldmaa" + }, + { + "given_name": "Kadri", + "index": 52, + "raw_name": "Kadri Runnel", + "role": "author", + "surname": "Runnel" + }, + { + "given_name": "Martin", + "index": 53, + "raw_name": "Martin Ryberg", + "role": "author", + "surname": "Ryberg" + }, + { + "given_name": "Irja", + "index": 54, + "raw_name": "Irja Saar", + "role": "author", + "surname": "Saar" + }, + { + "given_name": "Kemal", + "index": 55, + "raw_name": "Kemal Sanli", + "role": "author", + "surname": "Sanli" + }, + { + "given_name": "James", + "index": 56, + "raw_name": "James Scott", + "role": "author", + "surname": "Scott" + }, + { + "given_name": "Viacheslav", + "index": 57, + "raw_name": "Viacheslav Spirin", + "role": "author", + "surname": "Spirin" + }, + { + "given_name": "Ave", + "index": 58, + "raw_name": "Ave Suija", + "role": "author", + "surname": "Suija" + }, + { + "given_name": "Sten", + "index": 59, + "raw_name": "Sten Svantesson", + "role": "author", + "surname": "Svantesson" + }, + { + "given_name": "Mariusz", + "index": 60, + "raw_name": "Mariusz Tadych", + "role": "author", + "surname": "Tadych" + }, + { + "given_name": "Susumu", + "index": 61, + "raw_name": "Susumu Takamatsu", + "role": "author", + "surname": "Takamatsu" + }, + { + "given_name": "Heidi", + "index": 62, + "raw_name": "Heidi Tamm", + "role": "author", + "surname": "Tamm" + }, + { + "given_name": "AFS.", + "index": 63, + "raw_name": "AFS. Taylor", + "role": "author", + "surname": "Taylor" + }, + { + "given_name": "Leho", + "index": 64, + "raw_name": "Leho Tedersoo", + "role": "author", + "surname": "Tedersoo" + }, + { + "given_name": "M.T.", + "index": 65, + "raw_name": "M.T. Telleria", + "role": "author", + "surname": "Telleria" + }, + { + "given_name": "Dhanushka", + "index": 66, + "raw_name": "Dhanushka Udayanga", + "role": "author", + "surname": "Udayanga" + }, + { + "given_name": "Martin", + "index": 67, + "raw_name": "Martin Unterseher", + "role": "author", + "surname": "Unterseher" + }, + { + "given_name": "Sergey", + "index": 68, + "raw_name": "Sergey Volobuev", + "role": "author", + "surname": "Volobuev" + }, + { + "given_name": "Michael", + "index": 69, + "raw_name": "Michael Weiss", + "role": "author", + "surname": "Weiss" + }, + { + "given_name": "Christian", + "index": 70, + "raw_name": "Christian Wurzbacher", + "role": "author", + "surname": "Wurzbacher" + }, + { + "raw_name": "Kessy Abarenkov" + }, + { + "raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden" + } + ], + "ext_ids": { + "doi": "10.15156/bio/sh409843.07fu" + }, + "extra": { + "datacite": { + "license": [ + { + "rights": "Attribution-NonCommercial (CC BY-NC)", + "rightsUri": "http://creativecommons.org/licenses/by-nc/4.0" + } + ], + "metadataVersion": 1, + "resourceType": "Dataset/UNITE Species Hypothesis", + "resourceTypeGeneral": "Dataset", + "schemaVersion": "http://datacite.org/schema/kernel-3" + }, + "release_month": 10 + }, + "language": "en", + "license_slug": "CC-BY-NC", + "publisher": "UNITE Community", + "refs": [], + "release_date": "2014-10-05", + "release_stage": "published", + "release_type": "dataset", + "release_year": 2014, + "subtitle": "Gomphales", + "title": "SH409843.07FU" +} diff --git a/python/tests/files/datacite/datacite_result_06.json b/python/tests/files/datacite/datacite_result_06.json new file mode 100644 index 00000000..4f6cae94 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_06.json @@ -0,0 +1,29 @@ +{ + "abstracts": [], + "contribs": [ + { + "index": 0, + "raw_name": "Crispijn De Passe (Der Ältere) (1564-1637)", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.16903/ethz-grs-d_006220" + }, + "extra": { + "datacite": { + "license": [ + { + "rights": "ETH-Bibliothek Zürich, Graphische Sammlung / D 6220 / Public Domain Mark 1.0" + } + ], + "metadataVersion": 1, + "resourceTypeGeneral": "InteractiveResource", + "schemaVersion": "http://datacite.org/schema/kernel-3" + } + }, + "refs": [], + "release_type": "article", + "release_year": 1590, + "title": "Der Eifer (Sedulitas), Blatt 7 der Folge \"Die Tugenden\"" +} diff --git a/python/tests/files/datacite/datacite_result_07.json b/python/tests/files/datacite/datacite_result_07.json new file mode 100644 index 00000000..2f500925 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_07.json @@ -0,0 +1,76 @@ +{ + "abstracts": [ + { + "content": "The purpose of the ISEC concept is to provide a high-efficient heat pump system for hot water production. The ISEC concept uses two storage tanks for the water, one discharged and one charged. Hot water for the industrial process is tapped from the charged tank, while the other tank is charging. Charging is done by circulating the water in the tank through the condenser of a heat pump several times and thereby gradually heating the water. The charging is done with a higher mass flow rate than the discharging to reach several circulations of the water during the time frame of one discharging. This result in a lower condensing temperature than if the water was heated in one step. Two test setups were built, one to test the performance of the heat pump gradually heating the water and one to investigate the stratification in the storage tanks. Furthermore, a dynamic model of the system was implemented in Dymola, and validated by the use of test data from the two experimental setups. This paper shows that there is a good consistency between the model and the experimental tests.", + "lang": "en", + "mimetype": "text/plain" + } + ], + "contribs": [ + { + "given_name": "E.", + "index": 0, + "raw_name": "E. ROTHUIZEN", + "role": "author", + "surname": "ROTHUIZEN" + }, + { + "given_name": "B.", + "index": 1, + "raw_name": "B. ELMEGAARD", + "role": "author", + "surname": "ELMEGAARD" + }, + { + "given_name": "B.", + "index": 2, + "raw_name": "B. MARKUSSEN W.", + "role": "author", + "surname": "MARKUSSEN W." + }, + { + "index": 3, + "raw_name": "Et Al.", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.18462/iir.icr.2015.0926" + }, + "extra": { + "datacite": { + "resourceType": "Dataset", + "resourceTypeGeneral": "Dataset", + "subjects": [ + { + "subject": "HEAT PUMP" + }, + { + "subject": "HOT WATER" + }, + { + "subject": "HEAT TRANSFER" + }, + { + "subject": "PERFORMANCE" + }, + { + "subject": "THERMAL STORAGE" + }, + { + "subject": "TANK" + }, + { + "subject": "MODEL" + } + ] + } + }, + "language": "en", + "publisher": "International Institute of Refrigeration (IIR)", + "refs": [], + "release_stage": "published", + "release_type": "dataset", + "release_year": 2015, + "title": "High efficient heat pump system using storage tanks to increase cop by means of the ISEC concept. 1: model validation." +} diff --git a/python/tests/files/datacite/datacite_result_08.json b/python/tests/files/datacite/datacite_result_08.json new file mode 100644 index 00000000..70237280 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_08.json @@ -0,0 +1,57 @@ +{ + "abstracts": [ + { + "content": "International society recognizes that the scarcity of fresh water is increasing and farming sectors suffer from lack of irrigation water. However, if we look at this issue with a framework of relative factor endowment, a different view will arise. In emerging states with rapid industrialization and labor migration, labor scarcity increases at a faster pace than that of irrigation water. Using the historical review of Japan's irrigation policies as well as the case studies of India and China, this paper shows that the introduction of policies which do not reflect the actual relative resource scarcity may mislead the development path. We argue that under increasing relative labor scarcity it is important to realize the substitution of capital for labor for surface irrigation system management and that the substitution needs public support because the service of surface irrigation system has some externalities. Through this argument, this paper also intends to shed the light back to the role of the state for local resource management which seems to be unfairly undervalued since the boom of community participatory approach in the 1980s.", + "lang": "en", + "mimetype": "text/plain" + } + ], + "contribs": [ + { + "given_name": "Kei", + "index": 0, + "raw_name": "Kei Kajisa", + "role": "author", + "surname": "Kajisa" + }, + { + "given_name": "Kei", + "index": 1, + "raw_name": "Kei Kajisa", + "role": "author", + "surname": "Kajisa" + } + ], + "ext_ids": { + "doi": "10.22004/ag.econ.284864" + }, + "extra": { + "datacite": { + "metadataVersion": 1, + "resourceType": "Text", + "resourceTypeGeneral": "Text", + "subjects": [ + { + "subject": "Land Economics/Use" + }, + { + "subject": "irrigation", + "subjectScheme": "keyword" + }, + { + "subject": "industrialization", + "subjectScheme": "keyword" + }, + { + "subject": "collective action", + "subjectScheme": "keyword" + } + ] + } + }, + "language": "en", + "refs": [], + "release_type": "article-journal", + "release_year": 2017, + "title": "Irrigation Policies under Rapid Industrialization and Labor Migration: Lessons from Japan, China and India" +} diff --git a/python/tests/files/datacite/datacite_result_09.json b/python/tests/files/datacite/datacite_result_09.json new file mode 100644 index 00000000..79571360 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_09.json @@ -0,0 +1,51 @@ +{ + "abstracts": [], + "contribs": [ + { + "given_name": "Nils", + "index": 0, + "raw_name": "Nils Kirstaedter", + "role": "author", + "surname": "Kirstaedter" + }, + { + "extra": { + "organization": "TIB-Technische Informationsbibliothek Universitätsbibliothek Hannover" + } + }, + { + "extra": { + "type": "DataManager" + }, + "raw_name": "Technische Informationsbibliothek (TIB)" + } + ], + "ext_ids": { + "doi": "10.2314/gbv:880813733" + }, + "extra": { + "datacite": { + "metadataVersion": 9, + "resourceType": "Report", + "resourceTypeGeneral": "Text", + "schemaVersion": "http://datacite.org/schema/kernel-4", + "subjects": [ + { + "subject": "Direktdiodenlasersysteme" + }, + { + "subject": "Physics", + "subjectScheme": "linsearch" + } + ] + } + }, + "language": "de", + "publisher": "[Lumics GmbH]", + "refs": [], + "release_stage": "published", + "release_type": "report", + "release_year": 2016, + "title": "BrightLas : TP3.3. Module für Direktdiodenstrahlquellen bis 4kW und Untersuchungen zur Leistungsskalierung (Diodemodul) : zum Verbundvorhaben Direktdiodenlaseranlagen und -systeme (VP3) im Förderschwerpunkt innovative regionale Wachstumskerne, BMBF : Abschlussbericht", + "version": "1.0" +} diff --git a/python/tests/files/datacite/datacite_result_10.json b/python/tests/files/datacite/datacite_result_10.json new file mode 100644 index 00000000..1d39feb0 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_10.json @@ -0,0 +1,35 @@ +{ + "abstracts": [], + "contribs": [ + { + "index": 0, + "raw_name": "Unknown", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.25549/wpacards-m6171" + }, + "extra": { + "datacite": { + "resourceType": "Dataset", + "resourceTypeGeneral": "Dataset", + "schemaVersion": "http://datacite.org/schema/kernel-4", + "subjects": [ + { + "subject": "housing areas" + }, + { + "subject": "Dwellings" + } + ] + } + }, + "language": "en", + "publisher": "University of Southern California Digital Library (USC.DL)", + "refs": [], + "release_stage": "published", + "release_type": "dataset", + "release_year": 2012, + "title": "WPA household census for 210 E VERNON, Los Angeles" +} diff --git a/python/tests/files/datacite/datacite_result_11.json b/python/tests/files/datacite/datacite_result_11.json new file mode 100644 index 00000000..761a99c9 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_11.json @@ -0,0 +1,27 @@ +{ + "abstracts": [], + "contribs": [ + { + "index": 0, + "raw_name": "Comet Photo AG (Zürich)", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.3932/ethz-a-000055869" + }, + "extra": { + "datacite": { + "metadataVersion": 6, + "resourceTypeGeneral": "Image", + "schemaVersion": "http://datacite.org/schema/kernel-3" + } + }, + "language": "de", + "publisher": "ETH-Bibliothek Zürich, Bildarchiv", + "refs": [], + "release_stage": "published", + "release_type": "graphic", + "release_year": 1965, + "title": "N1 bei Safenwil" +} diff --git a/python/tests/files/datacite/datacite_result_12.json b/python/tests/files/datacite/datacite_result_12.json new file mode 100644 index 00000000..4e966d6c --- /dev/null +++ b/python/tests/files/datacite/datacite_result_12.json @@ -0,0 +1,49 @@ +{ + "abstracts": [], + "contribs": [ + { + "given_name": "Charalampos", + "index": 0, + "raw_name": "Charalampos Spanias", + "role": "author", + "surname": "Spanias" + }, + { + "given_name": "Pantelis T", + "index": 1, + "raw_name": "Pantelis T Nikolaidis", + "role": "author", + "surname": "Nikolaidis" + }, + { + "given_name": "Thomas", + "index": 2, + "raw_name": "Thomas Rosemann", + "role": "author", + "surname": "Rosemann" + }, + { + "given_name": "Beat", + "index": 3, + "raw_name": "Beat Knechtle", + "role": "author", + "surname": "Knechtle" + } + ], + "ext_ids": { + "doi": "10.5167/uzh-171449" + }, + "extra": { + "datacite": { + "resourceTypeGeneral": "Text" + }, + "release_month": 6 + }, + "publisher": "MDPI Publishing", + "refs": [], + "release_date": "2019-06-14", + "release_stage": "published", + "release_type": "article-journal", + "release_year": 2019, + "title": "Anthropometric and Physiological Profile of Mixed Martial Art Athletes: A Brief Review" +} diff --git a/python/tests/files/datacite/datacite_result_13.json b/python/tests/files/datacite/datacite_result_13.json new file mode 100644 index 00000000..923f2ea8 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_13.json @@ -0,0 +1,36 @@ +{ + "abstracts": [], + "contribs": [ + { + "index": 0, + "raw_name": "O.M.", + "role": "author" + }, + { + "given_name": "Hermann", + "index": 1, + "raw_name": "Hermann Hiltbrunner", + "role": "author", + "surname": "Hiltbrunner" + } + ], + "ext_ids": { + "doi": "10.5169/seals-314104" + }, + "extra": { + "datacite": { + "metadataVersion": 17, + "resourceType": "Journal Article", + "resourceTypeGeneral": "Text", + "schemaVersion": "http://datacite.org/schema/kernel-3" + }, + "release_month": 10 + }, + "publisher": "Buchdruckerei Büchler & Co.", + "refs": [], + "release_date": "1940-10-05", + "release_stage": "published", + "release_type": "article-journal", + "release_year": 1940, + "title": "[Müssen wir des Glücks uns schämen?]" +} diff --git a/python/tests/files/datacite/datacite_result_14.json b/python/tests/files/datacite/datacite_result_14.json new file mode 100644 index 00000000..2ce68d29 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_14.json @@ -0,0 +1,114 @@ +{ + "abstracts": [ + { + "content": "An entry from the Cambridge Structural Database, the world's repository for small molecule crystal structures. The entry contains experimental data from a crystal diffraction study. The deposited dataset for this entry is freely available from the CCDC and typically includes 3D coordinates, cell parameters, space group, experimental conditions and quality measures.", + "lang": "en", + "mimetype": "text/plain" + } + ], + "contribs": [ + { + "given_name": "E.", + "index": 0, + "raw_name": "E. Stulz", + "role": "author", + "surname": "Stulz" + }, + { + "given_name": "S.M.", + "index": 1, + "raw_name": "S.M. Scott", + "role": "author", + "surname": "Scott" + }, + { + "given_name": "Yiu-Fai", + "index": 2, + "raw_name": "Yiu-Fai Ng", + "role": "author", + "surname": "Ng" + }, + { + "given_name": "A.D.", + "index": 3, + "raw_name": "A.D. Bond", + "role": "author", + "surname": "Bond" + }, + { + "given_name": "S.J.", + "index": 4, + "raw_name": "S.J. Teat", + "role": "author", + "surname": "Teat" + }, + { + "given_name": "S.L.", + "index": 5, + "raw_name": "S.L. Darling", + "role": "author", + "surname": "Darling" + }, + { + "given_name": "N.", + "index": 6, + "raw_name": "N. Feeder", + "role": "author", + "surname": "Feeder" + }, + { + "given_name": "J.K.M.", + "index": 7, + "raw_name": "J.K.M. Sanders", + "role": "author", + "surname": "Sanders" + } + ], + "ext_ids": { + "doi": "10.5517/cc7gns3" + }, + "extra": { + "datacite": { + "metadataVersion": 2, + "relations": [ + { + "relatedIdentifier": "10.1021/ic034699w", + "relatedIdentifierType": "DOI", + "relationType": "IsSupplementTo" + } + ], + "resourceTypeGeneral": "Dataset", + "schemaVersion": "http://datacite.org/schema/kernel-3", + "subjects": [ + { + "subject": "Crystal Structure" + }, + { + "subject": "Experimental 3D Coordinates" + }, + { + "subject": "Crystal System" + }, + { + "subject": "Space Group" + }, + { + "subject": "Cell Parameters" + }, + { + "subject": "Crystallography" + }, + { + "subject": "bis(mu~2~-5-(3,5-Di-t-butylphenyl)-15-(4-(2-(diphenylphosphino)ethynyl)phenyl)-2,8,12,18-tetrahexyl-3,7,13,17-tetramethylporphyrinato)-(5,15-bis(3,5-di-t-butylphenyl)-2,8,12,18-tetraethyl-3,7,13,17-tetramethylporphyrinato)-di-nickel-ruthenium chloroform solvate" + } + ] + } + }, + "language": "en", + "publisher": "Cambridge Crystallographic Data Centre", + "refs": [], + "release_stage": "published", + "release_type": "entry", + "release_year": 2004, + "title": "CCDC 222635: Experimental Crystal Structure Determination" +} diff --git a/python/tests/files/datacite/datacite_result_15.json b/python/tests/files/datacite/datacite_result_15.json new file mode 100644 index 00000000..5e7180c4 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_15.json @@ -0,0 +1,29 @@ +{ + "abstracts": [], + "contribs": [ + { + "given_name": "David", + "index": 0, + "raw_name": "David Richardson", + "role": "author", + "surname": "Richardson" + } + ], + "ext_ids": { + "doi": "10.6073/pasta/95296d8416aae24f3d39b4ecb27f0b28" + }, + "extra": { + "datacite": { + "metadataVersion": 1, + "resourceType": "dataPackage", + "resourceTypeGeneral": "Dataset", + "schemaVersion": "http://datacite.org/schema/kernel-2.2" + } + }, + "publisher": "Environmental Data Initiative", + "refs": [], + "release_stage": "published", + "release_type": "dataset", + "release_year": 2017, + "title": "Parramore Island of the Virginia Coast Reserve Permanent Plot Resurvey: Tree data 1997" +} diff --git a/python/tests/files/datacite/datacite_result_16.json b/python/tests/files/datacite/datacite_result_16.json new file mode 100644 index 00000000..dc9d18af --- /dev/null +++ b/python/tests/files/datacite/datacite_result_16.json @@ -0,0 +1,34 @@ +{ + "abstracts": [], + "contribs": [ + { + "given_name": "Taha", + "index": 0, + "raw_name": "Taha Sochi", + "role": "author", + "surname": "Sochi" + } + ], + "ext_ids": { + "doi": "10.6084/m9.figshare.1282478" + }, + "extra": { + "datacite": { + "license": [ + { + "rights": "CC-BY", + "rightsUri": "http://creativecommons.org/licenses/by/3.0/us" + } + ], + "resourceType": "Paper", + "resourceTypeGeneral": "Dataset", + "schemaVersion": "http://datacite.org/schema/kernel-3" + } + }, + "publisher": "Figshare", + "refs": [], + "release_stage": "published", + "release_type": "dataset", + "release_year": 2014, + "title": "Testing the Connectivity of Networks" +} diff --git a/python/tests/files/datacite/datacite_result_17.json b/python/tests/files/datacite/datacite_result_17.json new file mode 100644 index 00000000..0f768179 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_17.json @@ -0,0 +1,25 @@ +{ + "abstracts": [], + "contribs": [ + { + "index": 0, + "raw_name": "Di Giovanna, Antonino Paolo (University Of Florence)", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.7910/dvn/tsqfwc/yytj22" + }, + "extra": { + "datacite": { + "resourceTypeGeneral": "Dataset", + "schemaVersion": "http://datacite.org/schema/kernel-4" + } + }, + "publisher": "Harvard Dataverse", + "refs": [], + "release_stage": "published", + "release_type": "dataset", + "release_year": 2018, + "title": "gel_BSA-FITC_Markov_segmntation0343.tif" +} diff --git a/python/tests/files/datacite/datacite_result_18.json b/python/tests/files/datacite/datacite_result_18.json new file mode 100644 index 00000000..7f2d2792 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_18.json @@ -0,0 +1,21 @@ +{ + "abstracts": [], + "contribs": [], + "ext_ids": { + "doi": "10.7916/d81z522m" + }, + "extra": { + "datacite": { + "metadataVersion": 2, + "schemaVersion": "http://datacite.org/schema/kernel-3" + }, + "release_month": 8 + }, + "publisher": "Columbia University", + "refs": [], + "release_date": "2017-08-21", + "release_stage": "published", + "release_type": "article", + "release_year": 2017, + "title": "Eastern questionnaire, answer sheet for Interviewee 53215, page 064" +} diff --git a/python/tests/files/datacite/datacite_result_19.json b/python/tests/files/datacite/datacite_result_19.json new file mode 100644 index 00000000..4ff00a56 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_19.json @@ -0,0 +1,21 @@ +{ + "abstracts": [], + "contribs": [], + "ext_ids": { + "doi": "10.7916/d86x0cg1" + }, + "extra": { + "datacite": { + "metadataVersion": 3, + "schemaVersion": "http://datacite.org/schema/kernel-3" + }, + "release_month": 8 + }, + "publisher": "Columbia University", + "refs": [], + "release_date": "2017-08-24", + "release_stage": "published", + "release_type": "article", + "release_year": 2017, + "title": "Eastern questionnaire, answer sheet for Interviewee 55236, page 092" +} diff --git a/python/tests/files/datacite/datacite_result_20.json b/python/tests/files/datacite/datacite_result_20.json new file mode 100644 index 00000000..5a6d3473 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_20.json @@ -0,0 +1,17 @@ +{ + "abstracts": [], + "contribs": [], + "ext_ids": { + "doi": "10.7916/d86x0cg1" + }, + "extra": { + "datacite": {}, + "release_month": 8 + }, + "refs": [], + "release_date": "2017-08-24", + "release_stage": "published", + "release_type": "article", + "release_year": 2017, + "title": "<h1>Eastern questionnaire</h1>" +} diff --git a/python/tests/files/datacite/datacite_result_21.json b/python/tests/files/datacite/datacite_result_21.json new file mode 100644 index 00000000..54c22538 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_21.json @@ -0,0 +1,18 @@ +{ + "abstracts": [], + "contribs": [], + "ext_ids": { + "doi": "10.7916/d86x0cg1" + }, + "extra": { + "datacite": {}, + "release_month": 8 + }, + "language": "de", + "refs": [], + "release_date": "2017-08-24", + "release_stage": "published", + "release_type": "article", + "release_year": 2017, + "title": "ABC" +} diff --git a/python/tests/files/datacite/datacite_result_22.json b/python/tests/files/datacite/datacite_result_22.json new file mode 100644 index 00000000..913fbbb6 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_22.json @@ -0,0 +1,25 @@ +{ + "abstracts": [], + "contribs": [ + { + "index": 0, + "raw_affiliation": "Department of pataphysics", + "raw_name": "Anton Welch", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.7916/d86x0cg1" + }, + "extra": { + "datacite": {}, + "release_month": 8 + }, + "language": "de", + "refs": [], + "release_date": "2017-08-24", + "release_stage": "published", + "release_type": "article", + "release_year": 2017, + "title": "ABC" +} diff --git a/python/tests/files/datacite/datacite_result_23.json b/python/tests/files/datacite/datacite_result_23.json new file mode 100644 index 00000000..0ac6a06d --- /dev/null +++ b/python/tests/files/datacite/datacite_result_23.json @@ -0,0 +1,25 @@ +{ + "abstracts": [], + "contribs": [ + { + "index": 0, + "raw_affiliation": "Department of pataphysics", + "raw_name": "Anton Welch", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.7916/d86x0cg1-xxx" + }, + "extra": { + "datacite": {}, + "release_month": 8 + }, + "language": "de", + "refs": [], + "release_date": "2017-08-24", + "release_stage": "published", + "release_type": "article", + "release_year": 2017, + "title": "ABC" +} diff --git a/python/tests/files/datacite/datacite_result_24.json b/python/tests/files/datacite/datacite_result_24.json new file mode 100644 index 00000000..cd9898f9 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_24.json @@ -0,0 +1,25 @@ +{ + "abstracts": [], + "contribs": [ + { + "index": 0, + "raw_affiliation": "Department of pataphysics", + "raw_name": "Anton Welch", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.7916/d86x0cg1" + }, + "extra": { + "datacite": {}, + "release_month": 8 + }, + "refs": [], + "release_date": "2017-08-24", + "release_stage": "published", + "release_type": "article", + "release_year": 2017, + "subtitle": "DEF", + "title": "ABC" +} diff --git a/python/tests/files/datacite/datacite_result_25.json b/python/tests/files/datacite/datacite_result_25.json new file mode 100644 index 00000000..6a29e8de --- /dev/null +++ b/python/tests/files/datacite/datacite_result_25.json @@ -0,0 +1,25 @@ +{ + "abstracts": [], + "contribs": [ + { + "index": 0, + "raw_affiliation": "Department of pataphysics", + "raw_name": "Anton Welch", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.7916/d86x0cg1" + }, + "extra": { + "datacite": {}, + "release_month": 8 + }, + "refs": [], + "release_date": "2017-08-24", + "release_stage": "published", + "release_type": "stub", + "release_year": 2017, + "subtitle": "DEF", + "title": "Additional file 123: ABC" +} diff --git a/python/tests/files/datacite/datacite_result_26.json b/python/tests/files/datacite/datacite_result_26.json new file mode 100644 index 00000000..267eb9c2 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_26.json @@ -0,0 +1,33 @@ +{ + "abstracts": [], + "contribs": [ + { + "index": 0, + "raw_affiliation": "Department of pataphysics", + "raw_name": "Anton Welch", + "role": "author" + }, + { + "extra": { + "type": "Editor" + }, + "given_name": "David", + "raw_name": "David Wemmer", + "surname": "Wemmer" + } + ], + "ext_ids": { + "doi": "10.7916/d86x0cg1" + }, + "extra": { + "datacite": {}, + "release_month": 8 + }, + "refs": [], + "release_date": "2017-08-24", + "release_stage": "published", + "release_type": "stub", + "release_year": 2017, + "subtitle": "DEF", + "title": "Additional file 123: ABC" +} diff --git a/python/tests/files/datacite_1k_records.jsonl.gz b/python/tests/files/datacite_1k_records.jsonl.gz Binary files differnew file mode 100644 index 00000000..28ea6e37 --- /dev/null +++ b/python/tests/files/datacite_1k_records.jsonl.gz diff --git a/python/tests/files/datacite_sample.jsonl b/python/tests/files/datacite_sample.jsonl new file mode 100644 index 00000000..dba3e267 --- /dev/null +++ b/python/tests/files/datacite_sample.jsonl @@ -0,0 +1 @@ +{"id":"10.18730/8dym9","type":"dois","attributes":{"doi":"10.18730/8dym9","identifiers":[{"identifier":"https://doi.org/10.18730/8dym9","identifierType":"DOI"},{"identifier":"ICDW 20791","identifierType":"Other"}],"creators":[{"name":"GLIS Of The ITPGRFA","affiliation":[]}],"titles":[{"title":"Triticum turgidum L. subsp. durum (Desf.) Husn. 97090"}],"publisher":"International Centre for Agricultural Research in Dry Areas","container":{},"publicationYear":2017,"subjects":[{"subject":"Plant Genetic Resource for Food and Agriculture"}],"contributors":[{"name":"International Centre For Agricultural Research In Dry Areas","affiliation":[]}],"dates":[{"date":"1986","dateType":"Accepted"},{"date":"1978-06-03","dateType":"Collected"},{"date":"2017","dateType":"Issued"}],"language":"en","types":{"ris":"GEN","bibtex":"misc","citeproc":"article","schemaOrg":"CreativeWork","resourceType":"PGRFA Material","resourceTypeGeneral":"PhysicalObject"},"relatedIdentifiers":[{"schemeUri":"http://www.fao.org/plant-treaty/areas-of-work/global-information-system/descriptors","schemeType":"XML","relationType":"HasMetadata","relatedIdentifier":"https://ssl.fao.org/glisapi/v1/pgrfas?doi=10.18730/8DYM9","relatedIdentifierType":"URL","relatedMetadataScheme":"GLIS Descriptors"},{"schemeUri":"http://rs.tdwg.org/dwc/terms/guides/text/index.htm","schemeType":"DwC-A","relationType":"HasMetadata","relatedIdentifier":"https://ssl.fao.org/glisapi/v1/pgrfas?_format=dwc&doi=10.18730/8DYM9","relatedIdentifierType":"URL","relatedMetadataScheme":"Darwin Core Archive"}],"sizes":[],"formats":[],"version":null,"rightsList":[],"descriptions":[{"description":"Plant Genetic Resource.<br>Taxonomy: Triticum turgidum L. subsp. durum (Desf.) Husn.<br>Common name(s): Wheat<br>Conserved by: International Centre for Agricultural Research in Dry Areas (ICARDA), Lebanon<br>Local sample unique identifier: 97090<br>Method of creation: Acquisition<br>Date: 1986<br>Biological status: Traditional cultivar/landrace<br>Other identifiers: ICDW 20791<br>MLS status: Included<br>Historical: No","descriptionType":"Abstract"}],"geoLocations":[{"geoLocationPlace":"Collecting site","geoLocationPoint":{"pointLatitude":"35.5","pointLongitude":"23.7333"}}],"fundingReferences":[],"url":"https://ssl.fao.org/glis/doi/10.18730/8DYM9","contentUrl":null,"metadataVersion":3,"schemaVersion":"http://datacite.org/schema/kernel-4","source":"mds","isActive":true,"state":"findable","reason":null,"created":"2017-11-11T12:26:01.000Z","registered":"2017-11-11T12:26:02.000Z","published":"2017","updated":"2019-08-02T16:34:56.000Z"},"relationships":{"client":{"data":{"id":"fao.itpgrfa","type":"clients"}}}} diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py new file mode 100644 index 00000000..5ad7ef2c --- /dev/null +++ b/python/tests/import_datacite.py @@ -0,0 +1,317 @@ +""" +Test datacite importer. +""" + +import collections +import datetime +import pytest +import gzip +from fatcat_tools.importers import DataciteImporter, JsonLinePusher +from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates, clean_doi, index_form_to_display_name +from fatcat_tools.transforms import entity_to_dict +from fixtures import api +import json + + +@pytest.fixture(scope="function") +def datacite_importer(api): + with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: + yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', + bezerk_mode=True) + +@pytest.fixture(scope="function") +def datacite_importer_existing(api): + with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: + yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', + bezerk_mode=False) + +@pytest.mark.skip(reason="larger datacite import slows tests down") +def test_datacite_importer_huge(datacite_importer): + last_index = datacite_importer.api.get_changelog(limit=1)[0].index + with gzip.open('tests/files/datacite_1k_records.jsonl.gz', 'rt') as f: + datacite_importer.bezerk_mode = True + counts = JsonLinePusher(datacite_importer, f).run() + assert counts['insert'] == 998 + change = datacite_importer.api.get_changelog_entry(index=last_index+1) + release = datacite_importer.api.get_release(change.editgroup.edits.releases[0].ident) + assert len(release.contribs) == 3 + + +def test_find_original_language_title(): + """ + Original language might be included, in various ways. + """ + Case = collections.namedtuple('Case', 'about input result') + cases = [ + Case('defaults to None', {}, None), + Case('ignore unknown keys', {'broken': 'kv'}, None), + Case('just a title', {'title': 'Noise Reduction'}, None), + Case('same title should be ignored', { + 'title': 'Noise Reduction', + 'original_language_title': 'Noise Reduction' + }, None), + Case('empty subdict is ignored', { + 'title': 'Noise Reduction', + 'original_language_title': {}, + }, None), + Case('unknown subdict keys are ignored', { + 'title': 'Noise Reduction', + 'original_language_title': {'broken': 'kv'}, + }, None), + Case('original string', { + 'title': 'Noise Reduction', + 'original_language_title': 'Подавление шума', + }, 'Подавление шума'), + Case('language tag is ignored, since its broken', { + 'title': 'Noise Reduction', + 'original_language_title': { + 'language': 'ja', + '__content__': 'Noise Reduction' + }, + }, None), + Case('do not care about language', { + 'title': 'Noise Reduction', + 'original_language_title': { + 'language': 'ja', + '__content__': 'Rauschunterdrückung', + }, + }, 'Rauschunterdrückung'), + Case('ignore excessive questionmarks', { + 'title': 'Noise Reduction', + 'original_language_title': { + 'language': 'ja', + '__content__': '???? However', + }, + }, None), + ] + + for case in cases: + result = find_original_language_title(case.input) + assert result == case.result + +def test_parse_datacite_titles(): + """ + Given a list of titles, find title, original_language_title and subtitle. + Result is a 3-tuple of title, original_language_title, subtitle. + """ + Case = collections.namedtuple('Case', 'about input result') + cases = [ + Case('handle None', None, (None, None, None)), + Case('empty list', [], (None, None, None)), + Case('empty item', [{}], (None, None, None)), + Case('broken keys', [{'broken': 'kv'}], (None, None, None)), + Case('title only', [{'title': 'Total carbon dioxide'}], + ('Total carbon dioxide', None, None), + ), + Case('title and subtitle', [ + {'title': 'Total carbon dioxide'}, + {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, + ], + ('Total carbon dioxide', None, 'Station TT043_7-9'), + ), + Case('title, subtitle order does not matter', [ + {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, + {'title': 'Total carbon dioxide'}, + ], + ('Total carbon dioxide', None, 'Station TT043_7-9'), + ), + Case('multiple titles, first wins', [ + {'title': 'Total carbon dioxide'}, + {'title': 'Meeting Heterogeneity'}, + ], + ('Total carbon dioxide', None, None), + ), + Case('multiple titles, plus sub', [ + {'title': 'Total carbon dioxide'}, + {'title': 'Meeting Heterogeneity'}, + {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, + ], + ('Total carbon dioxide', None, 'Station TT043_7-9'), + ), + Case('multiple titles, multiple subs', [ + {'title': 'Total carbon dioxide'}, + {'title': 'Meeting Heterogeneity'}, + {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, + {'title': 'Some other subtitle', 'titleType': 'Subtitle'}, + ], + ('Total carbon dioxide', None, 'Station TT043_7-9'), + ), + Case('title, original, sub', [ + {'title': 'Total carbon dioxide', 'original_language_title': 'Всего углекислого газа'}, + {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, + ], + ('Total carbon dioxide', 'Всего углекислого газа', 'Station TT043_7-9'), + ), + Case('title, original same as title, sub', [ + {'title': 'Total carbon dioxide', 'original_language_title': { + '__content__': 'Total carbon dioxide', + }}, + {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, + ], + ('Total carbon dioxide', None, 'Station TT043_7-9'), + ), + Case('title, original dict, sub', [ + {'title': 'Total carbon dioxide', 'original_language_title': { + '__content__': 'Всего углекислого газа', + }}, + {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, + ], + ('Total carbon dioxide', 'Всего углекислого газа', 'Station TT043_7-9'), + ), + ] + + for case in cases: + result = parse_datacite_titles(case.input) + assert result == case.result, case.about + +def test_parse_datacite_dates(): + """ + Test datacite date parsing. + """ + Case = collections.namedtuple('Case', 'about input result') + cases = [ + Case('None is None', None, (None, None, None)), + Case('empty list is None', [], (None, None, None)), + Case('empty item is None', [{}], (None, None, None)), + Case('year only yields year only', [{'date': '2019'}], (None, None, 2019)), + Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, None, 2019)), + Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, None, 2020)), + Case('first with type', [ + {'date': '2019', 'dateType': 'Accepted'}, {'date': '2020'} + ], (None, None, 2019)), + Case('full date', [ + {'date': '2019-12-01', 'dateType': 'Valid'}, + ], (datetime.date(2019, 12, 1), 12, 2019)), + Case('date type prio', [ + {'date': '2000-12-01', 'dateType': 'Valid'}, + {'date': '2010-01-01', 'dateType': 'Updated'}, + ], (datetime.date(2000, 12, 1), 12, 2000)), + Case('date type prio, Available > Updated', [ + {'date': '2010-01-01', 'dateType': 'Updated'}, + {'date': '2000-12-01', 'dateType': 'Available'}, + ], (datetime.date(2000, 12, 1), 12, 2000)), + Case('allow different date formats, Available > Updated', [ + {'date': '2010-01-01T10:00:00', 'dateType': 'Updated'}, + {'date': '2000-12-01T10:00:00', 'dateType': 'Available'}, + ], (datetime.date(2000, 12, 1), 12, 2000)), + Case('allow different date formats, Available > Updated', [ + {'date': '2010-01-01T10:00:00Z', 'dateType': 'Updated'}, + {'date': '2000-12-01T10:00:00Z', 'dateType': 'Available'}, + ], (datetime.date(2000, 12, 1), 12, 2000)), + Case('allow fuzzy date formats, Available > Updated', [ + {'date': '2010', 'dateType': 'Updated'}, + {'date': '2000 Dec 01', 'dateType': 'Available'}, + ], (datetime.date(2000, 12, 1), 12, 2000)), + Case('fuzzy year only', [ + {'date': 'Year 2010', 'dateType': 'Issued'}, + ], (None, None, 2010)), + Case('fuzzy year and month', [ + {'date': 'Year 2010 Feb', 'dateType': 'Issued'}, + ], (None, 2, 2010)), + Case('fuzzy year, month, day', [ + {'date': 'Year 2010 Feb 24', 'dateType': 'Issued'}, + ], (datetime.date(2010, 2, 24), 2, 2010)), + Case('ignore broken date', [ + {'date': 'Febrrr 45', 'dateType': 'Updated'}, + ], (None, None, None)), + ] + for case in cases: + result = parse_datacite_dates(case.input) + assert result == case.result, case.about + +def test_datacite_importer(datacite_importer): + last_index = datacite_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/datacite_sample.jsonl', 'r') as f: + datacite_importer.bezerk_mode = True + counts = JsonLinePusher(datacite_importer, f).run() + assert counts['insert'] == 1 + assert counts['exists'] == 0 + assert counts['skip'] == 0 + + # fetch most recent editgroup + change = datacite_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup + assert eg.description + assert "datacite" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.DataciteImporter" in eg.extra['agent'] + + last_index = datacite_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/datacite_sample.jsonl', 'r') as f: + datacite_importer.bezerk_mode = False + datacite_importer.reset() + counts = JsonLinePusher(datacite_importer, f).run() + assert counts['insert'] == 0 + assert counts['exists'] == 1 + assert counts['skip'] == 0 + assert last_index == datacite_importer.api.get_changelog(limit=1)[0].index + +def test_datacite_dict_parse(datacite_importer): + with open('tests/files/datacite_sample.jsonl', 'r') as f: + raw = json.load(f) + r = datacite_importer.parse_record(raw) + # ensure the API server is ok with format + JsonLinePusher(datacite_importer, [json.dumps(raw)]).run() + + print(r.extra) + assert r.title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090" + assert r.publisher == "International Centre for Agricultural Research in Dry Areas" + assert r.release_type == "article" + assert r.release_stage == "published" + assert r.license_slug == None + assert r.original_title == None + assert r.ext_ids.doi == "10.18730/8dym9" + assert r.ext_ids.isbn13 == None + assert r.language == "en" + assert r.subtitle == None + assert r.release_date == None + assert r.release_year == 1986 + assert 'subtitle' not in r.extra + assert 'subtitle' not in r.extra['datacite'] + assert 'funder' not in r.extra + assert 'funder' not in r.extra['datacite'] + # matched by ISSN, so shouldn't be in there + #assert extra['container_name'] == "International Journal of Quantum Chemistry" + assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}] + assert len(r.abstracts) == 1 + assert len(r.abstracts[0].content) == 421 + assert len(r.contribs) == 2 + assert r.contribs[0].raw_name == "GLIS Of The ITPGRFA" + assert r.contribs[0].given_name == None + assert r.contribs[0].surname == None + assert len(r.refs) == 0 + +def test_datacite_conversions(datacite_importer): + """ + Datacite JSON to release entity JSON representation. The count is hardcoded + for now. + """ + datacite_importer.debug = True + for i in range(27): + src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i) + dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i) + print('testing mapping from {} => {}'.format(src, dst)) + with open(src, 'r') as f: + re = datacite_importer.parse_record(json.load(f)) + result = entity_to_dict(re) + with open(dst, 'r') as f: + expected = json.loads(f.read()) + + assert result == expected, 'output mismatch in {}'.format(dst) + +def test_index_form_to_display_name(): + Case = collections.namedtuple('Case', 'input output') + cases = [ + Case('', ''), + Case('ABC', 'ABC'), + Case('International Space Station', 'International Space Station'), + Case('Jin, Shan', 'Shan Jin'), + Case('Volkshochschule Der Bundesstadt Bonn', 'Volkshochschule Der Bundesstadt Bonn'), + Case('Solomon, P. M.', 'P. M. Solomon'), + Case('Sujeevan Ratnasingham', 'Sujeevan Ratnasingham'), + Case('Paul Stöckli (1906-1991), Künstler', 'Paul Stöckli (1906-1991), Künstler'), + ] + + for c in cases: + assert c.output == index_form_to_display_name(c.input) + |