From 4a82a0763bf927248f22e47ab5187af4beff83ee Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Mon, 9 Dec 2019 01:03:43 +0100
Subject: datacite: importer skeleton

* contributors, title, date, publisher, container, license

Field and value analysis via https://github.com/miku/indigo.
---
 python/fatcat_import.py                   |  30 ++
 python/fatcat_tools/importers/__init__.py |   1 +
 python/fatcat_tools/importers/datacite.py | 458 ++++++++++++++++++++++++++++++
 python/tests/import_datacite.py           |  25 ++
 4 files changed, 514 insertions(+)
 create mode 100644 python/fatcat_tools/importers/datacite.py
 create mode 100644 python/tests/import_datacite.py

(limited to 'python')

diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 8d82dab3..d7651792 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -166,6 +166,17 @@ def run_cdl_dash_dat(args):
     print("fileset id: {}".format(fs.ident))
     print("link: https://fatcat.wiki/fileset/{}".format(fs.ident))
 
+def run_datacite(args):
+    dci = DataciteImporter(args.api,
+        args.issn_map_file,
+        edit_batch_size=args.batch_size,
+        bezerk_mode=args.bezerk_mode)
+    if args.kafka_mode:
+        KafkaJsonPusher(fci, args.kafka_hosts, args.kafka_env, "api-datacite",
+            "fatcat-import", consume_batch_size=args.batch_size).run()
+    else:
+        JsonLinePusher(dci, args.json_file).run()
+
 def main():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -439,6 +450,25 @@ def main():
         type=str,
         help="use existing editgroup (instead of creating a new one)")
 
+    sub_datacite = subparsers.add_parser('datacite',
+        help="import datacite.org metadata")
+    sub_datacite.add_argument('json_file',
+        help="File with jsonlines from datacite.org v2 API to import from",
+        default=sys.stdin, type=argparse.FileType('r'))
+    sub_datacite.add_argument('issn_map_file',
+        help="ISSN to ISSN-L mapping file",
+        default=None, type=argparse.FileType('r'))
+    sub_datacite.add_argument('--kafka-mode',
+        action='store_true',
+        help="consume from kafka topic (not stdin)")
+    sub_datacite.add_argument('--bezerk-mode',
+        action='store_true',
+        help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)")
+    sub_datacite.set_defaults(
+        func=run_datacite,
+        auth_var="FATCAT_API_AUTH_TOKEN",
+    )
+
     args = parser.parse_args()
     if not args.__dict__.get("func"):
         print("tell me what to do!")
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index bb9c5b17..d936605f 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -14,6 +14,7 @@ To run an import you combine two classes; one each of:
 
 from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLargeFilePusher, Bs4XmlLinesPusher, Bs4XmlFileListPusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC
 from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP, lookup_license_slug
+from .datacite import DataciteImporter
 from .jalc import JalcImporter
 from .jstor import JstorImporter
 from .arxiv import ArxivRawImporter
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
new file mode 100644
index 00000000..faa8e2be
--- /dev/null
+++ b/python/fatcat_tools/importers/datacite.py
@@ -0,0 +1,458 @@
+"""
+WIP: Importer for datacite.org data.
+
+Example doc at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8
+"""
+
+from .common import EntityImporter
+import dateparser
+import datetime
+import fatcat_openapi_client
+import json
+import sys
+
+# https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
+CONTAINER_TYPE_MAP = {
+    'Journal': 'journal',
+    'Series': 'journal',
+    'Book Series': 'book-series',
+}
+
+# TODO(martin): merge this with other maps, maybe.
+LICENSE_SLUG_MAP = {
+    "//creativecommons.org/licenses/by/2.0": "CC-BY",
+    "//creativecommons.org/licenses/by/2.0/uk/legalcode": "CC-BY",
+    "//creativecommons.org/licenses/by/3.0": "CC-BY",
+    "//creativecommons.org/licenses/by/3.0/us": "CC-BY",
+    "//creativecommons.org/licenses/by/4.0": "CC-BY",
+    "//creativecommons.org/licenses/by/4.0/deed.de": "CC-BY",
+    "//creativecommons.org/licenses/by/4.0/deed.en_US": "CC-BY",
+    "//creativecommons.org/licenses/by/4.0/legalcode": "CC-BY",
+    "//creativecommons.org/licenses/by-nc/2.0": "CC-BY-NC",
+    "//creativecommons.org/licenses/by-nc/3.0": "CC-BY-NC",
+    "//creativecommons.org/licenses/by-nc/4.0": "CC-BY-NC",
+    "//creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC",
+    "//creativecommons.org/licenses/by-nc-nd/3.0": "CC-BY-NC-ND",
+    "//creativecommons.org/licenses/by-nc-nd/3.0/gr": "CC-BY-NC-ND",
+    "//creativecommons.org/licenses/by-nc-nd/4.0": "CC-BY-NC-ND",
+    "//creativecommons.org/licenses/by-nc-nd/4.0": "CC-BY-ND",
+    "//creativecommons.org/licenses/by-nc-nd/4.0/legalcode": "CC-BY-ND",
+    "//creativecommons.org/licenses/by-nc-sa/4.0": "CC-BY-NC-SA",
+    "//creativecommons.org/licenses/by-nc-sa/4.0": "CC-BY-SA",
+    "//creativecommons.org/licenses/by-nd/4.0": "CC-BY-ND",
+    "//creativecommons.org/licenses/by-sa/3.0/de": "CC-BY-SA",
+    "//creativecommons.org/licenses/by-sa/3.0/gr": "CC-BY-SA",
+    "//creativecommons.org/licenses/by-sa/4.0": "CC-BY-SA",
+    "//creativecommons.org/licenses/by-sa/4.0/legalcode": "CC-BY-SA",
+    "//creativecommons.org/licenses/CC-BY/4.0": "CC-BY",
+    "//creativecommons.org/licenses/publicdomain/zero/1.0": "CC-0",
+    "//creativecommons.org/publicdomain/zero/1.0": "CC-0",
+    "//creativecommons.org/publicdomain/zero/1.0": "CC-0",
+    "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0",
+    "//opensource.org/licenses/MIT": "MIT",
+    "//www.elsevier.com/open-access/userlicense/1.0": "ELSEVIER-USER-1.0",
+    "//www.gnu.org/licenses/gpl-3.0.en.html": "GPLv3",
+    "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html": "GPLv2",
+    "//www.karger.com/Services/SiteLicenses": "KARGER",
+    "//www.opensource.org/licenses/Apache-2.0": "Apache-2.0",
+    "//www.opensource.org/licenses/BSD-3-Clause": "BSD-3-Clause",
+    "//www.opensource.org/licenses/EUPL-1.1": "EUPL-1.1", # redirects to EUPL-1.2
+    "//www.opensource.org/licenses/MIT": "MIT",
+    # "http://royalsocietypublishing.org/licence": "", # OA and "normal", https://royalsociety.org/journals/authors/licence-to-publish/
+    # "http://rsc.li/journals-terms-of-use": "RSC",
+    # "http://www.fu-berlin.de/sites/refubium/rechtliches/Nutzungsbedingungen": "", # 53 UrhG.
+    # "http://www.nrcresearchpress.com/page/about/CorporateTextAndDataMining": "",
+    # "http://www.springer.com/tdm": "",
+    # "https://cds.unistra.fr/vizier-org/licences_vizier.html": "", # Maybe try to "SPN" those: https://web.archive.org/web/*/https://cds.unistra.fr/vizier-org/licences_vizier.html
+    # "https://link.aps.org/licenses/aps-default-accepted-manuscript-license": "",
+    # "https://oparu.uni-ulm.de/xmlui/license_opod_v1": "",
+    # "https://publikationen.bibliothek.kit.edu/kitopen-lizenz": "",
+    # "https://rightsstatements.org/page/InC/1.0?language=en": "",
+    # "https://services.ceda.ac.uk/cedasite/register/info": "",
+    # "https://wdc.dlr.de/ndmc/userfiles/file/NDMC-Data_Sharing_Principles.pdf": "", # 404
+    # "https://www.cambridge.org/core/terms": "",
+    # "https://www.elsevier.com/tdm/userlicense/1.0",
+    # "info:eu-repo/semantics/closedAccess": "", # https://wiki.surfnet.nl/display/standards/info-eu-repo/#info-eu-repo-AccessRights
+    # "info:eu-repo/semantics/embargoedAccess": "",
+    # "info:eu-repo/semantics/openAccess": "",
+}
+
+class DataciteImporter(EntityImporter):
+    """
+    Importer for datacite records. TODO(martin): Do we need issn_map_file?
+    """
+
+    def __init__(self, api, issn_map_file, **kwargs):
+
+        eg_desc = kwargs.get('editgroup_description',
+            "Automated import of Datacite DOI metadata, harvested from REST API")
+        eg_extra = kwargs.get('editgroup_extra', dict())
+        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.DataciteImporter')
+        super().__init__(api,
+            issn_map_file=issn_map_file,
+            editgroup_description=eg_desc,
+            editgroup_extra=eg_extra,
+            **kwargs)
+
+        self.create_containers = kwargs.get('create_containers', True)
+        self.read_issn_map_file(issn_map_file)
+
+    def parse_record(self, obj):
+        """
+        TODO(martin): Map datacite to RE.
+
+        WIP, notes:
+
+        * Many subjects, should they end up in extra?
+        * attributes.creators and attributes.contributors
+
+        $ jq '.attributes.creators[]?.nameType?' datacite.500k | sort | uniq -c | sort -nr
+        3963663 "Personal"
+        289795 null
+        8892 "Organizational"
+
+        Shall we use issued, available?
+
+          {
+            "date": "2011-11-18",
+            "dateType": "Accepted"
+          },
+          {
+            "date": "2011-11-18",
+            "dateType": "Available"
+          },
+          {
+            "date": "2011-11-07",
+            "dateType": "Copyrighted"
+          },
+          {
+            "date": "2011-11-18",
+            "dateType": "Issued"
+          },
+          {
+            "date": "2011-11-07",
+            "dateType": "Issued"
+          }
+
+        TODO(martin): Quick analysis of dates and stages.
+        """
+
+        if 'attributes' not in obj:
+            return None
+
+        attributes = obj['attributes']
+
+        # > Contributors
+        #
+        #  "attributes.creators[].contributorType": [
+        #    "author"
+        #  ],
+        #  "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": [
+        #    "LCNA",
+        #    "GND",
+        #    "email",
+        #    "NAF",
+        #    "OSF",
+        #    "RRID",
+        #    "ORCID",
+        #    "SCOPUS",
+        #    "NRCPID",
+        #    "schema.org",
+        #    "GRID",
+        #    "MGDS",
+        #    "VIAF",
+        #    "JACoW-ID"
+        #  ],
+        #
+        #    "https://orcid.org/0000-0002-9902-738X",
+        #    "http://jacow.org/JACoW-00001280",
+        #    "Wiebe_Peter",
+        #    "https://osf.io/https://osf.io/kjfuy/",
+        #    "http://www.viaf.org176549220",
+        #    "2239",
+        #    "Jeffries_Martin",
+        #    "https://orcid.org/0000-0002-1493-6630",
+        #    "0000-0002-6233-612X",
+        #
+        # "creators": [
+        #   {
+        #     "name": "Bögli, Hans",
+        #     "nameType": "Personal",
+        #     "givenName": "Hans",
+        #     "familyName": "Bögli",
+        #     "affiliation": []
+        #   }
+        # ],
+
+        contribs = []
+
+        for i, c in enumerate(attributes['creators']):
+            if not c.get('nameType') == 'Personal':
+                continue
+            creator_id = None
+            for nid in c.get('nameIdentifiers', []):
+                if not nid.get('nameIdentifierScheme').lower() == "orcid":
+                    continue
+                orcid = nid.get('nameIdentifier', '').replace('https://orcid.org/', '')
+                if not orcid:
+                    continue
+                creator_id = self.lookup_orcid(orcid)
+                # If creator_id is None, should we create creators?
+            contribs.append(fatcat_openapi_client.ReleaseContrib(
+                creator_id=creator_id,
+                index=i,
+                raw_name=c.get('name'),
+                given_name=c.get('givenName'),
+                surname=c.get('familyName'),
+            ))
+
+        # > Title
+        #
+        #   "attributes.titles[].titleType": [
+        #     "AlternativeTitle",
+        #     "Other",
+        #     "Subtitle",
+        #     null,
+        #     "TranslatedTitle"
+        #   ],
+        title, subtitle = None, None
+
+        for entry in attributes.get('titles', []):
+            if not title and 'titleType' not in entry:
+                title = entry.get('title').strip()
+            if entry.get('titleType') == 'Subtitle':
+                subtitle = entry.get('title').strip()
+
+        # > Dates
+        #
+        #  "attributes.dates[].dateType": [
+        #    "Accepted",
+        #    "Available"
+        #    "Collected",
+        #    "Copyrighted",
+        #    "Created",
+        #    "Issued",
+        #    "Submitted",
+        #    "Updated",
+        #    "Valid",
+        #  ],
+        #
+        # Different documents have different dates defined. Choose the topmost
+        # available from prio list.
+        date_type_prio = (
+            'Valid',
+            'Issued',
+            'Available',
+            'Accepted',
+            'Submitted',
+            'Copyrighted',
+            'Collected',
+            'Created',
+            'Updated',
+        )
+
+        release_year, release_date = None, None
+        for prio in date_type_prio:
+            dates = attributes.get('dates', []) or [] # Never be None.
+            for item in dates:
+                if not item.get('dateType') == prio:
+                    continue
+                result = dateparser.parse(item.get('date'))
+                if result is None:
+                    # Unparsable date.
+                    continue
+                release_date = result
+                release_year = result.year
+                if 1000 < release_year < datetime.date.today().year + 5:
+                    # Skip possibly bogus dates.
+                    continue
+                break
+            else:
+                continue
+            break
+
+        # > Publisher
+        #
+        # A few NA values. A few bogus values.
+        #
+        publisher = attributes.get('publisher')
+
+        if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)'):
+            publisher = None
+        if publisher is not None and len(publisher) > 80:
+            # Arbitrary magic value, TODO(martin): better heuristic.
+            # Example: "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller,
+            # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE / Unbekannt,
+            # Nutzungsrechte müssen durch den Nutzer abgeklärt werden",
+            # TODO(martin): log misses.
+            publisher = None
+
+        # > Container
+        #
+        # For the moment, only ISSN as container.
+        #
+        #    "container": {
+        #      "type": "Journal",
+        #      "issue": "8",
+        #      "title": "Angewandte Chemie International Edition",
+        #      "volume": "57",
+        #      "lastPage": "2080",
+        #      "firstPage": "2077",
+        #      "identifier": "14337851",
+        #      "identifierType": "ISSN"
+        #    },
+        #
+        # "attributes.container.type": [
+        #   "DataRepository",
+        #   "Journal",
+        #   "Series",
+        #   "Book Series"
+        # ],
+        #
+        #  "attributes.container.identifierType": [
+        #    "Handle",
+        #    "ISBN",
+        #    "LISSN",
+        #    "DOI",
+        #    "EISSN",
+        #    "URL",
+        #    "ISSN"
+        #  ],
+
+        container_id = None
+        container = attributes.get('container', {}) or {}
+        if container.get('type') in CONTAINER_TYPE_MAP.keys():
+            container_type = CONTAINER_TYPE_MAP.get(container['type'])
+            if container.get('identifier') and container.get('identifierType') == 'ISSN':
+                issn = container.get('identifier')
+                if len(issn) == 8:
+                    issn = issn[:4] + "-" + issn[4:]
+                issnl = self.issn2issnl(issn)
+                container_id = self.lookup_issnl(issnl)
+
+                if container_id is None and container.get('title'):
+                    ce = fatcat_openapi_client.ContainerEntity(
+                        issnl=issnl,
+                        container_type=container_type,
+                        name=container.get('title'),
+                    )
+                    ce_edit = self.create_container(ce)
+                    container_id = ce_edit.ident
+                    self._issnl_id_map[issnl] = container_id
+
+        # > License
+        #
+        # attributes.rightsList[].rightsUri
+        # attributes.rightsList[].rights
+        # attributes.rightsList[].lang
+        #
+
+        license_slug = None
+        license_extra = []
+        for l in attributes.get('rightsList', []):
+            slug = lookup_license_slug(l.get('rightsUri'))
+            if slug:
+                license_slug = slug
+            license_extra.append(l)
+
+        # > Release type.
+        #
+        # Datacite has some fine granular typing (e.g. "Supplementary
+        # Collection of Datasets", "Taxonomic treatment", "blog_entry", ...
+        #
+        # Additional, coarse: resourceTypeGeneral
+        #
+        #  "attributes.types.resourceTypeGeneral": [
+        #    "Image",
+        #    "Dataset",
+        #    "PhysicalObject",
+        #    "Collection",
+        #    "Text",
+        #    "Sound",
+        #    "InteractiveResource",
+        #    "Event",
+        #    "Software",
+        #    "Other",
+        #    "Workflow",
+        #    "Audiovisual"
+        #  ],
+
+        # > Extra information.
+        extra, extra_datacite = dict(), dict()
+        if license_extra:
+            extra_datacite['license'] = license_extra
+
+        if extra_datacite:
+            extra['datacite'] = extra_datacite
+
+        # https://guide.fatcat.wiki/entity_release.html
+        re = fatcat_openapi_client.ReleaseEntity(
+            work_id=None,
+            container_id=container_id,
+            release_type=None,
+            release_stage=None,
+            title=title, # attributes.titles, various titleType
+            subtitle=subtitle,
+            original_title=title, # AlternativeTitle?
+            release_year=release_year, # publicationYear
+            release_date=release_date, # date issues/available?
+            publisher=publisher, # attributes.publisher
+            ext_ids=fatcat_openapi_client.ReleaseExtIds(
+                doi=attributes.get('doi'), # attributes.doi,
+                # Can we add handle.net link?
+            ),
+            contribs=contribs,
+            volume=None,
+            issue=None,
+            pages=None,
+            language=None,
+            abstracts=None,
+            refs=None,
+            extra=extra,
+            license_slug=license_slug,
+        )
+        return re
+
+    def try_update(self, re, debug=True):
+        if debug is True:
+            # print(type(re))
+            print(json.dumps(re.to_dict(), default=extended_encoder))
+            return
+        return False
+
+    def insert_batch(self, batch):
+        # Debugging.
+        for item in batch:
+            print(item)
+        return
+
+        # Orig.
+        self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
+            editgroup=fatcat_openapi_client.Editgroup(
+                description=self.editgroup_description,
+                extra=self.editgroup_extra),
+            entity_list=batch))
+
+def extended_encoder(value):
+    """
+    Can be used with json.dumps(value, default=extended_encoder) to serialize
+    value not serializable by default. https://docs.python.org/3/library/json.html#basic-usage
+    """
+    if isinstance(value, (datetime.datetime, datetime.date)):
+        return value.isoformat()
+    if isinstance(value, set):
+        return list(value)
+
+def lookup_license_slug(raw):
+    """
+    TODO(martin): reuse from crossref, maybe.
+    """
+    if not raw:
+        return None
+    raw = raw.strip().replace('http://', '//').replace('https://', '//')
+    if 'creativecommons.org' in raw.lower():
+        raw = raw.lower()
+        raw = raw.replace('/legalcode', '/').replace('/uk', '')
+        if not raw.endswith('/'):
+            raw = raw + '/'
+    return LICENSE_SLUG_MAP.get(raw)
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
new file mode 100644
index 00000000..0bbaba2e
--- /dev/null
+++ b/python/tests/import_datacite.py
@@ -0,0 +1,25 @@
+"""
+Test datacite importer.
+
+Datacite is a aggregator, hence inputs are quite varied.
+
+Here is small sample of ID types taken from a sample:
+
+    497344 "DOI"
+     65013 "URL"
+     22210 "CCDC"
+     17853 "GBIF"
+     17635 "Other"
+     11474 "uri"
+      9170 "Publisher ID"
+      7775 "URN"
+      6196 "DUCHAS"
+      5624 "Handle"
+      5056 "publisherId"
+
+A nice tool, not yet existing tool (maybe named indigo) would do the following:
+
+    $ shuf -n 100000 datacite.ndjson | indigo -t md > data.md
+
+TODO(martin): Write tests.
+"""
-- 
cgit v1.2.3


From 68a051abc45103f21284163d13c8893c31b4e8e4 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Mon, 16 Dec 2019 19:32:54 +0100
Subject: datacite: basic field mappings

Currently using two external libraries:

* dateparser
* langcodes

Note: This commit includes lots of wip docs and field stat in comment,
which should be removed.
---
 python/fatcat_tools/importers/datacite.py | 222 ++++++++++++++++++++++++------
 1 file changed, 181 insertions(+), 41 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index faa8e2be..e486ba90 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -6,6 +6,7 @@ Example doc at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8
 
 from .common import EntityImporter
 import dateparser
+import langcodes
 import datetime
 import fatcat_openapi_client
 import json
@@ -18,36 +19,132 @@ CONTAINER_TYPE_MAP = {
     'Book Series': 'book-series',
 }
 
+# The docs/guide should be the cannonical home for these mappings; update there
+# first.
+#
+# > select count(*), release_type from release_rev group by release_type order by count(*) desc;
+#
+#   count   |   release_type
+# ----------+-------------------
+#  95030004 | article-journal
+#  13477878 | chapter
+#   5926811 | paper-conference
+#   2169642 | article
+#   1806415 | dataset
+#   1548614 | book
+#   1390304 |
+#    818351 | report
+#    815684 | entry
+#    307998 | standard
+#    297769 | thesis
+#    261426 | letter
+#    148093 | post
+#    122736 | editorial
+#     99225 | stub
+#     96219 | review-book
+#     22854 | peer_review
+#     19078 | interview
+#     16278 | article-newspaper
+#      3973 | speech
+#      3536 | legal_case
+#      2264 | abstract
+#      1626 | legislation
+#      1053 | retraction
+#        85 | component
+# (25 rows)
+#
+# Map various datacite type types to CSL-ish types. None means TODO or remove.
+DATACITE_TYPE_MAP = {
+    'ris': {
+        'THES': 'thesis',
+        'SOUND': None,
+        'CHAP': 'chapter',
+        'FIGURE': None,
+        'RPRT': 'report',
+        'JOUR': 'article-journal',
+        'MPCT': None,
+        'GEN': None,
+        'BOOK': 'book',
+        'DATA': 'dataset',
+        'COMP': None,
+    },
+    'schemaOrg': {
+        'Dataset': 'dataset',
+        'Book': 'book',
+        'ScholarlyArticle': 'article',
+        'ImageObject': 'graphic',
+        'Collection': None,
+        'MediaObject': None,
+        'Event': None,
+        'SoftwareSourceCode': None,
+        'Chapter': 'chapter',
+        'CreativeWork': None,
+        'PublicationIssue': 'article',
+        'AudioObject': None,
+        'Thesis': 'thesis',
+    },
+    'citeproc': {
+        'dataset': 'dataset',
+        'chapter': 'chapter',
+        'article-journal': 'article-journal',
+        'song': 'song',
+        'article': 'article',
+        'report': 'report',
+        'graphic': 'graphic',
+        'thesis': 'thesis',
+        'book': 'book',
+    },
+    'bibtex': {
+        'phdthesis': 'thesis',
+        'inbook': 'chapter',
+        'misc': None,
+        'article': 'article-journal',
+        'book': 'book',
+    },
+    'resourceTypeGeneral': {
+        'Image': None,
+        'Dataset': 'dataset',
+        'PhysicalObject': None,
+        'Collection': None,
+        'Text': None,
+        'Sound': None,
+        'InteractiveResource': None,
+        'Event': None,
+        'Software': None,
+        'Other': None,
+        'Workflow': None,
+        'Audiovisual': None,
+    }
+}
+
+
 # TODO(martin): merge this with other maps, maybe.
 LICENSE_SLUG_MAP = {
-    "//creativecommons.org/licenses/by/2.0": "CC-BY",
+    "//creativecommons.org/licenses/by/2.0/": "CC-BY",
     "//creativecommons.org/licenses/by/2.0/uk/legalcode": "CC-BY",
-    "//creativecommons.org/licenses/by/3.0": "CC-BY",
+    "//creativecommons.org/licenses/by/3.0/": "CC-BY",
     "//creativecommons.org/licenses/by/3.0/us": "CC-BY",
-    "//creativecommons.org/licenses/by/4.0": "CC-BY",
-    "//creativecommons.org/licenses/by/4.0/deed.de": "CC-BY",
-    "//creativecommons.org/licenses/by/4.0/deed.en_US": "CC-BY",
-    "//creativecommons.org/licenses/by/4.0/legalcode": "CC-BY",
-    "//creativecommons.org/licenses/by-nc/2.0": "CC-BY-NC",
-    "//creativecommons.org/licenses/by-nc/3.0": "CC-BY-NC",
-    "//creativecommons.org/licenses/by-nc/4.0": "CC-BY-NC",
+    "//creativecommons.org/licenses/by/4.0/": "CC-BY",
+    "//creativecommons.org/licenses/by/4.0/deed.de/": "CC-BY",
+    "//creativecommons.org/licenses/by/4.0/deed.en_US/": "CC-BY",
+    "//creativecommons.org/licenses/by/4.0/legalcode/": "CC-BY",
+    "//creativecommons.org/licenses/by-nc/2.0/": "CC-BY-NC",
+    "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
+    "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
     "//creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC",
-    "//creativecommons.org/licenses/by-nc-nd/3.0": "CC-BY-NC-ND",
+    "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
     "//creativecommons.org/licenses/by-nc-nd/3.0/gr": "CC-BY-NC-ND",
-    "//creativecommons.org/licenses/by-nc-nd/4.0": "CC-BY-NC-ND",
-    "//creativecommons.org/licenses/by-nc-nd/4.0": "CC-BY-ND",
+    "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-ND",
     "//creativecommons.org/licenses/by-nc-nd/4.0/legalcode": "CC-BY-ND",
-    "//creativecommons.org/licenses/by-nc-sa/4.0": "CC-BY-NC-SA",
-    "//creativecommons.org/licenses/by-nc-sa/4.0": "CC-BY-SA",
-    "//creativecommons.org/licenses/by-nd/4.0": "CC-BY-ND",
+    "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
+    "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
     "//creativecommons.org/licenses/by-sa/3.0/de": "CC-BY-SA",
     "//creativecommons.org/licenses/by-sa/3.0/gr": "CC-BY-SA",
-    "//creativecommons.org/licenses/by-sa/4.0": "CC-BY-SA",
+    "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
     "//creativecommons.org/licenses/by-sa/4.0/legalcode": "CC-BY-SA",
-    "//creativecommons.org/licenses/CC-BY/4.0": "CC-BY",
-    "//creativecommons.org/licenses/publicdomain/zero/1.0": "CC-0",
-    "//creativecommons.org/publicdomain/zero/1.0": "CC-0",
-    "//creativecommons.org/publicdomain/zero/1.0": "CC-0",
+    "//creativecommons.org/licenses/CC-BY/4.0/": "CC-BY",
+    "//creativecommons.org/licenses/publicdomain/zero/1.0/": "CC-0",
+    "//creativecommons.org/publicdomain/zero/1.0/": "CC-0",
     "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0",
     "//opensource.org/licenses/MIT": "MIT",
     "//www.elsevier.com/open-access/userlicense/1.0": "ELSEVIER-USER-1.0",
@@ -75,6 +172,7 @@ LICENSE_SLUG_MAP = {
     # "info:eu-repo/semantics/closedAccess": "", # https://wiki.surfnet.nl/display/standards/info-eu-repo/#info-eu-repo-AccessRights
     # "info:eu-repo/semantics/embargoedAccess": "",
     # "info:eu-repo/semantics/openAccess": "",
+    # Note: Some URLs pointing to licensing terms are not in WB yet (but would be nice).
 }
 
 class DataciteImporter(EntityImporter):
@@ -302,12 +400,12 @@ class DataciteImporter(EntityImporter):
         #      "identifierType": "ISSN"
         #    },
         #
-        # "attributes.container.type": [
-        #   "DataRepository",
-        #   "Journal",
-        #   "Series",
-        #   "Book Series"
-        # ],
+        #  "attributes.container.type": [
+        #    "DataRepository",
+        #    "Journal",
+        #    "Series",
+        #    "Book Series"
+        #  ],
         #
         #  "attributes.container.identifierType": [
         #    "Handle",
@@ -318,6 +416,7 @@ class DataciteImporter(EntityImporter):
         #    "URL",
         #    "ISSN"
         #  ],
+        #
 
         container_id = None
         container = attributes.get('container', {}) or {}
@@ -328,17 +427,18 @@ class DataciteImporter(EntityImporter):
                 if len(issn) == 8:
                     issn = issn[:4] + "-" + issn[4:]
                 issnl = self.issn2issnl(issn)
-                container_id = self.lookup_issnl(issnl)
-
-                if container_id is None and container.get('title'):
-                    ce = fatcat_openapi_client.ContainerEntity(
-                        issnl=issnl,
-                        container_type=container_type,
-                        name=container.get('title'),
-                    )
-                    ce_edit = self.create_container(ce)
-                    container_id = ce_edit.ident
-                    self._issnl_id_map[issnl] = container_id
+                if issnl is not None:
+                    container_id = self.lookup_issnl(issnl)
+
+                    if container_id is None and container.get('title'):
+                        ce = fatcat_openapi_client.ContainerEntity(
+                            issnl=issnl,
+                            container_type=container_type,
+                            name=container.get('title'),
+                        )
+                        ce_edit = self.create_container(ce)
+                        container_id = ce_edit.ident
+                        self._issnl_id_map[issnl] = container_id
 
         # > License
         #
@@ -376,11 +476,51 @@ class DataciteImporter(EntityImporter):
         #    "Workflow",
         #    "Audiovisual"
         #  ],
+        #  "attributes.types.citeproc": [
+        #    "dataset",
+        #    "chapter",
+        #    "article-journal",
+        #    "song",
+        #    "article",
+        #    "report",
+        #    "graphic",
+        #    "thesis",
+        #    "book"
+        #  ],
+        #
+        # There is RIS, also.
+
+        # attributes.types.resourceType contains too many things for now.
+        for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', 'bibtex', 'ris'):
+            release_type = attributes.get('types', {}).get(typeType)
+            if release_type is not None:
+                break
+
+        # TODO(martin): Skip unmapped release_type entirely?
+        if release_type is None:
+            print("datacite unmapped type: {}".format(release_type), file=sys.stderr)
+
+        # > Language.
+        # attributes.language
+
+        language = None
+        value = attributes.get('language', '') or '' # As it is written.
+        try:
+            language = langcodes.find(value).language
+        except LookupError:
+            try:
+                language = langcodes.get(value).language
+            except langcodes.tag_parser.LanguageTagError:
+                pass
 
-        # > Extra information.
+        # > Extra information: license, subjects, ...
         extra, extra_datacite = dict(), dict()
         if license_extra:
-            extra_datacite['license'] = license_extra
+            extra_datacite = {
+                'license': license_extra,
+            }
+        if attributes.get('subjects'):
+            extra_datacite['subjects'] = attributes.get('subjects', [])
 
         if extra_datacite:
             extra['datacite'] = extra_datacite
@@ -389,7 +529,7 @@ class DataciteImporter(EntityImporter):
         re = fatcat_openapi_client.ReleaseEntity(
             work_id=None,
             container_id=container_id,
-            release_type=None,
+            release_type=release_type,
             release_stage=None,
             title=title, # attributes.titles, various titleType
             subtitle=subtitle,
@@ -405,7 +545,7 @@ class DataciteImporter(EntityImporter):
             volume=None,
             issue=None,
             pages=None,
-            language=None,
+            language=language,
             abstracts=None,
             refs=None,
             extra=extra,
-- 
cgit v1.2.3


From 76d6d4d2de6580ae147e40c43c18f04cc48b62ec Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 17 Dec 2019 17:38:45 +0100
Subject: datacite: add missing mappings and notes

---
 python/fatcat_tools/importers/datacite.py | 441 ++++++++++++------------------
 1 file changed, 175 insertions(+), 266 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index e486ba90..4e117dde 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -1,5 +1,5 @@
 """
-WIP: Importer for datacite.org data.
+Prototype Importer for datacite.org data.
 
 Example doc at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8
 """
@@ -8,9 +8,11 @@ from .common import EntityImporter
 import dateparser
 import langcodes
 import datetime
+import langdetect
 import fatcat_openapi_client
 import json
 import sys
+import hashlib
 
 # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
 CONTAINER_TYPE_MAP = {
@@ -20,40 +22,8 @@ CONTAINER_TYPE_MAP = {
 }
 
 # The docs/guide should be the cannonical home for these mappings; update there
-# first.
-#
-# > select count(*), release_type from release_rev group by release_type order by count(*) desc;
-#
-#   count   |   release_type
-# ----------+-------------------
-#  95030004 | article-journal
-#  13477878 | chapter
-#   5926811 | paper-conference
-#   2169642 | article
-#   1806415 | dataset
-#   1548614 | book
-#   1390304 |
-#    818351 | report
-#    815684 | entry
-#    307998 | standard
-#    297769 | thesis
-#    261426 | letter
-#    148093 | post
-#    122736 | editorial
-#     99225 | stub
-#     96219 | review-book
-#     22854 | peer_review
-#     19078 | interview
-#     16278 | article-newspaper
-#      3973 | speech
-#      3536 | legal_case
-#      2264 | abstract
-#      1626 | legislation
-#      1053 | retraction
-#        85 | component
-# (25 rows)
-#
-# Map various datacite type types to CSL-ish types. None means TODO or remove.
+# first.  Map various datacite type types to CSL-ish types. None means TODO or
+# remove.
 DATACITE_TYPE_MAP = {
     'ris': {
         'THES': 'thesis',
@@ -197,91 +167,17 @@ class DataciteImporter(EntityImporter):
 
     def parse_record(self, obj):
         """
-        TODO(martin): Map datacite to RE.
-
-        WIP, notes:
-
-        * Many subjects, should they end up in extra?
-        * attributes.creators and attributes.contributors
-
-        $ jq '.attributes.creators[]?.nameType?' datacite.500k | sort | uniq -c | sort -nr
-        3963663 "Personal"
-        289795 null
-        8892 "Organizational"
-
-        Shall we use issued, available?
-
-          {
-            "date": "2011-11-18",
-            "dateType": "Accepted"
-          },
-          {
-            "date": "2011-11-18",
-            "dateType": "Available"
-          },
-          {
-            "date": "2011-11-07",
-            "dateType": "Copyrighted"
-          },
-          {
-            "date": "2011-11-18",
-            "dateType": "Issued"
-          },
-          {
-            "date": "2011-11-07",
-            "dateType": "Issued"
-          }
-
-        TODO(martin): Quick analysis of dates and stages.
+        Mapping datacite JSON to ReleaseEntity.
         """
-
         if 'attributes' not in obj:
             return None
 
         attributes = obj['attributes']
 
-        # > Contributors
-        #
-        #  "attributes.creators[].contributorType": [
-        #    "author"
-        #  ],
-        #  "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": [
-        #    "LCNA",
-        #    "GND",
-        #    "email",
-        #    "NAF",
-        #    "OSF",
-        #    "RRID",
-        #    "ORCID",
-        #    "SCOPUS",
-        #    "NRCPID",
-        #    "schema.org",
-        #    "GRID",
-        #    "MGDS",
-        #    "VIAF",
-        #    "JACoW-ID"
-        #  ],
-        #
-        #    "https://orcid.org/0000-0002-9902-738X",
-        #    "http://jacow.org/JACoW-00001280",
-        #    "Wiebe_Peter",
-        #    "https://osf.io/https://osf.io/kjfuy/",
-        #    "http://www.viaf.org176549220",
-        #    "2239",
-        #    "Jeffries_Martin",
-        #    "https://orcid.org/0000-0002-1493-6630",
-        #    "0000-0002-6233-612X",
-        #
-        # "creators": [
-        #   {
-        #     "name": "Bögli, Hans",
-        #     "nameType": "Personal",
-        #     "givenName": "Hans",
-        #     "familyName": "Bögli",
-        #     "affiliation": []
-        #   }
-        # ],
-
+        # Contributors. Many nameIdentifierSchemes, we do not use yet:
+        # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": [
+        # "LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", "SCOPUS",
+        # "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID" ],
         contribs = []
 
         for i, c in enumerate(attributes['creators']):
@@ -304,15 +200,8 @@ class DataciteImporter(EntityImporter):
                 surname=c.get('familyName'),
             ))
 
-        # > Title
-        #
-        #   "attributes.titles[].titleType": [
-        #     "AlternativeTitle",
-        #     "Other",
-        #     "Subtitle",
-        #     null,
-        #     "TranslatedTitle"
-        #   ],
+        # Title, may come with "attributes.titles[].titleType", like
+        # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
         title, subtitle = None, None
 
         for entry in attributes.get('titles', []):
@@ -321,22 +210,13 @@ class DataciteImporter(EntityImporter):
             if entry.get('titleType') == 'Subtitle':
                 subtitle = entry.get('title').strip()
 
-        # > Dates
-        #
-        #  "attributes.dates[].dateType": [
-        #    "Accepted",
-        #    "Available"
-        #    "Collected",
-        #    "Copyrighted",
-        #    "Created",
-        #    "Issued",
-        #    "Submitted",
-        #    "Updated",
-        #    "Valid",
-        #  ],
-        #
-        # Different documents have different dates defined. Choose the topmost
-        # available from prio list.
+        # Dates. A few internal dates (registered, created, updated) and
+        # published (0..2554). We try to work with typed date list, in
+        # "attributes.dates[].dateType", values: "Accepted", "Available"
+        # "Collected", "Copyrighted", "Created", "Issued", "Submitted",
+        # "Updated", "Valid".
+        release_year, release_date = None, None
+
         date_type_prio = (
             'Valid',
             'Issued',
@@ -348,14 +228,16 @@ class DataciteImporter(EntityImporter):
             'Created',
             'Updated',
         )
-
-        release_year, release_date = None, None
         for prio in date_type_prio:
             dates = attributes.get('dates', []) or [] # Never be None.
             for item in dates:
                 if not item.get('dateType') == prio:
                     continue
-                result = dateparser.parse(item.get('date'))
+                try:
+                    result = dateparser.parse(item.get('date'))
+                except TypeError as err:
+                    print("{} failed with: {}".format(item.get('date'), err), file=sys.stderr)
+                    continue
                 if result is None:
                     # Unparsable date.
                     continue
@@ -369,56 +251,23 @@ class DataciteImporter(EntityImporter):
                 continue
             break
 
-        # > Publisher
-        #
-        # A few NA values. A few bogus values.
-        #
+        # Publisher. A few NA values. A few bogus values.
         publisher = attributes.get('publisher')
 
-        if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)'):
+        if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)', '(:none)'):
             publisher = None
         if publisher is not None and len(publisher) > 80:
-            # Arbitrary magic value, TODO(martin): better heuristic.
-            # Example: "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller,
-            # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE / Unbekannt,
-            # Nutzungsrechte müssen durch den Nutzer abgeklärt werden",
-            # TODO(martin): log misses.
+            # Arbitrary magic value max length. TODO(martin): better heuristic,
+            # but factored out; first we have to log misses. Example:
+            # "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller,
+            # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE /
+            # Unbekannt, Nutzungsrechte müssen durch den Nutzer abgeklärt
+            # werden"
             publisher = None
 
-        # > Container
-        #
-        # For the moment, only ISSN as container.
-        #
-        #    "container": {
-        #      "type": "Journal",
-        #      "issue": "8",
-        #      "title": "Angewandte Chemie International Edition",
-        #      "volume": "57",
-        #      "lastPage": "2080",
-        #      "firstPage": "2077",
-        #      "identifier": "14337851",
-        #      "identifierType": "ISSN"
-        #    },
-        #
-        #  "attributes.container.type": [
-        #    "DataRepository",
-        #    "Journal",
-        #    "Series",
-        #    "Book Series"
-        #  ],
-        #
-        #  "attributes.container.identifierType": [
-        #    "Handle",
-        #    "ISBN",
-        #    "LISSN",
-        #    "DOI",
-        #    "EISSN",
-        #    "URL",
-        #    "ISSN"
-        #  ],
-        #
-
+        # Container. For the moment, only ISSN as container.
         container_id = None
+
         container = attributes.get('container', {}) or {}
         if container.get('type') in CONTAINER_TYPE_MAP.keys():
             container_type = CONTAINER_TYPE_MAP.get(container['type'])
@@ -440,142 +289,202 @@ class DataciteImporter(EntityImporter):
                         container_id = ce_edit.ident
                         self._issnl_id_map[issnl] = container_id
 
-        # > License
-        #
-        # attributes.rightsList[].rightsUri
-        # attributes.rightsList[].rights
-        # attributes.rightsList[].lang
-        #
+        # Volume and issue.
+        volume = container.get('volume')
+        issue = container.get('issue')
+
+        # Pages.
+        pages = None
+
+        first_page = container.get('firstPage')
+        last_page = container.get('lastPage')
+
+        if first_page and last_page:
+            try:
+                int(first_page) < int(last_page)
+                pages = '{}-{}'.format(first_page, last_page)
+            except ValueError as err:
+                print(err, file=sys.stderr)
+                pass
+
+        if not pages and first_page:
+            pages = first_page
 
+        # License.
         license_slug = None
         license_extra = []
+
         for l in attributes.get('rightsList', []):
             slug = lookup_license_slug(l.get('rightsUri'))
             if slug:
                 license_slug = slug
             license_extra.append(l)
 
-        # > Release type.
-        #
-        # Datacite has some fine granular typing (e.g. "Supplementary
-        # Collection of Datasets", "Taxonomic treatment", "blog_entry", ...
-        #
-        # Additional, coarse: resourceTypeGeneral
-        #
-        #  "attributes.types.resourceTypeGeneral": [
-        #    "Image",
-        #    "Dataset",
-        #    "PhysicalObject",
-        #    "Collection",
-        #    "Text",
-        #    "Sound",
-        #    "InteractiveResource",
-        #    "Event",
-        #    "Software",
-        #    "Other",
-        #    "Workflow",
-        #    "Audiovisual"
-        #  ],
-        #  "attributes.types.citeproc": [
-        #    "dataset",
-        #    "chapter",
-        #    "article-journal",
-        #    "song",
-        #    "article",
-        #    "report",
-        #    "graphic",
-        #    "thesis",
-        #    "book"
-        #  ],
-        #
-        # There is RIS, also.
-
-        # attributes.types.resourceType contains too many things for now.
+        # Release type. Try to determine the release type from a variety of
+        # types supplied in datacite. The "attributes.types.resourceType"
+        # contains too many (176 in sample) things for now; citeproc may be the
+        # closest, but not always supplied.
         for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', 'bibtex', 'ris'):
-            release_type = attributes.get('types', {}).get(typeType)
+            value = attributes.get('types', {}).get(typeType)
+            release_type = DATACITE_TYPE_MAP.get(value)
             if release_type is not None:
                 break
 
-        # TODO(martin): Skip unmapped release_type entirely?
         if release_type is None:
             print("datacite unmapped type: {}".format(release_type), file=sys.stderr)
 
-        # > Language.
-        # attributes.language
-
+        # Language values are varied ("ger", "es", "English", "ENG", "en-us",
+        # "other", ...). Try to crush it with langcodes: "It may sound to you
+        # like langcodes solves a pretty boring problem. At one level, that's
+        # right. Sometimes you have a boring problem, and it's great when a
+        # library solves it for you." -- TODO(martin): We need more of these.
         language = None
-        value = attributes.get('language', '') or '' # As it is written.
+
+        value = attributes.get('language', '') or ''
         try:
             language = langcodes.find(value).language
         except LookupError:
             try:
                 language = langcodes.get(value).language
             except langcodes.tag_parser.LanguageTagError:
+                print('could not determine language: {}'.format(value), file=sys.stderr)
+
+        # Abstracts appear in "attributes.descriptions[].descriptionType", some
+        # of the observed values: "Methods", "TechnicalInfo",
+        # "SeriesInformation", "Other", "TableOfContents", "Abstract". The
+        # "Other" fields might contain references or related articles (with
+        # DOI). TODO(martin): maybe try to parse out some of those refs.
+        abstracts = []
+
+        for desc in attributes.get('descriptions', []):
+            if not desc.get('descriptionType') == 'Abstract':
+                continue
+            if len(desc.get('description', '')) < 10:
+                continue
+            text = desc.get('description')
+            sha1 = hashlib.sha1(text.encode('utf-8')).hexdigest()
+            lang = None
+            try:
+                lang = langdetect.detect(text)
+            except langdetect.lang_detect_exception.LangDetectException:
                 pass
+            abstracts.append(fatcat_openapi_client.ReleaseAbstract(
+                mimetype="text/plain",
+                content=text,
+                sha1=sha1,
+                lang=lang,
+            ))
+
+        # References and relations. Datacite include many relation types in
+        # "attributes.relatedIdentifiers[].relationType", e.g.
+        # "IsPartOf", "IsPreviousVersionOf", "Continues", "IsVariantFormOf",
+        # "IsSupplementTo", "Cites", "IsSupplementedBy", "IsDocumentedBy", "HasVersion",
+        # "IsCitedBy", "IsMetadataFor", "IsNewVersionOf", "IsIdenticalTo", "HasPart",
+        # "References", "Reviews", "HasMetadata", "IsContinuedBy", "IsVersionOf",
+        # "IsDerivedFrom", "IsSourceOf".
+        #
+        # For the moment, we only care about References.
+        refs, ref_index = [], 0
+
+        for rel in attributes.get('relatedIdentifiers', []):
+            if not rel.get('relationType') == 'References':
+                continue
+            ref_extra = dict()
+            if rel.get('relatedIdentifierType') == 'DOI':
+                ref_extra['doi'] = rel.get('relatedIdentifier')
+            if not ref_extra:
+                ref_extra = None
+            refs.append(fatcat_openapi_client.ReleaseRef(
+                index=ref_index,
+                extra=ref_extra,
+            ))
+            ref_index += 1
+
+        # Start with clear stages, e.g. published. TODO(martin): we could
+        # probably infer a bit more from the relations, e.g.
+        # "IsPreviousVersionOf" or "IsNewVersionOf".
+        release_stage = None
+        if attributes.get('state') == 'findable' or attributes.get('isActive') is True:
+            release_stage = 'published'
+
+        # Extra information.
+        extra_datacite = dict()
 
-        # > Extra information: license, subjects, ...
-        extra, extra_datacite = dict(), dict()
         if license_extra:
-            extra_datacite = {
-                'license': license_extra,
-            }
+            extra_datacite['license'] = license_extra
         if attributes.get('subjects'):
-            extra_datacite['subjects'] = attributes.get('subjects', [])
+            extra_datacite['subjects'] = attributes['subjects']
+        if attributes.get('url'):
+            extra_datacite['url'] = attributes['url']
+
+        extra = dict()
 
         if extra_datacite:
             extra['datacite'] = extra_datacite
 
-        # https://guide.fatcat.wiki/entity_release.html
+        # Assemble release.
         re = fatcat_openapi_client.ReleaseEntity(
             work_id=None,
             container_id=container_id,
             release_type=release_type,
-            release_stage=None,
-            title=title, # attributes.titles, various titleType
+            release_stage=release_stage,
+            title=title,
             subtitle=subtitle,
-            original_title=title, # AlternativeTitle?
-            release_year=release_year, # publicationYear
-            release_date=release_date, # date issues/available?
-            publisher=publisher, # attributes.publisher
+            original_title=title,
+            release_year=release_year,
+            release_date=release_date,
+            publisher=publisher,
             ext_ids=fatcat_openapi_client.ReleaseExtIds(
-                doi=attributes.get('doi'), # attributes.doi,
-                # Can we add handle.net link?
+                doi=attributes.get('doi'),
             ),
             contribs=contribs,
-            volume=None,
-            issue=None,
-            pages=None,
+            volume=volume,
+            issue=issue,
+            pages=pages,
             language=language,
-            abstracts=None,
-            refs=None,
+            abstracts=abstracts,
+            refs=refs,
             extra=extra,
             license_slug=license_slug,
         )
         return re
 
     def try_update(self, re, debug=True):
+        """
+        When debug is true, write the RE to stdout.
+        """
         if debug is True:
-            # print(type(re))
-            print(json.dumps(re.to_dict(), default=extended_encoder))
-            return
-        return False
+            print(json.dumps(re.to_dict(), default=extended_json_encoder))
+            return False
 
-    def insert_batch(self, batch):
-        # Debugging.
-        for item in batch:
-            print(item)
-        return
+        # lookup existing DOI (don't need to try other ext idents for crossref)
+        existing = None
+        try:
+            existing = self.api.lookup_release(doi=re.ext_ids.doi)
+        except fatcat_openapi_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+            # doesn't exist, need to update
+            return True
 
-        # Orig.
+        # eventually we'll want to support "updates", but for now just skip if
+        # entity already exists
+        if existing:
+            self.counts['exists'] += 1
+            return False
+
+        return True
+
+    def insert_batch(self, batch):
         self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
             editgroup=fatcat_openapi_client.Editgroup(
                 description=self.editgroup_description,
                 extra=self.editgroup_extra),
             entity_list=batch))
 
-def extended_encoder(value):
+def extended_json_encoder(value):
     """
-    Can be used with json.dumps(value, default=extended_encoder) to serialize
+    Can be used with json.dumps(value, default=extended_json_encoder) to serialize
     value not serializable by default. https://docs.python.org/3/library/json.html#basic-usage
     """
     if isinstance(value, (datetime.datetime, datetime.date)):
@@ -585,7 +494,7 @@ def extended_encoder(value):
 
 def lookup_license_slug(raw):
     """
-    TODO(martin): reuse from crossref, maybe.
+    TODO(martin): reuse from or combine with crossref, maybe.
     """
     if not raw:
         return None
-- 
cgit v1.2.3


From 403b1a2d4591d878145a021a7c1e15e2d60c47d8 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 18 Dec 2019 20:21:49 +0100
Subject: improve datacite field mapping and import

Current version succeeded to import a random sample of 100000 records
(0.5%) from datacite.

The --debug (write JSON to stdout) and --insert-log-file (log batch
before committing to db) flags are temporary added to help debugging.

Add few unit tests.

Some edge cases:

a) Existing keys without value requires a slightly awkward:

```
titles = attributes.get('titles', []) or []
```

b) There can be 0, 1, or more (first one wins) titles.

c) Date handling is probably not ideal. Datacite has a potentiall fine
grained list of dates.

The test case (tests/files/datacite_sample.jsonl) refers to
https://ssl.fao.org/glis/doi/10.18730/8DYM9, which has date (main
descriptor) 1986. The datacite record contains: 2017 (publicationYear,
probably the year of record creation with reference system), 1978-06-03
(collected, e.g. experimental sample), 1986 ("Accepted"). The online
version of the resource knows even one more date (2019-06-05 10:14:43 by
WIEWS update).
---
 python/fatcat_import.py                         |  15 +-
 python/fatcat_tools/importers/datacite.py       | 180 ++++++++++++++++++------
 python/tests/files/datacite_1k_records.jsonl.gz | Bin 0 -> 684605 bytes
 python/tests/files/datacite_sample.jsonl        |   1 +
 python/tests/import_datacite.py                 | 108 +++++++++++---
 5 files changed, 245 insertions(+), 59 deletions(-)
 create mode 100644 python/tests/files/datacite_1k_records.jsonl.gz
 create mode 100644 python/tests/files/datacite_sample.jsonl

(limited to 'python')

diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index d7651792..90bb01a1 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -170,7 +170,10 @@ def run_datacite(args):
     dci = DataciteImporter(args.api,
         args.issn_map_file,
         edit_batch_size=args.batch_size,
-        bezerk_mode=args.bezerk_mode)
+        bezerk_mode=args.bezerk_mode,
+        debug=args.debug,
+        lang_detect=args.lang_detect,
+        insert_log_file=args.insert_log_file)
     if args.kafka_mode:
         KafkaJsonPusher(fci, args.kafka_hosts, args.kafka_env, "api-datacite",
             "fatcat-import", consume_batch_size=args.batch_size).run()
@@ -464,6 +467,16 @@ def main():
     sub_datacite.add_argument('--bezerk-mode',
         action='store_true',
         help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)")
+    sub_datacite.add_argument('--debug',
+        action='store_true',
+        help="write converted JSON to stdout")
+    sub_datacite.add_argument('--lang-detect',
+        action='store_true',
+        help="try to detect language (slow)")
+    sub_datacite.add_argument('--insert-log-file',
+        default='',
+        type=str,
+        help="write inserted documents into file (for debugging)")
     sub_datacite.set_defaults(
         func=run_datacite,
         auth_var="FATCAT_API_AUTH_TOKEN",
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 4e117dde..9774e334 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -6,13 +6,14 @@ Example doc at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8
 
 from .common import EntityImporter
 import dateparser
-import langcodes
 import datetime
-import langdetect
 import fatcat_openapi_client
+import hashlib
 import json
+import langcodes
+import langdetect
+import sqlite3
 import sys
-import hashlib
 
 # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
 CONTAINER_TYPE_MAP = {
@@ -147,10 +148,11 @@ LICENSE_SLUG_MAP = {
 
 class DataciteImporter(EntityImporter):
     """
-    Importer for datacite records. TODO(martin): Do we need issn_map_file?
+    Importer for datacite records.
     """
 
-    def __init__(self, api, issn_map_file, **kwargs):
+    def __init__(self, api, issn_map_file, debug=False, lang_detect=False,
+                 insert_log_file=None, **kwargs):
 
         eg_desc = kwargs.get('editgroup_description',
             "Automated import of Datacite DOI metadata, harvested from REST API")
@@ -163,7 +165,42 @@ class DataciteImporter(EntityImporter):
             **kwargs)
 
         self.create_containers = kwargs.get('create_containers', True)
+        extid_map_file = kwargs.get('extid_map_file')
+        self.extid_map_db = None
+        if extid_map_file:
+            db_uri = "file:{}?mode=ro".format(extid_map_file)
+            print("Using external ID map: {}".format(db_uri), file=sys.stderr)
+            self.extid_map_db = sqlite3.connect(db_uri, uri=True)
+        else:
+            print("Not using external ID map", file=sys.stderr)
+
         self.read_issn_map_file(issn_map_file)
+        self.debug = debug
+        self.lang_detect = lang_detect
+        self.insert_log_file = insert_log_file
+
+        print('datacite with debug={}, lang_detect={}'.format(self.debug, self.lang_detect), file=sys.stderr)
+
+    def lookup_ext_ids(self, doi):
+        """
+        Return dictionary of identifiers refering to the same things as the given DOI.
+        """
+        if self.extid_map_db is None:
+            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
+        row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
+            [doi.lower()]).fetchone()
+        if row is None:
+            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
+        row = [str(cell or '') or None for cell in row]
+        return dict(
+            core_id=row[0],
+            pmid=row[1],
+            pmcid=row[2],
+            wikidata_qid=row[3],
+            # TODO:
+            arxiv_id=None,
+            jstor_id=None,
+        )
 
     def parse_record(self, obj):
         """
@@ -174,14 +211,14 @@ class DataciteImporter(EntityImporter):
 
         attributes = obj['attributes']
 
-        # Contributors. Many nameIdentifierSchemes, we do not use yet:
-        # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": [
-        # "LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", "SCOPUS",
-        # "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID" ],
+        # Contributors. Many nameIdentifierSchemes, we do not use (yet):
+        # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme":
+        # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID",
+        # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"].
         contribs = []
 
         for i, c in enumerate(attributes['creators']):
-            if not c.get('nameType') == 'Personal':
+            if 'nameType' in c and not c.get('nameType') == 'Personal':
                 continue
             creator_id = None
             for nid in c.get('nameIdentifiers', []):
@@ -191,7 +228,7 @@ class DataciteImporter(EntityImporter):
                 if not orcid:
                     continue
                 creator_id = self.lookup_orcid(orcid)
-                # If creator_id is None, should we create creators?
+                # TODO(martin): If creator_id is None, should we create creators?
             contribs.append(fatcat_openapi_client.ReleaseContrib(
                 creator_id=creator_id,
                 index=i,
@@ -204,11 +241,27 @@ class DataciteImporter(EntityImporter):
         # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
         title, subtitle = None, None
 
-        for entry in attributes.get('titles', []):
-            if not title and 'titleType' not in entry:
-                title = entry.get('title').strip()
-            if entry.get('titleType') == 'Subtitle':
-                subtitle = entry.get('title').strip()
+        titles = attributes.get('titles', []) or []
+        if len(titles) == 0:
+            print('skipping record w/o title: {}'.format(obj), file=sys.stderr)
+            return False
+        elif len(titles) == 1:
+            # We do not care about the type then.
+            title = titles[0].get('title', '') or ''
+            title = title.strip()
+        else:
+            for entry in titles:
+                if not title and ('titleType' not in entry or not entry.get('titleType')):
+                    title = entry.get('title').strip()
+                if entry.get('titleType') == 'Subtitle':
+                    subtitle = entry.get('title', '').strip()
+
+        if not title:
+            print('skipping record w/o title: {}'.format(obj), file=sys.stderr)
+            return False
+
+        if not subtitle:
+            subtitle = None
 
         # Dates. A few internal dates (registered, created, updated) and
         # published (0..2554). We try to work with typed date list, in
@@ -217,14 +270,13 @@ class DataciteImporter(EntityImporter):
         # "Updated", "Valid".
         release_year, release_date = None, None
 
+        # Ignore: Collected, Issued.
         date_type_prio = (
             'Valid',
-            'Issued',
             'Available',
             'Accepted',
             'Submitted',
             'Copyrighted',
-            'Collected',
             'Created',
             'Updated',
         )
@@ -233,15 +285,36 @@ class DataciteImporter(EntityImporter):
             for item in dates:
                 if not item.get('dateType') == prio:
                     continue
-                try:
-                    result = dateparser.parse(item.get('date'))
-                except TypeError as err:
-                    print("{} failed with: {}".format(item.get('date'), err), file=sys.stderr)
-                    continue
+
+                # Parse out date, use common patterns first, fallback to dateparser.
+                result, value, year_only = None, item.get('date', ''), False
+
+                # Before using (expensive) dateparser, try a few common patterns.
+                common_patterns = ('%Y-%m-%d', '%Y', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S')
+
+                for pattern in common_patterns:
+                    try:
+                        result = datetime.datetime.strptime(value, pattern)
+                    except ValueError:
+                        continue
+                    else:
+                        if pattern == '%Y':
+                            year_only = True
+                        break
+
+                if result is None:
+                    print('fallback for {}'.format(value), file=sys.stderr)
+                    try:
+                        result = dateparser.parse(value)
+                    except TypeError as err:
+                        print("{} date parsing failed with: {}".format(value, err), file=sys.stderr)
+                        continue
+
                 if result is None:
                     # Unparsable date.
                     continue
-                release_date = result
+                if not year_only:
+                    release_date = result.date()
                 release_year = result.year
                 if 1000 < release_year < datetime.date.today().year + 5:
                     # Skip possibly bogus dates.
@@ -280,10 +353,16 @@ class DataciteImporter(EntityImporter):
                     container_id = self.lookup_issnl(issnl)
 
                     if container_id is None and container.get('title'):
+                        container_title = container.get('title')
+                        if isinstance(container_title, list):
+                            if len(container_title) > 0:
+                                print('too many container titles: {}'.format(len(container_title)))
+                                container_title = container_title[0]
+                        assert isinstance(container_title, str)
                         ce = fatcat_openapi_client.ContainerEntity(
                             issnl=issnl,
                             container_type=container_type,
-                            name=container.get('title'),
+                            name=container_title,
                         )
                         ce_edit = self.create_container(ce)
                         container_id = ce_edit.ident
@@ -326,12 +405,12 @@ class DataciteImporter(EntityImporter):
         # closest, but not always supplied.
         for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', 'bibtex', 'ris'):
             value = attributes.get('types', {}).get(typeType)
-            release_type = DATACITE_TYPE_MAP.get(value)
+            release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value)
             if release_type is not None:
                 break
 
         if release_type is None:
-            print("datacite unmapped type: {}".format(release_type), file=sys.stderr)
+            print("no mapped type: {}".format(value), file=sys.stderr)
 
         # Language values are varied ("ger", "es", "English", "ENG", "en-us",
         # "other", ...). Try to crush it with langcodes: "It may sound to you
@@ -347,7 +426,7 @@ class DataciteImporter(EntityImporter):
             try:
                 language = langcodes.get(value).language
             except langcodes.tag_parser.LanguageTagError:
-                print('could not determine language: {}'.format(value), file=sys.stderr)
+                pass
 
         # Abstracts appear in "attributes.descriptions[].descriptionType", some
         # of the observed values: "Methods", "TechnicalInfo",
@@ -355,8 +434,8 @@ class DataciteImporter(EntityImporter):
         # "Other" fields might contain references or related articles (with
         # DOI). TODO(martin): maybe try to parse out some of those refs.
         abstracts = []
-
-        for desc in attributes.get('descriptions', []):
+        descs = attributes.get('descriptions', []) or []
+        for desc in descs:
             if not desc.get('descriptionType') == 'Abstract':
                 continue
             if len(desc.get('description', '')) < 10:
@@ -364,10 +443,11 @@ class DataciteImporter(EntityImporter):
             text = desc.get('description')
             sha1 = hashlib.sha1(text.encode('utf-8')).hexdigest()
             lang = None
-            try:
-                lang = langdetect.detect(text)
-            except langdetect.lang_detect_exception.LangDetectException:
-                pass
+            if self.lang_detect:
+                try:
+                    lang = langdetect.detect(text)
+                except langdetect.lang_detect_exception.LangDetectException as err:
+                    print('language detection failed: {}'.format(err), file=sys.stderr)
             abstracts.append(fatcat_openapi_client.ReleaseAbstract(
                 mimetype="text/plain",
                 content=text,
@@ -386,7 +466,8 @@ class DataciteImporter(EntityImporter):
         # For the moment, we only care about References.
         refs, ref_index = [], 0
 
-        for rel in attributes.get('relatedIdentifiers', []):
+        relIds = attributes.get('relatedIdentifiers', []) or []
+        for rel in relIds:
             if not rel.get('relationType') == 'References':
                 continue
             ref_extra = dict()
@@ -422,6 +503,9 @@ class DataciteImporter(EntityImporter):
         if extra_datacite:
             extra['datacite'] = extra_datacite
 
+        doi = attributes.get('doi', '').lower()
+        extids = self.lookup_ext_ids(doi=doi)
+
         # Assemble release.
         re = fatcat_openapi_client.ReleaseEntity(
             work_id=None,
@@ -435,7 +519,13 @@ class DataciteImporter(EntityImporter):
             release_date=release_date,
             publisher=publisher,
             ext_ids=fatcat_openapi_client.ReleaseExtIds(
-                doi=attributes.get('doi'),
+                doi=doi,
+                pmid=extids['pmid'],
+                pmcid=extids['pmcid'],
+                wikidata_qid=extids['wikidata_qid'],
+                core=extids['core_id'],
+                arxiv=extids['arxiv_id'],
+                jstor=extids['jstor_id'],
             ),
             contribs=contribs,
             volume=volume,
@@ -449,11 +539,12 @@ class DataciteImporter(EntityImporter):
         )
         return re
 
-    def try_update(self, re, debug=True):
+    def try_update(self, re):
         """
-        When debug is true, write the RE to stdout.
+        When debug is true, write the RE to stdout, not to the database. Might
+        hide schema mismatch bugs.
         """
-        if debug is True:
+        if self.debug is True:
             print(json.dumps(re.to_dict(), default=extended_json_encoder))
             return False
 
@@ -476,10 +567,16 @@ class DataciteImporter(EntityImporter):
         return True
 
     def insert_batch(self, batch):
+        print('inserting batch ({})'.format(len(batch)), file=sys.stderr)
+        if self.insert_log_file:
+            with open(self.insert_log_file, 'a') as f:
+                for doc in batch:
+                    json.dump(doc.to_dict(), f, default=extended_json_encoder)
+                    f.write('\n')
         self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
             editgroup=fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra),
+            description=self.editgroup_description,
+            extra=self.editgroup_extra),
             entity_list=batch))
 
 def extended_json_encoder(value):
@@ -491,6 +588,7 @@ def extended_json_encoder(value):
         return value.isoformat()
     if isinstance(value, set):
         return list(value)
+    raise TypeError('cannot encode type: {}'.format(type(value)))
 
 def lookup_license_slug(raw):
     """
diff --git a/python/tests/files/datacite_1k_records.jsonl.gz b/python/tests/files/datacite_1k_records.jsonl.gz
new file mode 100644
index 00000000..28ea6e37
Binary files /dev/null and b/python/tests/files/datacite_1k_records.jsonl.gz differ
diff --git a/python/tests/files/datacite_sample.jsonl b/python/tests/files/datacite_sample.jsonl
new file mode 100644
index 00000000..dba3e267
--- /dev/null
+++ b/python/tests/files/datacite_sample.jsonl
@@ -0,0 +1 @@
+{"id":"10.18730/8dym9","type":"dois","attributes":{"doi":"10.18730/8dym9","identifiers":[{"identifier":"https://doi.org/10.18730/8dym9","identifierType":"DOI"},{"identifier":"ICDW 20791","identifierType":"Other"}],"creators":[{"name":"GLIS Of The ITPGRFA","affiliation":[]}],"titles":[{"title":"Triticum turgidum L. subsp. durum (Desf.) Husn. 97090"}],"publisher":"International Centre for Agricultural Research in Dry Areas","container":{},"publicationYear":2017,"subjects":[{"subject":"Plant Genetic Resource for Food and Agriculture"}],"contributors":[{"name":"International Centre For Agricultural Research In Dry Areas","affiliation":[]}],"dates":[{"date":"1986","dateType":"Accepted"},{"date":"1978-06-03","dateType":"Collected"},{"date":"2017","dateType":"Issued"}],"language":"en","types":{"ris":"GEN","bibtex":"misc","citeproc":"article","schemaOrg":"CreativeWork","resourceType":"PGRFA Material","resourceTypeGeneral":"PhysicalObject"},"relatedIdentifiers":[{"schemeUri":"http://www.fao.org/plant-treaty/areas-of-work/global-information-system/descriptors","schemeType":"XML","relationType":"HasMetadata","relatedIdentifier":"https://ssl.fao.org/glisapi/v1/pgrfas?doi=10.18730/8DYM9","relatedIdentifierType":"URL","relatedMetadataScheme":"GLIS Descriptors"},{"schemeUri":"http://rs.tdwg.org/dwc/terms/guides/text/index.htm","schemeType":"DwC-A","relationType":"HasMetadata","relatedIdentifier":"https://ssl.fao.org/glisapi/v1/pgrfas?_format=dwc&doi=10.18730/8DYM9","relatedIdentifierType":"URL","relatedMetadataScheme":"Darwin Core Archive"}],"sizes":[],"formats":[],"version":null,"rightsList":[],"descriptions":[{"description":"Plant Genetic Resource.<br>Taxonomy: Triticum turgidum L. subsp. durum (Desf.) Husn.<br>Common name(s): Wheat<br>Conserved by: International Centre for Agricultural Research in Dry Areas (ICARDA), Lebanon<br>Local sample unique identifier: 97090<br>Method of creation: Acquisition<br>Date: 1986<br>Biological status: Traditional cultivar/landrace<br>Other identifiers: ICDW 20791<br>MLS status: Included<br>Historical: No","descriptionType":"Abstract"}],"geoLocations":[{"geoLocationPlace":"Collecting site","geoLocationPoint":{"pointLatitude":"35.5","pointLongitude":"23.7333"}}],"fundingReferences":[],"url":"https://ssl.fao.org/glis/doi/10.18730/8DYM9","contentUrl":null,"metadataVersion":3,"schemaVersion":"http://datacite.org/schema/kernel-4","source":"mds","isActive":true,"state":"findable","reason":null,"created":"2017-11-11T12:26:01.000Z","registered":"2017-11-11T12:26:02.000Z","published":"2017","updated":"2019-08-02T16:34:56.000Z"},"relationships":{"client":{"data":{"id":"fao.itpgrfa","type":"clients"}}}}
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 0bbaba2e..9c542fc6 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -1,25 +1,99 @@
 """
 Test datacite importer.
+"""
 
-Datacite is a aggregator, hence inputs are quite varied.
+import datetime
+import pytest
+import gzip
+from fatcat_tools.importers import DataciteImporter, JsonLinePusher
+from fixtures import api
+import json
 
-Here is small sample of ID types taken from a sample:
 
-    497344 "DOI"
-     65013 "URL"
-     22210 "CCDC"
-     17853 "GBIF"
-     17635 "Other"
-     11474 "uri"
-      9170 "Publisher ID"
-      7775 "URN"
-      6196 "DUCHAS"
-      5624 "Handle"
-      5056 "publisherId"
+@pytest.fixture(scope="function")
+def datacite_importer(api):
+    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+        yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3',
+                               bezerk_mode=True)
 
-A nice tool, not yet existing tool (maybe named indigo) would do the following:
+@pytest.fixture(scope="function")
+def datacite_importer_existing(api):
+    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+        yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3',
+                               bezerk_mode=False)
 
-    $ shuf -n 100000 datacite.ndjson | indigo -t md > data.md
 
-TODO(martin): Write tests.
-"""
+@pytest.mark.skip(reason="larger datacite import slows tests down")
+def test_datacite_importer_huge(datacite_importer):
+    last_index = datacite_importer.api.get_changelog(limit=1)[0].index
+    with gzip.open('tests/files/datacite_1k_records.jsonl.gz', 'rt') as f:
+        datacite_importer.bezerk_mode = True
+        counts = JsonLinePusher(datacite_importer, f).run()
+    assert counts['insert'] == 998
+    change = datacite_importer.api.get_changelog_entry(index=last_index+1)
+    release = datacite_importer.api.get_release(change.editgroup.edits.releases[0].ident)
+    assert len(release.contribs) == 3
+
+
+def test_datacite_importer(datacite_importer):
+    last_index = datacite_importer.api.get_changelog(limit=1)[0].index
+    with open('tests/files/datacite_sample.jsonl', 'r') as f:
+        datacite_importer.bezerk_mode = True
+        counts = JsonLinePusher(datacite_importer, f).run()
+    assert counts['insert'] == 1
+    assert counts['exists'] == 0
+    assert counts['skip'] == 0
+
+    # fetch most recent editgroup
+    change = datacite_importer.api.get_changelog_entry(index=last_index+1)
+    eg = change.editgroup
+    assert eg.description
+    assert "datacite" in eg.description.lower()
+    assert eg.extra['git_rev']
+    assert "fatcat_tools.DataciteImporter" in eg.extra['agent']
+
+    last_index = datacite_importer.api.get_changelog(limit=1)[0].index
+    with open('tests/files/datacite_sample.jsonl', 'r') as f:
+        datacite_importer.bezerk_mode = False
+        datacite_importer.reset()
+        counts = JsonLinePusher(datacite_importer, f).run()
+    assert counts['insert'] == 0
+    assert counts['exists'] == 1
+    assert counts['skip'] == 0
+    assert last_index == datacite_importer.api.get_changelog(limit=1)[0].index
+
+def test_datacite_dict_parse(datacite_importer):
+    with open('tests/files/datacite_sample.jsonl', 'r') as f:
+        raw = json.load(f)
+        r = datacite_importer.parse_record(raw)
+        # ensure the API server is ok with format
+        JsonLinePusher(datacite_importer, [json.dumps(raw)]).run()
+
+        print(r.extra)
+        assert r.title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090"
+        assert r.publisher == "International Centre for Agricultural Research in Dry Areas"
+        assert r.release_type == "article"
+        assert r.release_stage == "published"
+        assert r.license_slug == None
+        assert r.original_title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090"
+        assert r.ext_ids.doi == "10.18730/8dym9"
+        assert r.ext_ids.isbn13 == None
+        assert r.language == "enc"
+        assert r.subtitle == None
+        assert r.release_date == None
+        assert r.release_year == 1986
+        assert 'subtitle' not in r.extra
+        assert 'subtitle' not in r.extra['datacite']
+        assert 'funder' not in r.extra
+        assert 'funder' not in r.extra['datacite']
+        # matched by ISSN, so shouldn't be in there
+        #assert extra['container_name'] == "International Journal of Quantum Chemistry"
+        assert r.extra['datacite']['url'] == 'https://ssl.fao.org/glis/doi/10.18730/8DYM9'
+        assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}]
+        assert len(r.abstracts) == 1
+        assert len(r.abstracts[0].content) == 421
+        assert len(r.contribs) == 1
+        assert r.contribs[0].raw_name == "GLIS Of The ITPGRFA"
+        assert r.contribs[0].given_name == None
+        assert r.contribs[0].surname == None
+        assert len(r.refs) == 0
-- 
cgit v1.2.3


From 52eabd48658a676ac4577d1c8da31df1fe58093e Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 20 Dec 2019 17:43:08 +0100
Subject: datacite: move common date patterns out of the loop

Additionally, try the unspecific (%Y) pattern last.
---
 python/fatcat_tools/importers/datacite.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 9774e334..77ce1012 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -280,6 +280,10 @@ class DataciteImporter(EntityImporter):
             'Created',
             'Updated',
         )
+
+        # Before using (expensive) dateparser, try a few common patterns.
+        common_patterns = ('%Y-%m-%d', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S', '%Y')
+
         for prio in date_type_prio:
             dates = attributes.get('dates', []) or [] # Never be None.
             for item in dates:
@@ -289,9 +293,6 @@ class DataciteImporter(EntityImporter):
                 # Parse out date, use common patterns first, fallback to dateparser.
                 result, value, year_only = None, item.get('date', ''), False
 
-                # Before using (expensive) dateparser, try a few common patterns.
-                common_patterns = ('%Y-%m-%d', '%Y', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S')
-
                 for pattern in common_patterns:
                     try:
                         result = datetime.datetime.strptime(value, pattern)
-- 
cgit v1.2.3


From a196435a0e88f85785742cdd089344f97401b43a Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Sat, 21 Dec 2019 23:30:56 +0100
Subject: address first round of MR14 comments

* add missing langdetect
* use entity_to_dict for json debug output
* factor out code for fields in function and add table driven tests
* update citeproc types
* add author as default role
* add raw_affiliation
* include relations from datacite
* remove url (covered by doi already)

Using yapf for python formatting.
---
 python/Pipfile                            |   1 +
 python/Pipfile.lock                       |   7 +
 python/fatcat_tools/importers/datacite.py | 467 ++++++++++++++++++++----------
 python/tests/import_datacite.py           | 178 +++++++++++-
 4 files changed, 503 insertions(+), 150 deletions(-)

(limited to 'python')

diff --git a/python/Pipfile b/python/Pipfile
index dfb87514..6325c180 100644
--- a/python/Pipfile
+++ b/python/Pipfile
@@ -49,6 +49,7 @@ elasticsearch-dsl = ">=6.0.0,<7.0.0"
 elasticsearch = ">=6.0.0,<7.0.0"
 langcodes = ">=1.4"
 dateparser = ">=0.7"
+langdetect = "*"
 
 [requires]
 # Python 3.5 is the bundled (system) version of python for Ubuntu 16.04
diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index b6e066b5..f0f60aa8 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -306,6 +306,13 @@
             "index": "pypi",
             "version": "==1.4.1"
         },
+        "langdetect": {
+            "hashes": [
+                "sha256:91a170d5f0ade380db809b3ba67f08e95fe6c6c8641f96d67a51ff7e98a9bf30"
+            ],
+            "index": "pypi",
+            "version": "==1.0.7"
+        },
         "loginpass": {
             "hashes": [
                 "sha256:717c87c1870a7e00547fd9d989aea9b22232b2f48826f552d79c34a47f9618c9",
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 77ce1012..19b89edf 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -14,6 +14,7 @@ import langcodes
 import langdetect
 import sqlite3
 import sys
+from fatcat_tools.transforms import entity_to_dict
 
 # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
 CONTAINER_TYPE_MAP = {
@@ -55,16 +56,42 @@ DATACITE_TYPE_MAP = {
         'Thesis': 'thesis',
     },
     'citeproc': {
-        'dataset': 'dataset',
-        'chapter': 'chapter',
-        'article-journal': 'article-journal',
-        'song': 'song',
         'article': 'article',
-        'report': 'report',
+        'article-journal': 'article-journal',
+        'article-magazine': 'article-magazine',
+        'article-newspaper': 'article-newspaper',
+        'bill': 'bill',
+        'book': 'book',
+        'broadcast': 'broadcast',
+        'chapter': 'chapter',
+        'dataset': 'dataset',
+        'entry-dictionary': 'entry-dictionary',
+        'entry-encyclopedia': 'entry-encyclopedia',
+        'entry': 'entry',
+        'figure': 'figure',
         'graphic': 'graphic',
+        'interview': 'interview',
+        'legal_case': 'legal_case',
+        'legislation': 'legislation',
+        'manuscript': 'manuscript',
+        'map': 'map',
+        'motion_picture': 'motion_picture',
+        'musical_score': 'musical_score',
+        'pamphlet': 'pamphlet',
+        'paper-conference': 'paper-conference',
+        'patent': 'patent',
+        'personal_communication': 'personal_communication',
+        'post': 'post',
+        'post-weblog': 'post-weblog',
+        'report': 'report',
+        'review-book': 'review-book',
+        'review': 'review',
+        'song': 'song',
+        'speech': 'speech',
         'thesis': 'thesis',
-        'book': 'book',
-    },
+        'treaty': 'treaty',
+        'webpage': 'webpage',
+    },  # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types
     'bibtex': {
         'phdthesis': 'thesis',
         'inbook': 'chapter',
@@ -88,7 +115,6 @@ DATACITE_TYPE_MAP = {
     }
 }
 
-
 # TODO(martin): merge this with other maps, maybe.
 LICENSE_SLUG_MAP = {
     "//creativecommons.org/licenses/by/2.0/": "CC-BY",
@@ -124,7 +150,8 @@ LICENSE_SLUG_MAP = {
     "//www.karger.com/Services/SiteLicenses": "KARGER",
     "//www.opensource.org/licenses/Apache-2.0": "Apache-2.0",
     "//www.opensource.org/licenses/BSD-3-Clause": "BSD-3-Clause",
-    "//www.opensource.org/licenses/EUPL-1.1": "EUPL-1.1", # redirects to EUPL-1.2
+    "//www.opensource.org/licenses/EUPL-1.1":
+    "EUPL-1.1",  # redirects to EUPL-1.2
     "//www.opensource.org/licenses/MIT": "MIT",
     # "http://royalsocietypublishing.org/licence": "", # OA and "normal", https://royalsociety.org/journals/authors/licence-to-publish/
     # "http://rsc.li/journals-terms-of-use": "RSC",
@@ -146,23 +173,31 @@ LICENSE_SLUG_MAP = {
     # Note: Some URLs pointing to licensing terms are not in WB yet (but would be nice).
 }
 
+
 class DataciteImporter(EntityImporter):
     """
     Importer for datacite records.
     """
-
-    def __init__(self, api, issn_map_file, debug=False, lang_detect=False,
-                 insert_log_file=None, **kwargs):
-
-        eg_desc = kwargs.get('editgroup_description',
-            "Automated import of Datacite DOI metadata, harvested from REST API")
+    def __init__(self,
+                 api,
+                 issn_map_file,
+                 debug=False,
+                 lang_detect=False,
+                 insert_log_file=None,
+                 **kwargs):
+
+        eg_desc = kwargs.get(
+            'editgroup_description',
+            "Automated import of Datacite DOI metadata, harvested from REST API"
+        )
         eg_extra = kwargs.get('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.DataciteImporter')
+        eg_extra['agent'] = eg_extra.get('agent',
+                                         'fatcat_tools.DataciteImporter')
         super().__init__(api,
-            issn_map_file=issn_map_file,
-            editgroup_description=eg_desc,
-            editgroup_extra=eg_extra,
-            **kwargs)
+                         issn_map_file=issn_map_file,
+                         editgroup_description=eg_desc,
+                         editgroup_extra=eg_extra,
+                         **kwargs)
 
         self.create_containers = kwargs.get('create_containers', True)
         extid_map_file = kwargs.get('extid_map_file')
@@ -179,18 +214,31 @@ class DataciteImporter(EntityImporter):
         self.lang_detect = lang_detect
         self.insert_log_file = insert_log_file
 
-        print('datacite with debug={}, lang_detect={}'.format(self.debug, self.lang_detect), file=sys.stderr)
+        print('datacite with debug={}, lang_detect={}'.format(
+            self.debug, self.lang_detect),
+              file=sys.stderr)
 
     def lookup_ext_ids(self, doi):
         """
         Return dictionary of identifiers refering to the same things as the given DOI.
         """
         if self.extid_map_db is None:
-            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
-        row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
+            return dict(core_id=None,
+                        pmid=None,
+                        pmcid=None,
+                        wikidata_qid=None,
+                        arxiv_id=None,
+                        jstor_id=None)
+        row = self.extid_map_db.execute(
+            "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
             [doi.lower()]).fetchone()
         if row is None:
-            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
+            return dict(core_id=None,
+                        pmid=None,
+                        pmcid=None,
+                        wikidata_qid=None,
+                        arxiv_id=None,
+                        jstor_id=None)
         row = [str(cell or '') or None for cell in row]
         return dict(
             core_id=row[0],
@@ -206,6 +254,8 @@ class DataciteImporter(EntityImporter):
         """
         Mapping datacite JSON to ReleaseEntity.
         """
+        if not obj or not isinstance(obj, dict):
+            return None
         if 'attributes' not in obj:
             return None
 
@@ -218,43 +268,54 @@ class DataciteImporter(EntityImporter):
         contribs = []
 
         for i, c in enumerate(attributes['creators']):
-            if 'nameType' in c and not c.get('nameType') == 'Personal':
-                continue
-            creator_id = None
-            for nid in c.get('nameIdentifiers', []):
-                if not nid.get('nameIdentifierScheme').lower() == "orcid":
+            nameType = c.get('nameType', '') or ''
+            if nameType == 'Personal':
+                creator_id = None
+                for nid in c.get('nameIdentifiers', []):
+                    if not nid.get('nameIdentifierScheme').lower() == "orcid":
+                        continue
+                    orcid = nid.get('nameIdentifier',
+                                    '').replace('https://orcid.org/', '')
+                    if not orcid:
+                        continue
+                    creator_id = self.lookup_orcid(orcid)
+                    # TODO(martin): If creator_id is None, should we create creators?
+
+                # If there are multiple affiliation strings, use the first one.
+                affiliations = c.get('affiliation', []) or []
+                raw_affiliation = None
+                if len(affiliations) == 0:
+                    raw_affiliation = None
+                else:
+                    raw_affiliation = affiliations[0]
+
+                contribs.append(
+                    fatcat_openapi_client.ReleaseContrib(
+                        creator_id=creator_id,
+                        index=i,
+                        raw_name=c.get('name'),
+                        given_name=c.get('givenName'),
+                        surname=c.get('familyName'),
+                        role='author',
+                        raw_affiliation=raw_affiliation,
+                    ))
+            elif nameType == 'Organizational':
+                name = c.get('name', '') or ''
+                if name == 'NN':
                     continue
-                orcid = nid.get('nameIdentifier', '').replace('https://orcid.org/', '')
-                if not orcid:
+                if len(name) < 3:
                     continue
-                creator_id = self.lookup_orcid(orcid)
-                # TODO(martin): If creator_id is None, should we create creators?
-            contribs.append(fatcat_openapi_client.ReleaseContrib(
-                creator_id=creator_id,
-                index=i,
-                raw_name=c.get('name'),
-                given_name=c.get('givenName'),
-                surname=c.get('familyName'),
-            ))
+                extra = {'organization': name}
+                contribs.append(fatcat_openapi_client.ReleaseContrib(
+                    index=i, extra=extra))
+            else:
+                print('unknown name type: {}'.format(nameType), file=sys.stderr)
 
         # Title, may come with "attributes.titles[].titleType", like
         # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
-        title, subtitle = None, None
-
         titles = attributes.get('titles', []) or []
-        if len(titles) == 0:
-            print('skipping record w/o title: {}'.format(obj), file=sys.stderr)
-            return False
-        elif len(titles) == 1:
-            # We do not care about the type then.
-            title = titles[0].get('title', '') or ''
-            title = title.strip()
-        else:
-            for entry in titles:
-                if not title and ('titleType' not in entry or not entry.get('titleType')):
-                    title = entry.get('title').strip()
-                if entry.get('titleType') == 'Subtitle':
-                    subtitle = entry.get('title', '').strip()
+        title, original_language_title, subtitle = parse_datacite_titles(
+            titles)
 
         if not title:
             print('skipping record w/o title: {}'.format(obj), file=sys.stderr)
@@ -268,67 +329,14 @@ class DataciteImporter(EntityImporter):
         # "attributes.dates[].dateType", values: "Accepted", "Available"
         # "Collected", "Copyrighted", "Created", "Issued", "Submitted",
         # "Updated", "Valid".
-        release_year, release_date = None, None
-
-        # Ignore: Collected, Issued.
-        date_type_prio = (
-            'Valid',
-            'Available',
-            'Accepted',
-            'Submitted',
-            'Copyrighted',
-            'Created',
-            'Updated',
-        )
-
-        # Before using (expensive) dateparser, try a few common patterns.
-        common_patterns = ('%Y-%m-%d', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S', '%Y')
-
-        for prio in date_type_prio:
-            dates = attributes.get('dates', []) or [] # Never be None.
-            for item in dates:
-                if not item.get('dateType') == prio:
-                    continue
-
-                # Parse out date, use common patterns first, fallback to dateparser.
-                result, value, year_only = None, item.get('date', ''), False
-
-                for pattern in common_patterns:
-                    try:
-                        result = datetime.datetime.strptime(value, pattern)
-                    except ValueError:
-                        continue
-                    else:
-                        if pattern == '%Y':
-                            year_only = True
-                        break
-
-                if result is None:
-                    print('fallback for {}'.format(value), file=sys.stderr)
-                    try:
-                        result = dateparser.parse(value)
-                    except TypeError as err:
-                        print("{} date parsing failed with: {}".format(value, err), file=sys.stderr)
-                        continue
-
-                if result is None:
-                    # Unparsable date.
-                    continue
-                if not year_only:
-                    release_date = result.date()
-                release_year = result.year
-                if 1000 < release_year < datetime.date.today().year + 5:
-                    # Skip possibly bogus dates.
-                    continue
-                break
-            else:
-                continue
-            break
+        release_date, release_year = parse_datacite_dates(
+            attributes.get('dates', []))
 
         # Publisher. A few NA values. A few bogus values.
         publisher = attributes.get('publisher')
 
-        if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)', '(:none)'):
+        if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)',
+                         '(:none)'):
             publisher = None
         if publisher is not None and len(publisher) > 80:
             # Arbitrary magic value max length. TODO(martin): better heuristic,
@@ -345,7 +353,8 @@ class DataciteImporter(EntityImporter):
         container = attributes.get('container', {}) or {}
         if container.get('type') in CONTAINER_TYPE_MAP.keys():
             container_type = CONTAINER_TYPE_MAP.get(container['type'])
-            if container.get('identifier') and container.get('identifierType') == 'ISSN':
+            if container.get('identifier') and container.get(
+                    'identifierType') == 'ISSN':
                 issn = container.get('identifier')
                 if len(issn) == 8:
                     issn = issn[:4] + "-" + issn[4:]
@@ -357,7 +366,8 @@ class DataciteImporter(EntityImporter):
                         container_title = container.get('title')
                         if isinstance(container_title, list):
                             if len(container_title) > 0:
-                                print('too many container titles: {}'.format(len(container_title)))
+                                print('too many container titles: {}'.format(
+                                    len(container_title)))
                                 container_title = container_title[0]
                         assert isinstance(container_title, str)
                         ce = fatcat_openapi_client.ContainerEntity(
@@ -404,7 +414,8 @@ class DataciteImporter(EntityImporter):
         # types supplied in datacite. The "attributes.types.resourceType"
         # contains too many (176 in sample) things for now; citeproc may be the
         # closest, but not always supplied.
-        for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', 'bibtex', 'ris'):
+        for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg',
+                         'bibtex', 'ris'):
             value = attributes.get('types', {}).get(typeType)
             release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value)
             if release_type is not None:
@@ -442,19 +453,19 @@ class DataciteImporter(EntityImporter):
             if len(desc.get('description', '')) < 10:
                 continue
             text = desc.get('description')
-            sha1 = hashlib.sha1(text.encode('utf-8')).hexdigest()
             lang = None
             if self.lang_detect:
                 try:
                     lang = langdetect.detect(text)
                 except langdetect.lang_detect_exception.LangDetectException as err:
-                    print('language detection failed: {}'.format(err), file=sys.stderr)
-            abstracts.append(fatcat_openapi_client.ReleaseAbstract(
-                mimetype="text/plain",
-                content=text,
-                sha1=sha1,
-                lang=lang,
-            ))
+                    print('language detection failed: {}'.format(err),
+                          file=sys.stderr)
+            abstracts.append(
+                fatcat_openapi_client.ReleaseAbstract(
+                    mimetype="text/plain",
+                    content=text,
+                    lang=lang,
+                ))
 
         # References and relations. Datacite include many relation types in
         # "attributes.relatedIdentifiers[].relationType", e.g.
@@ -476,17 +487,19 @@ class DataciteImporter(EntityImporter):
                 ref_extra['doi'] = rel.get('relatedIdentifier')
             if not ref_extra:
                 ref_extra = None
-            refs.append(fatcat_openapi_client.ReleaseRef(
-                index=ref_index,
-                extra=ref_extra,
-            ))
+            refs.append(
+                fatcat_openapi_client.ReleaseRef(
+                    index=ref_index,
+                    extra=ref_extra,
+                ))
             ref_index += 1
 
         # Start with clear stages, e.g. published. TODO(martin): we could
         # probably infer a bit more from the relations, e.g.
         # "IsPreviousVersionOf" or "IsNewVersionOf".
         release_stage = None
-        if attributes.get('state') == 'findable' or attributes.get('isActive') is True:
+        if attributes.get(
+                'state') == 'findable' or attributes.get('isActive') is True:
             release_stage = 'published'
 
         # Extra information.
@@ -496,8 +509,22 @@ class DataciteImporter(EntityImporter):
             extra_datacite['license'] = license_extra
         if attributes.get('subjects'):
             extra_datacite['subjects'] = attributes['subjects']
-        if attributes.get('url'):
-            extra_datacite['url'] = attributes['url']
+
+        # Include certain relations from relatedIdentifiers. Keeping the
+        # original structure of data here, which is a list of dicts, with
+        # relation type, identifer and identifier type (mostly).
+        relations = []
+        for rel in relIds:
+            if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues',
+                                           'IsVariantFormOf', 'IsSupplementTo',
+                                           'HasVersion', 'IsMetadataFor',
+                                           'IsNewVersionOf', 'IsIdenticalTo',
+                                           'IsVersionOf', 'IsDerivedFrom',
+                                           'IsSourceOf'):
+                relations.append(rel)
+
+        if relations:
+            extra_datacite['relations'] = relations
 
         extra = dict()
 
@@ -515,7 +542,7 @@ class DataciteImporter(EntityImporter):
             release_stage=release_stage,
             title=title,
             subtitle=subtitle,
-            original_title=title,
+            original_title=original_language_title,
             release_year=release_year,
             release_date=release_date,
             publisher=publisher,
@@ -546,7 +573,7 @@ class DataciteImporter(EntityImporter):
         hide schema mismatch bugs.
         """
         if self.debug is True:
-            print(json.dumps(re.to_dict(), default=extended_json_encoder))
+            print(json.dumps(entity_to_dict(re, api_client=None)))
             return False
 
         # lookup existing DOI (don't need to try other ext idents for crossref)
@@ -572,24 +599,15 @@ class DataciteImporter(EntityImporter):
         if self.insert_log_file:
             with open(self.insert_log_file, 'a') as f:
                 for doc in batch:
-                    json.dump(doc.to_dict(), f, default=extended_json_encoder)
+                    json.dump(entity_to_dict(re, api_client=None), f)
                     f.write('\n')
-        self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
-            editgroup=fatcat_openapi_client.Editgroup(
-            description=self.editgroup_description,
-            extra=self.editgroup_extra),
-            entity_list=batch))
+        self.api.create_release_auto_batch(
+            fatcat_openapi_client.ReleaseAutoBatch(
+                editgroup=fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description,
+                    extra=self.editgroup_extra),
+                entity_list=batch))
 
-def extended_json_encoder(value):
-    """
-    Can be used with json.dumps(value, default=extended_json_encoder) to serialize
-    value not serializable by default. https://docs.python.org/3/library/json.html#basic-usage
-    """
-    if isinstance(value, (datetime.datetime, datetime.date)):
-        return value.isoformat()
-    if isinstance(value, set):
-        return list(value)
-    raise TypeError('cannot encode type: {}'.format(type(value)))
 
 def lookup_license_slug(raw):
     """
@@ -604,3 +622,156 @@ def lookup_license_slug(raw):
         if not raw.endswith('/'):
             raw = raw + '/'
     return LICENSE_SLUG_MAP.get(raw)
+
+
+def find_original_language_title(item, min_length=4, max_questionmarks=3):
+    """
+    Perform a few checks before returning a potential original language title.
+    """
+    if not 'original_language_title' in item:
+        return None
+    title = item.get('title')
+    if not title:
+        return None
+    original_language_title = item.get('original_language_title')
+    if isinstance(original_language_title,
+                  str) and title != original_language_title:
+        if len(original_language_title) < min_length:
+            return None
+        if original_language_title.count('?') > max_questionmarks:
+            return None
+        return original_language_title
+    if isinstance(original_language_title, dict):
+        content = original_language_title.get('__content__', '') or ''
+        if content and content != title and not content.count(
+                '?') > max_questionmarks:
+            return content
+    return None
+
+
+def parse_datacite_titles(titles):
+    """
+    Given a list of title items from datacite, return 3-tuple (title,
+    original_language_title, subtitle).
+
+    Example input:
+
+        [
+            {
+                 "title": "Meeting Heterogeneity in Consumer Demand"
+            }
+        ]
+    """
+    title, original_language_title, subtitle = None, None, None
+
+    if titles is None:
+        return title, original_language_title, subtitle
+    if len(titles) == 0:
+        return title, original_language_title, subtitle
+    elif len(titles) == 1:
+        original_language_title = find_original_language_title(titles[0])
+        title = titles[0].get('title', '') or ''
+        title = title.strip()
+        if not title:
+            title = None
+        return title, original_language_title, subtitle
+    else:
+        for entry in titles:
+            if not title and ('titleType' not in entry
+                              or not entry.get('titleType')):
+                title = entry.get('title').strip()
+            if not subtitle and entry.get('titleType') == 'Subtitle':
+                subtitle = entry.get('title', '').strip()
+            if not original_language_title:
+                original_language_title = find_original_language_title(entry)
+
+    return title, original_language_title, subtitle
+
+
+def parse_datacite_dates(dates):
+    """
+    Given a list of date fields (under .dates), return tuple, (release_date,
+    release_year).
+    """
+    release_date, release_year = None, None
+
+    if not dates:
+        return release_date, release_year
+
+    if not isinstance(dates, list):
+        raise ValueError('expected a list of date items')
+
+    # Ignored: Collected, Issued.
+    date_type_prio = (
+        'Valid',
+        'Available',
+        'Accepted',
+        'Submitted',
+        'Copyrighted',
+        'Created',
+        'Updated',
+    )
+
+    # Before using (expensive) dateparser, try a few common patterns.
+    common_patterns = ('%Y-%m-%d', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ',
+                       '%Y-%m-%dT%H:%M:%S', '%Y')
+
+    def parse_item(item):
+        result, value, year_only = None, item.get('date', ''), False
+        release_date, release_year = None, None
+
+        for pattern in common_patterns:
+            try:
+                result = datetime.datetime.strptime(value, pattern)
+            except ValueError:
+                continue
+            else:
+                if pattern == '%Y':
+                    year_only = True
+                break
+
+        if result is None:
+            print('fallback for {}'.format(value), file=sys.stderr)
+            try:
+                result = dateparser.parse(value)
+            except TypeError as err:
+                print("{} date parsing failed with: {}".format(value, err),
+                      file=sys.stderr)
+                return result_date, result_year
+
+        if result is None:
+            # Unparsable date.
+            return release_date, release_year
+
+        if not year_only:
+            release_date = result.date()
+        release_year = result.year
+
+        return release_date, release_year
+
+    for prio in date_type_prio:
+        for item in dates:
+            if not item.get('dateType') == prio:
+                continue
+
+            release_date, release_year = parse_item(item)
+            if release_date is None and release_year is None:
+                continue
+
+            if release_year < 1000 or release_year > datetime.date.today(
+            ).year + 5:
+                # Skip possibly bogus dates.
+                release_year = None
+                continue
+            break
+        else:
+            continue
+        break
+
+    if release_date is None and release_year is None:
+        for item in dates:
+            release_date, release_year = parse_item(item)
+            if release_year or release_date:
+                break
+
+    return release_date, release_year
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 9c542fc6..ab67a310 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -2,10 +2,12 @@
 Test datacite importer.
 """
 
+import collections
 import datetime
 import pytest
 import gzip
 from fatcat_tools.importers import DataciteImporter, JsonLinePusher
+from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates
 from fixtures import api
 import json
 
@@ -22,7 +24,6 @@ def datacite_importer_existing(api):
         yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3',
                                bezerk_mode=False)
 
-
 @pytest.mark.skip(reason="larger datacite import slows tests down")
 def test_datacite_importer_huge(datacite_importer):
     last_index = datacite_importer.api.get_changelog(limit=1)[0].index
@@ -35,6 +36,179 @@ def test_datacite_importer_huge(datacite_importer):
     assert len(release.contribs) == 3
 
 
+def test_find_original_language_title():
+    """
+    Original language might be included, in various ways.
+    """
+    Case = collections.namedtuple('Case', 'about input result')
+    cases = [
+        Case('defaults to None', {}, None),
+        Case('ignore unknown keys', {'broken': 'kv'}, None),
+        Case('just a title', {'title': 'Noise Reduction'}, None),
+        Case('same title should be ignored', {
+            'title': 'Noise Reduction',
+            'original_language_title': 'Noise Reduction'
+        }, None),
+        Case('empty subdict is ignored', {
+            'title': 'Noise Reduction',
+            'original_language_title': {},
+        }, None),
+        Case('unknown subdict keys are ignored', {
+            'title': 'Noise Reduction',
+            'original_language_title': {'broken': 'kv'},
+        }, None),
+        Case('original string', {
+            'title': 'Noise Reduction',
+            'original_language_title': 'Подавление шума',
+        }, 'Подавление шума'),
+        Case('language tag is ignored, since its broken', {
+            'title': 'Noise Reduction',
+            'original_language_title': {
+                'language': 'ja',
+                '__content__': 'Noise Reduction'
+            },
+        }, None),
+        Case('do not care about language', {
+            'title': 'Noise Reduction',
+            'original_language_title': {
+                'language': 'ja',
+                '__content__': 'Rauschunterdrückung',
+            },
+        }, 'Rauschunterdrückung'),
+        Case('ignore excessive questionmarks', {
+            'title': 'Noise Reduction',
+            'original_language_title': {
+                'language': 'ja',
+                '__content__': '???? However',
+            },
+        }, None),
+    ]
+
+    for case in cases:
+        result = find_original_language_title(case.input)
+        assert result == case.result
+
+def test_parse_datacite_titles():
+    """
+    Given a list of titles, find title, original_language_title and subtitle.
+    Result is a 3-tuple of title, original_language_title, subtitle.
+    """
+    Case = collections.namedtuple('Case', 'about input result')
+    cases = [
+        Case('handle None', None, (None, None, None)),
+        Case('empty list', [], (None, None, None)),
+        Case('empty item', [{}], (None, None, None)),
+        Case('broken keys', [{'broken': 'kv'}], (None, None, None)),
+        Case('title only', [{'title': 'Total carbon dioxide'}],
+             ('Total carbon dioxide', None, None),
+        ),
+        Case('title and subtitle', [
+            {'title': 'Total carbon dioxide'},
+            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
+        ],
+             ('Total carbon dioxide', None, 'Station TT043_7-9'),
+        ),
+        Case('title, subtitle order does not matter', [
+            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
+            {'title': 'Total carbon dioxide'},
+        ],
+             ('Total carbon dioxide', None, 'Station TT043_7-9'),
+        ),
+        Case('multiple titles, first wins', [
+            {'title': 'Total carbon dioxide'},
+            {'title': 'Meeting Heterogeneity'},
+        ],
+             ('Total carbon dioxide', None, None),
+        ),
+        Case('multiple titles, plus sub', [
+            {'title': 'Total carbon dioxide'},
+            {'title': 'Meeting Heterogeneity'},
+            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
+        ],
+             ('Total carbon dioxide', None, 'Station TT043_7-9'),
+        ),
+        Case('multiple titles, multiple subs', [
+            {'title': 'Total carbon dioxide'},
+            {'title': 'Meeting Heterogeneity'},
+            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
+            {'title': 'Some other subtitle', 'titleType': 'Subtitle'},
+        ],
+             ('Total carbon dioxide', None, 'Station TT043_7-9'),
+        ),
+        Case('title, original, sub', [
+            {'title': 'Total carbon dioxide', 'original_language_title': 'Всего углекислого газа'},
+            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
+        ],
+             ('Total carbon dioxide', 'Всего углекислого газа', 'Station TT043_7-9'),
+        ),
+        Case('title, original same as title, sub', [
+            {'title': 'Total carbon dioxide', 'original_language_title': {
+                '__content__': 'Total carbon dioxide',
+            }},
+            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
+        ],
+             ('Total carbon dioxide', None, 'Station TT043_7-9'),
+        ),
+        Case('title, original dict, sub', [
+            {'title': 'Total carbon dioxide', 'original_language_title': {
+                '__content__': 'Всего углекислого газа',
+            }},
+            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
+        ],
+             ('Total carbon dioxide', 'Всего углекислого газа', 'Station TT043_7-9'),
+        ),
+    ]
+
+    for case in cases:
+        result = parse_datacite_titles(case.input)
+        assert result == case.result, case.about
+
+def test_parse_datacite_dates():
+    """
+    Test datacite date parsing.
+    """
+    Case = collections.namedtuple('Case', 'about input result')
+    cases = [
+        Case('None is None', None, (None, None)),
+        Case('empty list is None', [], (None, None)),
+        Case('empty item is None', [{}], (None, None)),
+        Case('empty item is None', [{'date': '2019'}], (None, 2019)),
+        Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, 2019)),
+        Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, 2020)),
+        Case('first with type', [
+            {'date': '2019', 'dateType': 'Accepted'}, {'date': '2020'}
+        ], (None, 2019)),
+        Case('full date', [
+            {'date': '2019-12-01', 'dateType': 'Valid'},
+        ], (datetime.date(2019, 12, 1), 2019)),
+        Case('date type prio', [
+            {'date': '2000-12-01', 'dateType': 'Valid'},
+            {'date': '2010-01-01', 'dateType': 'Updated'},
+        ], (datetime.date(2000, 12, 1), 2000)),
+        Case('date type prio, Available > Updated', [
+            {'date': '2010-01-01', 'dateType': 'Updated'},
+            {'date': '2000-12-01', 'dateType': 'Available'},
+        ], (datetime.date(2000, 12, 1), 2000)),
+        Case('allow different date formats, Available > Updated', [
+            {'date': '2010-01-01T10:00:00', 'dateType': 'Updated'},
+            {'date': '2000-12-01T10:00:00', 'dateType': 'Available'},
+        ], (datetime.date(2000, 12, 1), 2000)),
+        Case('allow different date formats, Available > Updated', [
+            {'date': '2010-01-01T10:00:00Z', 'dateType': 'Updated'},
+            {'date': '2000-12-01T10:00:00Z', 'dateType': 'Available'},
+        ], (datetime.date(2000, 12, 1), 2000)),
+        Case('allow fuzzy date formats, Available > Updated', [
+            {'date': '2010', 'dateType': 'Updated'},
+            {'date': '2000 Dec 01', 'dateType': 'Available'},
+        ], (datetime.date(2000, 12, 1), 2000)),
+        Case('ignore broken date', [
+            {'date': 'Febrrr 45', 'dateType': 'Updated'},
+        ], (None, None)),
+    ]
+    for case in cases:
+        result = parse_datacite_dates(case.input)
+        assert result == case.result, case.about
+
 def test_datacite_importer(datacite_importer):
     last_index = datacite_importer.api.get_changelog(limit=1)[0].index
     with open('tests/files/datacite_sample.jsonl', 'r') as f:
@@ -75,7 +249,7 @@ def test_datacite_dict_parse(datacite_importer):
         assert r.release_type == "article"
         assert r.release_stage == "published"
         assert r.license_slug == None
-        assert r.original_title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090"
+        assert r.original_title == None
         assert r.ext_ids.doi == "10.18730/8dym9"
         assert r.ext_ids.isbn13 == None
         assert r.language == "enc"
-- 
cgit v1.2.3


From 013d873c73f374f968559b6b70d9c2575b6dc47e Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 26 Dec 2019 15:36:04 +0100
Subject: datacite: add missing --extid-map-file flag

---
 python/fatcat_import.py | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'python')

diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 90bb01a1..c6c74bc2 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -173,6 +173,7 @@ def run_datacite(args):
         bezerk_mode=args.bezerk_mode,
         debug=args.debug,
         lang_detect=args.lang_detect,
+        extid_map_file=args.extid_map_file,
         insert_log_file=args.insert_log_file)
     if args.kafka_mode:
         KafkaJsonPusher(fci, args.kafka_hosts, args.kafka_env, "api-datacite",
@@ -461,6 +462,9 @@ def main():
     sub_datacite.add_argument('issn_map_file',
         help="ISSN to ISSN-L mapping file",
         default=None, type=argparse.FileType('r'))
+    sub_datacite.add_argument('--extid-map-file',
+        help="DOI-to-other-identifiers sqlite3 database",
+        default=None, type=str)
     sub_datacite.add_argument('--kafka-mode',
         action='store_true',
         help="consume from kafka topic (not stdin)")
-- 
cgit v1.2.3


From 91bd7b82608e5e27a10c649cf8205243b8ba96c6 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 26 Dec 2019 15:37:13 +0100
Subject: datacite: use specific auth var

---
 python/fatcat_import.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index c6c74bc2..a17029cc 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -483,7 +483,7 @@ def main():
         help="write inserted documents into file (for debugging)")
     sub_datacite.set_defaults(
         func=run_datacite,
-        auth_var="FATCAT_API_AUTH_TOKEN",
+        auth_var="FATCAT_AUTH_WORKER_DATACITE",
     )
 
     args = parser.parse_args()
-- 
cgit v1.2.3


From a57919b05d8b1f24041713e85b7fa4322c0591c6 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 26 Dec 2019 17:24:50 +0100
Subject: datacite: use iso 639-1 codes

---
 python/fatcat_tools/importers/datacite.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 19b89edf..26520164 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -10,7 +10,7 @@ import datetime
 import fatcat_openapi_client
 import hashlib
 import json
-import langcodes
+import pycountry
 import langdetect
 import sqlite3
 import sys
@@ -433,12 +433,9 @@ class DataciteImporter(EntityImporter):
 
         value = attributes.get('language', '') or ''
         try:
-            language = langcodes.find(value).language
-        except LookupError:
-            try:
-                language = langcodes.get(value).language
-            except langcodes.tag_parser.LanguageTagError:
-                pass
+            language = pycountry.languages.lookup(value).alpha_2
+        except (LookupError, AttributeError) as err:
+            print('language lookup miss for {}: {}'.format(value, err), file=sys.stderr)
 
         # Abstracts appear in "attributes.descriptions[].descriptionType", some
         # of the observed values: "Methods", "TechnicalInfo",
-- 
cgit v1.2.3


From 097fa7660c60e6c52ac2adbdd82fe64c122b1e42 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 26 Dec 2019 17:25:09 +0100
Subject: datacite: limit abstract length

---
 python/fatcat_tools/importers/datacite.py | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 26520164..66f812e2 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -16,6 +16,10 @@ import sqlite3
 import sys
 from fatcat_tools.transforms import entity_to_dict
 
+
+# Cutoff length for abstracts.
+MAX_ABSTRACT_LENGTH = 2048
+
 # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
 CONTAINER_TYPE_MAP = {
     'Journal': 'journal',
@@ -450,6 +454,8 @@ class DataciteImporter(EntityImporter):
             if len(desc.get('description', '')) < 10:
                 continue
             text = desc.get('description')
+            if len(text) > MAX_ABSTRACT_LENGTH:
+                text = text[:MAX_ABSTRACT_LENGTH] + " [...]"
             lang = None
             if self.lang_detect:
                 try:
-- 
cgit v1.2.3


From a4cd65ed4897987e70520d81c7caa27cd28ed5a3 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 26 Dec 2019 17:30:54 +0100
Subject: remove langcodes dependency

---
 python/Pipfile      |  1 -
 python/Pipfile.lock | 14 --------------
 2 files changed, 15 deletions(-)

(limited to 'python')

diff --git a/python/Pipfile b/python/Pipfile
index 6325c180..5d50b37c 100644
--- a/python/Pipfile
+++ b/python/Pipfile
@@ -47,7 +47,6 @@ pylatexenc = "*"
 pygal = "*"
 elasticsearch-dsl = ">=6.0.0,<7.0.0"
 elasticsearch = ">=6.0.0,<7.0.0"
-langcodes = ">=1.4"
 dateparser = ">=0.7"
 langdetect = "*"
 
diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index f0f60aa8..25606b3c 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -299,13 +299,6 @@
             ],
             "version": "==2.5.0"
         },
-        "langcodes": {
-            "hashes": [
-                "sha256:22cff103b6c0f3c9907d9a51c2d634177a50a189672ad16d959a3e2cd48eadab"
-            ],
-            "index": "pypi",
-            "version": "==1.4.1"
-        },
         "langdetect": {
             "hashes": [
                 "sha256:91a170d5f0ade380db809b3ba67f08e95fe6c6c8641f96d67a51ff7e98a9bf30"
@@ -352,13 +345,6 @@
             ],
             "version": "==4.4.2"
         },
-        "marisa-trie": {
-            "hashes": [
-                "sha256:4419abb6b603c97e863fad994abe57ed247fb12491f4bbacb2d762bd2e8958b6",
-                "sha256:c73bc25d868e8c4ea7aa7f1e19892db07bba2463351269b05340ccfa06eb2baf"
-            ],
-            "version": "==0.7.5"
-        },
         "markupsafe": {
             "hashes": [
                 "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473",
-- 
cgit v1.2.3


From 1f7bbc5a582db45fcd6034800959e158d35a2297 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 26 Dec 2019 17:36:18 +0100
Subject: datacite: include doi in error messages

---
 python/fatcat_tools/importers/datacite.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 66f812e2..a4a3ef8b 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -264,6 +264,7 @@ class DataciteImporter(EntityImporter):
             return None
 
         attributes = obj['attributes']
+        doi = attributes.get('doi', '').lower()
 
         # Contributors. Many nameIdentifierSchemes, we do not use (yet):
         # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme":
@@ -313,7 +314,7 @@ class DataciteImporter(EntityImporter):
                 contribs.append(fatcat_openapi_client.ReleaseContrib(
                     index=i, extra=extra))
             else:
-                print('unknown name type: {}'.format(nameType), file=sys.stderr)
+                print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr)
 
         # Title, may come with "attributes.titles[].titleType", like
         # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
@@ -322,7 +323,7 @@ class DataciteImporter(EntityImporter):
             titles)
 
         if not title:
-            print('skipping record w/o title: {}'.format(obj), file=sys.stderr)
+            print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr)
             return False
 
         if not subtitle:
@@ -370,7 +371,7 @@ class DataciteImporter(EntityImporter):
                         container_title = container.get('title')
                         if isinstance(container_title, list):
                             if len(container_title) > 0:
-                                print('too many container titles: {}'.format(
+                                print('[{}] too many container titles: {}'.format(doi,
                                     len(container_title)))
                                 container_title = container_title[0]
                         assert isinstance(container_title, str)
@@ -398,7 +399,7 @@ class DataciteImporter(EntityImporter):
                 int(first_page) < int(last_page)
                 pages = '{}-{}'.format(first_page, last_page)
             except ValueError as err:
-                print(err, file=sys.stderr)
+                print('[{}] {}'.format(doi, err), file=sys.stderr)
                 pass
 
         if not pages and first_page:
@@ -426,7 +427,7 @@ class DataciteImporter(EntityImporter):
                 break
 
         if release_type is None:
-            print("no mapped type: {}".format(value), file=sys.stderr)
+            print("[{}] no mapped type: {}".format(doi, value), file=sys.stderr)
 
         # Language values are varied ("ger", "es", "English", "ENG", "en-us",
         # "other", ...). Try to crush it with langcodes: "It may sound to you
@@ -439,7 +440,7 @@ class DataciteImporter(EntityImporter):
         try:
             language = pycountry.languages.lookup(value).alpha_2
         except (LookupError, AttributeError) as err:
-            print('language lookup miss for {}: {}'.format(value, err), file=sys.stderr)
+            print('[{}] language lookup miss for {}: {}'.format(doi, value, err), file=sys.stderr)
 
         # Abstracts appear in "attributes.descriptions[].descriptionType", some
         # of the observed values: "Methods", "TechnicalInfo",
@@ -461,7 +462,7 @@ class DataciteImporter(EntityImporter):
                 try:
                     lang = langdetect.detect(text)
                 except langdetect.lang_detect_exception.LangDetectException as err:
-                    print('language detection failed: {}'.format(err),
+                    print('[{}] language detection failed: {}'.format(doi, err),
                           file=sys.stderr)
             abstracts.append(
                 fatcat_openapi_client.ReleaseAbstract(
@@ -534,7 +535,6 @@ class DataciteImporter(EntityImporter):
         if extra_datacite:
             extra['datacite'] = extra_datacite
 
-        doi = attributes.get('doi', '').lower()
         extids = self.lookup_ext_ids(doi=doi)
 
         # Assemble release.
-- 
cgit v1.2.3


From 13430af9e8c2e39ba90a7db2135496503fb020b2 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 26 Dec 2019 17:43:00 +0100
Subject: datacite: use clean on field values

---
 python/fatcat_tools/importers/datacite.py | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index a4a3ef8b..16431928 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -4,7 +4,7 @@ Prototype Importer for datacite.org data.
 Example doc at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8
 """
 
-from .common import EntityImporter
+from .common import EntityImporter, clean
 import dateparser
 import datetime
 import fatcat_openapi_client
@@ -292,7 +292,20 @@ class DataciteImporter(EntityImporter):
                 if len(affiliations) == 0:
                     raw_affiliation = None
                 else:
-                    raw_affiliation = affiliations[0]
+                    raw_affiliation = clean(affiliations[0])
+
+                name = c.get('name')
+                given_name = c.get('givenName')
+                surname = c.get('familyName')
+
+                if name:
+                    name = clean(name)
+
+                if given_name:
+                    given_name = clean(given_name)
+
+                if surname:
+                    surname = clean(surname)
 
                 contribs.append(
                     fatcat_openapi_client.ReleaseContrib(
@@ -325,9 +338,13 @@ class DataciteImporter(EntityImporter):
         if not title:
             print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr)
             return False
+        else:
+            title = clean(title)
 
         if not subtitle:
             subtitle = None
+        else:
+            subtitle = clean(subtitle)
 
         # Dates. A few internal dates (registered, created, updated) and
         # published (0..2554). We try to work with typed date list, in
@@ -352,6 +369,9 @@ class DataciteImporter(EntityImporter):
             # werden"
             publisher = None
 
+        if publisher:
+            publisher = clean(publisher)
+
         # Container. For the moment, only ISSN as container.
         container_id = None
 
@@ -388,6 +408,12 @@ class DataciteImporter(EntityImporter):
         volume = container.get('volume')
         issue = container.get('issue')
 
+        if volume:
+            volume = clean(volume)
+
+        if issue:
+            issue = clean(issue)
+
         # Pages.
         pages = None
 
-- 
cgit v1.2.3


From 133cc910c07187a349a336b1b7107d67dc3aed3e Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 26 Dec 2019 17:53:26 +0100
Subject: datacite: include container_name top level key in extra

---
 python/fatcat_tools/importers/datacite.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 16431928..322e3db9 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -374,6 +374,7 @@ class DataciteImporter(EntityImporter):
 
         # Container. For the moment, only ISSN as container.
         container_id = None
+        container_name = None
 
         container = attributes.get('container', {}) or {}
         if container.get('type') in CONTAINER_TYPE_MAP.keys():
@@ -388,21 +389,30 @@ class DataciteImporter(EntityImporter):
                     container_id = self.lookup_issnl(issnl)
 
                     if container_id is None and container.get('title'):
-                        container_title = container.get('title')
-                        if isinstance(container_title, list):
-                            if len(container_title) > 0:
+                        container_name = container.get('title')
+                        if isinstance(container_name, list):
+                            if len(container_name) > 0:
                                 print('[{}] too many container titles: {}'.format(doi,
-                                    len(container_title)))
-                                container_title = container_title[0]
-                        assert isinstance(container_title, str)
+                                    len(container_name)))
+                                container_name = container_name[0]
+                        assert isinstance(container_name, str)
                         ce = fatcat_openapi_client.ContainerEntity(
                             issnl=issnl,
                             container_type=container_type,
-                            name=container_title,
+                            name=container_name,
                         )
                         ce_edit = self.create_container(ce)
                         container_id = ce_edit.ident
                         self._issnl_id_map[issnl] = container_id
+                else:
+                    # TODO(martin): factor this out into a testable function.
+                    # TODO(martin): "container_name": "№1(1) (2018)" / 10.26087/inasan.2018.1.1.013
+                    container_name = container.get('title')
+                    if isinstance(container_name, list):
+                        if len(container_name) > 0:
+                            print('[{}] too many container titles: {}'.format(doi,
+                                len(container_name)))
+                            container_name = container_name[0]
 
         # Volume and issue.
         volume = container.get('volume')
@@ -558,6 +568,10 @@ class DataciteImporter(EntityImporter):
 
         extra = dict()
 
+        # top-level extra keys
+        if not container_id and container_name:
+            extra['container_name'] = container_name
+
         if extra_datacite:
             extra['datacite'] = extra_datacite
 
-- 
cgit v1.2.3


From d0fea0bb56c80caa1bb32a725d1ed4424df5ac39 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 26 Dec 2019 23:52:25 +0100
Subject: datacite: treat untyped names as people

---
 python/fatcat_tools/importers/datacite.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 322e3db9..0de15a18 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -274,7 +274,7 @@ class DataciteImporter(EntityImporter):
 
         for i, c in enumerate(attributes['creators']):
             nameType = c.get('nameType', '') or ''
-            if nameType == 'Personal':
+            if nameType == 'Personal' or nameType == '':
                 creator_id = None
                 for nid in c.get('nameIdentifiers', []):
                     if not nid.get('nameIdentifierScheme').lower() == "orcid":
-- 
cgit v1.2.3


From 9a2a7e35948e350aaf40b07d4d4427d288970d3f Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 26 Dec 2019 23:52:40 +0100
Subject: datacite: adjust tests

---
 python/tests/import_datacite.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'python')

diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index ab67a310..bc47a185 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -252,7 +252,7 @@ def test_datacite_dict_parse(datacite_importer):
         assert r.original_title == None
         assert r.ext_ids.doi == "10.18730/8dym9"
         assert r.ext_ids.isbn13 == None
-        assert r.language == "enc"
+        assert r.language == "en"
         assert r.subtitle == None
         assert r.release_date == None
         assert r.release_year == 1986
@@ -262,7 +262,6 @@ def test_datacite_dict_parse(datacite_importer):
         assert 'funder' not in r.extra['datacite']
         # matched by ISSN, so shouldn't be in there
         #assert extra['container_name'] == "International Journal of Quantum Chemistry"
-        assert r.extra['datacite']['url'] == 'https://ssl.fao.org/glis/doi/10.18730/8DYM9'
         assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}]
         assert len(r.abstracts) == 1
         assert len(r.abstracts[0].content) == 421
-- 
cgit v1.2.3


From 161103859c556593922682f2bb24e18cc5a48cae Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 27 Dec 2019 00:09:34 +0100
Subject: datacite: suppress debug-like language lookup miss message

---
 python/fatcat_tools/importers/datacite.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 0de15a18..a651b9fe 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -476,7 +476,9 @@ class DataciteImporter(EntityImporter):
         try:
             language = pycountry.languages.lookup(value).alpha_2
         except (LookupError, AttributeError) as err:
-            print('[{}] language lookup miss for {}: {}'.format(doi, value, err), file=sys.stderr)
+            pass
+            # TODO(martin): Print this on debug level, only.
+            # print('[{}] language lookup miss for {}: {}'.format(doi, value, err), file=sys.stderr)
 
         # Abstracts appear in "attributes.descriptions[].descriptionType", some
         # of the observed values: "Methods", "TechnicalInfo",
-- 
cgit v1.2.3


From fb3739aa74cc8ddc29222231b581fea5a2e16196 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 27 Dec 2019 00:28:15 +0100
Subject: datacite: page number misses are too common

Should be a level debug, not info.

Examples: E675, n/a, 15D.2.1, 15D.2.1, A.1E.1, A.1E.1, ...
---
 python/fatcat_tools/importers/datacite.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index a651b9fe..a0eb92ad 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -435,7 +435,8 @@ class DataciteImporter(EntityImporter):
                 int(first_page) < int(last_page)
                 pages = '{}-{}'.format(first_page, last_page)
             except ValueError as err:
-                print('[{}] {}'.format(doi, err), file=sys.stderr)
+                # TODO(martin): This is more debug than info.
+                # print('[{}] {}'.format(doi, err), file=sys.stderr)
                 pass
 
         if not pages and first_page:
-- 
cgit v1.2.3


From b49cbfd528f3aa02beb2e0d1f76658e4dff0aa22 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Sat, 28 Dec 2019 23:19:41 +0100
Subject: datacite: update docs with observed values

---
 python/fatcat_tools/importers/datacite.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index a0eb92ad..e1a746c7 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -747,7 +747,9 @@ def parse_datacite_dates(dates):
     if not isinstance(dates, list):
         raise ValueError('expected a list of date items')
 
-    # Ignored: Collected, Issued.
+    # Observed values: "Available", "Submitted", "Valid", "Issued", "Accepted",
+    # "Collected", "Updated", "Copyrighted", "Created"
+    # Ignored for now: "Collected", "Issued"
     date_type_prio = (
         'Valid',
         'Available',
-- 
cgit v1.2.3


From e92f003ff38001e1611e5df2753bc6f122dd14f2 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Sun, 29 Dec 2019 00:39:13 +0100
Subject: datacite: check for empty title after clean

---
 python/fatcat_tools/importers/datacite.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index e1a746c7..fb945ba6 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -335,11 +335,14 @@ class DataciteImporter(EntityImporter):
         title, original_language_title, subtitle = parse_datacite_titles(
             titles)
 
+        if title is None:
+            print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr)
+            return False
+
+        title = clean(title)
         if not title:
             print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr)
             return False
-        else:
-            title = clean(title)
 
         if not subtitle:
             subtitle = None
-- 
cgit v1.2.3


From eb383a232a64270e26457e17e74adf26934b541c Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Mon, 30 Dec 2019 18:03:07 +0100
Subject: datacite: perform additional checks on contrib

---
 python/fatcat_tools/importers/datacite.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index fb945ba6..f681b51b 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -307,13 +307,19 @@ class DataciteImporter(EntityImporter):
                 if surname:
                     surname = clean(surname)
 
+                if not name:
+                    continue
+
+                if raw_affiliation is not None and not raw_affiliation:
+                    continue
+
                 contribs.append(
                     fatcat_openapi_client.ReleaseContrib(
                         creator_id=creator_id,
                         index=i,
-                        raw_name=c.get('name'),
-                        given_name=c.get('givenName'),
-                        surname=c.get('familyName'),
+                        raw_name=name,
+                        given_name=given_name,
+                        surname=surname,
                         role='author',
                         raw_affiliation=raw_affiliation,
                     ))
-- 
cgit v1.2.3


From 1f27a42ac56d7b986905097fba662c2b18d5e8f8 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 31 Dec 2019 01:13:12 +0100
Subject: datacite: update docs

---
 python/fatcat_tools/importers/datacite.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index f681b51b..7f0482b4 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -1,7 +1,11 @@
 """
-Prototype Importer for datacite.org data.
+Prototype importer for datacite.org data.
 
-Example doc at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8
+Example input document at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8.
+
+Datacite being a aggregator, the data is varied and exposes a couple of
+problems in content and structure. A few fields habe their own parsing
+functions (parse_datacite_...), which can be tested more easily.
 """
 
 from .common import EntityImporter, clean
@@ -682,6 +686,8 @@ def lookup_license_slug(raw):
 def find_original_language_title(item, min_length=4, max_questionmarks=3):
     """
     Perform a few checks before returning a potential original language title.
+
+    Example input: {'title': 'Some title', 'original_language_title': 'Some title'}
     """
     if not 'original_language_title' in item:
         return None
@@ -709,13 +715,7 @@ def parse_datacite_titles(titles):
     Given a list of title items from datacite, return 3-tuple (title,
     original_language_title, subtitle).
 
-    Example input:
-
-        [
-            {
-                 "title": "Meeting Heterogeneity in Consumer Demand"
-            }
-        ]
+    Example input: [{"title": "Meeting Heterogeneity in Consumer Demand"}]
     """
     title, original_language_title, subtitle = None, None, None
 
-- 
cgit v1.2.3


From 5fcd26823207ae5ea0cdb5f1a6c8ddf7851ab6f4 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 31 Dec 2019 01:27:13 +0100
Subject: datacite: clean doi
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

address issue with EN DASH DOI.

> "external identifier doesn't match required pattern for a DOI (expected,
eg, '10.1234/aksjdfh'): 10.25513/1812-3996.2017.1.34–42"
---
 python/fatcat_tools/importers/datacite.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 7f0482b4..5b3065aa 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -268,7 +268,7 @@ class DataciteImporter(EntityImporter):
             return None
 
         attributes = obj['attributes']
-        doi = attributes.get('doi', '').lower()
+        doi = clean_doi(attributes.get('doi', '').lower())
 
         # Contributors. Many nameIdentifierSchemes, we do not use (yet):
         # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme":
@@ -832,3 +832,15 @@ def parse_datacite_dates(dates):
                 break
 
     return release_date, release_year
+
+def clean_doi(doi):
+    """
+    10.25513/1812-3996.2017.1.34–42 // 8211, Hex 2013, Octal 20023
+    See also: https://github.com/miku/throwaway-check-doi
+
+    Replace unicode HYPHEN..HORIZONTAL BAR with HYPHEN-MINUS.
+    """
+    for c in ('\u2010', '\u2011', '\u2012', '\u2013', '\u2014', '\u2015'):
+        doi = doi.replace(c, "-")
+    return doi
+
-- 
cgit v1.2.3


From d951c59c1086f0cdda8683e1dd9083d9512886f3 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 31 Dec 2019 03:10:30 +0100
Subject: datacite: skip non-ascii doi for now
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Example of a non-ascii doi:

* https://doi.org/10.13125/américacrítica/3017
---
 python/fatcat_tools/importers/datacite.py | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 5b3065aa..b16f333a 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -270,6 +270,10 @@ class DataciteImporter(EntityImporter):
         attributes = obj['attributes']
         doi = clean_doi(attributes.get('doi', '').lower())
 
+        if not doi.isascii():
+            print('[{}] skipping non-ascii doi for now'.format(doi))
+            return None
+
         # Contributors. Many nameIdentifierSchemes, we do not use (yet):
         # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme":
         # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID",
-- 
cgit v1.2.3


From 0c6332792d23f7bd5dd9508f28ffb7ddaee741f6 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 31 Dec 2019 03:17:10 +0100
Subject: datacite: isascii was added in 3.7, only

---
 python/fatcat_tools/importers/datacite.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index b16f333a..fc97b1e3 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -181,6 +181,12 @@ LICENSE_SLUG_MAP = {
     # Note: Some URLs pointing to licensing terms are not in WB yet (but would be nice).
 }
 
+# TODO(martin): drop this after 3.7 upgrade
+try:
+    isascii = str.isascii # new in 3.7, https://docs.python.org/3/library/stdtypes.html#str.isascii
+except AttributeError:
+    isascii = lambda s: len(s) == len(s.encode())
+
 
 class DataciteImporter(EntityImporter):
     """
@@ -270,7 +276,7 @@ class DataciteImporter(EntityImporter):
         attributes = obj['attributes']
         doi = clean_doi(attributes.get('doi', '').lower())
 
-        if not doi.isascii():
+        if not isascii(doi):
             print('[{}] skipping non-ascii doi for now'.format(doi))
             return None
 
-- 
cgit v1.2.3


From f51fe4e6572e30214003e7ec4b7444c07663543b Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 31 Dec 2019 14:23:32 +0100
Subject: datacite: fix typo

---
 python/fatcat_tools/importers/datacite.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index fc97b1e3..f65482e7 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -3,7 +3,7 @@ Prototype importer for datacite.org data.
 
 Example input document at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8.
 
-Datacite being a aggregator, the data is varied and exposes a couple of
+Datacite being an aggregator, the data is varied and exposes a couple of
 problems in content and structure. A few fields habe their own parsing
 functions (parse_datacite_...), which can be tested more easily.
 """
-- 
cgit v1.2.3


From 02f5298be0da677e52621e7e6be682e07b9fce7e Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 31 Dec 2019 14:23:51 +0100
Subject: datacite: ensure name schema is defined

---
 python/fatcat_tools/importers/datacite.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index f65482e7..f8080c10 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -291,7 +291,8 @@ class DataciteImporter(EntityImporter):
             if nameType == 'Personal' or nameType == '':
                 creator_id = None
                 for nid in c.get('nameIdentifiers', []):
-                    if not nid.get('nameIdentifierScheme').lower() == "orcid":
+                    name_scheme = nid.get('nameIdentifierScheme', '') or ''
+                    if not name_scheme.lower() == "orcid":
                         continue
                     orcid = nid.get('nameIdentifier',
                                     '').replace('https://orcid.org/', '')
-- 
cgit v1.2.3


From 90eb8a70796230b29ec19142482f2503bae55252 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 31 Dec 2019 23:42:30 +0100
Subject: datacite: address 'Unpublished' publisher

---
 python/fatcat_tools/importers/datacite.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index f8080c10..854085b8 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -378,12 +378,21 @@ class DataciteImporter(EntityImporter):
         release_date, release_year = parse_datacite_dates(
             attributes.get('dates', []))
 
+        # Start with clear stages, e.g. published. TODO(martin): we could
+        # probably infer a bit more from the relations, e.g.
+        # "IsPreviousVersionOf" or "IsNewVersionOf".
+        release_stage = None
+        if attributes.get(
+                'state') == 'findable' or attributes.get('isActive') is True:
+            release_stage = 'published'
+
         # Publisher. A few NA values. A few bogus values.
         publisher = attributes.get('publisher')
 
         if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)',
-                         '(:none)'):
+                         '(:none)', 'Unpublished'):
             publisher = None
+            release_stage = None
         if publisher is not None and len(publisher) > 80:
             # Arbitrary magic value max length. TODO(martin): better heuristic,
             # but factored out; first we have to log misses. Example:
@@ -561,14 +570,6 @@ class DataciteImporter(EntityImporter):
                 ))
             ref_index += 1
 
-        # Start with clear stages, e.g. published. TODO(martin): we could
-        # probably infer a bit more from the relations, e.g.
-        # "IsPreviousVersionOf" or "IsNewVersionOf".
-        release_stage = None
-        if attributes.get(
-                'state') == 'findable' or attributes.get('isActive') is True:
-            release_stage = 'published'
-
         # Extra information.
         extra_datacite = dict()
 
-- 
cgit v1.2.3


From 1ca0d81c0d9be268aa952a3dd42fa51e2be7c88c Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 1 Jan 2020 12:58:07 +0100
Subject: datacite: avoid more None values

---
 python/fatcat_tools/importers/datacite.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 854085b8..37fceb1c 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -524,9 +524,9 @@ class DataciteImporter(EntityImporter):
         for desc in descs:
             if not desc.get('descriptionType') == 'Abstract':
                 continue
-            if len(desc.get('description', '')) < 10:
+            if len(desc.get('description', '') or '') < 10:
                 continue
-            text = desc.get('description')
+            text = desc.get('description', '')
             if len(text) > MAX_ABSTRACT_LENGTH:
                 text = text[:MAX_ABSTRACT_LENGTH] + " [...]"
             lang = None
@@ -556,10 +556,10 @@ class DataciteImporter(EntityImporter):
 
         relIds = attributes.get('relatedIdentifiers', []) or []
         for rel in relIds:
-            if not rel.get('relationType') == 'References':
+            if not rel.get('relationType', '') == 'References':
                 continue
             ref_extra = dict()
-            if rel.get('relatedIdentifierType') == 'DOI':
+            if rel.get('relatedIdentifierType', '') == 'DOI':
                 ref_extra['doi'] = rel.get('relatedIdentifier')
             if not ref_extra:
                 ref_extra = None
-- 
cgit v1.2.3


From b87ba235c0a7da15d70c5ab7fa367d7b9c1fb981 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 2 Jan 2020 17:35:39 +0100
Subject: datacite: names can be 'Unav', too

---
 python/fatcat_tools/importers/datacite.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 37fceb1c..19c71d24 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -325,7 +325,10 @@ class DataciteImporter(EntityImporter):
                 if not name:
                     continue
 
-                if raw_affiliation is not None and not raw_affiliation:
+                if raw_affiliation == '':
+                    continue
+
+                if name in ('(:Unav)', 'NA'):
                     continue
 
                 contribs.append(
-- 
cgit v1.2.3


From 96e38edde79735b4080ec08d57e9f54759e97b61 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 2 Jan 2020 17:35:54 +0100
Subject: datacite: add conversion fixtures

The `test_datacite_conversions` function will compare an input
(datacite) document to an expected output (release entity as JSON). This
way, it should not be too hard to add more cases by adding: input,
output - and by increasing the counter in the range loop within the
test.

To view input and result side by side with vim, change into the test
directory and run:

    tests/files/datacite $ ./caseview.sh 18
---
 python/tests/files/datacite/caseview.sh            |  17 +
 python/tests/files/datacite/datacite_doc_00.json   | 140 +++++
 python/tests/files/datacite/datacite_doc_01.json   |  81 +++
 python/tests/files/datacite/datacite_doc_02.json   |  85 +++
 python/tests/files/datacite/datacite_doc_03.json   |  70 +++
 python/tests/files/datacite/datacite_doc_04.json   |  80 +++
 python/tests/files/datacite/datacite_doc_05.json   | 598 +++++++++++++++++++++
 python/tests/files/datacite/datacite_doc_06.json   |  83 +++
 python/tests/files/datacite/datacite_doc_07.json   | 120 +++++
 python/tests/files/datacite/datacite_doc_08.json   | 105 ++++
 python/tests/files/datacite/datacite_doc_09.json   | 130 +++++
 python/tests/files/datacite/datacite_doc_10.json   |  83 +++
 python/tests/files/datacite/datacite_doc_11.json   |  86 +++
 python/tests/files/datacite/datacite_doc_12.json   | 103 ++++
 python/tests/files/datacite/datacite_doc_13.json   |  86 +++
 python/tests/files/datacite/datacite_doc_14.json   | 166 ++++++
 python/tests/files/datacite/datacite_doc_15.json   |  79 +++
 python/tests/files/datacite/datacite_doc_16.json   |  80 +++
 python/tests/files/datacite/datacite_doc_17.json   |  72 +++
 python/tests/files/datacite/datacite_doc_18.json   |  79 +++
 python/tests/files/datacite/datacite_doc_19.json   |  79 +++
 python/tests/files/datacite/datacite_doc_20.json   |  42 ++
 python/tests/files/datacite/datacite_doc_21.json   |  42 ++
 python/tests/files/datacite/datacite_doc_22.json   |  44 ++
 python/tests/files/datacite/datacite_doc_23.json   |  44 ++
 .../tests/files/datacite/datacite_result_00.json   |  87 +++
 .../tests/files/datacite/datacite_result_01.json   |  32 ++
 .../tests/files/datacite/datacite_result_02.json   |  36 ++
 .../tests/files/datacite/datacite_result_03.json   |  19 +
 .../tests/files/datacite/datacite_result_04.json   |  28 +
 .../tests/files/datacite/datacite_result_05.json   | 530 ++++++++++++++++++
 .../tests/files/datacite/datacite_result_06.json   |  26 +
 .../tests/files/datacite/datacite_result_07.json   |  73 +++
 .../tests/files/datacite/datacite_result_08.json   |  53 ++
 .../tests/files/datacite/datacite_result_09.json   |  35 ++
 .../tests/files/datacite/datacite_result_10.json   |  32 ++
 .../tests/files/datacite/datacite_result_11.json   |  21 +
 .../tests/files/datacite/datacite_result_12.json   |  44 ++
 .../tests/files/datacite/datacite_result_13.json   |  28 +
 .../tests/files/datacite/datacite_result_14.json   | 110 ++++
 .../tests/files/datacite/datacite_result_15.json   |  22 +
 .../tests/files/datacite/datacite_result_16.json   |  31 ++
 .../tests/files/datacite/datacite_result_17.json   |  20 +
 .../tests/files/datacite/datacite_result_18.json   |  15 +
 .../tests/files/datacite/datacite_result_19.json   |  15 +
 .../tests/files/datacite/datacite_result_20.json   |  14 +
 .../tests/files/datacite/datacite_result_21.json   |  15 +
 .../tests/files/datacite/datacite_result_22.json   |  22 +
 .../tests/files/datacite/datacite_result_23.json   |  22 +
 python/tests/import_datacite.py                    |  26 +-
 50 files changed, 3949 insertions(+), 1 deletion(-)
 create mode 100755 python/tests/files/datacite/caseview.sh
 create mode 100644 python/tests/files/datacite/datacite_doc_00.json
 create mode 100644 python/tests/files/datacite/datacite_doc_01.json
 create mode 100644 python/tests/files/datacite/datacite_doc_02.json
 create mode 100644 python/tests/files/datacite/datacite_doc_03.json
 create mode 100644 python/tests/files/datacite/datacite_doc_04.json
 create mode 100644 python/tests/files/datacite/datacite_doc_05.json
 create mode 100644 python/tests/files/datacite/datacite_doc_06.json
 create mode 100644 python/tests/files/datacite/datacite_doc_07.json
 create mode 100644 python/tests/files/datacite/datacite_doc_08.json
 create mode 100644 python/tests/files/datacite/datacite_doc_09.json
 create mode 100644 python/tests/files/datacite/datacite_doc_10.json
 create mode 100644 python/tests/files/datacite/datacite_doc_11.json
 create mode 100644 python/tests/files/datacite/datacite_doc_12.json
 create mode 100644 python/tests/files/datacite/datacite_doc_13.json
 create mode 100644 python/tests/files/datacite/datacite_doc_14.json
 create mode 100644 python/tests/files/datacite/datacite_doc_15.json
 create mode 100644 python/tests/files/datacite/datacite_doc_16.json
 create mode 100644 python/tests/files/datacite/datacite_doc_17.json
 create mode 100644 python/tests/files/datacite/datacite_doc_18.json
 create mode 100644 python/tests/files/datacite/datacite_doc_19.json
 create mode 100644 python/tests/files/datacite/datacite_doc_20.json
 create mode 100644 python/tests/files/datacite/datacite_doc_21.json
 create mode 100644 python/tests/files/datacite/datacite_doc_22.json
 create mode 100644 python/tests/files/datacite/datacite_doc_23.json
 create mode 100644 python/tests/files/datacite/datacite_result_00.json
 create mode 100644 python/tests/files/datacite/datacite_result_01.json
 create mode 100644 python/tests/files/datacite/datacite_result_02.json
 create mode 100644 python/tests/files/datacite/datacite_result_03.json
 create mode 100644 python/tests/files/datacite/datacite_result_04.json
 create mode 100644 python/tests/files/datacite/datacite_result_05.json
 create mode 100644 python/tests/files/datacite/datacite_result_06.json
 create mode 100644 python/tests/files/datacite/datacite_result_07.json
 create mode 100644 python/tests/files/datacite/datacite_result_08.json
 create mode 100644 python/tests/files/datacite/datacite_result_09.json
 create mode 100644 python/tests/files/datacite/datacite_result_10.json
 create mode 100644 python/tests/files/datacite/datacite_result_11.json
 create mode 100644 python/tests/files/datacite/datacite_result_12.json
 create mode 100644 python/tests/files/datacite/datacite_result_13.json
 create mode 100644 python/tests/files/datacite/datacite_result_14.json
 create mode 100644 python/tests/files/datacite/datacite_result_15.json
 create mode 100644 python/tests/files/datacite/datacite_result_16.json
 create mode 100644 python/tests/files/datacite/datacite_result_17.json
 create mode 100644 python/tests/files/datacite/datacite_result_18.json
 create mode 100644 python/tests/files/datacite/datacite_result_19.json
 create mode 100644 python/tests/files/datacite/datacite_result_20.json
 create mode 100644 python/tests/files/datacite/datacite_result_21.json
 create mode 100644 python/tests/files/datacite/datacite_result_22.json
 create mode 100644 python/tests/files/datacite/datacite_result_23.json

(limited to 'python')

diff --git a/python/tests/files/datacite/caseview.sh b/python/tests/files/datacite/caseview.sh
new file mode 100755
index 00000000..d1e98c04
--- /dev/null
+++ b/python/tests/files/datacite/caseview.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+#
+# Open input and output in vertical vim split.
+#
+# $ caseview 13
+#
+view() {
+    if [ -z "$1" ]; then
+        echo usage: "$0" CASE-NUMBER
+        exit 1
+    else
+        padded=$(printf "%02d\n" "$1")
+        vim -O "datacite_doc_$padded.json" "datacite_result_$padded.json"
+    fi
+}
+
+view "$@"
diff --git a/python/tests/files/datacite/datacite_doc_00.json b/python/tests/files/datacite/datacite_doc_00.json
new file mode 100644
index 00000000..248f525f
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_00.json
@@ -0,0 +1,140 @@
+{
+  "id": "10.1007/s10870-008-9413-z",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.1007/s10870-008-9413-z",
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.1007/s10870-008-9413-z",
+        "identifierType": "DOI"
+      },
+      {
+        "identifier": "s10870-008-9413-z",
+        "identifierType": "Publisher ID"
+      }
+    ],
+    "creators": [
+      {
+        "name": "Li, Qian-Jin",
+        "nameType": "Personal",
+        "givenName": "Qian-Jin",
+        "familyName": "Li",
+        "affiliation": []
+      },
+      {
+        "name": "Yang, Chun-Long",
+        "nameType": "Personal",
+        "givenName": "Chun-Long",
+        "familyName": "Yang",
+        "affiliation": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "Synthesis and Crystal Structure of a Compound with Two Conformational Isomers: N-(2-methylbenzoyl)-N′-(4-nitrophenyl)thiourea"
+      }
+    ],
+    "publisher": "Springer Science and Business Media LLC",
+    "container": {
+      "type": "Journal",
+      "issue": "12",
+      "title": "Journal of Chemical Crystallography",
+      "volume": "38",
+      "lastPage": "930",
+      "firstPage": "927",
+      "identifier": "1074-1542",
+      "identifierType": "ISSN"
+    },
+    "publicationYear": 2008,
+    "subjects": [],
+    "contributors": [],
+    "dates": [
+      {
+        "date": "2008-05-30",
+        "dateType": "Issued"
+      },
+      {
+        "date": "2019-05-31T04:04:23Z",
+        "dateType": "Updated"
+      }
+    ],
+    "language": null,
+    "types": {
+      "ris": "JOUR",
+      "bibtex": "article",
+      "citeproc": "article-journal",
+      "schemaOrg": "ScholarlyArticle",
+      "resourceType": "JournalArticle",
+      "resourceTypeGeneral": "Text"
+    },
+    "relatedIdentifiers": [
+      {
+        "relationType": "IsPartOf",
+        "relatedIdentifier": "1074-1542",
+        "resourceTypeGeneral": "Collection",
+        "relatedIdentifierType": "ISSN"
+      },
+      {
+        "relationType": "References",
+        "relatedIdentifier": "10.1016/j.bmcl.2005.09.033",
+        "relatedIdentifierType": "DOI"
+      },
+      {
+        "relationType": "References",
+        "relatedIdentifier": "10.1016/s0022-1139(02)00330-5",
+        "relatedIdentifierType": "DOI"
+      },
+      {
+        "relationType": "References",
+        "relatedIdentifier": "10.1016/s0010-8545(01)00337-x",
+        "relatedIdentifierType": "DOI"
+      },
+      {
+        "relationType": "References",
+        "relatedIdentifier": "10.1016/j.tetlet.2005.06.135",
+        "relatedIdentifierType": "DOI"
+      },
+      {
+        "relationType": "References",
+        "relatedIdentifier": "10.1039/p298700000s1",
+        "relatedIdentifierType": "DOI"
+      },
+      {
+        "relationType": "References",
+        "relatedIdentifier": "10.1002/anie.199515551",
+        "relatedIdentifierType": "DOI"
+      }
+    ],
+    "sizes": [],
+    "formats": [],
+    "version": null,
+    "rightsList": [
+      {
+        "rightsUri": "http://www.springer.com/tdm"
+      }
+    ],
+    "descriptions": [],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "http://link.springer.com/10.1007/s10870-008-9413-z",
+    "contentUrl": null,
+    "metadataVersion": 1,
+    "schemaVersion": "http://datacite.org/schema/kernel-4",
+    "source": "levriero",
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2019-06-18T14:52:19.000Z",
+    "registered": null,
+    "published": "2008",
+    "updated": "2019-08-03T00:03:40.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "crossref.citations",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_doc_01.json b/python/tests/files/datacite/datacite_doc_01.json
new file mode 100644
index 00000000..c4ef6e45
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_01.json
@@ -0,0 +1,81 @@
+{
+  "id": "10.11588/diglit.25558.39",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.11588/diglit.25558.39",
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.11588/diglit.25558.39",
+        "identifierType": "DOI"
+      }
+    ],
+    "creators": [
+      {
+        "name": "Dargenty, G.",
+        "nameType": "Personal",
+        "givenName": "G.",
+        "familyName": "Dargenty",
+        "affiliation": []
+      }
+    ],
+    "titles": [
+      {
+        "lang": "de",
+        "title": "Ferdinand Gaillard, [1]: né à Paris le 16 janvier 1834, mort à Paris le 19 janvier 1887"
+      }
+    ],
+    "publisher": "University Library Heidelberg",
+    "container": {},
+    "publicationYear": 1887,
+    "subjects": [],
+    "contributors": [],
+    "dates": [
+      {
+        "date": "1887",
+        "dateType": "Issued"
+      }
+    ],
+    "language": "fre",
+    "types": {
+      "ris": "RPRT",
+      "bibtex": "article",
+      "citeproc": "article-journal",
+      "schemaOrg": "ScholarlyArticle",
+      "resourceType": "DigitalisatDigital copy",
+      "resourceTypeGeneral": "Text"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [],
+    "version": null,
+    "rightsList": [
+      {
+        "lang": "de",
+        "rights": "Standard (Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen) - http://www.ub.uni-heidelberg.de/helios/digi/nutzung/Welcome.html"
+      }
+    ],
+    "descriptions": [],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "http://digi.ub.uni-heidelberg.de/diglit/art1887_1/0172",
+    "contentUrl": null,
+    "metadataVersion": 4,
+    "schemaVersion": "http://datacite.org/schema/kernel-4",
+    "source": null,
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2016-12-08T07:43:15.000Z",
+    "registered": "2016-12-08T07:43:15.000Z",
+    "published": "1887",
+    "updated": "2019-08-02T14:27:33.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "gesis.ubhd",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_doc_02.json b/python/tests/files/datacite/datacite_doc_02.json
new file mode 100644
index 00000000..8b9a594e
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_02.json
@@ -0,0 +1,85 @@
+{
+  "id": "10.11588/diglit.37715.57",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.11588/diglit.37715.57",
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.11588/diglit.37715.57",
+        "identifierType": "DOI"
+      }
+    ],
+    "creators": [
+      {
+        "name": "Weyersberg, Albert",
+        "nameType": "Personal",
+        "givenName": "Albert",
+        "familyName": "Weyersberg",
+        "affiliation": []
+      }
+    ],
+    "titles": [
+      {
+        "lang": "de",
+        "title": "Solinger Schwertschmiede-Familien, [4]"
+      }
+    ],
+    "publisher": "University Library Heidelberg",
+    "container": {},
+    "publicationYear": 1897,
+    "subjects": [],
+    "contributors": [],
+    "dates": [
+      {
+        "date": "1897",
+        "dateType": "Issued"
+      }
+    ],
+    "language": "ger",
+    "types": {
+      "ris": "RPRT",
+      "bibtex": "article",
+      "citeproc": "article-journal",
+      "schemaOrg": "ScholarlyArticle",
+      "resourceType": "DigitalisatDigital copy",
+      "resourceTypeGeneral": "Text"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [],
+    "version": null,
+    "rightsList": [
+      {
+        "lang": "de",
+        "rights": "Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen - https://creativecommons.org/licenses/by-sa/3.0/de/"
+      },
+      {
+        "lang": "en",
+        "rights": "Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen - https://creativecommons.org/licenses/by-sa/3.0/"
+      }
+    ],
+    "descriptions": [],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "https://digi.ub.uni-heidelberg.de/diglit/zhwk1897_1899/0131",
+    "contentUrl": null,
+    "metadataVersion": 2,
+    "schemaVersion": "http://datacite.org/schema/kernel-4",
+    "source": "mds",
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2018-11-29T12:04:12.000Z",
+    "registered": "2018-11-29T12:04:13.000Z",
+    "published": "1897",
+    "updated": "2019-08-02T21:31:04.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "gesis.ubhd",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_doc_03.json b/python/tests/files/datacite/datacite_doc_03.json
new file mode 100644
index 00000000..e77a359c
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_03.json
@@ -0,0 +1,70 @@
+{
+  "id": "10.13140/rg.2.2.30434.53446",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.13140/rg.2.2.30434.53446",
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.13140/rg.2.2.30434.53446",
+        "identifierType": "DOI"
+      }
+    ],
+    "creators": [
+      {
+        "name": "Mastura Yahya",
+        "affiliation": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "midterm ah30903"
+      }
+    ],
+    "publisher": "Unpublished",
+    "container": {},
+    "publicationYear": 2016,
+    "subjects": [],
+    "contributors": [],
+    "dates": [
+      {
+        "date": "2016",
+        "dateType": "Issued"
+      }
+    ],
+    "language": "ms",
+    "types": {
+      "ris": "GEN",
+      "bibtex": "misc",
+      "citeproc": "article",
+      "schemaOrg": "CreativeWork"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [],
+    "version": null,
+    "rightsList": [],
+    "descriptions": [],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "http://rgdoi.net/10.13140/RG.2.2.30434.53446",
+    "contentUrl": null,
+    "metadataVersion": 0,
+    "schemaVersion": "http://datacite.org/schema/kernel-3",
+    "source": null,
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2016-11-03T09:07:08.000Z",
+    "registered": "2016-11-03T09:07:09.000Z",
+    "published": "2016",
+    "updated": "2019-08-02T12:51:15.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "rg.rg",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_doc_04.json b/python/tests/files/datacite/datacite_doc_04.json
new file mode 100644
index 00000000..8655a26a
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_04.json
@@ -0,0 +1,80 @@
+{
+  "id": "10.14288/1.0080520",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.14288/1.0080520",
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.14288/1.0080520",
+        "identifierType": "DOI"
+      }
+    ],
+    "creators": [
+      {
+        "name": "Nicollerat, Marc Andre",
+        "nameType": "Personal",
+        "givenName": "Marc Andre",
+        "familyName": "Nicollerat",
+        "affiliation": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "On chain maps inducing isomorphisms in homology"
+      }
+    ],
+    "publisher": "University of British Columbia",
+    "container": {},
+    "publicationYear": 1973,
+    "subjects": [],
+    "contributors": [],
+    "dates": [
+      {
+        "date": "1973",
+        "dateType": "Issued"
+      }
+    ],
+    "language": "en",
+    "types": {
+      "ris": "RPRT",
+      "bibtex": "article",
+      "citeproc": "article-journal",
+      "schemaOrg": "ScholarlyArticle",
+      "resourceType": "Text",
+      "resourceTypeGeneral": "Text"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [],
+    "version": null,
+    "rightsList": [],
+    "descriptions": [
+      {
+        "description": "Let A be an abelian category, I the full subcategory of A consisting of injective objects of A, and K(A) the category whose objects are cochain complexes of elements of A, and whose morphisms are homotopy classes of cochain maps.  In (5), lemma 4.6., p. 42, R. Hartshorne has proved that, under certain conditions, a cochain complex X˙ ε. |KA)| can be embedded in a complex I˙ ε. |K(I)| in such a way that I˙ has the same cohomology as X˙.  In Chapter I we show that the construction given in the two first parts of Hartshorne's Lemma is natural i.e. there exists a functor  J : K(A) → K(I) and a natural transformation [formula omitted]  (where E : K(I) → K(A) is the embedding functor) such that [formula omitted] is  injective and induces isomorphism in cohomology. The question whether the construction given in the third part of the lemma is functorial is still open.  We also prove that J is left adjoint to E, so that K(I) is a reflective subcategory of K(A).  In the special case where A is a category [formula omitted] of left A-modules, and [formula omitted] the category of cochain complexes in [formula omitted] and cochain maps (not homotopy classes), we prove the existence of a functor [formula omitted]  In Chapter II we study the natural homomorphism [formula omitted]   where A, B are rings, and M, L, N modules or chain complexes. In particular we give several sufficient conditions under which v is an isomorphism, or induces isomorphism in homology.  In the appendix we give a detailed proof of Hartshorne's Lemma. We think that this is useful, as no complete proof is, to our knowledge, to be found in the literature.",
+        "descriptionType": "Abstract"
+      }
+    ],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "https://doi.library.ubc.ca/10.14288/1.0080520",
+    "contentUrl": null,
+    "metadataVersion": 5,
+    "schemaVersion": "http://datacite.org/schema/kernel-3",
+    "source": null,
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2015-11-11T11:12:34.000Z",
+    "registered": "2015-11-11T11:12:35.000Z",
+    "published": "1973",
+    "updated": "2019-08-02T09:43:14.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "cisti.ubc",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_doc_05.json b/python/tests/files/datacite/datacite_doc_05.json
new file mode 100644
index 00000000..75e68e9d
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_05.json
@@ -0,0 +1,598 @@
+{
+  "id": "10.15156/bio/sh409843.07fu",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.15156/bio/sh409843.07fu",
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.15156/bio/sh409843.07fu",
+        "identifierType": "DOI"
+      }
+    ],
+    "creators": [
+      {
+        "name": "Kõljalg, Urmas",
+        "nameType": "Personal",
+        "givenName": "Urmas",
+        "familyName": "Kõljalg",
+        "affiliation": []
+      },
+      {
+        "name": "Abarenkov, Kessy",
+        "nameType": "Personal",
+        "givenName": "Kessy",
+        "familyName": "Abarenkov",
+        "affiliation": []
+      },
+      {
+        "name": "Nilsson, R. Henrik",
+        "nameType": "Personal",
+        "givenName": "R. Henrik",
+        "familyName": "Nilsson",
+        "affiliation": []
+      },
+      {
+        "name": "Larsson, Karl-Henrik",
+        "nameType": "Personal",
+        "givenName": "Karl-Henrik",
+        "familyName": "Larsson",
+        "affiliation": []
+      },
+      {
+        "name": "Aas, Anders Bjørnsgard",
+        "nameType": "Personal",
+        "givenName": "Anders Bjørnsgard",
+        "familyName": "Aas",
+        "affiliation": []
+      },
+      {
+        "name": "Adams, Rachel",
+        "nameType": "Personal",
+        "givenName": "Rachel",
+        "familyName": "Adams",
+        "affiliation": []
+      },
+      {
+        "name": "Alves, Artur",
+        "nameType": "Personal",
+        "givenName": "Artur",
+        "familyName": "Alves",
+        "affiliation": []
+      },
+      {
+        "name": "Ammirati, Joseph F.",
+        "nameType": "Personal",
+        "givenName": "Joseph F.",
+        "familyName": "Ammirati",
+        "affiliation": []
+      },
+      {
+        "name": "Arnold, A. Elizabeth",
+        "nameType": "Personal",
+        "givenName": "A. Elizabeth",
+        "familyName": "Arnold",
+        "affiliation": []
+      },
+      {
+        "name": "Bahram, Mohammad",
+        "nameType": "Personal",
+        "givenName": "Mohammad",
+        "familyName": "Bahram",
+        "affiliation": []
+      },
+      {
+        "name": "Bengtsson-Palme, Johan",
+        "nameType": "Personal",
+        "givenName": "Johan",
+        "familyName": "Bengtsson-Palme",
+        "affiliation": []
+      },
+      {
+        "name": "Berlin, Anna",
+        "nameType": "Personal",
+        "givenName": "Anna",
+        "familyName": "Berlin",
+        "affiliation": []
+      },
+      {
+        "name": "Botnen, Synnøve",
+        "nameType": "Personal",
+        "givenName": "Synnøve",
+        "familyName": "Botnen",
+        "affiliation": []
+      },
+      {
+        "name": "Bourlat, Sarah",
+        "nameType": "Personal",
+        "givenName": "Sarah",
+        "familyName": "Bourlat",
+        "affiliation": []
+      },
+      {
+        "name": "Cheeke, Tanya",
+        "nameType": "Personal",
+        "givenName": "Tanya",
+        "familyName": "Cheeke",
+        "affiliation": []
+      },
+      {
+        "name": "Dima, Bálint",
+        "nameType": "Personal",
+        "givenName": "Bálint",
+        "familyName": "Dima",
+        "affiliation": []
+      },
+      {
+        "name": "Drenkhan, Rein",
+        "nameType": "Personal",
+        "givenName": "Rein",
+        "familyName": "Drenkhan",
+        "affiliation": []
+      },
+      {
+        "name": "Duarte, Camila",
+        "nameType": "Personal",
+        "givenName": "Camila",
+        "familyName": "Duarte",
+        "affiliation": []
+      },
+      {
+        "name": "Dueñas, Margarita",
+        "nameType": "Personal",
+        "givenName": "Margarita",
+        "familyName": "Dueñas",
+        "affiliation": []
+      },
+      {
+        "name": "Eberhardt, Ursula",
+        "nameType": "Personal",
+        "givenName": "Ursula",
+        "familyName": "Eberhardt",
+        "affiliation": []
+      },
+      {
+        "name": "Friberg, Hanna",
+        "nameType": "Personal",
+        "givenName": "Hanna",
+        "familyName": "Friberg",
+        "affiliation": []
+      },
+      {
+        "name": "Frøslev, Tobias G.",
+        "nameType": "Personal",
+        "givenName": "Tobias G.",
+        "familyName": "Frøslev",
+        "affiliation": []
+      },
+      {
+        "name": "Garnica, Sigisfredo",
+        "nameType": "Personal",
+        "givenName": "Sigisfredo",
+        "familyName": "Garnica",
+        "affiliation": []
+      },
+      {
+        "name": "Geml, József",
+        "nameType": "Personal",
+        "givenName": "József",
+        "familyName": "Geml",
+        "affiliation": []
+      },
+      {
+        "name": "Ghobad-Nejhad, Masoomeh",
+        "nameType": "Personal",
+        "givenName": "Masoomeh",
+        "familyName": "Ghobad-Nejhad",
+        "affiliation": []
+      },
+      {
+        "name": "Grebenc, Tine",
+        "nameType": "Personal",
+        "givenName": "Tine",
+        "familyName": "Grebenc",
+        "affiliation": []
+      },
+      {
+        "name": "Griffith, Gareth W.",
+        "nameType": "Personal",
+        "givenName": "Gareth W.",
+        "familyName": "Griffith",
+        "affiliation": []
+      },
+      {
+        "name": "Hampe, Felix",
+        "nameType": "Personal",
+        "givenName": "Felix",
+        "familyName": "Hampe",
+        "affiliation": []
+      },
+      {
+        "name": "Kennedy, Peter",
+        "nameType": "Personal",
+        "givenName": "Peter",
+        "familyName": "Kennedy",
+        "affiliation": []
+      },
+      {
+        "name": "Khomich, Maryia",
+        "nameType": "Personal",
+        "givenName": "Maryia",
+        "familyName": "Khomich",
+        "affiliation": []
+      },
+      {
+        "name": "Kohout, Petr",
+        "nameType": "Personal",
+        "givenName": "Petr",
+        "familyName": "Kohout",
+        "affiliation": []
+      },
+      {
+        "name": "Kollom, Anu",
+        "nameType": "Personal",
+        "givenName": "Anu",
+        "familyName": "Kollom",
+        "affiliation": []
+      },
+      {
+        "name": "Larsson, Ellen",
+        "nameType": "Personal",
+        "givenName": "Ellen",
+        "familyName": "Larsson",
+        "affiliation": []
+      },
+      {
+        "name": "Laszlo, Irinyi",
+        "nameType": "Personal",
+        "givenName": "Irinyi",
+        "familyName": "Laszlo",
+        "affiliation": []
+      },
+      {
+        "name": "Leavitt, Steven",
+        "nameType": "Personal",
+        "givenName": "Steven",
+        "familyName": "Leavitt",
+        "affiliation": []
+      },
+      {
+        "name": "Liimatainen, Kare",
+        "nameType": "Personal",
+        "givenName": "Kare",
+        "familyName": "Liimatainen",
+        "affiliation": []
+      },
+      {
+        "name": "Lindahl, Björn",
+        "nameType": "Personal",
+        "givenName": "Björn",
+        "familyName": "Lindahl",
+        "affiliation": []
+      },
+      {
+        "name": "Lodge, Deborah J.",
+        "nameType": "Personal",
+        "givenName": "Deborah J.",
+        "familyName": "Lodge",
+        "affiliation": []
+      },
+      {
+        "name": "Lumbsch, Helge Thorsten",
+        "nameType": "Personal",
+        "givenName": "Helge Thorsten",
+        "familyName": "Lumbsch",
+        "affiliation": []
+      },
+      {
+        "name": "Martín Esteban, María Paz",
+        "nameType": "Personal",
+        "givenName": "María Paz",
+        "familyName": "Martín Esteban",
+        "affiliation": []
+      },
+      {
+        "name": "Meyer, Wieland",
+        "nameType": "Personal",
+        "givenName": "Wieland",
+        "familyName": "Meyer",
+        "affiliation": []
+      },
+      {
+        "name": "Miettinen, Otto",
+        "nameType": "Personal",
+        "givenName": "Otto",
+        "familyName": "Miettinen",
+        "affiliation": []
+      },
+      {
+        "name": "Nguyen, Nhu",
+        "nameType": "Personal",
+        "givenName": "Nhu",
+        "familyName": "Nguyen",
+        "affiliation": []
+      },
+      {
+        "name": "Niskanen, Tuula",
+        "nameType": "Personal",
+        "givenName": "Tuula",
+        "familyName": "Niskanen",
+        "affiliation": []
+      },
+      {
+        "name": "Oono, Ryoko",
+        "nameType": "Personal",
+        "givenName": "Ryoko",
+        "familyName": "Oono",
+        "affiliation": []
+      },
+      {
+        "name": "Öpik, Maarja",
+        "nameType": "Personal",
+        "givenName": "Maarja",
+        "familyName": "Öpik",
+        "affiliation": []
+      },
+      {
+        "name": "Ordynets, Alexander",
+        "nameType": "Personal",
+        "givenName": "Alexander",
+        "familyName": "Ordynets",
+        "affiliation": []
+      },
+      {
+        "name": "Pawłowska, Julia",
+        "nameType": "Personal",
+        "givenName": "Julia",
+        "familyName": "Pawłowska",
+        "affiliation": []
+      },
+      {
+        "name": "Peintner, Ursula",
+        "nameType": "Personal",
+        "givenName": "Ursula",
+        "familyName": "Peintner",
+        "affiliation": []
+      },
+      {
+        "name": "Pereira, Olinto Liparini",
+        "nameType": "Personal",
+        "givenName": "Olinto Liparini",
+        "familyName": "Pereira",
+        "affiliation": []
+      },
+      {
+        "name": "Pinho, Danilo Batista",
+        "nameType": "Personal",
+        "givenName": "Danilo Batista",
+        "familyName": "Pinho",
+        "affiliation": []
+      },
+      {
+        "name": "Põldmaa, Kadri",
+        "nameType": "Personal",
+        "givenName": "Kadri",
+        "familyName": "Põldmaa",
+        "affiliation": []
+      },
+      {
+        "name": "Runnel, Kadri",
+        "nameType": "Personal",
+        "givenName": "Kadri",
+        "familyName": "Runnel",
+        "affiliation": []
+      },
+      {
+        "name": "Ryberg, Martin",
+        "nameType": "Personal",
+        "givenName": "Martin",
+        "familyName": "Ryberg",
+        "affiliation": []
+      },
+      {
+        "name": "Saar, Irja",
+        "nameType": "Personal",
+        "givenName": "Irja",
+        "familyName": "Saar",
+        "affiliation": []
+      },
+      {
+        "name": "Sanli, Kemal",
+        "nameType": "Personal",
+        "givenName": "Kemal",
+        "familyName": "Sanli",
+        "affiliation": []
+      },
+      {
+        "name": "Scott, James",
+        "nameType": "Personal",
+        "givenName": "James",
+        "familyName": "Scott",
+        "affiliation": []
+      },
+      {
+        "name": "Spirin, Viacheslav",
+        "nameType": "Personal",
+        "givenName": "Viacheslav",
+        "familyName": "Spirin",
+        "affiliation": []
+      },
+      {
+        "name": "Suija, Ave",
+        "nameType": "Personal",
+        "givenName": "Ave",
+        "familyName": "Suija",
+        "affiliation": []
+      },
+      {
+        "name": "Svantesson, Sten",
+        "nameType": "Personal",
+        "givenName": "Sten",
+        "familyName": "Svantesson",
+        "affiliation": []
+      },
+      {
+        "name": "Tadych, Mariusz",
+        "nameType": "Personal",
+        "givenName": "Mariusz",
+        "familyName": "Tadych",
+        "affiliation": []
+      },
+      {
+        "name": "Takamatsu, Susumu",
+        "nameType": "Personal",
+        "givenName": "Susumu",
+        "familyName": "Takamatsu",
+        "affiliation": []
+      },
+      {
+        "name": "Tamm, Heidi",
+        "nameType": "Personal",
+        "givenName": "Heidi",
+        "familyName": "Tamm",
+        "affiliation": []
+      },
+      {
+        "name": "Taylor, AFS.",
+        "nameType": "Personal",
+        "givenName": "AFS.",
+        "familyName": "Taylor",
+        "affiliation": []
+      },
+      {
+        "name": "Tedersoo, Leho",
+        "nameType": "Personal",
+        "givenName": "Leho",
+        "familyName": "Tedersoo",
+        "affiliation": []
+      },
+      {
+        "name": "Telleria, M.T.",
+        "nameType": "Personal",
+        "givenName": "M.T.",
+        "familyName": "Telleria",
+        "affiliation": []
+      },
+      {
+        "name": "Udayanga, Dhanushka",
+        "nameType": "Personal",
+        "givenName": "Dhanushka",
+        "familyName": "Udayanga",
+        "affiliation": []
+      },
+      {
+        "name": "Unterseher, Martin",
+        "nameType": "Personal",
+        "givenName": "Martin",
+        "familyName": "Unterseher",
+        "affiliation": []
+      },
+      {
+        "name": "Volobuev, Sergey",
+        "nameType": "Personal",
+        "givenName": "Sergey",
+        "familyName": "Volobuev",
+        "affiliation": []
+      },
+      {
+        "name": "Weiss, Michael",
+        "nameType": "Personal",
+        "givenName": "Michael",
+        "familyName": "Weiss",
+        "affiliation": []
+      },
+      {
+        "name": "Wurzbacher, Christian",
+        "nameType": "Personal",
+        "givenName": "Christian",
+        "familyName": "Wurzbacher",
+        "affiliation": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "SH409843.07FU"
+      },
+      {
+        "title": "Gomphales",
+        "titleType": "Subtitle"
+      }
+    ],
+    "publisher": "UNITE Community",
+    "container": {},
+    "publicationYear": 2015,
+    "subjects": [],
+    "contributors": [
+      {
+        "name": "Kessy Abarenkov",
+        "affiliation": []
+      },
+      {
+        "name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden",
+        "affiliation": []
+      }
+    ],
+    "dates": [
+      {
+        "date": "2016-04-22",
+        "dateType": "Updated"
+      },
+      {
+        "date": "2014-10-05",
+        "dateType": "Created"
+      },
+      {
+        "date": "2015",
+        "dateType": "Issued"
+      }
+    ],
+    "language": "eng",
+    "types": {
+      "ris": "DATA",
+      "bibtex": "misc",
+      "citeproc": "dataset",
+      "schemaOrg": "Dataset",
+      "resourceType": "Dataset/UNITE Species Hypothesis",
+      "resourceTypeGeneral": "Dataset"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [
+      "application/json"
+    ],
+    "version": null,
+    "rightsList": [
+      {
+        "rights": "Attribution-NonCommercial (CC BY-NC)",
+        "rightsUri": "http://creativecommons.org/licenses/by-nc/4.0"
+      }
+    ],
+    "descriptions": [
+      {
+        "description": "UNITE provides a unified way for delimiting, identifying, communicating, and working with DNA-based Species Hypotheses (SH). All fungal ITS sequences in the international nucleotide sequence databases are clustered to approximately the species level by applying a set of dynamic distance values (&lt;0.5 - 3.0%). All species hypotheses are given a unique, stable name in the form of a DOI, and their taxonomic and ecological annotations are verified through distributed, web-based third-party annotation efforts. SHs are connected to a taxon name and its classification as far as possible (phylum, class, order, etc.) by taking into account identifications for all sequences in the SH. An automatically or manually designated sequence is chosen to represent each such SH. These sequences are released (https://unite.ut.ee/repository.php) for use by the scientific community in, for example, local sequence similarity searches and next-generation sequencing analysis pipelines. The system and the data are updated automatically as the number of public fungal ITS sequences grows.",
+        "descriptionType": "Abstract"
+      }
+    ],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "https://plutof.ut.ee/#/datacite/10.15156/BIO/SH409843.07FU",
+    "contentUrl": null,
+    "metadataVersion": 1,
+    "schemaVersion": "http://datacite.org/schema/kernel-3",
+    "source": null,
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2015-06-05T10:23:18.000Z",
+    "registered": "2015-06-05T10:23:19.000Z",
+    "published": "2015",
+    "updated": "2019-08-02T07:45:28.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "estdoi.bio",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_doc_06.json b/python/tests/files/datacite/datacite_doc_06.json
new file mode 100644
index 00000000..a7f3ee70
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_06.json
@@ -0,0 +1,83 @@
+{
+  "id": "10.16903/ethz-grs-d_006220",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.16903/ethz-grs-d_006220",
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.16903/ethz-grs-d_006220",
+        "identifierType": "DOI"
+      }
+    ],
+    "creators": [
+      {
+        "name": "Crispijn De Passe (Der Ältere) (1564-1637)",
+        "nameType": "Personal",
+        "affiliation": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "Der Eifer (Sedulitas), Blatt 7 der Folge \"Die Tugenden\""
+      }
+    ],
+    "publisher": "n.a.",
+    "container": {},
+    "publicationYear": 1590,
+    "subjects": [],
+    "contributors": [],
+    "dates": [
+      {
+        "date": "1590",
+        "dateType": "Available"
+      },
+      {
+        "date": "1590",
+        "dateType": "Issued"
+      }
+    ],
+    "language": null,
+    "types": {
+      "ris": "GEN",
+      "bibtex": "misc",
+      "citeproc": "article",
+      "schemaOrg": "CreativeWork",
+      "resourceTypeGeneral": "InteractiveResource"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [
+      "Blattgrösse: 21.0 x 14.4 x 0.0 cm (beschnitten)",
+      "Kupferstich"
+    ],
+    "version": null,
+    "rightsList": [
+      {
+        "rights": "ETH-Bibliothek Zürich, Graphische Sammlung / D 6220 / Public Domain Mark 1.0"
+      }
+    ],
+    "descriptions": [],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "http://www.e-gs.ethz.ch/eMP/eMuseumPlus?service=ExternalInterface&module=collection&objectId=29469&viewType=detailView",
+    "contentUrl": null,
+    "metadataVersion": 1,
+    "schemaVersion": "http://datacite.org/schema/kernel-3",
+    "source": "mds",
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2017-12-13T12:03:09.000Z",
+    "registered": "2017-12-13T12:03:09.000Z",
+    "published": "1590",
+    "updated": "2019-08-02T17:20:02.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "ethz.gs",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_doc_07.json b/python/tests/files/datacite/datacite_doc_07.json
new file mode 100644
index 00000000..c70695b6
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_07.json
@@ -0,0 +1,120 @@
+{
+  "id": "10.18462/iir.icr.2015.0926",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.18462/iir.icr.2015.0926",
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.18462/iir.icr.2015.0926",
+        "identifierType": "DOI"
+      }
+    ],
+    "creators": [
+      {
+        "name": "ROTHUIZEN, E.",
+        "nameType": "Personal",
+        "givenName": "E.",
+        "familyName": "ROTHUIZEN",
+        "affiliation": []
+      },
+      {
+        "name": "ELMEGAARD, B.",
+        "nameType": "Personal",
+        "givenName": "B.",
+        "familyName": "ELMEGAARD",
+        "affiliation": []
+      },
+      {
+        "name": "MARKUSSEN W., B.",
+        "nameType": "Personal",
+        "givenName": "B.",
+        "familyName": "MARKUSSEN W.",
+        "affiliation": []
+      },
+      {
+        "name": "Et Al.",
+        "affiliation": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "High efficient heat pump system using storage tanks to increase cop by means of the ISEC concept. 1: model validation."
+      }
+    ],
+    "publisher": "International Institute of Refrigeration (IIR)",
+    "container": {},
+    "publicationYear": 2015,
+    "subjects": [
+      {
+        "subject": "HEAT PUMP"
+      },
+      {
+        "subject": "HOT WATER"
+      },
+      {
+        "subject": "HEAT TRANSFER"
+      },
+      {
+        "subject": "PERFORMANCE"
+      },
+      {
+        "subject": "THERMAL STORAGE"
+      },
+      {
+        "subject": "TANK"
+      },
+      {
+        "subject": "MODEL"
+      }
+    ],
+    "contributors": [],
+    "dates": [
+      {
+        "date": "2015",
+        "dateType": "Issued"
+      }
+    ],
+    "language": "eng",
+    "types": {
+      "ris": "DATA",
+      "bibtex": "misc",
+      "citeproc": "dataset",
+      "schemaOrg": "Dataset",
+      "resourceType": "Dataset",
+      "resourceTypeGeneral": "Dataset"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [],
+    "version": null,
+    "rightsList": [],
+    "descriptions": [
+      {
+        "description": "The purpose of the ISEC concept is to provide a high-efficient heat pump system for hot water production. The ISEC concept uses two storage tanks for the water, one discharged and one charged. Hot water for the industrial process is tapped from the charged tank, while the other tank is charging. Charging is done by circulating the water in the tank through the condenser of a heat pump several times and thereby gradually heating the water. The charging is done with a higher mass flow rate than the discharging to reach several circulations of the water during the time frame of one discharging. This result in a lower condensing temperature than if the water was heated in one step. Two test setups were built, one to test the performance of the heat pump gradually heating the water and one to investigate the stratification in the storage tanks. Furthermore, a dynamic model of the system was implemented in Dymola, and validated by the use of test data from the two experimental setups. This paper shows that there is a good consistency between the model and the experimental tests.",
+        "descriptionType": "Abstract"
+      }
+    ],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "http://www.iifiir.org/clientBookline/service/reference.asp?INSTANCE=EXPLOITATION&OUTPUT=PORTAL&DOCID=IFD_REFDOC_0015008&DOCBASE=IFD_REFDOC_EN&SETLANGUAGE=EN",
+    "contentUrl": null,
+    "metadataVersion": 0,
+    "schemaVersion": null,
+    "source": null,
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2016-11-21T13:08:14.000Z",
+    "registered": "2016-11-21T13:08:14.000Z",
+    "published": "2015",
+    "updated": "2019-08-16T18:00:59.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "inist.iif",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_doc_08.json b/python/tests/files/datacite/datacite_doc_08.json
new file mode 100644
index 00000000..e9170788
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_08.json
@@ -0,0 +1,105 @@
+{
+  "id": "10.22004/ag.econ.284864",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.22004/ag.econ.284864",
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.22004/ag.econ.284864",
+        "identifierType": "DOI"
+      }
+    ],
+    "creators": [
+      {
+        "name": "Kajisa, Kei",
+        "nameType": "Personal",
+        "givenName": "Kei",
+        "familyName": "Kajisa",
+        "affiliation": [],
+        "nameIdentifiers": []
+      },
+      {
+        "name": "Kajisa, Kei",
+        "nameType": "Personal",
+        "givenName": "Kei",
+        "familyName": "Kajisa",
+        "affiliation": [],
+        "nameIdentifiers": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "Irrigation Policies under Rapid Industrialization and Labor Migration: Lessons from Japan, China and India"
+      }
+    ],
+    "publisher": "Unknown",
+    "container": {},
+    "publicationYear": 2017,
+    "subjects": [
+      {
+        "subject": "Land Economics/Use"
+      },
+      {
+        "subject": "irrigation",
+        "subjectScheme": "keyword"
+      },
+      {
+        "subject": "industrialization",
+        "subjectScheme": "keyword"
+      },
+      {
+        "subject": "collective action",
+        "subjectScheme": "keyword"
+      }
+    ],
+    "contributors": [],
+    "dates": [
+      {
+        "date": "2017",
+        "dateType": "Issued"
+      }
+    ],
+    "language": "eng",
+    "types": {
+      "ris": "RPRT",
+      "bibtex": "article",
+      "citeproc": "article-journal",
+      "schemaOrg": "ScholarlyArticle",
+      "resourceType": "Text",
+      "resourceTypeGeneral": "Text"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [],
+    "version": null,
+    "rightsList": [],
+    "descriptions": [
+      {
+        "description": "International society recognizes that the scarcity of fresh water is increasing and farming sectors suffer from lack of irrigation water. However, if we look at this issue with a framework of relative factor endowment, a different view will arise. In emerging states with rapid industrialization and labor migration, labor scarcity increases at a faster pace than that of irrigation water. Using the historical review of Japan’s irrigation policies as well as the case studies of India and China, this paper shows that the introduction of policies which do not reflect the actual relative resource scarcity may mislead the development path. We argue that under increasing relative labor scarcity it is important to realize the substitution of capital for labor for surface irrigation system management and that the substitution needs public support because the service of surface irrigation system has some externalities. Through this argument, this paper also intends to shed the light back to the role of the state for local resource management which seems to be unfairly undervalued since the boom of community participatory approach in the 1980s.",
+        "descriptionType": "Abstract"
+      }
+    ],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "https://ageconsearch.umn.edu/record/284864",
+    "contentUrl": null,
+    "metadataVersion": 1,
+    "schemaVersion": null,
+    "source": "mds",
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2019-08-24T07:46:47.000Z",
+    "registered": "2019-08-24T07:46:47.000Z",
+    "published": "2017",
+    "updated": "2019-08-25T09:38:33.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "tind.agecon",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_doc_09.json b/python/tests/files/datacite/datacite_doc_09.json
new file mode 100644
index 00000000..d09af545
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_09.json
@@ -0,0 +1,130 @@
+{
+  "id": "10.2314/gbv:880813733",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.2314/gbv:880813733",
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.2314/gbv:880813733",
+        "identifierType": "DOI"
+      },
+      {
+        "identifier": "880813733",
+        "identifierType": "ppn"
+      },
+      {
+        "identifier": "03WKCF3C",
+        "identifierType": "contract"
+      },
+      {
+        "identifier": "01132105",
+        "identifierType": "contract"
+      },
+      {
+        "identifier": "GBV:880813733",
+        "identifierType": "firstid"
+      },
+      {
+        "identifier": "TIBKAT:880813733",
+        "identifierType": "ftx-id"
+      }
+    ],
+    "creators": [
+      {
+        "name": "Kirstaedter, Nils",
+        "nameType": "Personal",
+        "givenName": "Nils",
+        "familyName": "Kirstaedter",
+        "affiliation": [],
+        "nameIdentifiers": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "BrightLas : TP3.3. Module für Direktdiodenstrahlquellen bis 4kW und Untersuchungen zur Leistungsskalierung (Diodemodul) : zum Verbundvorhaben Direktdiodenlaseranlagen und -systeme (VP3) im Förderschwerpunkt innovative regionale Wachstumskerne, BMBF : Abschlussbericht"
+      },
+      {
+        "title": "Module für Direktdiodenstrahlquellen bis 4kW und Untersuchungen zur Leistungsskalierung (Diodemodul)",
+        "titleType": "AlternativeTitle"
+      },
+      {
+        "title": "Direktdiodenlaseranlagen und -systeme (VP3)",
+        "titleType": "AlternativeTitle"
+      }
+    ],
+    "publisher": "[Lumics GmbH]",
+    "container": {},
+    "publicationYear": 2016,
+    "subjects": [
+      {
+        "subject": "Direktdiodenlasersysteme"
+      },
+      {
+        "subject": "Physics",
+        "subjectScheme": "linsearch"
+      }
+    ],
+    "contributors": [
+      {
+        "name": "TIB-Technische Informationsbibliothek Universitätsbibliothek Hannover",
+        "nameType": "Organizational",
+        "affiliation": [],
+        "contributorType": "HostingInstitution",
+        "nameIdentifiers": []
+      },
+      {
+        "name": "Technische Informationsbibliothek (TIB)",
+        "affiliation": [],
+        "contributorType": "DataManager",
+        "nameIdentifiers": []
+      }
+    ],
+    "dates": [
+      {
+        "date": "2016",
+        "dateType": "Issued"
+      }
+    ],
+    "language": "de",
+    "types": {
+      "ris": "RPRT",
+      "bibtex": "article",
+      "citeproc": "report",
+      "schemaOrg": "ScholarlyArticle",
+      "resourceType": "Report",
+      "resourceTypeGeneral": "Text"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [
+      "1 Online-Ressource (10 Seiten, 1,40 MB)"
+    ],
+    "formats": [
+      "application/pdf"
+    ],
+    "version": "1.0",
+    "rightsList": [],
+    "descriptions": [],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "https://www.tib.eu/suchen/id/TIBKAT:880813733/",
+    "contentUrl": null,
+    "metadataVersion": 9,
+    "schemaVersion": "http://datacite.org/schema/kernel-4",
+    "source": "mds",
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2017-02-25T00:00:18.000Z",
+    "registered": "2017-02-25T00:00:19.000Z",
+    "published": "2016",
+    "updated": "2019-08-03T05:53:51.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "tib.tib",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_doc_10.json b/python/tests/files/datacite/datacite_doc_10.json
new file mode 100644
index 00000000..d40fc272
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_10.json
@@ -0,0 +1,83 @@
+{
+  "id": "10.25549/wpacards-m6171",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.25549/wpacards-m6171",
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.25549/wpacards-m6171",
+        "identifierType": "DOI"
+      }
+    ],
+    "creators": [
+      {
+        "name": "Unknown",
+        "affiliation": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "WPA household census for 210 E VERNON, Los Angeles"
+      }
+    ],
+    "publisher": "University of Southern California Digital Library (USC.DL)",
+    "container": {},
+    "publicationYear": 2012,
+    "subjects": [
+      {
+        "subject": "housing areas"
+      },
+      {
+        "subject": "Dwellings"
+      }
+    ],
+    "contributors": [],
+    "dates": [
+      {
+        "date": "2012",
+        "dateType": "Issued"
+      }
+    ],
+    "language": "eng",
+    "types": {
+      "ris": "DATA",
+      "bibtex": "misc",
+      "citeproc": "dataset",
+      "schemaOrg": "Dataset",
+      "resourceType": "Dataset",
+      "resourceTypeGeneral": "Dataset"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [],
+    "version": null,
+    "rightsList": [],
+    "descriptions": [
+      {
+        "descriptionType": "Abstract"
+      }
+    ],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "http://digitallibrary.usc.edu/cdm/ref/collection/p15799coll8/id/2608",
+    "contentUrl": null,
+    "metadataVersion": 0,
+    "schemaVersion": "http://datacite.org/schema/kernel-4",
+    "source": "mds",
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2018-09-09T08:32:09.000Z",
+    "registered": "2018-09-09T08:33:10.000Z",
+    "published": "2012",
+    "updated": "2019-08-02T20:03:32.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "usc.dl",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_doc_11.json b/python/tests/files/datacite/datacite_doc_11.json
new file mode 100644
index 00000000..50fe8363
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_11.json
@@ -0,0 +1,86 @@
+{
+  "id": "10.3932/ethz-a-000055869",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.3932/ethz-a-000055869",
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.3932/ethz-a-000055869",
+        "identifierType": "DOI"
+      }
+    ],
+    "creators": [
+      {
+        "name": "Comet Photo AG (Zürich)",
+        "affiliation": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "N1 bei Safenwil"
+      }
+    ],
+    "publisher": "ETH-Bibliothek Zürich, Bildarchiv",
+    "container": {},
+    "publicationYear": 1965,
+    "subjects": [],
+    "contributors": [],
+    "dates": [
+      {
+        "date": "1965",
+        "dateType": "Available"
+      },
+      {
+        "date": "1965",
+        "dateType": "Issued"
+      }
+    ],
+    "language": "de",
+    "types": {
+      "ris": "FIGURE",
+      "bibtex": "misc",
+      "citeproc": "graphic",
+      "schemaOrg": "ImageObject",
+      "resourceTypeGeneral": "Image"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [
+      "TIFF-Bild"
+    ],
+    "version": null,
+    "rightsList": [],
+    "descriptions": [
+      {
+        "description": "Download und Nutzung frei",
+        "descriptionType": "Other"
+      },
+      {
+        "description": "10, N1, Genève, Bern, Zürich, Sankt Gallen, Sankt Margrethen, Strassen, Strassenbau, 2.",
+        "descriptionType": "Other"
+      }
+    ],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "http://ba.e-pics.ethz.ch/link.jsp?id=44861",
+    "contentUrl": null,
+    "metadataVersion": 6,
+    "schemaVersion": "http://datacite.org/schema/kernel-3",
+    "source": "mds",
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2019-03-04T23:56:42.000Z",
+    "registered": "2019-07-30T13:17:45.000Z",
+    "published": "1965",
+    "updated": "2019-08-02T22:08:26.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "ethz.epics-ba",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_doc_12.json b/python/tests/files/datacite/datacite_doc_12.json
new file mode 100644
index 00000000..31c0f0ca
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_12.json
@@ -0,0 +1,103 @@
+{
+  "id": "10.5167/uzh-171449",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.5167/uzh-171449",
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.5167/uzh-171449",
+        "identifierType": "DOI"
+      }
+    ],
+    "creators": [
+      {
+        "name": "Spanias, Charalampos",
+        "nameType": "Personal",
+        "givenName": "Charalampos",
+        "familyName": "Spanias",
+        "affiliation": [],
+        "nameIdentifiers": []
+      },
+      {
+        "name": "Nikolaidis, Pantelis T",
+        "nameType": "Personal",
+        "givenName": "Pantelis T",
+        "familyName": "Nikolaidis",
+        "affiliation": [],
+        "nameIdentifiers": []
+      },
+      {
+        "name": "Rosemann, Thomas",
+        "nameType": "Personal",
+        "givenName": "Thomas",
+        "familyName": "Rosemann",
+        "affiliation": [],
+        "nameIdentifiers": []
+      },
+      {
+        "name": "Knechtle, Beat",
+        "nameType": "Personal",
+        "givenName": "Beat",
+        "familyName": "Knechtle",
+        "affiliation": [],
+        "nameIdentifiers": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "Anthropometric and Physiological Profile of Mixed Martial Art Athletes: A Brief Review"
+      }
+    ],
+    "publisher": "MDPI Publishing",
+    "container": {},
+    "publicationYear": 2019,
+    "subjects": [],
+    "contributors": [],
+    "dates": [
+      {
+        "date": "2019-06-14",
+        "dateType": "Available"
+      },
+      {
+        "date": "2019",
+        "dateType": "Issued"
+      }
+    ],
+    "language": null,
+    "types": {
+      "ris": "RPRT",
+      "bibtex": "article",
+      "citeproc": "article-journal",
+      "schemaOrg": "ScholarlyArticle",
+      "resourceTypeGeneral": "Text"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [],
+    "version": null,
+    "rightsList": [],
+    "descriptions": [],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "https://www.zora.uzh.ch/id/eprint/171449",
+    "contentUrl": null,
+    "metadataVersion": 0,
+    "schemaVersion": null,
+    "source": "mds",
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2019-06-27T01:01:35.000Z",
+    "registered": "2019-06-27T01:01:36.000Z",
+    "published": "2019",
+    "updated": "2019-09-26T16:44:24.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "ethz.zora",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_doc_13.json b/python/tests/files/datacite/datacite_doc_13.json
new file mode 100644
index 00000000..ff6eb229
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_13.json
@@ -0,0 +1,86 @@
+{
+  "id": "10.5169/seals-314104",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.5169/seals-314104",
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.5169/seals-314104",
+        "identifierType": "DOI"
+      }
+    ],
+    "creators": [
+      {
+        "name": "O.M.",
+        "affiliation": []
+      },
+      {
+        "name": "Hiltbrunner, Hermann",
+        "nameType": "Personal",
+        "givenName": "Hermann",
+        "familyName": "Hiltbrunner",
+        "affiliation": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "[Müssen wir des Glücks uns schämen?]"
+      }
+    ],
+    "publisher": "Buchdruckerei Büchler & Co.",
+    "container": {},
+    "publicationYear": 1940,
+    "subjects": [],
+    "contributors": [],
+    "dates": [
+      {
+        "date": "1940-10-05",
+        "dateType": "Available"
+      },
+      {
+        "date": "1940",
+        "dateType": "Issued"
+      }
+    ],
+    "language": null,
+    "types": {
+      "ris": "JOUR",
+      "bibtex": "article",
+      "citeproc": "article-journal",
+      "schemaOrg": "ScholarlyArticle",
+      "resourceType": "Journal Article",
+      "resourceTypeGeneral": "Text"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [
+      "text/html",
+      "application/pdf"
+    ],
+    "version": null,
+    "rightsList": [],
+    "descriptions": [],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "https://www.e-periodica.ch/digbib/view?pid=sle-001:1940-1941:45::13",
+    "contentUrl": null,
+    "metadataVersion": 17,
+    "schemaVersion": "http://datacite.org/schema/kernel-3",
+    "source": null,
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2013-03-22T14:02:08.000Z",
+    "registered": "2013-03-22T13:58:11.000Z",
+    "published": "1940",
+    "updated": "2019-08-02T02:22:55.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "ethz.seals",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_doc_14.json b/python/tests/files/datacite/datacite_doc_14.json
new file mode 100644
index 00000000..b1e1ebf2
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_14.json
@@ -0,0 +1,166 @@
+{
+  "id": "10.5517/cc7gns3",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.5517/cc7gns3",
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.5517/cc7gns3",
+        "identifierType": "DOI"
+      },
+      {
+        "identifier": "222635",
+        "identifierType": "CCDC"
+      }
+    ],
+    "creators": [
+      {
+        "name": "Stulz, E.",
+        "nameType": "Personal",
+        "givenName": "E.",
+        "familyName": "Stulz",
+        "affiliation": []
+      },
+      {
+        "name": "Scott, S.M.",
+        "nameType": "Personal",
+        "givenName": "S.M.",
+        "familyName": "Scott",
+        "affiliation": []
+      },
+      {
+        "name": "Ng, Yiu-Fai",
+        "nameType": "Personal",
+        "givenName": "Yiu-Fai",
+        "familyName": "Ng",
+        "affiliation": []
+      },
+      {
+        "name": "Bond, A.D.",
+        "nameType": "Personal",
+        "givenName": "A.D.",
+        "familyName": "Bond",
+        "affiliation": []
+      },
+      {
+        "name": "Teat, S.J.",
+        "nameType": "Personal",
+        "givenName": "S.J.",
+        "familyName": "Teat",
+        "affiliation": []
+      },
+      {
+        "name": "Darling, S.L.",
+        "nameType": "Personal",
+        "givenName": "S.L.",
+        "familyName": "Darling",
+        "affiliation": []
+      },
+      {
+        "name": "Feeder, N.",
+        "nameType": "Personal",
+        "givenName": "N.",
+        "familyName": "Feeder",
+        "affiliation": []
+      },
+      {
+        "name": "Sanders, J.K.M.",
+        "nameType": "Personal",
+        "givenName": "J.K.M.",
+        "familyName": "Sanders",
+        "affiliation": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "CCDC 222635: Experimental Crystal Structure Determination"
+      }
+    ],
+    "publisher": "Cambridge Crystallographic Data Centre",
+    "container": {},
+    "publicationYear": 2004,
+    "subjects": [
+      {
+        "subject": "Crystal Structure"
+      },
+      {
+        "subject": "Experimental 3D Coordinates"
+      },
+      {
+        "subject": "Crystal System"
+      },
+      {
+        "subject": "Space Group"
+      },
+      {
+        "subject": "Cell Parameters"
+      },
+      {
+        "subject": "Crystallography"
+      },
+      {
+        "subject": "bis(mu~2~-5-(3,5-Di-t-butylphenyl)-15-(4-(2-(diphenylphosphino)ethynyl)phenyl)-2,8,12,18-tetrahexyl-3,7,13,17-tetramethylporphyrinato)-(5,15-bis(3,5-di-t-butylphenyl)-2,8,12,18-tetraethyl-3,7,13,17-tetramethylporphyrinato)-di-nickel-ruthenium chloroform solvate"
+      }
+    ],
+    "contributors": [],
+    "dates": [
+      {
+        "date": "2004",
+        "dateType": "Issued"
+      }
+    ],
+    "language": "eng",
+    "types": {
+      "ris": "DATA",
+      "bibtex": "misc",
+      "citeproc": "dataset",
+      "schemaOrg": "Dataset",
+      "resourceTypeGeneral": "Dataset"
+    },
+    "relatedIdentifiers": [
+      {
+        "relationType": "IsSupplementTo",
+        "relatedIdentifier": "10.1021/ic034699w",
+        "relatedIdentifierType": "DOI"
+      }
+    ],
+    "sizes": [],
+    "formats": [
+      "CIF"
+    ],
+    "version": null,
+    "rightsList": [],
+    "descriptions": [
+      {
+        "description": "Related Article: E.Stulz, S.M.Scott, Yiu-Fai Ng, A.D.Bond, S.J.Teat, S.L.Darling, N.Feeder, J.K.M.Sanders|2003|Inorg.Chem.|42|6564|doi:10.1021/ic034699w",
+        "descriptionType": "Other"
+      },
+      {
+        "description": "An entry from the Cambridge Structural Database, the world’s repository for small molecule crystal structures. The entry contains experimental data from a crystal diffraction study. The deposited dataset for this entry is freely available from the CCDC and typically includes 3D coordinates, cell parameters, space group, experimental conditions and quality measures.",
+        "descriptionType": "Abstract"
+      }
+    ],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "http://www.ccdc.cam.ac.uk/services/structure_request?id=doi:10.5517/cc7gns3&sid=DataCite",
+    "contentUrl": null,
+    "metadataVersion": 2,
+    "schemaVersion": "http://datacite.org/schema/kernel-3",
+    "source": null,
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2014-03-18T07:28:28.000Z",
+    "registered": "2014-03-18T07:28:29.000Z",
+    "published": "2004",
+    "updated": "2019-08-02T03:38:32.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "ccdc.csd",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_doc_15.json b/python/tests/files/datacite/datacite_doc_15.json
new file mode 100644
index 00000000..5b4ee8ec
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_15.json
@@ -0,0 +1,79 @@
+{
+  "id": "10.6073/pasta/95296d8416aae24f3d39b4ecb27f0b28",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.6073/pasta/95296d8416aae24f3d39b4ecb27f0b28",
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.6073/pasta/95296d8416aae24f3d39b4ecb27f0b28",
+        "identifierType": "DOI"
+      },
+      {
+        "identifier": "https://pasta.lternet.edu/package/eml/knb-lter-vcr/102/16",
+        "identifierType": "URL"
+      }
+    ],
+    "creators": [
+      {
+        "name": "Richardson, David",
+        "nameType": "Personal",
+        "givenName": "David",
+        "familyName": "Richardson",
+        "affiliation": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "Parramore Island of the Virginia Coast Reserve Permanent Plot Resurvey: Tree data 1997"
+      }
+    ],
+    "publisher": "Environmental Data Initiative",
+    "container": {},
+    "publicationYear": 2017,
+    "subjects": [],
+    "contributors": [],
+    "dates": [
+      {
+        "date": "2017",
+        "dateType": "Issued"
+      }
+    ],
+    "language": null,
+    "types": {
+      "ris": "DATA",
+      "bibtex": "misc",
+      "citeproc": "dataset",
+      "schemaOrg": "Dataset",
+      "resourceType": "dataPackage",
+      "resourceTypeGeneral": "Dataset"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [],
+    "version": null,
+    "rightsList": [],
+    "descriptions": [],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "https://portal.lternet.edu/nis/mapbrowse?packageid=knb-lter-vcr.102.16",
+    "contentUrl": null,
+    "metadataVersion": 1,
+    "schemaVersion": "http://datacite.org/schema/kernel-2.2",
+    "source": null,
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2017-02-01T18:20:04.000Z",
+    "registered": "2017-02-01T18:20:05.000Z",
+    "published": "2017",
+    "updated": "2019-08-02T14:16:49.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "edi.edi",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_doc_16.json b/python/tests/files/datacite/datacite_doc_16.json
new file mode 100644
index 00000000..5af7fbe1
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_16.json
@@ -0,0 +1,80 @@
+{
+  "id": "10.6084/m9.figshare.1282478",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.6084/m9.figshare.1282478",
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.6084/m9.figshare.1282478",
+        "identifierType": "DOI"
+      }
+    ],
+    "creators": [
+      {
+        "name": "Sochi, Taha",
+        "nameType": "Personal",
+        "givenName": "Taha",
+        "familyName": "Sochi",
+        "affiliation": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "Testing the Connectivity of Networks"
+      }
+    ],
+    "publisher": "Figshare",
+    "container": {},
+    "publicationYear": 2014,
+    "subjects": [],
+    "contributors": [],
+    "dates": [
+      {
+        "date": "2014",
+        "dateType": "Issued"
+      }
+    ],
+    "language": null,
+    "types": {
+      "ris": "DATA",
+      "bibtex": "misc",
+      "citeproc": "dataset",
+      "schemaOrg": "Dataset",
+      "resourceType": "Paper",
+      "resourceTypeGeneral": "Dataset"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [],
+    "version": null,
+    "rightsList": [
+      {
+        "rights": "CC-BY",
+        "rightsUri": "http://creativecommons.org/licenses/by/3.0/us"
+      }
+    ],
+    "descriptions": [],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "http://figshare.com/articles/Testing_the_Connectivity_of_Networks/1282478",
+    "contentUrl": null,
+    "metadataVersion": 0,
+    "schemaVersion": "http://datacite.org/schema/kernel-3",
+    "source": null,
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2014-12-31T15:38:16.000Z",
+    "registered": "2014-12-31T15:38:18.000Z",
+    "published": "2014",
+    "updated": "2019-08-02T04:52:11.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "figshare.ars",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_doc_17.json b/python/tests/files/datacite/datacite_doc_17.json
new file mode 100644
index 00000000..f1363a61
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_17.json
@@ -0,0 +1,72 @@
+{
+  "id": "10.7910/dvn/tsqfwc/yytj22",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.7910/dvn/tsqfwc/yytj22",
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.7910/dvn/tsqfwc/yytj22",
+        "identifierType": "DOI"
+      }
+    ],
+    "creators": [
+      {
+        "name": "Di Giovanna, Antonino Paolo (University Of Florence)",
+        "nameType": "Personal",
+        "affiliation": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "gel_BSA-FITC_Markov_segmntation0343.tif"
+      }
+    ],
+    "publisher": "Harvard Dataverse",
+    "container": {},
+    "publicationYear": 2018,
+    "subjects": [],
+    "contributors": [],
+    "dates": [
+      {
+        "date": "2018",
+        "dateType": "Issued"
+      }
+    ],
+    "language": null,
+    "types": {
+      "ris": "DATA",
+      "bibtex": "misc",
+      "citeproc": "dataset",
+      "schemaOrg": "Dataset",
+      "resourceTypeGeneral": "Dataset"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [],
+    "version": null,
+    "rightsList": [],
+    "descriptions": [],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/TSQFWC/YYTJ22",
+    "contentUrl": null,
+    "metadataVersion": 0,
+    "schemaVersion": "http://datacite.org/schema/kernel-4",
+    "source": "mds",
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2018-08-22T17:36:10.000Z",
+    "registered": "2018-08-22T17:37:30.000Z",
+    "published": "2018",
+    "updated": "2019-08-02T19:43:20.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "gdcc.harvard-dv",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_doc_18.json b/python/tests/files/datacite/datacite_doc_18.json
new file mode 100644
index 00000000..f6bc81a6
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_18.json
@@ -0,0 +1,79 @@
+{
+  "id": "10.7916/d81z522m",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.7916/d81z522m",
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.7916/d81z522m",
+        "identifierType": "DOI"
+      }
+    ],
+    "creators": [
+      {
+        "name": "(:Unav)",
+        "affiliation": [],
+        "nameIdentifiers": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "Eastern questionnaire, answer sheet for Interviewee 53215, page 064"
+      }
+    ],
+    "publisher": "Columbia University",
+    "container": {},
+    "publicationYear": 2017,
+    "subjects": [],
+    "contributors": [],
+    "dates": [
+      {
+        "date": "2017-08-21",
+        "dateType": "Created"
+      },
+      {
+        "date": "2019-08-04",
+        "dateType": "Updated"
+      },
+      {
+        "date": "2017",
+        "dateType": "Issued"
+      }
+    ],
+    "language": null,
+    "types": {
+      "ris": "GEN",
+      "bibtex": "misc",
+      "citeproc": "article",
+      "schemaOrg": "CreativeWork"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [],
+    "version": null,
+    "rightsList": [],
+    "descriptions": [],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "https://dlc.library.columbia.edu/lcaaj/cul:k3j9kd52d6",
+    "contentUrl": null,
+    "metadataVersion": 2,
+    "schemaVersion": "http://datacite.org/schema/kernel-3",
+    "source": "ez",
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2017-11-29T02:15:31.000Z",
+    "registered": "2017-11-29T02:15:32.000Z",
+    "published": "2017",
+    "updated": "2019-08-04T13:17:58.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "cul.columbia",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_doc_19.json b/python/tests/files/datacite/datacite_doc_19.json
new file mode 100644
index 00000000..c0bc25ba
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_19.json
@@ -0,0 +1,79 @@
+{
+  "id": "10.7916/d86x0cg1",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.7916/d86x0cg1",
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.7916/d86x0cg1",
+        "identifierType": "DOI"
+      }
+    ],
+    "creators": [
+      {
+        "name": "(:Unav)",
+        "affiliation": [],
+        "nameIdentifiers": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "Eastern questionnaire, answer sheet for Interviewee 55236, page 092"
+      }
+    ],
+    "publisher": "Columbia University",
+    "container": {},
+    "publicationYear": 2017,
+    "subjects": [],
+    "contributors": [],
+    "dates": [
+      {
+        "date": "2017-08-24",
+        "dateType": "Created"
+      },
+      {
+        "date": "2019-08-04",
+        "dateType": "Updated"
+      },
+      {
+        "date": "2017",
+        "dateType": "Issued"
+      }
+    ],
+    "language": null,
+    "types": {
+      "ris": "GEN",
+      "bibtex": "misc",
+      "citeproc": "article",
+      "schemaOrg": "CreativeWork"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [],
+    "version": null,
+    "rightsList": [],
+    "descriptions": [],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "https://dlc.library.columbia.edu/lcaaj/cul:44j0zpc98s",
+    "contentUrl": null,
+    "metadataVersion": 3,
+    "schemaVersion": "http://datacite.org/schema/kernel-3",
+    "source": "ez",
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2017-11-29T09:29:33.000Z",
+    "registered": "2017-11-29T09:29:34.000Z",
+    "published": "2017",
+    "updated": "2019-08-04T23:43:40.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "cul.columbia",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_doc_20.json b/python/tests/files/datacite/datacite_doc_20.json
new file mode 100644
index 00000000..964e2cbb
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_20.json
@@ -0,0 +1,42 @@
+{
+    "attributes": {
+      "doi": "10.7916/d86x0cg1",
+      "creators": [
+        {
+          "name": "(:Unav)",
+          "affiliation": [],
+          "nameIdentifiers": []
+        }
+      ],
+      "titles": [
+        {
+          "title": "<h1>Eastern questionnaire</h1>"
+        }
+      ],
+      "publicationYear": 2017,
+      "dates": [
+        {
+          "date": "2017-08-24",
+          "dateType": "Created"
+        },
+        {
+          "date": "2019-08-04",
+          "dateType": "Updated"
+        },
+        {
+          "date": "2017",
+          "dateType": "Issued"
+        }
+      ],
+      "language": null,
+      "types": {
+        "ris": "GEN",
+        "bibtex": "misc",
+        "citeproc": "article",
+        "schemaOrg": "CreativeWork"
+      },
+      "isActive": true,
+      "state": "findable"
+    }
+  }
+  
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_doc_21.json b/python/tests/files/datacite/datacite_doc_21.json
new file mode 100644
index 00000000..cae7f40f
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_21.json
@@ -0,0 +1,42 @@
+{
+    "attributes": {
+      "doi": "10.7916/d86x0cg1",
+      "creators": [
+        {
+          "name": "(:Unav)",
+          "affiliation": [],
+          "nameIdentifiers": []
+        }
+      ],
+      "titles": [
+        {
+          "title": "ABC"
+        }
+      ],
+      "publicationYear": 2017,
+      "language": "GERMAN",
+      "types": {
+        "ris": "GEN",
+        "bibtex": "misc",
+        "citeproc": "article",
+        "schemaOrg": "CreativeWork"
+      },
+      "dates": [
+        {
+          "date": "2017-08-24",
+          "dateType": "Created"
+        },
+        {
+          "date": "2019-08-04",
+          "dateType": "Updated"
+        },
+        {
+          "date": "2017",
+          "dateType": "Issued"
+        }
+      ],
+      "isActive": true,
+      "state": "findable"
+    }
+  }
+  
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_doc_22.json b/python/tests/files/datacite/datacite_doc_22.json
new file mode 100644
index 00000000..42448ddf
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_22.json
@@ -0,0 +1,44 @@
+{
+    "attributes": {
+      "doi": "10.7916/d86x0cg1",
+      "creators": [
+        {
+          "name": "Anton Welch",
+          "affiliation": [
+            "Department of pataphysics"
+          ],
+          "nameIdentifiers": []
+        }
+      ],
+      "titles": [
+        {
+          "title": "ABC"
+        }
+      ],
+      "publicationYear": 2017,
+      "language": "GERMAN",
+      "types": {
+        "ris": "GEN",
+        "bibtex": "misc",
+        "citeproc": "article",
+        "schemaOrg": "CreativeWork"
+      },
+      "dates": [
+        {
+          "date": "2017-08-24",
+          "dateType": "Created"
+        },
+        {
+          "date": "2019-08-04",
+          "dateType": "Updated"
+        },
+        {
+          "date": "2017",
+          "dateType": "Issued"
+        }
+      ],
+      "isActive": true,
+      "state": "findable"
+    }
+  }
+
diff --git a/python/tests/files/datacite/datacite_doc_23.json b/python/tests/files/datacite/datacite_doc_23.json
new file mode 100644
index 00000000..1e5bcc3f
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_23.json
@@ -0,0 +1,44 @@
+{
+    "attributes": {
+      "doi": "10.7916/d86x0cg1\u2013xxx",
+      "creators": [
+        {
+          "name": "Anton Welch",
+          "affiliation": [
+            "Department of pataphysics"
+          ],
+          "nameIdentifiers": []
+        }
+      ],
+      "titles": [
+        {
+          "title": "ABC"
+        }
+      ],
+      "publicationYear": 2017,
+      "language": "GERMAN",
+      "types": {
+        "ris": "GEN",
+        "bibtex": "misc",
+        "citeproc": "article",
+        "schemaOrg": "CreativeWork"
+      },
+      "dates": [
+        {
+          "date": "2017-08-24",
+          "dateType": "Created"
+        },
+        {
+          "date": "2019-08-04",
+          "dateType": "Updated"
+        },
+        {
+          "date": "2017",
+          "dateType": "Issued"
+        }
+      ],
+      "isActive": true,
+      "state": "findable"
+    }
+  }
+
diff --git a/python/tests/files/datacite/datacite_result_00.json b/python/tests/files/datacite/datacite_result_00.json
new file mode 100644
index 00000000..085e23f3
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_00.json
@@ -0,0 +1,87 @@
+{
+    "extra": {
+        "container_name": "Journal of Chemical Crystallography",
+        "datacite": {
+            "license": [
+                {
+                    "rightsUri": "http://www.springer.com/tdm"
+                }
+            ],
+            "relations": [
+                {
+                    "relationType": "IsPartOf",
+                    "relatedIdentifier": "1074-1542",
+                    "resourceTypeGeneral": "Collection",
+                    "relatedIdentifierType": "ISSN"
+                }
+            ]
+        }
+    },
+    "title": "Synthesis and Crystal Structure of a Compound with Two Conformational Isomers: N-(2-methylbenzoyl)-N\u2032-(4-nitrophenyl)thiourea",
+    "release_type": "article-journal",
+    "release_stage": "published",
+    "release_date": "2019-05-31",
+    "release_year": 2019,
+    "ext_ids": {
+        "doi": "10.1007/s10870-008-9413-z"
+    },
+    "volume": "38",
+    "issue": "12",
+    "pages": "927-930",
+    "publisher": "Springer Science and Business Media LLC",
+    "contribs": [
+        {
+            "index": 0,
+            "raw_name": "Li, Qian-Jin",
+            "given_name": "Qian-Jin",
+            "surname": "Li",
+            "role": "author"
+        },
+        {
+            "index": 1,
+            "raw_name": "Yang, Chun-Long",
+            "given_name": "Chun-Long",
+            "surname": "Yang",
+            "role": "author"
+        }
+    ],
+    "refs": [
+        {
+            "index": 0,
+            "extra": {
+                "doi": "10.1016/j.bmcl.2005.09.033"
+            }
+        },
+        {
+            "index": 1,
+            "extra": {
+                "doi": "10.1016/s0022-1139(02)00330-5"
+            }
+        },
+        {
+            "index": 2,
+            "extra": {
+                "doi": "10.1016/s0010-8545(01)00337-x"
+            }
+        },
+        {
+            "index": 3,
+            "extra": {
+                "doi": "10.1016/j.tetlet.2005.06.135"
+            }
+        },
+        {
+            "index": 4,
+            "extra": {
+                "doi": "10.1039/p298700000s1"
+            }
+        },
+        {
+            "index": 5,
+            "extra": {
+                "doi": "10.1002/anie.199515551"
+            }
+        }
+    ],
+    "abstracts": []
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_01.json b/python/tests/files/datacite/datacite_result_01.json
new file mode 100644
index 00000000..f8c6b930
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_01.json
@@ -0,0 +1,32 @@
+{
+    "extra": {
+        "datacite": {
+            "license": [
+                {
+                    "lang": "de",
+                    "rights": "Standard (Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen) - http://www.ub.uni-heidelberg.de/helios/digi/nutzung/Welcome.html"
+                }
+            ]
+        }
+    },
+    "title": "Ferdinand Gaillard, [1]: n\u00e9 \u00e0 Paris le 16 janvier 1834, mort \u00e0 Paris le 19 janvier 1887",
+    "release_type": "article-journal",
+    "release_stage": "published",
+    "release_year": 1887,
+    "ext_ids": {
+        "doi": "10.11588/diglit.25558.39"
+    },
+    "publisher": "University Library Heidelberg",
+    "language": "fr",
+    "contribs": [
+        {
+            "index": 0,
+            "raw_name": "Dargenty, G.",
+            "given_name": "G.",
+            "surname": "Dargenty",
+            "role": "author"
+        }
+    ],
+    "refs": [],
+    "abstracts": []
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_02.json b/python/tests/files/datacite/datacite_result_02.json
new file mode 100644
index 00000000..f8b85f38
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_02.json
@@ -0,0 +1,36 @@
+{
+    "extra": {
+        "datacite": {
+            "license": [
+                {
+                    "lang": "de",
+                    "rights": "Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen - https://creativecommons.org/licenses/by-sa/3.0/de/"
+                },
+                {
+                    "lang": "en",
+                    "rights": "Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen - https://creativecommons.org/licenses/by-sa/3.0/"
+                }
+            ]
+        }
+    },
+    "title": "Solinger Schwertschmiede-Familien, [4]",
+    "release_type": "article-journal",
+    "release_stage": "published",
+    "release_year": 1897,
+    "ext_ids": {
+        "doi": "10.11588/diglit.37715.57"
+    },
+    "publisher": "University Library Heidelberg",
+    "language": "de",
+    "contribs": [
+        {
+            "index": 0,
+            "raw_name": "Weyersberg, Albert",
+            "given_name": "Albert",
+            "surname": "Weyersberg",
+            "role": "author"
+        }
+    ],
+    "refs": [],
+    "abstracts": []
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_03.json b/python/tests/files/datacite/datacite_result_03.json
new file mode 100644
index 00000000..3e3c2bd5
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_03.json
@@ -0,0 +1,19 @@
+{
+    "extra": {},
+    "title": "midterm ah30903",
+    "release_type": "article",
+    "release_year": 2016,
+    "ext_ids": {
+        "doi": "10.13140/rg.2.2.30434.53446"
+    },
+    "language": "ms",
+    "contribs": [
+        {
+            "index": 0,
+            "raw_name": "Mastura Yahya",
+            "role": "author"
+        }
+    ],
+    "refs": [],
+    "abstracts": []
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_04.json b/python/tests/files/datacite/datacite_result_04.json
new file mode 100644
index 00000000..7ca70d6c
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_04.json
@@ -0,0 +1,28 @@
+{
+    "extra": {},
+    "title": "On chain maps inducing isomorphisms in homology",
+    "release_type": "article-journal",
+    "release_stage": "published",
+    "release_year": 1973,
+    "ext_ids": {
+        "doi": "10.14288/1.0080520"
+    },
+    "publisher": "University of British Columbia",
+    "language": "en",
+    "contribs": [
+        {
+            "index": 0,
+            "raw_name": "Nicollerat, Marc Andre",
+            "given_name": "Marc Andre",
+            "surname": "Nicollerat",
+            "role": "author"
+        }
+    ],
+    "refs": [],
+    "abstracts": [
+        {
+            "content": "Let A be an abelian category, I the full subcategory of A consisting of injective objects of A, and K(A) the category whose objects are cochain complexes of elements of A, and whose morphisms are homotopy classes of cochain maps.  In (5), lemma 4.6., p. 42, R. Hartshorne has proved that, under certain conditions, a cochain complex X\u02d9 \u03b5. |KA)| can be embedded in a complex I\u02d9 \u03b5. |K(I)| in such a way that I\u02d9 has the same cohomology as X\u02d9.  In Chapter I we show that the construction given in the two first parts of Hartshorne's Lemma is natural i.e. there exists a functor  J : K(A) \u2192 K(I) and a natural transformation [formula omitted]  (where E : K(I) \u2192 K(A) is the embedding functor) such that [formula omitted] is  injective and induces isomorphism in cohomology. The question whether the construction given in the third part of the lemma is functorial is still open.  We also prove that J is left adjoint to E, so that K(I) is a reflective subcategory of K(A).  In the special case where A is a category [formula omitted] of left A-modules, and [formula omitted] the category of cochain complexes in [formula omitted] and cochain maps (not homotopy classes), we prove the existence of a functor [formula omitted]  In Chapter II we study the natural homomorphism [formula omitted]   where A, B are rings, and M, L, N modules or chain complexes. In particular we give several sufficient conditions under which v is an isomorphism, or induces isomorphism in homology.  In the appendix we give a detailed proof of Hartshorne's Lemma. We think that this is useful, as no complete proof is, to our knowledge, to be found in the literature.",
+            "mimetype": "text/plain"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json
new file mode 100644
index 00000000..e61769de
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_05.json
@@ -0,0 +1,530 @@
+{
+    "extra": {
+        "datacite": {
+            "license": [
+                {
+                    "rights": "Attribution-NonCommercial (CC BY-NC)",
+                    "rightsUri": "http://creativecommons.org/licenses/by-nc/4.0"
+                }
+            ]
+        }
+    },
+    "title": "SH409843.07FU",
+    "subtitle": "Gomphales",
+    "release_type": "dataset",
+    "release_stage": "published",
+    "release_date": "2014-10-05",
+    "release_year": 2014,
+    "ext_ids": {
+        "doi": "10.15156/bio/sh409843.07fu"
+    },
+    "publisher": "UNITE Community",
+    "language": "en",
+    "license_slug": "CC-BY-NC",
+    "contribs": [
+        {
+            "index": 0,
+            "raw_name": "K\u00f5ljalg, Urmas",
+            "given_name": "Urmas",
+            "surname": "K\u00f5ljalg",
+            "role": "author"
+        },
+        {
+            "index": 1,
+            "raw_name": "Abarenkov, Kessy",
+            "given_name": "Kessy",
+            "surname": "Abarenkov",
+            "role": "author"
+        },
+        {
+            "index": 2,
+            "raw_name": "Nilsson, R. Henrik",
+            "given_name": "R. Henrik",
+            "surname": "Nilsson",
+            "role": "author"
+        },
+        {
+            "index": 3,
+            "raw_name": "Larsson, Karl-Henrik",
+            "given_name": "Karl-Henrik",
+            "surname": "Larsson",
+            "role": "author"
+        },
+        {
+            "index": 4,
+            "raw_name": "Aas, Anders Bj\u00f8rnsgard",
+            "given_name": "Anders Bj\u00f8rnsgard",
+            "surname": "Aas",
+            "role": "author"
+        },
+        {
+            "index": 5,
+            "raw_name": "Adams, Rachel",
+            "given_name": "Rachel",
+            "surname": "Adams",
+            "role": "author"
+        },
+        {
+            "index": 6,
+            "raw_name": "Alves, Artur",
+            "given_name": "Artur",
+            "surname": "Alves",
+            "role": "author"
+        },
+        {
+            "index": 7,
+            "raw_name": "Ammirati, Joseph F.",
+            "given_name": "Joseph F.",
+            "surname": "Ammirati",
+            "role": "author"
+        },
+        {
+            "index": 8,
+            "raw_name": "Arnold, A. Elizabeth",
+            "given_name": "A. Elizabeth",
+            "surname": "Arnold",
+            "role": "author"
+        },
+        {
+            "index": 9,
+            "raw_name": "Bahram, Mohammad",
+            "given_name": "Mohammad",
+            "surname": "Bahram",
+            "role": "author"
+        },
+        {
+            "index": 10,
+            "raw_name": "Bengtsson-Palme, Johan",
+            "given_name": "Johan",
+            "surname": "Bengtsson-Palme",
+            "role": "author"
+        },
+        {
+            "index": 11,
+            "raw_name": "Berlin, Anna",
+            "given_name": "Anna",
+            "surname": "Berlin",
+            "role": "author"
+        },
+        {
+            "index": 12,
+            "raw_name": "Botnen, Synn\u00f8ve",
+            "given_name": "Synn\u00f8ve",
+            "surname": "Botnen",
+            "role": "author"
+        },
+        {
+            "index": 13,
+            "raw_name": "Bourlat, Sarah",
+            "given_name": "Sarah",
+            "surname": "Bourlat",
+            "role": "author"
+        },
+        {
+            "index": 14,
+            "raw_name": "Cheeke, Tanya",
+            "given_name": "Tanya",
+            "surname": "Cheeke",
+            "role": "author"
+        },
+        {
+            "index": 15,
+            "raw_name": "Dima, B\u00e1lint",
+            "given_name": "B\u00e1lint",
+            "surname": "Dima",
+            "role": "author"
+        },
+        {
+            "index": 16,
+            "raw_name": "Drenkhan, Rein",
+            "given_name": "Rein",
+            "surname": "Drenkhan",
+            "role": "author"
+        },
+        {
+            "index": 17,
+            "raw_name": "Duarte, Camila",
+            "given_name": "Camila",
+            "surname": "Duarte",
+            "role": "author"
+        },
+        {
+            "index": 18,
+            "raw_name": "Due\u00f1as, Margarita",
+            "given_name": "Margarita",
+            "surname": "Due\u00f1as",
+            "role": "author"
+        },
+        {
+            "index": 19,
+            "raw_name": "Eberhardt, Ursula",
+            "given_name": "Ursula",
+            "surname": "Eberhardt",
+            "role": "author"
+        },
+        {
+            "index": 20,
+            "raw_name": "Friberg, Hanna",
+            "given_name": "Hanna",
+            "surname": "Friberg",
+            "role": "author"
+        },
+        {
+            "index": 21,
+            "raw_name": "Fr\u00f8slev, Tobias G.",
+            "given_name": "Tobias G.",
+            "surname": "Fr\u00f8slev",
+            "role": "author"
+        },
+        {
+            "index": 22,
+            "raw_name": "Garnica, Sigisfredo",
+            "given_name": "Sigisfredo",
+            "surname": "Garnica",
+            "role": "author"
+        },
+        {
+            "index": 23,
+            "raw_name": "Geml, J\u00f3zsef",
+            "given_name": "J\u00f3zsef",
+            "surname": "Geml",
+            "role": "author"
+        },
+        {
+            "index": 24,
+            "raw_name": "Ghobad-Nejhad, Masoomeh",
+            "given_name": "Masoomeh",
+            "surname": "Ghobad-Nejhad",
+            "role": "author"
+        },
+        {
+            "index": 25,
+            "raw_name": "Grebenc, Tine",
+            "given_name": "Tine",
+            "surname": "Grebenc",
+            "role": "author"
+        },
+        {
+            "index": 26,
+            "raw_name": "Griffith, Gareth W.",
+            "given_name": "Gareth W.",
+            "surname": "Griffith",
+            "role": "author"
+        },
+        {
+            "index": 27,
+            "raw_name": "Hampe, Felix",
+            "given_name": "Felix",
+            "surname": "Hampe",
+            "role": "author"
+        },
+        {
+            "index": 28,
+            "raw_name": "Kennedy, Peter",
+            "given_name": "Peter",
+            "surname": "Kennedy",
+            "role": "author"
+        },
+        {
+            "index": 29,
+            "raw_name": "Khomich, Maryia",
+            "given_name": "Maryia",
+            "surname": "Khomich",
+            "role": "author"
+        },
+        {
+            "index": 30,
+            "raw_name": "Kohout, Petr",
+            "given_name": "Petr",
+            "surname": "Kohout",
+            "role": "author"
+        },
+        {
+            "index": 31,
+            "raw_name": "Kollom, Anu",
+            "given_name": "Anu",
+            "surname": "Kollom",
+            "role": "author"
+        },
+        {
+            "index": 32,
+            "raw_name": "Larsson, Ellen",
+            "given_name": "Ellen",
+            "surname": "Larsson",
+            "role": "author"
+        },
+        {
+            "index": 33,
+            "raw_name": "Laszlo, Irinyi",
+            "given_name": "Irinyi",
+            "surname": "Laszlo",
+            "role": "author"
+        },
+        {
+            "index": 34,
+            "raw_name": "Leavitt, Steven",
+            "given_name": "Steven",
+            "surname": "Leavitt",
+            "role": "author"
+        },
+        {
+            "index": 35,
+            "raw_name": "Liimatainen, Kare",
+            "given_name": "Kare",
+            "surname": "Liimatainen",
+            "role": "author"
+        },
+        {
+            "index": 36,
+            "raw_name": "Lindahl, Bj\u00f6rn",
+            "given_name": "Bj\u00f6rn",
+            "surname": "Lindahl",
+            "role": "author"
+        },
+        {
+            "index": 37,
+            "raw_name": "Lodge, Deborah J.",
+            "given_name": "Deborah J.",
+            "surname": "Lodge",
+            "role": "author"
+        },
+        {
+            "index": 38,
+            "raw_name": "Lumbsch, Helge Thorsten",
+            "given_name": "Helge Thorsten",
+            "surname": "Lumbsch",
+            "role": "author"
+        },
+        {
+            "index": 39,
+            "raw_name": "Mart\u00edn Esteban, Mar\u00eda Paz",
+            "given_name": "Mar\u00eda Paz",
+            "surname": "Mart\u00edn Esteban",
+            "role": "author"
+        },
+        {
+            "index": 40,
+            "raw_name": "Meyer, Wieland",
+            "given_name": "Wieland",
+            "surname": "Meyer",
+            "role": "author"
+        },
+        {
+            "index": 41,
+            "raw_name": "Miettinen, Otto",
+            "given_name": "Otto",
+            "surname": "Miettinen",
+            "role": "author"
+        },
+        {
+            "index": 42,
+            "raw_name": "Nguyen, Nhu",
+            "given_name": "Nhu",
+            "surname": "Nguyen",
+            "role": "author"
+        },
+        {
+            "index": 43,
+            "raw_name": "Niskanen, Tuula",
+            "given_name": "Tuula",
+            "surname": "Niskanen",
+            "role": "author"
+        },
+        {
+            "index": 44,
+            "raw_name": "Oono, Ryoko",
+            "given_name": "Ryoko",
+            "surname": "Oono",
+            "role": "author"
+        },
+        {
+            "index": 45,
+            "raw_name": "\u00d6pik, Maarja",
+            "given_name": "Maarja",
+            "surname": "\u00d6pik",
+            "role": "author"
+        },
+        {
+            "index": 46,
+            "raw_name": "Ordynets, Alexander",
+            "given_name": "Alexander",
+            "surname": "Ordynets",
+            "role": "author"
+        },
+        {
+            "index": 47,
+            "raw_name": "Paw\u0142owska, Julia",
+            "given_name": "Julia",
+            "surname": "Paw\u0142owska",
+            "role": "author"
+        },
+        {
+            "index": 48,
+            "raw_name": "Peintner, Ursula",
+            "given_name": "Ursula",
+            "surname": "Peintner",
+            "role": "author"
+        },
+        {
+            "index": 49,
+            "raw_name": "Pereira, Olinto Liparini",
+            "given_name": "Olinto Liparini",
+            "surname": "Pereira",
+            "role": "author"
+        },
+        {
+            "index": 50,
+            "raw_name": "Pinho, Danilo Batista",
+            "given_name": "Danilo Batista",
+            "surname": "Pinho",
+            "role": "author"
+        },
+        {
+            "index": 51,
+            "raw_name": "P\u00f5ldmaa, Kadri",
+            "given_name": "Kadri",
+            "surname": "P\u00f5ldmaa",
+            "role": "author"
+        },
+        {
+            "index": 52,
+            "raw_name": "Runnel, Kadri",
+            "given_name": "Kadri",
+            "surname": "Runnel",
+            "role": "author"
+        },
+        {
+            "index": 53,
+            "raw_name": "Ryberg, Martin",
+            "given_name": "Martin",
+            "surname": "Ryberg",
+            "role": "author"
+        },
+        {
+            "index": 54,
+            "raw_name": "Saar, Irja",
+            "given_name": "Irja",
+            "surname": "Saar",
+            "role": "author"
+        },
+        {
+            "index": 55,
+            "raw_name": "Sanli, Kemal",
+            "given_name": "Kemal",
+            "surname": "Sanli",
+            "role": "author"
+        },
+        {
+            "index": 56,
+            "raw_name": "Scott, James",
+            "given_name": "James",
+            "surname": "Scott",
+            "role": "author"
+        },
+        {
+            "index": 57,
+            "raw_name": "Spirin, Viacheslav",
+            "given_name": "Viacheslav",
+            "surname": "Spirin",
+            "role": "author"
+        },
+        {
+            "index": 58,
+            "raw_name": "Suija, Ave",
+            "given_name": "Ave",
+            "surname": "Suija",
+            "role": "author"
+        },
+        {
+            "index": 59,
+            "raw_name": "Svantesson, Sten",
+            "given_name": "Sten",
+            "surname": "Svantesson",
+            "role": "author"
+        },
+        {
+            "index": 60,
+            "raw_name": "Tadych, Mariusz",
+            "given_name": "Mariusz",
+            "surname": "Tadych",
+            "role": "author"
+        },
+        {
+            "index": 61,
+            "raw_name": "Takamatsu, Susumu",
+            "given_name": "Susumu",
+            "surname": "Takamatsu",
+            "role": "author"
+        },
+        {
+            "index": 62,
+            "raw_name": "Tamm, Heidi",
+            "given_name": "Heidi",
+            "surname": "Tamm",
+            "role": "author"
+        },
+        {
+            "index": 63,
+            "raw_name": "Taylor, AFS.",
+            "given_name": "AFS.",
+            "surname": "Taylor",
+            "role": "author"
+        },
+        {
+            "index": 64,
+            "raw_name": "Tedersoo, Leho",
+            "given_name": "Leho",
+            "surname": "Tedersoo",
+            "role": "author"
+        },
+        {
+            "index": 65,
+            "raw_name": "Telleria, M.T.",
+            "given_name": "M.T.",
+            "surname": "Telleria",
+            "role": "author"
+        },
+        {
+            "index": 66,
+            "raw_name": "Udayanga, Dhanushka",
+            "given_name": "Dhanushka",
+            "surname": "Udayanga",
+            "role": "author"
+        },
+        {
+            "index": 67,
+            "raw_name": "Unterseher, Martin",
+            "given_name": "Martin",
+            "surname": "Unterseher",
+            "role": "author"
+        },
+        {
+            "index": 68,
+            "raw_name": "Volobuev, Sergey",
+            "given_name": "Sergey",
+            "surname": "Volobuev",
+            "role": "author"
+        },
+        {
+            "index": 69,
+            "raw_name": "Weiss, Michael",
+            "given_name": "Michael",
+            "surname": "Weiss",
+            "role": "author"
+        },
+        {
+            "index": 70,
+            "raw_name": "Wurzbacher, Christian",
+            "given_name": "Christian",
+            "surname": "Wurzbacher",
+            "role": "author"
+        }
+    ],
+    "refs": [],
+    "abstracts": [
+        {
+            "content": "UNITE provides a unified way for delimiting, identifying, communicating, and working with DNA-based Species Hypotheses (SH). All fungal ITS sequences in the international nucleotide sequence databases are clustered to approximately the species level by applying a set of dynamic distance values (&lt;0.5 - 3.0%). All species hypotheses are given a unique, stable name in the form of a DOI, and their taxonomic and ecological annotations are verified through distributed, web-based third-party annotation efforts. SHs are connected to a taxon name and its classification as far as possible (phylum, class, order, etc.) by taking into account identifications for all sequences in the SH. An automatically or manually designated sequence is chosen to represent each such SH. These sequences are released (https://unite.ut.ee/repository.php) for use by the scientific community in, for example, local sequence similarity searches and next-generation sequencing analysis pipelines. The system and the data are updated automatically as the number of public fungal ITS sequences grows.",
+            "mimetype": "text/plain"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_06.json b/python/tests/files/datacite/datacite_result_06.json
new file mode 100644
index 00000000..61f2549d
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_06.json
@@ -0,0 +1,26 @@
+{
+    "extra": {
+        "datacite": {
+            "license": [
+                {
+                    "rights": "ETH-Bibliothek Z\u00fcrich, Graphische Sammlung / D 6220 / Public Domain Mark 1.0"
+                }
+            ]
+        }
+    },
+    "title": "Der Eifer (Sedulitas), Blatt 7 der Folge \"Die Tugenden\"",
+    "release_type": "article",
+    "release_year": 1590,
+    "ext_ids": {
+        "doi": "10.16903/ethz-grs-d_006220"
+    },
+    "contribs": [
+        {
+            "index": 0,
+            "raw_name": "Crispijn De Passe (Der \u00c4ltere) (1564-1637)",
+            "role": "author"
+        }
+    ],
+    "refs": [],
+    "abstracts": []
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_07.json b/python/tests/files/datacite/datacite_result_07.json
new file mode 100644
index 00000000..324bb663
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_07.json
@@ -0,0 +1,73 @@
+{
+    "extra": {
+        "datacite": {
+            "subjects": [
+                {
+                    "subject": "HEAT PUMP"
+                },
+                {
+                    "subject": "HOT WATER"
+                },
+                {
+                    "subject": "HEAT TRANSFER"
+                },
+                {
+                    "subject": "PERFORMANCE"
+                },
+                {
+                    "subject": "THERMAL STORAGE"
+                },
+                {
+                    "subject": "TANK"
+                },
+                {
+                    "subject": "MODEL"
+                }
+            ]
+        }
+    },
+    "title": "High efficient heat pump system using storage tanks to increase cop by means of the ISEC concept. 1: model validation.",
+    "release_type": "dataset",
+    "release_stage": "published",
+    "release_year": 2015,
+    "ext_ids": {
+        "doi": "10.18462/iir.icr.2015.0926"
+    },
+    "publisher": "International Institute of Refrigeration (IIR)",
+    "language": "en",
+    "contribs": [
+        {
+            "index": 0,
+            "raw_name": "ROTHUIZEN, E.",
+            "given_name": "E.",
+            "surname": "ROTHUIZEN",
+            "role": "author"
+        },
+        {
+            "index": 1,
+            "raw_name": "ELMEGAARD, B.",
+            "given_name": "B.",
+            "surname": "ELMEGAARD",
+            "role": "author"
+        },
+        {
+            "index": 2,
+            "raw_name": "MARKUSSEN W., B.",
+            "given_name": "B.",
+            "surname": "MARKUSSEN W.",
+            "role": "author"
+        },
+        {
+            "index": 3,
+            "raw_name": "Et Al.",
+            "role": "author"
+        }
+    ],
+    "refs": [],
+    "abstracts": [
+        {
+            "content": "The purpose of the ISEC concept is to provide a high-efficient heat pump system for hot water production. The ISEC concept uses two storage tanks for the water, one discharged and one charged. Hot water for the industrial process is tapped from the charged tank, while the other tank is charging. Charging is done by circulating the water in the tank through the condenser of a heat pump several times and thereby gradually heating the water. The charging is done with a higher mass flow rate than the discharging to reach several circulations of the water during the time frame of one discharging. This result in a lower condensing temperature than if the water was heated in one step. Two test setups were built, one to test the performance of the heat pump gradually heating the water and one to investigate the stratification in the storage tanks. Furthermore, a dynamic model of the system was implemented in Dymola, and validated by the use of test data from the two experimental setups. This paper shows that there is a good consistency between the model and the experimental tests.",
+            "mimetype": "text/plain"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_08.json b/python/tests/files/datacite/datacite_result_08.json
new file mode 100644
index 00000000..281c3679
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_08.json
@@ -0,0 +1,53 @@
+{
+    "extra": {
+        "datacite": {
+            "subjects": [
+                {
+                    "subject": "Land Economics/Use"
+                },
+                {
+                    "subject": "irrigation",
+                    "subjectScheme": "keyword"
+                },
+                {
+                    "subject": "industrialization",
+                    "subjectScheme": "keyword"
+                },
+                {
+                    "subject": "collective action",
+                    "subjectScheme": "keyword"
+                }
+            ]
+        }
+    },
+    "title": "Irrigation Policies under Rapid Industrialization and Labor Migration: Lessons from Japan, China and India",
+    "release_type": "article-journal",
+    "release_year": 2017,
+    "ext_ids": {
+        "doi": "10.22004/ag.econ.284864"
+    },
+    "language": "en",
+    "contribs": [
+        {
+            "index": 0,
+            "raw_name": "Kajisa, Kei",
+            "given_name": "Kei",
+            "surname": "Kajisa",
+            "role": "author"
+        },
+        {
+            "index": 1,
+            "raw_name": "Kajisa, Kei",
+            "given_name": "Kei",
+            "surname": "Kajisa",
+            "role": "author"
+        }
+    ],
+    "refs": [],
+    "abstracts": [
+        {
+            "content": "International society recognizes that the scarcity of fresh water is increasing and farming sectors suffer from lack of irrigation water. However, if we look at this issue with a framework of relative factor endowment, a different view will arise. In emerging states with rapid industrialization and labor migration, labor scarcity increases at a faster pace than that of irrigation water. Using the historical review of Japan\u2019s irrigation policies as well as the case studies of India and China, this paper shows that the introduction of policies which do not reflect the actual relative resource scarcity may mislead the development path. We argue that under increasing relative labor scarcity it is important to realize the substitution of capital for labor for surface irrigation system management and that the substitution needs public support because the service of surface irrigation system has some externalities. Through this argument, this paper also intends to shed the light back to the role of the state for local resource management which seems to be unfairly undervalued since the boom of community participatory approach in the 1980s.",
+            "mimetype": "text/plain"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_09.json b/python/tests/files/datacite/datacite_result_09.json
new file mode 100644
index 00000000..01f92f85
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_09.json
@@ -0,0 +1,35 @@
+{
+    "extra": {
+        "datacite": {
+            "subjects": [
+                {
+                    "subject": "Direktdiodenlasersysteme"
+                },
+                {
+                    "subject": "Physics",
+                    "subjectScheme": "linsearch"
+                }
+            ]
+        }
+    },
+    "title": "BrightLas : TP3.3. Module f\u00fcr Direktdiodenstrahlquellen bis 4kW und Untersuchungen zur Leistungsskalierung (Diodemodul) : zum Verbundvorhaben Direktdiodenlaseranlagen und -systeme (VP3) im F\u00f6rderschwerpunkt innovative regionale Wachstumskerne, BMBF : Abschlussbericht",
+    "release_type": "report",
+    "release_stage": "published",
+    "release_year": 2016,
+    "ext_ids": {
+        "doi": "10.2314/gbv:880813733"
+    },
+    "publisher": "[Lumics GmbH]",
+    "language": "de",
+    "contribs": [
+        {
+            "index": 0,
+            "raw_name": "Kirstaedter, Nils",
+            "given_name": "Nils",
+            "surname": "Kirstaedter",
+            "role": "author"
+        }
+    ],
+    "refs": [],
+    "abstracts": []
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_10.json b/python/tests/files/datacite/datacite_result_10.json
new file mode 100644
index 00000000..325facf7
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_10.json
@@ -0,0 +1,32 @@
+{
+    "extra": {
+        "datacite": {
+            "subjects": [
+                {
+                    "subject": "housing areas"
+                },
+                {
+                    "subject": "Dwellings"
+                }
+            ]
+        }
+    },
+    "title": "WPA household census for 210 E VERNON, Los Angeles",
+    "release_type": "dataset",
+    "release_stage": "published",
+    "release_year": 2012,
+    "ext_ids": {
+        "doi": "10.25549/wpacards-m6171"
+    },
+    "publisher": "University of Southern California Digital Library (USC.DL)",
+    "language": "en",
+    "contribs": [
+        {
+            "index": 0,
+            "raw_name": "Unknown",
+            "role": "author"
+        }
+    ],
+    "refs": [],
+    "abstracts": []
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_11.json b/python/tests/files/datacite/datacite_result_11.json
new file mode 100644
index 00000000..037c5ac2
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_11.json
@@ -0,0 +1,21 @@
+{
+    "extra": {},
+    "title": "N1 bei Safenwil",
+    "release_type": "graphic",
+    "release_stage": "published",
+    "release_year": 1965,
+    "ext_ids": {
+        "doi": "10.3932/ethz-a-000055869"
+    },
+    "publisher": "ETH-Bibliothek Z\u00fcrich, Bildarchiv",
+    "language": "de",
+    "contribs": [
+        {
+            "index": 0,
+            "raw_name": "Comet Photo AG (Z\u00fcrich)",
+            "role": "author"
+        }
+    ],
+    "refs": [],
+    "abstracts": []
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_12.json b/python/tests/files/datacite/datacite_result_12.json
new file mode 100644
index 00000000..6b6cad4a
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_12.json
@@ -0,0 +1,44 @@
+{
+    "extra": {},
+    "title": "Anthropometric and Physiological Profile of Mixed Martial Art Athletes: A Brief Review",
+    "release_type": "article-journal",
+    "release_stage": "published",
+    "release_date": "2019-06-14",
+    "release_year": 2019,
+    "ext_ids": {
+        "doi": "10.5167/uzh-171449"
+    },
+    "publisher": "MDPI Publishing",
+    "contribs": [
+        {
+            "index": 0,
+            "raw_name": "Spanias, Charalampos",
+            "given_name": "Charalampos",
+            "surname": "Spanias",
+            "role": "author"
+        },
+        {
+            "index": 1,
+            "raw_name": "Nikolaidis, Pantelis T",
+            "given_name": "Pantelis T",
+            "surname": "Nikolaidis",
+            "role": "author"
+        },
+        {
+            "index": 2,
+            "raw_name": "Rosemann, Thomas",
+            "given_name": "Thomas",
+            "surname": "Rosemann",
+            "role": "author"
+        },
+        {
+            "index": 3,
+            "raw_name": "Knechtle, Beat",
+            "given_name": "Beat",
+            "surname": "Knechtle",
+            "role": "author"
+        }
+    ],
+    "refs": [],
+    "abstracts": []
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_13.json b/python/tests/files/datacite/datacite_result_13.json
new file mode 100644
index 00000000..3da3816d
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_13.json
@@ -0,0 +1,28 @@
+{
+    "extra": {},
+    "title": "[M\u00fcssen wir des Gl\u00fccks uns sch\u00e4men?]",
+    "release_type": "article-journal",
+    "release_stage": "published",
+    "release_date": "1940-10-05",
+    "release_year": 1940,
+    "ext_ids": {
+        "doi": "10.5169/seals-314104"
+    },
+    "publisher": "Buchdruckerei B\u00fcchler & Co.",
+    "contribs": [
+        {
+            "index": 0,
+            "raw_name": "O.M.",
+            "role": "author"
+        },
+        {
+            "index": 1,
+            "raw_name": "Hiltbrunner, Hermann",
+            "given_name": "Hermann",
+            "surname": "Hiltbrunner",
+            "role": "author"
+        }
+    ],
+    "refs": [],
+    "abstracts": []
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_14.json b/python/tests/files/datacite/datacite_result_14.json
new file mode 100644
index 00000000..94c00472
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_14.json
@@ -0,0 +1,110 @@
+{
+    "extra": {
+        "datacite": {
+            "subjects": [
+                {
+                    "subject": "Crystal Structure"
+                },
+                {
+                    "subject": "Experimental 3D Coordinates"
+                },
+                {
+                    "subject": "Crystal System"
+                },
+                {
+                    "subject": "Space Group"
+                },
+                {
+                    "subject": "Cell Parameters"
+                },
+                {
+                    "subject": "Crystallography"
+                },
+                {
+                    "subject": "bis(mu~2~-5-(3,5-Di-t-butylphenyl)-15-(4-(2-(diphenylphosphino)ethynyl)phenyl)-2,8,12,18-tetrahexyl-3,7,13,17-tetramethylporphyrinato)-(5,15-bis(3,5-di-t-butylphenyl)-2,8,12,18-tetraethyl-3,7,13,17-tetramethylporphyrinato)-di-nickel-ruthenium chloroform solvate"
+                }
+            ],
+            "relations": [
+                {
+                    "relationType": "IsSupplementTo",
+                    "relatedIdentifier": "10.1021/ic034699w",
+                    "relatedIdentifierType": "DOI"
+                }
+            ]
+        }
+    },
+    "title": "CCDC 222635: Experimental Crystal Structure Determination",
+    "release_type": "dataset",
+    "release_stage": "published",
+    "release_year": 2004,
+    "ext_ids": {
+        "doi": "10.5517/cc7gns3"
+    },
+    "publisher": "Cambridge Crystallographic Data Centre",
+    "language": "en",
+    "contribs": [
+        {
+            "index": 0,
+            "raw_name": "Stulz, E.",
+            "given_name": "E.",
+            "surname": "Stulz",
+            "role": "author"
+        },
+        {
+            "index": 1,
+            "raw_name": "Scott, S.M.",
+            "given_name": "S.M.",
+            "surname": "Scott",
+            "role": "author"
+        },
+        {
+            "index": 2,
+            "raw_name": "Ng, Yiu-Fai",
+            "given_name": "Yiu-Fai",
+            "surname": "Ng",
+            "role": "author"
+        },
+        {
+            "index": 3,
+            "raw_name": "Bond, A.D.",
+            "given_name": "A.D.",
+            "surname": "Bond",
+            "role": "author"
+        },
+        {
+            "index": 4,
+            "raw_name": "Teat, S.J.",
+            "given_name": "S.J.",
+            "surname": "Teat",
+            "role": "author"
+        },
+        {
+            "index": 5,
+            "raw_name": "Darling, S.L.",
+            "given_name": "S.L.",
+            "surname": "Darling",
+            "role": "author"
+        },
+        {
+            "index": 6,
+            "raw_name": "Feeder, N.",
+            "given_name": "N.",
+            "surname": "Feeder",
+            "role": "author"
+        },
+        {
+            "index": 7,
+            "raw_name": "Sanders, J.K.M.",
+            "given_name": "J.K.M.",
+            "surname": "Sanders",
+            "role": "author"
+        }
+    ],
+    "refs": [],
+    "abstracts": [
+        {
+            "content": "An entry from the Cambridge Structural Database, the world\u2019s repository for small molecule crystal structures. The entry contains experimental data from a crystal diffraction study. The deposited dataset for this entry is freely available from the CCDC and typically includes 3D coordinates, cell parameters, space group, experimental conditions and quality measures.",
+            "mimetype": "text/plain"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_15.json b/python/tests/files/datacite/datacite_result_15.json
new file mode 100644
index 00000000..0614f6ba
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_15.json
@@ -0,0 +1,22 @@
+{
+    "extra": {},
+    "title": "Parramore Island of the Virginia Coast Reserve Permanent Plot Resurvey: Tree data 1997",
+    "release_type": "dataset",
+    "release_stage": "published",
+    "release_year": 2017,
+    "ext_ids": {
+        "doi": "10.6073/pasta/95296d8416aae24f3d39b4ecb27f0b28"
+    },
+    "publisher": "Environmental Data Initiative",
+    "contribs": [
+        {
+            "index": 0,
+            "raw_name": "Richardson, David",
+            "given_name": "David",
+            "surname": "Richardson",
+            "role": "author"
+        }
+    ],
+    "refs": [],
+    "abstracts": []
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_16.json b/python/tests/files/datacite/datacite_result_16.json
new file mode 100644
index 00000000..1d861cf6
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_16.json
@@ -0,0 +1,31 @@
+{
+    "extra": {
+        "datacite": {
+            "license": [
+                {
+                    "rights": "CC-BY",
+                    "rightsUri": "http://creativecommons.org/licenses/by/3.0/us"
+                }
+            ]
+        }
+    },
+    "title": "Testing the Connectivity of Networks",
+    "release_type": "dataset",
+    "release_stage": "published",
+    "release_year": 2014,
+    "ext_ids": {
+        "doi": "10.6084/m9.figshare.1282478"
+    },
+    "publisher": "Figshare",
+    "contribs": [
+        {
+            "index": 0,
+            "raw_name": "Sochi, Taha",
+            "given_name": "Taha",
+            "surname": "Sochi",
+            "role": "author"
+        }
+    ],
+    "refs": [],
+    "abstracts": []
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_17.json b/python/tests/files/datacite/datacite_result_17.json
new file mode 100644
index 00000000..0852a09e
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_17.json
@@ -0,0 +1,20 @@
+{
+    "extra": {},
+    "title": "gel_BSA-FITC_Markov_segmntation0343.tif",
+    "release_type": "dataset",
+    "release_stage": "published",
+    "release_year": 2018,
+    "ext_ids": {
+        "doi": "10.7910/dvn/tsqfwc/yytj22"
+    },
+    "publisher": "Harvard Dataverse",
+    "contribs": [
+        {
+            "index": 0,
+            "raw_name": "Di Giovanna, Antonino Paolo (University Of Florence)",
+            "role": "author"
+        }
+    ],
+    "refs": [],
+    "abstracts": []
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_18.json b/python/tests/files/datacite/datacite_result_18.json
new file mode 100644
index 00000000..12ab39fe
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_18.json
@@ -0,0 +1,15 @@
+{
+    "extra": {},
+    "title": "Eastern questionnaire, answer sheet for Interviewee 53215, page 064",
+    "release_type": "article",
+    "release_stage": "published",
+    "release_date": "2017-08-21",
+    "release_year": 2017,
+    "ext_ids": {
+        "doi": "10.7916/d81z522m"
+    },
+    "publisher": "Columbia University",
+    "contribs": [],
+    "refs": [],
+    "abstracts": []
+}
diff --git a/python/tests/files/datacite/datacite_result_19.json b/python/tests/files/datacite/datacite_result_19.json
new file mode 100644
index 00000000..1505db92
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_19.json
@@ -0,0 +1,15 @@
+{
+    "extra": {},
+    "title": "Eastern questionnaire, answer sheet for Interviewee 55236, page 092",
+    "release_type": "article",
+    "release_stage": "published",
+    "release_date": "2017-08-24",
+    "release_year": 2017,
+    "ext_ids": {
+        "doi": "10.7916/d86x0cg1"
+    },
+    "publisher": "Columbia University",
+    "contribs": [],
+    "refs": [],
+    "abstracts": []
+}
diff --git a/python/tests/files/datacite/datacite_result_20.json b/python/tests/files/datacite/datacite_result_20.json
new file mode 100644
index 00000000..1868eede
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_20.json
@@ -0,0 +1,14 @@
+{
+    "extra": {},
+    "title": "<h1>Eastern questionnaire</h1>",
+    "release_type": "article",
+    "release_stage": "published",
+    "release_date": "2017-08-24",
+    "release_year": 2017,
+    "ext_ids": {
+        "doi": "10.7916/d86x0cg1"
+    },
+    "contribs": [],
+    "refs": [],
+    "abstracts": []
+}
diff --git a/python/tests/files/datacite/datacite_result_21.json b/python/tests/files/datacite/datacite_result_21.json
new file mode 100644
index 00000000..9214065a
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_21.json
@@ -0,0 +1,15 @@
+{
+    "extra": {},
+    "title": "ABC",
+    "release_type": "article",
+    "release_stage": "published",
+    "release_date": "2017-08-24",
+    "release_year": 2017,
+    "ext_ids": {
+        "doi": "10.7916/d86x0cg1"
+    },
+    "contribs": [],
+    "refs": [],
+    "abstracts": [],
+    "language": "de"
+}
diff --git a/python/tests/files/datacite/datacite_result_22.json b/python/tests/files/datacite/datacite_result_22.json
new file mode 100644
index 00000000..e9939e09
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_22.json
@@ -0,0 +1,22 @@
+{
+    "extra": {},
+    "title": "ABC",
+    "release_type": "article",
+    "release_stage": "published",
+    "release_date": "2017-08-24",
+    "release_year": 2017,
+    "ext_ids": {
+        "doi": "10.7916/d86x0cg1"
+    },
+    "contribs": [
+        {
+            "raw_affiliation": "Department of pataphysics",
+            "index": 0,
+            "raw_name": "Anton Welch",
+            "role": "author"
+        }
+    ],
+    "refs": [],
+    "abstracts": [],
+    "language": "de"
+}
diff --git a/python/tests/files/datacite/datacite_result_23.json b/python/tests/files/datacite/datacite_result_23.json
new file mode 100644
index 00000000..2bf66eae
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_23.json
@@ -0,0 +1,22 @@
+{
+    "extra": {},
+    "title": "ABC",
+    "release_type": "article",
+    "release_stage": "published",
+    "release_date": "2017-08-24",
+    "release_year": 2017,
+    "ext_ids": {
+        "doi": "10.7916/d86x0cg1-xxx"
+    },
+    "contribs": [
+        {
+            "index": 0,
+            "raw_name": "Anton Welch",
+            "role": "author",
+            "raw_affiliation": "Department of pataphysics"
+        }
+    ],
+    "refs": [],
+    "abstracts": [],
+    "language": "de"
+}
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index bc47a185..cdc165d7 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -7,7 +7,8 @@ import datetime
 import pytest
 import gzip
 from fatcat_tools.importers import DataciteImporter, JsonLinePusher
-from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates
+from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates, clean_doi
+from fatcat_tools.transforms import entity_to_dict
 from fixtures import api
 import json
 
@@ -270,3 +271,26 @@ def test_datacite_dict_parse(datacite_importer):
         assert r.contribs[0].given_name == None
         assert r.contribs[0].surname == None
         assert len(r.refs) == 0
+
+def test_clean_doi():
+    assert clean_doi("10.25513/1812-3996.2017.1.34\u201342") == "10.25513/1812-3996.2017.1.34-42"
+    assert "123" == clean_doi("123")
+
+def test_datacite_conversions(datacite_importer):
+    """
+    Datacite JSON to release entity JSON representation. The count is hardcoded
+    for now.
+    """
+    datacite_importer.debug = True
+    for i in range(24):
+        src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i)
+        dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i)
+        print('testing mapping from {} => {}'.format(src, dst))
+        with open(src, 'r') as f:
+            re = datacite_importer.parse_record(json.load(f))
+            result = entity_to_dict(re)
+        with open(dst, 'r') as f:
+           expected = json.loads(f.read())
+
+        assert result == expected
+
-- 
cgit v1.2.3


From cb223fccb64500a8e134b9ec721c8a08b1a60f19 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 2 Jan 2020 18:10:55 +0100
Subject: datacite: add two more skipable tokens

---
 python/fatcat_tools/importers/datacite.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 19c71d24..a03587c0 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -328,7 +328,7 @@ class DataciteImporter(EntityImporter):
                 if raw_affiliation == '':
                     continue
 
-                if name in ('(:Unav)', 'NA'):
+                if name in ('(:Unav)', 'NA', 'NN', '(:Null)'):
                     continue
 
                 contribs.append(
-- 
cgit v1.2.3


From be43049db0da2df4343bd5e1392d6c5201fc67d0 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 2 Jan 2020 18:11:35 +0100
Subject: datacite: address raw_name index form comment

> The convention for display_name and raw_name is to be how the name
would normally be printed, not in index form (surname comma given_name).
So we might need to un-encode names like "Tricart, Pierre".

Use an additional `index_form_to_display_name` function to convert index
from to display form, heuristically.
---
 python/fatcat_tools/importers/datacite.py          |  43 +++++++
 .../tests/files/datacite/datacite_result_00.json   |   4 +-
 .../tests/files/datacite/datacite_result_01.json   |   2 +-
 .../tests/files/datacite/datacite_result_02.json   |   2 +-
 .../tests/files/datacite/datacite_result_04.json   |   2 +-
 .../tests/files/datacite/datacite_result_05.json   | 142 ++++++++++-----------
 .../tests/files/datacite/datacite_result_07.json   |   6 +-
 .../tests/files/datacite/datacite_result_08.json   |   4 +-
 .../tests/files/datacite/datacite_result_09.json   |   2 +-
 .../tests/files/datacite/datacite_result_12.json   |   8 +-
 .../tests/files/datacite/datacite_result_13.json   |   2 +-
 .../tests/files/datacite/datacite_result_14.json   |  16 +--
 .../tests/files/datacite/datacite_result_15.json   |   2 +-
 .../tests/files/datacite/datacite_result_16.json   |   2 +-
 .../tests/files/datacite/datacite_result_18.json   |   2 +-
 .../tests/files/datacite/datacite_result_19.json   |   2 +-
 .../tests/files/datacite/datacite_result_20.json   |   2 +-
 .../tests/files/datacite/datacite_result_21.json   |   6 +-
 .../tests/files/datacite/datacite_result_22.json   |  10 +-
 .../tests/files/datacite/datacite_result_23.json   |   6 +-
 python/tests/import_datacite.py                    |  18 ++-
 21 files changed, 171 insertions(+), 112 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index a03587c0..bd135569 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -331,6 +331,10 @@ class DataciteImporter(EntityImporter):
                 if name in ('(:Unav)', 'NA', 'NN', '(:Null)'):
                     continue
 
+                # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'.
+                if name:
+                    name = index_form_to_display_name(name)
+
                 contribs.append(
                     fatcat_openapi_client.ReleaseContrib(
                         creator_id=creator_id,
@@ -859,3 +863,42 @@ def clean_doi(doi):
         doi = doi.replace(c, "-")
     return doi
 
+def index_form_to_display_name(s):
+    """
+    Try to convert an index form name, like 'Razis, Panos A' into display_name,
+    e.g. 'Panos A Razis'.
+    """
+    if ',' not in s:
+        return s
+    skip_on_chars = ['(', ')', '*']
+    for char in skip_on_chars:
+        if char in s:
+            return s
+    if s.count(',') > 1:
+        # "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan"
+        return s
+    stopwords = [
+        'Archive',
+        'Collection',
+        'Coordinator',
+        'Department',
+        'Germany',
+        'International',
+        'National',
+        'Netherlands',
+        'Office',
+        'Organisation',
+        'Organization',
+        'Service',
+        'Services',
+        'United States',
+        'University',
+        'Verein',
+        'Volkshochschule',
+    ]
+    for stop in stopwords:
+        if stop.lower() in s.lower():
+            return s
+
+    a, b = s.split(',')
+    return '{} {}'.format(b.strip(), a.strip())
diff --git a/python/tests/files/datacite/datacite_result_00.json b/python/tests/files/datacite/datacite_result_00.json
index 085e23f3..a4b28076 100644
--- a/python/tests/files/datacite/datacite_result_00.json
+++ b/python/tests/files/datacite/datacite_result_00.json
@@ -32,14 +32,14 @@
     "contribs": [
         {
             "index": 0,
-            "raw_name": "Li, Qian-Jin",
+            "raw_name": "Qian-Jin Li",
             "given_name": "Qian-Jin",
             "surname": "Li",
             "role": "author"
         },
         {
             "index": 1,
-            "raw_name": "Yang, Chun-Long",
+            "raw_name": "Chun-Long Yang",
             "given_name": "Chun-Long",
             "surname": "Yang",
             "role": "author"
diff --git a/python/tests/files/datacite/datacite_result_01.json b/python/tests/files/datacite/datacite_result_01.json
index f8c6b930..46be2515 100644
--- a/python/tests/files/datacite/datacite_result_01.json
+++ b/python/tests/files/datacite/datacite_result_01.json
@@ -21,7 +21,7 @@
     "contribs": [
         {
             "index": 0,
-            "raw_name": "Dargenty, G.",
+            "raw_name": "G. Dargenty",
             "given_name": "G.",
             "surname": "Dargenty",
             "role": "author"
diff --git a/python/tests/files/datacite/datacite_result_02.json b/python/tests/files/datacite/datacite_result_02.json
index f8b85f38..bdcb4951 100644
--- a/python/tests/files/datacite/datacite_result_02.json
+++ b/python/tests/files/datacite/datacite_result_02.json
@@ -25,7 +25,7 @@
     "contribs": [
         {
             "index": 0,
-            "raw_name": "Weyersberg, Albert",
+            "raw_name": "Albert Weyersberg",
             "given_name": "Albert",
             "surname": "Weyersberg",
             "role": "author"
diff --git a/python/tests/files/datacite/datacite_result_04.json b/python/tests/files/datacite/datacite_result_04.json
index 7ca70d6c..54b19ef9 100644
--- a/python/tests/files/datacite/datacite_result_04.json
+++ b/python/tests/files/datacite/datacite_result_04.json
@@ -12,7 +12,7 @@
     "contribs": [
         {
             "index": 0,
-            "raw_name": "Nicollerat, Marc Andre",
+            "raw_name": "Marc Andre Nicollerat",
             "given_name": "Marc Andre",
             "surname": "Nicollerat",
             "role": "author"
diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json
index e61769de..a790c26e 100644
--- a/python/tests/files/datacite/datacite_result_05.json
+++ b/python/tests/files/datacite/datacite_result_05.json
@@ -24,497 +24,497 @@
     "contribs": [
         {
             "index": 0,
-            "raw_name": "K\u00f5ljalg, Urmas",
+            "raw_name": "Urmas K\u00f5ljalg",
             "given_name": "Urmas",
             "surname": "K\u00f5ljalg",
             "role": "author"
         },
         {
             "index": 1,
-            "raw_name": "Abarenkov, Kessy",
+            "raw_name": "Kessy Abarenkov",
             "given_name": "Kessy",
             "surname": "Abarenkov",
             "role": "author"
         },
         {
             "index": 2,
-            "raw_name": "Nilsson, R. Henrik",
+            "raw_name": "R. Henrik Nilsson",
             "given_name": "R. Henrik",
             "surname": "Nilsson",
             "role": "author"
         },
         {
             "index": 3,
-            "raw_name": "Larsson, Karl-Henrik",
+            "raw_name": "Karl-Henrik Larsson",
             "given_name": "Karl-Henrik",
             "surname": "Larsson",
             "role": "author"
         },
         {
             "index": 4,
-            "raw_name": "Aas, Anders Bj\u00f8rnsgard",
+            "raw_name": "Anders Bj\u00f8rnsgard Aas",
             "given_name": "Anders Bj\u00f8rnsgard",
             "surname": "Aas",
             "role": "author"
         },
         {
             "index": 5,
-            "raw_name": "Adams, Rachel",
+            "raw_name": "Rachel Adams",
             "given_name": "Rachel",
             "surname": "Adams",
             "role": "author"
         },
         {
             "index": 6,
-            "raw_name": "Alves, Artur",
+            "raw_name": "Artur Alves",
             "given_name": "Artur",
             "surname": "Alves",
             "role": "author"
         },
         {
             "index": 7,
-            "raw_name": "Ammirati, Joseph F.",
+            "raw_name": "Joseph F. Ammirati",
             "given_name": "Joseph F.",
             "surname": "Ammirati",
             "role": "author"
         },
         {
             "index": 8,
-            "raw_name": "Arnold, A. Elizabeth",
+            "raw_name": "A. Elizabeth Arnold",
             "given_name": "A. Elizabeth",
             "surname": "Arnold",
             "role": "author"
         },
         {
             "index": 9,
-            "raw_name": "Bahram, Mohammad",
+            "raw_name": "Mohammad Bahram",
             "given_name": "Mohammad",
             "surname": "Bahram",
             "role": "author"
         },
         {
             "index": 10,
-            "raw_name": "Bengtsson-Palme, Johan",
+            "raw_name": "Johan Bengtsson-Palme",
             "given_name": "Johan",
             "surname": "Bengtsson-Palme",
             "role": "author"
         },
         {
             "index": 11,
-            "raw_name": "Berlin, Anna",
+            "raw_name": "Anna Berlin",
             "given_name": "Anna",
             "surname": "Berlin",
             "role": "author"
         },
         {
             "index": 12,
-            "raw_name": "Botnen, Synn\u00f8ve",
+            "raw_name": "Synn\u00f8ve Botnen",
             "given_name": "Synn\u00f8ve",
             "surname": "Botnen",
             "role": "author"
         },
         {
             "index": 13,
-            "raw_name": "Bourlat, Sarah",
+            "raw_name": "Sarah Bourlat",
             "given_name": "Sarah",
             "surname": "Bourlat",
             "role": "author"
         },
         {
             "index": 14,
-            "raw_name": "Cheeke, Tanya",
+            "raw_name": "Tanya Cheeke",
             "given_name": "Tanya",
             "surname": "Cheeke",
             "role": "author"
         },
         {
             "index": 15,
-            "raw_name": "Dima, B\u00e1lint",
+            "raw_name": "B\u00e1lint Dima",
             "given_name": "B\u00e1lint",
             "surname": "Dima",
             "role": "author"
         },
         {
             "index": 16,
-            "raw_name": "Drenkhan, Rein",
+            "raw_name": "Rein Drenkhan",
             "given_name": "Rein",
             "surname": "Drenkhan",
             "role": "author"
         },
         {
             "index": 17,
-            "raw_name": "Duarte, Camila",
+            "raw_name": "Camila Duarte",
             "given_name": "Camila",
             "surname": "Duarte",
             "role": "author"
         },
         {
             "index": 18,
-            "raw_name": "Due\u00f1as, Margarita",
+            "raw_name": "Margarita Due\u00f1as",
             "given_name": "Margarita",
             "surname": "Due\u00f1as",
             "role": "author"
         },
         {
             "index": 19,
-            "raw_name": "Eberhardt, Ursula",
+            "raw_name": "Ursula Eberhardt",
             "given_name": "Ursula",
             "surname": "Eberhardt",
             "role": "author"
         },
         {
             "index": 20,
-            "raw_name": "Friberg, Hanna",
+            "raw_name": "Hanna Friberg",
             "given_name": "Hanna",
             "surname": "Friberg",
             "role": "author"
         },
         {
             "index": 21,
-            "raw_name": "Fr\u00f8slev, Tobias G.",
+            "raw_name": "Tobias G. Fr\u00f8slev",
             "given_name": "Tobias G.",
             "surname": "Fr\u00f8slev",
             "role": "author"
         },
         {
             "index": 22,
-            "raw_name": "Garnica, Sigisfredo",
+            "raw_name": "Sigisfredo Garnica",
             "given_name": "Sigisfredo",
             "surname": "Garnica",
             "role": "author"
         },
         {
             "index": 23,
-            "raw_name": "Geml, J\u00f3zsef",
+            "raw_name": "J\u00f3zsef Geml",
             "given_name": "J\u00f3zsef",
             "surname": "Geml",
             "role": "author"
         },
         {
             "index": 24,
-            "raw_name": "Ghobad-Nejhad, Masoomeh",
+            "raw_name": "Masoomeh Ghobad-Nejhad",
             "given_name": "Masoomeh",
             "surname": "Ghobad-Nejhad",
             "role": "author"
         },
         {
             "index": 25,
-            "raw_name": "Grebenc, Tine",
+            "raw_name": "Tine Grebenc",
             "given_name": "Tine",
             "surname": "Grebenc",
             "role": "author"
         },
         {
             "index": 26,
-            "raw_name": "Griffith, Gareth W.",
+            "raw_name": "Gareth W. Griffith",
             "given_name": "Gareth W.",
             "surname": "Griffith",
             "role": "author"
         },
         {
             "index": 27,
-            "raw_name": "Hampe, Felix",
+            "raw_name": "Felix Hampe",
             "given_name": "Felix",
             "surname": "Hampe",
             "role": "author"
         },
         {
             "index": 28,
-            "raw_name": "Kennedy, Peter",
+            "raw_name": "Peter Kennedy",
             "given_name": "Peter",
             "surname": "Kennedy",
             "role": "author"
         },
         {
             "index": 29,
-            "raw_name": "Khomich, Maryia",
+            "raw_name": "Maryia Khomich",
             "given_name": "Maryia",
             "surname": "Khomich",
             "role": "author"
         },
         {
             "index": 30,
-            "raw_name": "Kohout, Petr",
+            "raw_name": "Petr Kohout",
             "given_name": "Petr",
             "surname": "Kohout",
             "role": "author"
         },
         {
             "index": 31,
-            "raw_name": "Kollom, Anu",
+            "raw_name": "Anu Kollom",
             "given_name": "Anu",
             "surname": "Kollom",
             "role": "author"
         },
         {
             "index": 32,
-            "raw_name": "Larsson, Ellen",
+            "raw_name": "Ellen Larsson",
             "given_name": "Ellen",
             "surname": "Larsson",
             "role": "author"
         },
         {
             "index": 33,
-            "raw_name": "Laszlo, Irinyi",
+            "raw_name": "Irinyi Laszlo",
             "given_name": "Irinyi",
             "surname": "Laszlo",
             "role": "author"
         },
         {
             "index": 34,
-            "raw_name": "Leavitt, Steven",
+            "raw_name": "Steven Leavitt",
             "given_name": "Steven",
             "surname": "Leavitt",
             "role": "author"
         },
         {
             "index": 35,
-            "raw_name": "Liimatainen, Kare",
+            "raw_name": "Kare Liimatainen",
             "given_name": "Kare",
             "surname": "Liimatainen",
             "role": "author"
         },
         {
             "index": 36,
-            "raw_name": "Lindahl, Bj\u00f6rn",
+            "raw_name": "Bj\u00f6rn Lindahl",
             "given_name": "Bj\u00f6rn",
             "surname": "Lindahl",
             "role": "author"
         },
         {
             "index": 37,
-            "raw_name": "Lodge, Deborah J.",
+            "raw_name": "Deborah J. Lodge",
             "given_name": "Deborah J.",
             "surname": "Lodge",
             "role": "author"
         },
         {
             "index": 38,
-            "raw_name": "Lumbsch, Helge Thorsten",
+            "raw_name": "Helge Thorsten Lumbsch",
             "given_name": "Helge Thorsten",
             "surname": "Lumbsch",
             "role": "author"
         },
         {
             "index": 39,
-            "raw_name": "Mart\u00edn Esteban, Mar\u00eda Paz",
+            "raw_name": "Mar\u00eda Paz Mart\u00edn Esteban",
             "given_name": "Mar\u00eda Paz",
             "surname": "Mart\u00edn Esteban",
             "role": "author"
         },
         {
             "index": 40,
-            "raw_name": "Meyer, Wieland",
+            "raw_name": "Wieland Meyer",
             "given_name": "Wieland",
             "surname": "Meyer",
             "role": "author"
         },
         {
             "index": 41,
-            "raw_name": "Miettinen, Otto",
+            "raw_name": "Otto Miettinen",
             "given_name": "Otto",
             "surname": "Miettinen",
             "role": "author"
         },
         {
             "index": 42,
-            "raw_name": "Nguyen, Nhu",
+            "raw_name": "Nhu Nguyen",
             "given_name": "Nhu",
             "surname": "Nguyen",
             "role": "author"
         },
         {
             "index": 43,
-            "raw_name": "Niskanen, Tuula",
+            "raw_name": "Tuula Niskanen",
             "given_name": "Tuula",
             "surname": "Niskanen",
             "role": "author"
         },
         {
             "index": 44,
-            "raw_name": "Oono, Ryoko",
+            "raw_name": "Ryoko Oono",
             "given_name": "Ryoko",
             "surname": "Oono",
             "role": "author"
         },
         {
             "index": 45,
-            "raw_name": "\u00d6pik, Maarja",
+            "raw_name": "Maarja \u00d6pik",
             "given_name": "Maarja",
             "surname": "\u00d6pik",
             "role": "author"
         },
         {
             "index": 46,
-            "raw_name": "Ordynets, Alexander",
+            "raw_name": "Alexander Ordynets",
             "given_name": "Alexander",
             "surname": "Ordynets",
             "role": "author"
         },
         {
             "index": 47,
-            "raw_name": "Paw\u0142owska, Julia",
+            "raw_name": "Julia Paw\u0142owska",
             "given_name": "Julia",
             "surname": "Paw\u0142owska",
             "role": "author"
         },
         {
             "index": 48,
-            "raw_name": "Peintner, Ursula",
+            "raw_name": "Ursula Peintner",
             "given_name": "Ursula",
             "surname": "Peintner",
             "role": "author"
         },
         {
             "index": 49,
-            "raw_name": "Pereira, Olinto Liparini",
+            "raw_name": "Olinto Liparini Pereira",
             "given_name": "Olinto Liparini",
             "surname": "Pereira",
             "role": "author"
         },
         {
             "index": 50,
-            "raw_name": "Pinho, Danilo Batista",
+            "raw_name": "Danilo Batista Pinho",
             "given_name": "Danilo Batista",
             "surname": "Pinho",
             "role": "author"
         },
         {
             "index": 51,
-            "raw_name": "P\u00f5ldmaa, Kadri",
+            "raw_name": "Kadri P\u00f5ldmaa",
             "given_name": "Kadri",
             "surname": "P\u00f5ldmaa",
             "role": "author"
         },
         {
             "index": 52,
-            "raw_name": "Runnel, Kadri",
+            "raw_name": "Kadri Runnel",
             "given_name": "Kadri",
             "surname": "Runnel",
             "role": "author"
         },
         {
             "index": 53,
-            "raw_name": "Ryberg, Martin",
+            "raw_name": "Martin Ryberg",
             "given_name": "Martin",
             "surname": "Ryberg",
             "role": "author"
         },
         {
             "index": 54,
-            "raw_name": "Saar, Irja",
+            "raw_name": "Irja Saar",
             "given_name": "Irja",
             "surname": "Saar",
             "role": "author"
         },
         {
             "index": 55,
-            "raw_name": "Sanli, Kemal",
+            "raw_name": "Kemal Sanli",
             "given_name": "Kemal",
             "surname": "Sanli",
             "role": "author"
         },
         {
             "index": 56,
-            "raw_name": "Scott, James",
+            "raw_name": "James Scott",
             "given_name": "James",
             "surname": "Scott",
             "role": "author"
         },
         {
             "index": 57,
-            "raw_name": "Spirin, Viacheslav",
+            "raw_name": "Viacheslav Spirin",
             "given_name": "Viacheslav",
             "surname": "Spirin",
             "role": "author"
         },
         {
             "index": 58,
-            "raw_name": "Suija, Ave",
+            "raw_name": "Ave Suija",
             "given_name": "Ave",
             "surname": "Suija",
             "role": "author"
         },
         {
             "index": 59,
-            "raw_name": "Svantesson, Sten",
+            "raw_name": "Sten Svantesson",
             "given_name": "Sten",
             "surname": "Svantesson",
             "role": "author"
         },
         {
             "index": 60,
-            "raw_name": "Tadych, Mariusz",
+            "raw_name": "Mariusz Tadych",
             "given_name": "Mariusz",
             "surname": "Tadych",
             "role": "author"
         },
         {
             "index": 61,
-            "raw_name": "Takamatsu, Susumu",
+            "raw_name": "Susumu Takamatsu",
             "given_name": "Susumu",
             "surname": "Takamatsu",
             "role": "author"
         },
         {
             "index": 62,
-            "raw_name": "Tamm, Heidi",
+            "raw_name": "Heidi Tamm",
             "given_name": "Heidi",
             "surname": "Tamm",
             "role": "author"
         },
         {
             "index": 63,
-            "raw_name": "Taylor, AFS.",
+            "raw_name": "AFS. Taylor",
             "given_name": "AFS.",
             "surname": "Taylor",
             "role": "author"
         },
         {
             "index": 64,
-            "raw_name": "Tedersoo, Leho",
+            "raw_name": "Leho Tedersoo",
             "given_name": "Leho",
             "surname": "Tedersoo",
             "role": "author"
         },
         {
             "index": 65,
-            "raw_name": "Telleria, M.T.",
+            "raw_name": "M.T. Telleria",
             "given_name": "M.T.",
             "surname": "Telleria",
             "role": "author"
         },
         {
             "index": 66,
-            "raw_name": "Udayanga, Dhanushka",
+            "raw_name": "Dhanushka Udayanga",
             "given_name": "Dhanushka",
             "surname": "Udayanga",
             "role": "author"
         },
         {
             "index": 67,
-            "raw_name": "Unterseher, Martin",
+            "raw_name": "Martin Unterseher",
             "given_name": "Martin",
             "surname": "Unterseher",
             "role": "author"
         },
         {
             "index": 68,
-            "raw_name": "Volobuev, Sergey",
+            "raw_name": "Sergey Volobuev",
             "given_name": "Sergey",
             "surname": "Volobuev",
             "role": "author"
         },
         {
             "index": 69,
-            "raw_name": "Weiss, Michael",
+            "raw_name": "Michael Weiss",
             "given_name": "Michael",
             "surname": "Weiss",
             "role": "author"
         },
         {
             "index": 70,
-            "raw_name": "Wurzbacher, Christian",
+            "raw_name": "Christian Wurzbacher",
             "given_name": "Christian",
             "surname": "Wurzbacher",
             "role": "author"
diff --git a/python/tests/files/datacite/datacite_result_07.json b/python/tests/files/datacite/datacite_result_07.json
index 324bb663..f572263c 100644
--- a/python/tests/files/datacite/datacite_result_07.json
+++ b/python/tests/files/datacite/datacite_result_07.json
@@ -38,21 +38,21 @@
     "contribs": [
         {
             "index": 0,
-            "raw_name": "ROTHUIZEN, E.",
+            "raw_name": "E. ROTHUIZEN",
             "given_name": "E.",
             "surname": "ROTHUIZEN",
             "role": "author"
         },
         {
             "index": 1,
-            "raw_name": "ELMEGAARD, B.",
+            "raw_name": "B. ELMEGAARD",
             "given_name": "B.",
             "surname": "ELMEGAARD",
             "role": "author"
         },
         {
             "index": 2,
-            "raw_name": "MARKUSSEN W., B.",
+            "raw_name": "B. MARKUSSEN W.",
             "given_name": "B.",
             "surname": "MARKUSSEN W.",
             "role": "author"
diff --git a/python/tests/files/datacite/datacite_result_08.json b/python/tests/files/datacite/datacite_result_08.json
index 281c3679..581ca1eb 100644
--- a/python/tests/files/datacite/datacite_result_08.json
+++ b/python/tests/files/datacite/datacite_result_08.json
@@ -30,14 +30,14 @@
     "contribs": [
         {
             "index": 0,
-            "raw_name": "Kajisa, Kei",
+            "raw_name": "Kei Kajisa",
             "given_name": "Kei",
             "surname": "Kajisa",
             "role": "author"
         },
         {
             "index": 1,
-            "raw_name": "Kajisa, Kei",
+            "raw_name": "Kei Kajisa",
             "given_name": "Kei",
             "surname": "Kajisa",
             "role": "author"
diff --git a/python/tests/files/datacite/datacite_result_09.json b/python/tests/files/datacite/datacite_result_09.json
index 01f92f85..db103d2b 100644
--- a/python/tests/files/datacite/datacite_result_09.json
+++ b/python/tests/files/datacite/datacite_result_09.json
@@ -24,7 +24,7 @@
     "contribs": [
         {
             "index": 0,
-            "raw_name": "Kirstaedter, Nils",
+            "raw_name": "Nils Kirstaedter",
             "given_name": "Nils",
             "surname": "Kirstaedter",
             "role": "author"
diff --git a/python/tests/files/datacite/datacite_result_12.json b/python/tests/files/datacite/datacite_result_12.json
index 6b6cad4a..192062e3 100644
--- a/python/tests/files/datacite/datacite_result_12.json
+++ b/python/tests/files/datacite/datacite_result_12.json
@@ -12,28 +12,28 @@
     "contribs": [
         {
             "index": 0,
-            "raw_name": "Spanias, Charalampos",
+            "raw_name": "Charalampos Spanias",
             "given_name": "Charalampos",
             "surname": "Spanias",
             "role": "author"
         },
         {
             "index": 1,
-            "raw_name": "Nikolaidis, Pantelis T",
+            "raw_name": "Pantelis T Nikolaidis",
             "given_name": "Pantelis T",
             "surname": "Nikolaidis",
             "role": "author"
         },
         {
             "index": 2,
-            "raw_name": "Rosemann, Thomas",
+            "raw_name": "Thomas Rosemann",
             "given_name": "Thomas",
             "surname": "Rosemann",
             "role": "author"
         },
         {
             "index": 3,
-            "raw_name": "Knechtle, Beat",
+            "raw_name": "Beat Knechtle",
             "given_name": "Beat",
             "surname": "Knechtle",
             "role": "author"
diff --git a/python/tests/files/datacite/datacite_result_13.json b/python/tests/files/datacite/datacite_result_13.json
index 3da3816d..c8971667 100644
--- a/python/tests/files/datacite/datacite_result_13.json
+++ b/python/tests/files/datacite/datacite_result_13.json
@@ -17,7 +17,7 @@
         },
         {
             "index": 1,
-            "raw_name": "Hiltbrunner, Hermann",
+            "raw_name": "Hermann Hiltbrunner",
             "given_name": "Hermann",
             "surname": "Hiltbrunner",
             "role": "author"
diff --git a/python/tests/files/datacite/datacite_result_14.json b/python/tests/files/datacite/datacite_result_14.json
index 94c00472..94ad000a 100644
--- a/python/tests/files/datacite/datacite_result_14.json
+++ b/python/tests/files/datacite/datacite_result_14.json
@@ -45,56 +45,56 @@
     "contribs": [
         {
             "index": 0,
-            "raw_name": "Stulz, E.",
+            "raw_name": "E. Stulz",
             "given_name": "E.",
             "surname": "Stulz",
             "role": "author"
         },
         {
             "index": 1,
-            "raw_name": "Scott, S.M.",
+            "raw_name": "S.M. Scott",
             "given_name": "S.M.",
             "surname": "Scott",
             "role": "author"
         },
         {
             "index": 2,
-            "raw_name": "Ng, Yiu-Fai",
+            "raw_name": "Yiu-Fai Ng",
             "given_name": "Yiu-Fai",
             "surname": "Ng",
             "role": "author"
         },
         {
             "index": 3,
-            "raw_name": "Bond, A.D.",
+            "raw_name": "A.D. Bond",
             "given_name": "A.D.",
             "surname": "Bond",
             "role": "author"
         },
         {
             "index": 4,
-            "raw_name": "Teat, S.J.",
+            "raw_name": "S.J. Teat",
             "given_name": "S.J.",
             "surname": "Teat",
             "role": "author"
         },
         {
             "index": 5,
-            "raw_name": "Darling, S.L.",
+            "raw_name": "S.L. Darling",
             "given_name": "S.L.",
             "surname": "Darling",
             "role": "author"
         },
         {
             "index": 6,
-            "raw_name": "Feeder, N.",
+            "raw_name": "N. Feeder",
             "given_name": "N.",
             "surname": "Feeder",
             "role": "author"
         },
         {
             "index": 7,
-            "raw_name": "Sanders, J.K.M.",
+            "raw_name": "J.K.M. Sanders",
             "given_name": "J.K.M.",
             "surname": "Sanders",
             "role": "author"
diff --git a/python/tests/files/datacite/datacite_result_15.json b/python/tests/files/datacite/datacite_result_15.json
index 0614f6ba..bdeb8426 100644
--- a/python/tests/files/datacite/datacite_result_15.json
+++ b/python/tests/files/datacite/datacite_result_15.json
@@ -11,7 +11,7 @@
     "contribs": [
         {
             "index": 0,
-            "raw_name": "Richardson, David",
+            "raw_name": "David Richardson",
             "given_name": "David",
             "surname": "Richardson",
             "role": "author"
diff --git a/python/tests/files/datacite/datacite_result_16.json b/python/tests/files/datacite/datacite_result_16.json
index 1d861cf6..ea8c2e59 100644
--- a/python/tests/files/datacite/datacite_result_16.json
+++ b/python/tests/files/datacite/datacite_result_16.json
@@ -20,7 +20,7 @@
     "contribs": [
         {
             "index": 0,
-            "raw_name": "Sochi, Taha",
+            "raw_name": "Taha Sochi",
             "given_name": "Taha",
             "surname": "Sochi",
             "role": "author"
diff --git a/python/tests/files/datacite/datacite_result_18.json b/python/tests/files/datacite/datacite_result_18.json
index 12ab39fe..274858c3 100644
--- a/python/tests/files/datacite/datacite_result_18.json
+++ b/python/tests/files/datacite/datacite_result_18.json
@@ -12,4 +12,4 @@
     "contribs": [],
     "refs": [],
     "abstracts": []
-}
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_19.json b/python/tests/files/datacite/datacite_result_19.json
index 1505db92..8d797268 100644
--- a/python/tests/files/datacite/datacite_result_19.json
+++ b/python/tests/files/datacite/datacite_result_19.json
@@ -12,4 +12,4 @@
     "contribs": [],
     "refs": [],
     "abstracts": []
-}
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_20.json b/python/tests/files/datacite/datacite_result_20.json
index 1868eede..97d7ae75 100644
--- a/python/tests/files/datacite/datacite_result_20.json
+++ b/python/tests/files/datacite/datacite_result_20.json
@@ -11,4 +11,4 @@
     "contribs": [],
     "refs": [],
     "abstracts": []
-}
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_21.json b/python/tests/files/datacite/datacite_result_21.json
index 9214065a..0a05a7cd 100644
--- a/python/tests/files/datacite/datacite_result_21.json
+++ b/python/tests/files/datacite/datacite_result_21.json
@@ -8,8 +8,8 @@
     "ext_ids": {
         "doi": "10.7916/d86x0cg1"
     },
+    "language": "de",
     "contribs": [],
     "refs": [],
-    "abstracts": [],
-    "language": "de"
-}
+    "abstracts": []
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_22.json b/python/tests/files/datacite/datacite_result_22.json
index e9939e09..9e4225b5 100644
--- a/python/tests/files/datacite/datacite_result_22.json
+++ b/python/tests/files/datacite/datacite_result_22.json
@@ -8,15 +8,15 @@
     "ext_ids": {
         "doi": "10.7916/d86x0cg1"
     },
+    "language": "de",
     "contribs": [
         {
-            "raw_affiliation": "Department of pataphysics",
             "index": 0,
             "raw_name": "Anton Welch",
-            "role": "author"
+            "role": "author",
+            "raw_affiliation": "Department of pataphysics"
         }
     ],
     "refs": [],
-    "abstracts": [],
-    "language": "de"
-}
+    "abstracts": []
+}
\ No newline at end of file
diff --git a/python/tests/files/datacite/datacite_result_23.json b/python/tests/files/datacite/datacite_result_23.json
index 2bf66eae..46f60492 100644
--- a/python/tests/files/datacite/datacite_result_23.json
+++ b/python/tests/files/datacite/datacite_result_23.json
@@ -8,6 +8,7 @@
     "ext_ids": {
         "doi": "10.7916/d86x0cg1-xxx"
     },
+    "language": "de",
     "contribs": [
         {
             "index": 0,
@@ -17,6 +18,5 @@
         }
     ],
     "refs": [],
-    "abstracts": [],
-    "language": "de"
-}
+    "abstracts": []
+}
\ No newline at end of file
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index cdc165d7..3e47fce8 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -7,7 +7,7 @@ import datetime
 import pytest
 import gzip
 from fatcat_tools.importers import DataciteImporter, JsonLinePusher
-from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates, clean_doi
+from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates, clean_doi, index_form_to_display_name
 from fatcat_tools.transforms import entity_to_dict
 from fixtures import api
 import json
@@ -294,3 +294,19 @@ def test_datacite_conversions(datacite_importer):
 
         assert result == expected
 
+def test_index_form_to_display_name():
+    Case = collections.namedtuple('Case', 'input output')
+    cases = [
+        Case('', ''),
+        Case('ABC', 'ABC'),
+        Case('International Space Station', 'International Space Station'),
+        Case('Jin, Shan', 'Shan Jin'),
+        Case('Volkshochschule Der Bundesstadt Bonn', 'Volkshochschule Der Bundesstadt Bonn'),
+        Case('Solomon, P. M.', 'P. M. Solomon'),
+        Case('Sujeevan Ratnasingham', 'Sujeevan Ratnasingham'),
+        Case('Paul Stöckli (1906-1991), Künstler', 'Paul Stöckli (1906-1991), Künstler'),
+    ]
+
+    for c in cases:
+        assert c.output == index_form_to_display_name(c.input)
+
-- 
cgit v1.2.3


From b33782cabf60ec8b90338abd4986338683c30b72 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 2 Jan 2020 18:52:53 +0100
Subject: datacite: add helper script to create new test case

---
 python/tests/files/datacite/casecreate.sh | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100755 python/tests/files/datacite/casecreate.sh

(limited to 'python')

diff --git a/python/tests/files/datacite/casecreate.sh b/python/tests/files/datacite/casecreate.sh
new file mode 100755
index 00000000..36ea08d2
--- /dev/null
+++ b/python/tests/files/datacite/casecreate.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+#
+# casecreate.sh creates a new test case file pair by copying the last one.
+#
+set -eo pipefail
+
+max=$(find . -name 'datacite_doc_*' | sort -n | tail -1 | grep -Eo '[0-9]+')
+if [ -z $max ]; then
+    echo "failed, expected datacite_doc_[NUMBER]..."
+    exit 1
+fi
+new=$((max+1))
+cp "datacite_doc_$max.json" "datacite_doc_$new.json"
+cp "datacite_result_$max.json" "datacite_result_$new.json"
-- 
cgit v1.2.3


From 9d4385210518266d73964f140f47995774656c3f Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 2 Jan 2020 19:01:47 +0100
Subject: datacite: open case for editing after creation

---
 python/tests/files/datacite/casecreate.sh | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'python')

diff --git a/python/tests/files/datacite/casecreate.sh b/python/tests/files/datacite/casecreate.sh
index 36ea08d2..82655dc3 100755
--- a/python/tests/files/datacite/casecreate.sh
+++ b/python/tests/files/datacite/casecreate.sh
@@ -12,3 +12,5 @@ fi
 new=$((max+1))
 cp "datacite_doc_$max.json" "datacite_doc_$new.json"
 cp "datacite_result_$max.json" "datacite_result_$new.json"
+
+[ -f ./caseview.sh ] && ./caseview.sh "$new"
-- 
cgit v1.2.3


From 61f0bbfbfdaf41be799fa41c88077806ef913188 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 2 Jan 2020 19:02:04 +0100
Subject: datacite: add another test case

---
 python/tests/files/datacite/datacite_doc_24.json   | 48 ++++++++++++++++++++++
 .../tests/files/datacite/datacite_result_24.json   | 22 ++++++++++
 python/tests/import_datacite.py                    |  2 +-
 3 files changed, 71 insertions(+), 1 deletion(-)
 create mode 100644 python/tests/files/datacite/datacite_doc_24.json
 create mode 100644 python/tests/files/datacite/datacite_result_24.json

(limited to 'python')

diff --git a/python/tests/files/datacite/datacite_doc_24.json b/python/tests/files/datacite/datacite_doc_24.json
new file mode 100644
index 00000000..6123350b
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_24.json
@@ -0,0 +1,48 @@
+{
+    "attributes": {
+      "doi": "10.7916/d86x0cg1",
+      "creators": [
+        {
+          "name": "Anton Welch",
+          "affiliation": [
+            "Department of pataphysics"
+          ],
+          "nameIdentifiers": []
+        }
+      ],
+      "titles": [
+        {
+          "title": "ABC"
+        },
+        {
+          "title": "DEF",
+          "titleType": "Subtitle"
+        }
+      ],
+      "publicationYear": 2016,
+      "language": "DE-CH",
+      "types": {
+        "ris": "GEN",
+        "bibtex": "misc",
+        "citeproc": "article",
+        "schemaOrg": "CreativeWork"
+      },
+      "dates": [
+        {
+          "date": "2017-08-24",
+          "dateType": "Created"
+        },
+        {
+          "date": "2019-08-04",
+          "dateType": "Updated"
+        },
+        {
+          "date": "2017",
+          "dateType": "Issued"
+        }
+      ],
+      "isActive": true,
+      "state": "findable"
+    }
+  }
+
diff --git a/python/tests/files/datacite/datacite_result_24.json b/python/tests/files/datacite/datacite_result_24.json
new file mode 100644
index 00000000..42859275
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_24.json
@@ -0,0 +1,22 @@
+{
+    "extra": {},
+    "title": "ABC",
+    "subtitle": "DEF",
+    "release_type": "article",
+    "release_stage": "published",
+    "release_date": "2017-08-24",
+    "release_year": 2017,
+    "ext_ids": {
+        "doi": "10.7916/d86x0cg1"
+    },
+    "contribs": [
+        {
+            "index": 0,
+            "raw_name": "Anton Welch",
+            "role": "author",
+            "raw_affiliation": "Department of pataphysics"
+        }
+    ],
+    "refs": [],
+    "abstracts": []
+}
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 3e47fce8..54a529c5 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -282,7 +282,7 @@ def test_datacite_conversions(datacite_importer):
     for now.
     """
     datacite_importer.debug = True
-    for i in range(24):
+    for i in range(25):
         src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i)
         dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i)
         print('testing mapping from {} => {}'.format(src, dst))
-- 
cgit v1.2.3


From 391565cbbc0ba17ffd8c4f5d88d4dfda8a8b323c Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 3 Jan 2020 13:46:05 +0100
Subject: datacite: remove --lang-detect flag

Estimated time for a single call is in the order of 50ms.
---
 python/fatcat_import.py                             |  4 ----
 python/fatcat_tools/importers/datacite.py           | 17 ++++++-----------
 python/tests/files/datacite/datacite_result_04.json |  5 +++--
 python/tests/files/datacite/datacite_result_05.json |  5 +++--
 python/tests/files/datacite/datacite_result_07.json |  5 +++--
 python/tests/files/datacite/datacite_result_08.json |  5 +++--
 python/tests/files/datacite/datacite_result_14.json |  5 +++--
 7 files changed, 21 insertions(+), 25 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index a17029cc..6b04d547 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -172,7 +172,6 @@ def run_datacite(args):
         edit_batch_size=args.batch_size,
         bezerk_mode=args.bezerk_mode,
         debug=args.debug,
-        lang_detect=args.lang_detect,
         extid_map_file=args.extid_map_file,
         insert_log_file=args.insert_log_file)
     if args.kafka_mode:
@@ -474,9 +473,6 @@ def main():
     sub_datacite.add_argument('--debug',
         action='store_true',
         help="write converted JSON to stdout")
-    sub_datacite.add_argument('--lang-detect',
-        action='store_true',
-        help="try to detect language (slow)")
     sub_datacite.add_argument('--insert-log-file',
         default='',
         type=str,
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index bd135569..8034a5c1 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -196,7 +196,6 @@ class DataciteImporter(EntityImporter):
                  api,
                  issn_map_file,
                  debug=False,
-                 lang_detect=False,
                  insert_log_file=None,
                  **kwargs):
 
@@ -225,12 +224,9 @@ class DataciteImporter(EntityImporter):
 
         self.read_issn_map_file(issn_map_file)
         self.debug = debug
-        self.lang_detect = lang_detect
         self.insert_log_file = insert_log_file
 
-        print('datacite with debug={}, lang_detect={}'.format(
-            self.debug, self.lang_detect),
-              file=sys.stderr)
+        print('datacite with debug={}'.format(self.debug), file=sys.stderr)
 
     def lookup_ext_ids(self, doi):
         """
@@ -537,12 +533,11 @@ class DataciteImporter(EntityImporter):
             if len(text) > MAX_ABSTRACT_LENGTH:
                 text = text[:MAX_ABSTRACT_LENGTH] + " [...]"
             lang = None
-            if self.lang_detect:
-                try:
-                    lang = langdetect.detect(text)
-                except langdetect.lang_detect_exception.LangDetectException as err:
-                    print('[{}] language detection failed: {}'.format(doi, err),
-                          file=sys.stderr)
+            try:
+                lang = langdetect.detect(text)
+            except langdetect.lang_detect_exception.LangDetectException as err:
+                print('[{}] language detection failed: {}'.format(doi, err),
+                      file=sys.stderr)
             abstracts.append(
                 fatcat_openapi_client.ReleaseAbstract(
                     mimetype="text/plain",
diff --git a/python/tests/files/datacite/datacite_result_04.json b/python/tests/files/datacite/datacite_result_04.json
index 54b19ef9..94fa1f94 100644
--- a/python/tests/files/datacite/datacite_result_04.json
+++ b/python/tests/files/datacite/datacite_result_04.json
@@ -22,7 +22,8 @@
     "abstracts": [
         {
             "content": "Let A be an abelian category, I the full subcategory of A consisting of injective objects of A, and K(A) the category whose objects are cochain complexes of elements of A, and whose morphisms are homotopy classes of cochain maps.  In (5), lemma 4.6., p. 42, R. Hartshorne has proved that, under certain conditions, a cochain complex X\u02d9 \u03b5. |KA)| can be embedded in a complex I\u02d9 \u03b5. |K(I)| in such a way that I\u02d9 has the same cohomology as X\u02d9.  In Chapter I we show that the construction given in the two first parts of Hartshorne's Lemma is natural i.e. there exists a functor  J : K(A) \u2192 K(I) and a natural transformation [formula omitted]  (where E : K(I) \u2192 K(A) is the embedding functor) such that [formula omitted] is  injective and induces isomorphism in cohomology. The question whether the construction given in the third part of the lemma is functorial is still open.  We also prove that J is left adjoint to E, so that K(I) is a reflective subcategory of K(A).  In the special case where A is a category [formula omitted] of left A-modules, and [formula omitted] the category of cochain complexes in [formula omitted] and cochain maps (not homotopy classes), we prove the existence of a functor [formula omitted]  In Chapter II we study the natural homomorphism [formula omitted]   where A, B are rings, and M, L, N modules or chain complexes. In particular we give several sufficient conditions under which v is an isomorphism, or induces isomorphism in homology.  In the appendix we give a detailed proof of Hartshorne's Lemma. We think that this is useful, as no complete proof is, to our knowledge, to be found in the literature.",
-            "mimetype": "text/plain"
+            "mimetype": "text/plain",
+            "lang": "en"
         }
     ]
-}
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json
index a790c26e..ff998c0f 100644
--- a/python/tests/files/datacite/datacite_result_05.json
+++ b/python/tests/files/datacite/datacite_result_05.json
@@ -524,7 +524,8 @@
     "abstracts": [
         {
             "content": "UNITE provides a unified way for delimiting, identifying, communicating, and working with DNA-based Species Hypotheses (SH). All fungal ITS sequences in the international nucleotide sequence databases are clustered to approximately the species level by applying a set of dynamic distance values (&lt;0.5 - 3.0%). All species hypotheses are given a unique, stable name in the form of a DOI, and their taxonomic and ecological annotations are verified through distributed, web-based third-party annotation efforts. SHs are connected to a taxon name and its classification as far as possible (phylum, class, order, etc.) by taking into account identifications for all sequences in the SH. An automatically or manually designated sequence is chosen to represent each such SH. These sequences are released (https://unite.ut.ee/repository.php) for use by the scientific community in, for example, local sequence similarity searches and next-generation sequencing analysis pipelines. The system and the data are updated automatically as the number of public fungal ITS sequences grows.",
-            "mimetype": "text/plain"
+            "mimetype": "text/plain",
+            "lang": "en"
         }
     ]
-}
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_07.json b/python/tests/files/datacite/datacite_result_07.json
index f572263c..f694ddef 100644
--- a/python/tests/files/datacite/datacite_result_07.json
+++ b/python/tests/files/datacite/datacite_result_07.json
@@ -67,7 +67,8 @@
     "abstracts": [
         {
             "content": "The purpose of the ISEC concept is to provide a high-efficient heat pump system for hot water production. The ISEC concept uses two storage tanks for the water, one discharged and one charged. Hot water for the industrial process is tapped from the charged tank, while the other tank is charging. Charging is done by circulating the water in the tank through the condenser of a heat pump several times and thereby gradually heating the water. The charging is done with a higher mass flow rate than the discharging to reach several circulations of the water during the time frame of one discharging. This result in a lower condensing temperature than if the water was heated in one step. Two test setups were built, one to test the performance of the heat pump gradually heating the water and one to investigate the stratification in the storage tanks. Furthermore, a dynamic model of the system was implemented in Dymola, and validated by the use of test data from the two experimental setups. This paper shows that there is a good consistency between the model and the experimental tests.",
-            "mimetype": "text/plain"
+            "mimetype": "text/plain",
+            "lang": "en"
         }
     ]
-}
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_08.json b/python/tests/files/datacite/datacite_result_08.json
index 581ca1eb..cc0e968b 100644
--- a/python/tests/files/datacite/datacite_result_08.json
+++ b/python/tests/files/datacite/datacite_result_08.json
@@ -47,7 +47,8 @@
     "abstracts": [
         {
             "content": "International society recognizes that the scarcity of fresh water is increasing and farming sectors suffer from lack of irrigation water. However, if we look at this issue with a framework of relative factor endowment, a different view will arise. In emerging states with rapid industrialization and labor migration, labor scarcity increases at a faster pace than that of irrigation water. Using the historical review of Japan\u2019s irrigation policies as well as the case studies of India and China, this paper shows that the introduction of policies which do not reflect the actual relative resource scarcity may mislead the development path. We argue that under increasing relative labor scarcity it is important to realize the substitution of capital for labor for surface irrigation system management and that the substitution needs public support because the service of surface irrigation system has some externalities. Through this argument, this paper also intends to shed the light back to the role of the state for local resource management which seems to be unfairly undervalued since the boom of community participatory approach in the 1980s.",
-            "mimetype": "text/plain"
+            "mimetype": "text/plain",
+            "lang": "en"
         }
     ]
-}
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_14.json b/python/tests/files/datacite/datacite_result_14.json
index 94ad000a..4521f891 100644
--- a/python/tests/files/datacite/datacite_result_14.json
+++ b/python/tests/files/datacite/datacite_result_14.json
@@ -104,7 +104,8 @@
     "abstracts": [
         {
             "content": "An entry from the Cambridge Structural Database, the world\u2019s repository for small molecule crystal structures. The entry contains experimental data from a crystal diffraction study. The deposited dataset for this entry is freely available from the CCDC and typically includes 3D coordinates, cell parameters, space group, experimental conditions and quality measures.",
-            "mimetype": "text/plain"
+            "mimetype": "text/plain",
+            "lang": "en"
         }
     ]
-}
\ No newline at end of file
+}
-- 
cgit v1.2.3


From 7f38b161a1eac016fb230f7ebe4158efefa35568 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 3 Jan 2020 13:55:36 +0100
Subject: update potentially outdated Pipfile.lock

via: $ pipenv lock

CI complained with a slightly cryptic:

> TypeError: __init__() missing 1 required positional argument: 'self'
---
 python/Pipfile.lock | 182 +++++++++++++++++++++++++---------------------------
 1 file changed, 86 insertions(+), 96 deletions(-)

(limited to 'python')

diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index 25606b3c..05bbd488 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -1,7 +1,7 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "17edea9781496af462798b03388bcf0c86d3924a761168cb5d324dca5916ce46"
+            "sha256": "f6e7c193796101c8592827483aaed50efec5e8c261c5a179aea494f8a24cb4dc"
         },
         "pipfile-spec": 6,
         "requires": {
@@ -25,11 +25,11 @@
         },
         "beautifulsoup4": {
             "hashes": [
-                "sha256:5279c36b4b2ec2cb4298d723791467e3000e5384a43ea0cdf5d45207c7e97169",
-                "sha256:6135db2ba678168c07950f9a16c4031822c6f4aec75a65e0a97bc5ca09789931",
-                "sha256:dcdef580e18a76d54002088602eba453eec38ebbcafafeaabd8cab12b6155d57"
+                "sha256:05fd825eb01c290877657a56df4c6e4c311b3965bda790c613a3d6fb01a5462a",
+                "sha256:9fbb4d6e48ecd30bcacc5b63b94088192dcda178513b2ae3c394229f8911b887",
+                "sha256:e1505eeed31b0f4ce2dbb3bc8eb256c04cc2b3b72af7d551a4ab6efd5cbe5dae"
             ],
-            "version": "==4.8.1"
+            "version": "==4.8.2"
         },
         "blinker": {
             "hashes": [
@@ -229,7 +229,6 @@
         },
         "flask-misaka": {
             "hashes": [
-                "sha256:bcfdacc0803ccea75d377737e82c83489b2153d922c9d9f9eabc5148d216ed70",
                 "sha256:d0cfb0efd9e5afacda76defd4a605a68390f4fb1bef283c71534fd3ce0d3efb5",
                 "sha256:f423c3beb5502742a57330a272f81d53223f6f99d45cc45b03926e3a3034f589"
             ],
@@ -238,10 +237,10 @@
         },
         "flask-mwoauth": {
             "hashes": [
-                "sha256:6df2e2448ea9251c61d4142da1aa2cd529e9ce22b440b577b1197912a8969be9"
+                "sha256:3723f251712e7cfe000d3ac06c51e970d59ab776ed75300dea910248e39e7de8"
             ],
             "index": "pypi",
-            "version": "==0.4.75"
+            "version": "==0.4.76"
         },
         "flask-uuid": {
             "hashes": [
@@ -422,7 +421,6 @@
         },
         "pykafka": {
             "hashes": [
-                "sha256:6b075909a52cb0c95325bc16ab797bbcdbb37386652ea460705ed4472ce91459",
                 "sha256:f0bbd394ae6970042a587c99fe4dc0966e67787249d963d4ce2f810dc9490577"
             ],
             "index": "pypi",
@@ -509,6 +507,9 @@
             "version": "==2019.3"
         },
         "raven": {
+            "extras": [
+                "flask"
+            ],
             "hashes": [
                 "sha256:3fa6de6efa2493a7c827472e984ce9b020797d0da16f1db67197bcc23c8fae54",
                 "sha256:44a13f87670836e153951af9a3c80405d36b43097db869a36e92809673692ce4"
@@ -518,29 +519,29 @@
         },
         "regex": {
             "hashes": [
-                "sha256:0472acc4b6319801c1bc681d838c88ba1446f9ae199e01f6e41091c701fb3d42",
-                "sha256:16709434c4e2332ee8ba26ae339aceb8ab0b24b8398ebd0f52ebc943f45c4fc2",
-                "sha256:223fb63ec8dcab20b3318e93dcec4aee89e98b062934090bf29ffc374d2000a2",
-                "sha256:23c3ebf05d1cd3adb26723fd598e75724e0cdb7d6a35185ac0caf061cc6edb49",
-                "sha256:2404a50fb48badaf214b700f08822b68d93d79200e0aefd9569d0332d21fbfcb",
-                "sha256:2af3a7a16fed6eff85c25da106effa36f61cbbe801d00ade349b53ce7619eb15",
-                "sha256:37e018d3746baf159aedfc9773c3cafacbd10d354ba15484f5cfc8ed9da5748b",
-                "sha256:3c9c2988d02a9238a1975c70e87c6ce94e6f36dd8e372b66f468990cfe077434",
-                "sha256:47298bc8b89d1c747f0f5974aa528fc0b6b17396f1694136a224d51461279d83",
-                "sha256:4eeb0fe936797ae00a085f99802642bfc722b3b4ea557e9e7849cb621ea10c91",
-                "sha256:6881be0218b47ed76db033f252bab3f912dfe7ed1fe7baa9daebf51de08546a0",
-                "sha256:7ac08cee5055f548eed3889e9aaef15fd00172d037949496f1f0b34acb8a7c3e",
-                "sha256:7c5e2efcf079c35ff266c3f3a6708834f88f9fd04a3c16b855e036b2b7b1b543",
-                "sha256:8355eaa64724a0fdb010a1654b77cb3e375dc08b7f592cc4a1c05ac606aa481c",
-                "sha256:999a885f7f5194464238ad5d74b05982acee54002f3aa775d8e0e8c5fb74c06c",
-                "sha256:9fd2f4813eaa3e421e82819d38e5b634d900faff7ae5a80cd89ccff407175e69",
-                "sha256:a2e1e53df7dd27943da2b512895125b33fb20f81862c9fed7b3bab2a1de684d1",
-                "sha256:ab43bc0836820b7900dfffc025b996784aec26ec87dc1df4f95a40398760223f",
-                "sha256:ba449b56fa419fb19bf2a2438adbd2433f27087a6fe115917eaf9cfca684d5b6",
-                "sha256:d3f632cefad2cf247bd845794002585e3772288bfcb0dbac59fdecd32cd38b67",
-                "sha256:d51311496061863caae2cfe120cf1ef37900019b86c89c2d75f0918e0b4b8bf3"
-            ],
-            "version": "==2019.12.19"
+                "sha256:032fdcc03406e1a6485ec09b826eac78732943840c4b29e503b789716f051d8d",
+                "sha256:0e6cf1e747f383f52a0964452658c04300a9a01e8a89c55ea22813931b580aa8",
+                "sha256:106e25a841921d8259dcef2a42786caae35bc750fb996f830065b3dfaa67b77e",
+                "sha256:1768cf42a78a11dae63152685e7a1d90af7a8d71d2d4f6d2387edea53a9e0588",
+                "sha256:27d1bd20d334f50b7ef078eba0f0756a640fd25f5f1708d3b5bed18a5d6bced9",
+                "sha256:29b20f66f2e044aafba86ecf10a84e611b4667643c42baa004247f5dfef4f90b",
+                "sha256:4850c78b53acf664a6578bba0e9ebeaf2807bb476c14ec7e0f936f2015133cae",
+                "sha256:57eacd38a5ec40ed7b19a968a9d01c0d977bda55664210be713e750dd7b33540",
+                "sha256:724eb24b92fc5fdc1501a1b4df44a68b9c1dda171c8ef8736799e903fb100f63",
+                "sha256:77ae8d926f38700432807ba293d768ba9e7652df0cbe76df2843b12f80f68885",
+                "sha256:78b3712ec529b2a71731fbb10b907b54d9c53a17ca589b42a578bc1e9a2c82ea",
+                "sha256:7bbbdbada3078dc360d4692a9b28479f569db7fc7f304b668787afc9feb38ec8",
+                "sha256:8d9ef7f6c403e35e73b7fc3cde9f6decdc43b1cb2ff8d058c53b9084bfcb553e",
+                "sha256:a83049eb717ae828ced9cf607845929efcb086a001fc8af93ff15c50012a5716",
+                "sha256:adc35d38952e688535980ae2109cad3a109520033642e759f987cf47fe278aa1",
+                "sha256:c29a77ad4463f71a506515d9ec3a899ed026b4b015bf43245c919ff36275444b",
+                "sha256:cfd31b3300fefa5eecb2fe596c6dee1b91b3a05ece9d5cfd2631afebf6c6fadd",
+                "sha256:d3ee0b035816e0520fac928de31b6572106f0d75597f6fa3206969a02baba06f",
+                "sha256:d508875793efdf6bab3d47850df8f40d4040ae9928d9d80864c1768d6aeaf8e3",
+                "sha256:ef0b828a7e22e58e06a1cceddba7b4665c6af8afeb22a0d8083001330572c147",
+                "sha256:faad39fdbe2c2ccda9846cd21581063086330efafa47d87afea4073a08128656"
+            ],
+            "version": "==2019.12.20"
         },
         "requests": {
             "hashes": [
@@ -553,8 +554,7 @@
         "requests-oauthlib": {
             "hashes": [
                 "sha256:7f71572defaecd16372f9006f33c2ec8c077c3cfa6f5911a9a90202beb513f3d",
-                "sha256:b4261601a71fd721a8bd6d7aa1cc1d6a8a93b4a9f5e96626f8e4d91e8beeaa6a",
-                "sha256:fa6c47b933f01060936d87ae9327fead68768b69c6c9ea2109c48be30f2d4dbc"
+                "sha256:b4261601a71fd721a8bd6d7aa1cc1d6a8a93b4a9f5e96626f8e4d91e8beeaa6a"
             ],
             "version": "==1.3.0"
         },
@@ -602,10 +602,9 @@
         },
         "wcwidth": {
             "hashes": [
-                "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
-                "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+                "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603"
             ],
-            "version": "==0.1.7"
+            "version": "==0.1.8"
         },
         "werkzeug": {
             "hashes": [
@@ -668,39 +667,39 @@
         },
         "coverage": {
             "hashes": [
-                "sha256:0cd13a6e98c37b510a2d34c8281d5e1a226aaf9b65b7d770ef03c63169965351",
-                "sha256:1a4b6b6a2a3a6612e6361130c2cc3dc4378d8c221752b96167ccbad94b47f3cd",
-                "sha256:2ee55e6dba516ddf6f484aa83ccabbb0adf45a18892204c23486938d12258cde",
-                "sha256:3be5338a2eb4ef03c57f20917e1d12a1fd10e3853fed060b6d6b677cb3745898",
-                "sha256:44b783b02db03c4777d8cf71bae19eadc171a6f2a96777d916b2c30a1eb3d070",
-                "sha256:475bf7c4252af0a56e1abba9606f1e54127cdf122063095c75ab04f6f99cf45e",
-                "sha256:47c81ee687eafc2f1db7f03fbe99aab81330565ebc62fb3b61edfc2216a550c8",
-                "sha256:4a7f8e72b18f2aca288ff02255ce32cc830bc04d993efbc87abf6beddc9e56c0",
-                "sha256:50197163a22fd17f79086e087a787883b3ec9280a509807daf158dfc2a7ded02",
-                "sha256:56b13000acf891f700f5067512b804d1ec8c301d627486c678b903859d07f798",
-                "sha256:79388ae29c896299b3567965dbcd93255f175c17c6c7bca38614d12718c47466",
-                "sha256:79fd5d3d62238c4f583b75d48d53cdae759fe04d4fb18fe8b371d88ad2b6f8be",
-                "sha256:7fe3e2fde2bf1d7ce25ebcd2d3de3650b8d60d9a73ce6dcef36e20191291613d",
-                "sha256:81042a24f67b96e4287774014fa27220d8a4d91af1043389e4d73892efc89ac6",
-                "sha256:81326f1095c53111f8afc95da281e1414185f4a538609a77ca50bdfa39a6c207",
-                "sha256:8873dc0d8f42142ea9f20c27bbdc485190fff93823c6795be661703369e5877d",
-                "sha256:88d2cbcb0a112f47eef71eb95460b6995da18e6f8ca50c264585abc2c473154b",
-                "sha256:91f2491aeab9599956c45a77c5666d323efdec790bfe23fcceafcd91105d585a",
-                "sha256:979daa8655ae5a51e8e7a24e7d34e250ae8309fd9719490df92cbb2fe2b0422b",
-                "sha256:9c871b006c878a890c6e44a5b2f3c6291335324b298c904dc0402ee92ee1f0be",
-                "sha256:a6d092545e5af53e960465f652e00efbf5357adad177b2630d63978d85e46a72",
-                "sha256:b5ed7837b923d1d71c4f587ae1539ccd96bfd6be9788f507dbe94dab5febbb5d",
-                "sha256:ba259f68250f16d2444cbbfaddaa0bb20e1560a4fdaad50bece25c199e6af864",
-                "sha256:be1d89614c6b6c36d7578496dc8625123bda2ff44f224cf8b1c45b810ee7383f",
-                "sha256:c1b030a79749aa8d1f1486885040114ee56933b15ccfc90049ba266e4aa2139f",
-                "sha256:c95bb147fab76f2ecde332d972d8f4138b8f2daee6c466af4ff3b4f29bd4c19e",
-                "sha256:d52c1c2d7e856cecc05aa0526453cb14574f821b7f413cc279b9514750d795c1",
-                "sha256:d609a6d564ad3d327e9509846c2c47f170456344521462b469e5cb39e48ba31c",
-                "sha256:e1bad043c12fb58e8c7d92b3d7f2f49977dcb80a08a6d1e7a5114a11bf819fca",
-                "sha256:e5a675f6829c53c87d79117a8eb656cc4a5f8918185a32fc93ba09778e90f6db",
-                "sha256:fec32646b98baf4a22fdceb08703965bd16dea09051fbeb31a04b5b6e72b846c"
-            ],
-            "version": "==5.0"
+                "sha256:0101888bd1592a20ccadae081ba10e8b204d20235d18d05c6f7d5e904a38fc10",
+                "sha256:04b961862334687549eb91cd5178a6fbe977ad365bddc7c60f2227f2f9880cf4",
+                "sha256:1ca43dbd739c0fc30b0a3637a003a0d2c7edc1dd618359d58cc1e211742f8bd1",
+                "sha256:1cbb88b34187bdb841f2599770b7e6ff8e259dc3bb64fc7893acf44998acf5f8",
+                "sha256:232f0b52a5b978288f0bbc282a6c03fe48cd19a04202df44309919c142b3bb9c",
+                "sha256:24bcfa86fd9ce86b73a8368383c39d919c497a06eebb888b6f0c12f13e920b1a",
+                "sha256:25b8f60b5c7da71e64c18888f3067d5b6f1334b9681876b2fb41eea26de881ae",
+                "sha256:2714160a63da18aed9340c70ed514973971ee7e665e6b336917ff4cca81a25b1",
+                "sha256:2ca2cd5264e84b2cafc73f0045437f70c6378c0d7dbcddc9ee3fe192c1e29e5d",
+                "sha256:2cc707fc9aad2592fc686d63ef72dc0031fc98b6fb921d2f5395d9ab84fbc3ef",
+                "sha256:348630edea485f4228233c2f310a598abf8afa5f8c716c02a9698089687b6085",
+                "sha256:40fbfd6b044c9db13aeec1daf5887d322c710d811f944011757526ef6e323fd9",
+                "sha256:46c9c6a1d1190c0b75ec7c0f339088309952b82ae8d67a79ff1319eb4e749b96",
+                "sha256:591506e088901bdc25620c37aec885e82cc896528f28c57e113751e3471fc314",
+                "sha256:5ac71bba1e07eab403b082c4428f868c1c9e26a21041436b4905c4c3d4e49b08",
+                "sha256:5f622f19abda4e934938e24f1d67599249abc201844933a6f01aaa8663094489",
+                "sha256:65bead1ac8c8930cf92a1ccaedcce19a57298547d5d1db5c9d4d068a0675c38b",
+                "sha256:7362a7f829feda10c7265b553455de596b83d1623b3d436b6d3c51c688c57bf6",
+                "sha256:7f2675750c50151f806070ec11258edf4c328340916c53bac0adbc465abd6b1e",
+                "sha256:960d7f42277391e8b1c0b0ae427a214e1b31a1278de6b73f8807b20c2e913bba",
+                "sha256:a50b0888d8a021a3342d36a6086501e30de7d840ab68fca44913e97d14487dc1",
+                "sha256:b7dbc5e8c39ea3ad3db22715f1b5401cd698a621218680c6daf42c2f9d36e205",
+                "sha256:bb3d29df5d07d5399d58a394d0ef50adf303ab4fbf66dfd25b9ef258effcb692",
+                "sha256:c0fff2733f7c2950f58a4fd09b5db257b00c6fec57bf3f68c5bae004d804b407",
+                "sha256:c792d3707a86c01c02607ae74364854220fb3e82735f631cd0a345dea6b4cee5",
+                "sha256:c90bda74e16bcd03861b09b1d37c0a4158feda5d5a036bb2d6e58de6ff65793e",
+                "sha256:cfce79ce41cc1a1dc7fc85bb41eeeb32d34a4cf39a645c717c0550287e30ff06",
+                "sha256:eeafb646f374988c22c8e6da5ab9fb81367ecfe81c70c292623373d2a021b1a1",
+                "sha256:f425f50a6dd807cb9043d15a4fcfba3b5874a54d9587ccbb748899f70dc18c47",
+                "sha256:fcd4459fe35a400b8f416bc57906862693c9f88b66dc925e7f2a933e77f6b18b",
+                "sha256:ff3936dd5feaefb4f91c8c1f50a06c588b5dc69fba4f7d9c79a6617ad80bb7df"
+            ],
+            "version": "==5.0.1"
         },
         "decorator": {
             "hashes": [
@@ -748,10 +747,10 @@
         },
         "jedi": {
             "hashes": [
-                "sha256:786b6c3d80e2f06fd77162a07fed81b8baa22dde5d62896a790a331d6ac21a27",
-                "sha256:ba859c74fa3c966a22f2aeebe1b74ee27e2a462f56d3f5f7ca4a59af61bfe42e"
+                "sha256:1349c1e8c107095a55386628bb3b2a79422f3a2cab8381e34ce19909e0cf5064",
+                "sha256:e909527104a903606dd63bea6e8e888833f0ef087057829b89a18364a856f807"
             ],
-            "version": "==0.15.1"
+            "version": "==0.15.2"
         },
         "lazy-object-proxy": {
             "hashes": [
@@ -808,14 +807,6 @@
             ],
             "version": "==0.5.2"
         },
-        "pathlib2": {
-            "hashes": [
-                "sha256:0ec8205a157c80d7acc301c0b18fbd5d44fe655968f5d947b6ecef5290fc35db",
-                "sha256:6cd9a47b597b37cc57de1c05e56fb1a1c9cc9fab04fe78c29acd090418529868"
-            ],
-            "markers": "python_version < '3.6'",
-            "version": "==2.3.5"
-        },
         "pexpect": {
             "hashes": [
                 "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1",
@@ -880,10 +871,10 @@
         },
         "py": {
             "hashes": [
-                "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa",
-                "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53"
+                "sha256:5e27081401262157467ad6e7f851b7aa402c5852dbcb3dae06768434de5752aa",
+                "sha256:c20fdd83a5dbc0af9efd622bee9a5564e278f6380fffcacc43ba6f43db2813b0"
             ],
-            "version": "==1.8.0"
+            "version": "==1.8.1"
         },
         "pygments": {
             "hashes": [
@@ -902,18 +893,18 @@
         },
         "pyparsing": {
             "hashes": [
-                "sha256:20f995ecd72f2a1f4bf6b072b63b22e2eb457836601e76d6e5dfcd75436acc1f",
-                "sha256:4ca62001be367f01bd3e92ecbb79070272a9d4964dce6a48a82ff0b8bc7e683a"
+                "sha256:4c830582a84fb022400b85429791bc551f1f4871c33f23e44f353119e92f969f",
+                "sha256:c342dccb5250c08d45fd6f8b4a559613ca603b57498511740e65cd11a2e7dcec"
             ],
-            "version": "==2.4.5"
+            "version": "==2.4.6"
         },
         "pytest": {
             "hashes": [
-                "sha256:65e92898fb5b61d0a1d7319c3e6dcf97e599e331cfdc2b27f20c0d87ece19239",
-                "sha256:9ea149066f566c943d3122f4b1cf1b577cab73189d11f490b54703fa5fa9df50"
+                "sha256:6192875be8af57b694b7c4904e909680102befcb99e610ef3d9f786952f795aa",
+                "sha256:f8447ebf8fd3d362868a5d3f43a9df786dfdfe9608843bd9002a2d47a104808f"
             ],
             "index": "pypi",
-            "version": "==4.6.7"
+            "version": "==4.6.8"
         },
         "pytest-cov": {
             "hashes": [
@@ -957,11 +948,11 @@
         },
         "responses": {
             "hashes": [
-                "sha256:caa5f7afd14666f970ea54a4125a639f6491218b45a013c6dc2544f0473ae2b8",
-                "sha256:f602986c715073b6bcb5d68a3225254aa447a1bd06040c66341816267d327721"
+                "sha256:515fd7c024097e5da76e9c4cf719083d181f1c3ddc09c2e0e49284ce863dd263",
+                "sha256:8ce8cb4e7e1ad89336f8865af152e0563d2e7f0e0b86d2cf75f015f819409243"
             ],
             "index": "pypi",
-            "version": "==0.10.8"
+            "version": "==0.10.9"
         },
         "simplegeneric": {
             "hashes": [
@@ -1018,10 +1009,9 @@
         },
         "wcwidth": {
             "hashes": [
-                "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
-                "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+                "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603"
             ],
-            "version": "==0.1.7"
+            "version": "==0.1.8"
         },
         "wrapt": {
             "hashes": [
-- 
cgit v1.2.3


From 7091fdd4a1eb4aad03776a07392ea752c1cc3c27 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 3 Jan 2020 15:00:27 +0100
Subject: add missing pathlib2 dependency

first seen in CI (jobs/230137), slightly related:
https://github.com/pytest-dev/pytest/issues/3953
---
 python/Pipfile      |  1 +
 python/Pipfile.lock | 18 +++++++++++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/Pipfile b/python/Pipfile
index 5d50b37c..3d546a84 100644
--- a/python/Pipfile
+++ b/python/Pipfile
@@ -49,6 +49,7 @@ elasticsearch-dsl = ">=6.0.0,<7.0.0"
 elasticsearch = ">=6.0.0,<7.0.0"
 dateparser = ">=0.7"
 langdetect = "*"
+pathlib2 = "*"
 
 [requires]
 # Python 3.5 is the bundled (system) version of python for Ubuntu 16.04
diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index 05bbd488..73dc20f0 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -1,7 +1,7 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "f6e7c193796101c8592827483aaed50efec5e8c261c5a179aea494f8a24cb4dc"
+            "sha256": "0ac9595590f2f5e28ac95eddbb7d16de69733a5e642663cb00136a4227b53e78"
         },
         "pipfile-spec": 6,
         "requires": {
@@ -398,6 +398,14 @@
             ],
             "version": "==3.1.0"
         },
+        "pathlib2": {
+            "hashes": [
+                "sha256:0ec8205a157c80d7acc301c0b18fbd5d44fe655968f5d947b6ecef5290fc35db",
+                "sha256:6cd9a47b597b37cc57de1c05e56fb1a1c9cc9fab04fe78c29acd090418529868"
+            ],
+            "index": "pypi",
+            "version": "==2.3.5"
+        },
         "pycparser": {
             "hashes": [
                 "sha256:a988718abfad80b6b157acce7bf130a30876d27603738ac39f140993246b25b3"
@@ -807,6 +815,14 @@
             ],
             "version": "==0.5.2"
         },
+        "pathlib2": {
+            "hashes": [
+                "sha256:0ec8205a157c80d7acc301c0b18fbd5d44fe655968f5d947b6ecef5290fc35db",
+                "sha256:6cd9a47b597b37cc57de1c05e56fb1a1c9cc9fab04fe78c29acd090418529868"
+            ],
+            "index": "pypi",
+            "version": "==2.3.5"
+        },
         "pexpect": {
             "hashes": [
                 "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1",
-- 
cgit v1.2.3


From b31955dbae89494735b230a25baa17797874e47e Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 3 Jan 2020 16:12:38 +0100
Subject: add pycountry dependency

---
 python/Pipfile      | 1 +
 python/Pipfile.lock | 9 ++++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/Pipfile b/python/Pipfile
index 3d546a84..01c1eb3d 100644
--- a/python/Pipfile
+++ b/python/Pipfile
@@ -50,6 +50,7 @@ elasticsearch = ">=6.0.0,<7.0.0"
 dateparser = ">=0.7"
 langdetect = "*"
 pathlib2 = "*"
+pycountry = "*"
 
 [requires]
 # Python 3.5 is the bundled (system) version of python for Ubuntu 16.04
diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index 73dc20f0..35125b67 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -1,7 +1,7 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "0ac9595590f2f5e28ac95eddbb7d16de69733a5e642663cb00136a4227b53e78"
+            "sha256": "fb9c3d2307483efe01d9c28a306bad319c84a94a4253d5c7c25bcfe2dad20c5d"
         },
         "pipfile-spec": 6,
         "requires": {
@@ -406,6 +406,13 @@
             "index": "pypi",
             "version": "==2.3.5"
         },
+        "pycountry": {
+            "hashes": [
+                "sha256:3c57aa40adcf293d59bebaffbe60d8c39976fba78d846a018dc0c2ec9c6cb3cb"
+            ],
+            "index": "pypi",
+            "version": "==19.8.18"
+        },
         "pycparser": {
             "hashes": [
                 "sha256:a988718abfad80b6b157acce7bf130a30876d27603738ac39f140993246b25b3"
-- 
cgit v1.2.3


From 1e5680202fe2bf0348f969ffd0e4b211cc45e1e5 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 3 Jan 2020 17:08:56 +0100
Subject: datacite: lowercase only once

---
 python/fatcat_tools/importers/datacite.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 8034a5c1..d13e855e 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -872,7 +872,7 @@ def index_form_to_display_name(s):
     if s.count(',') > 1:
         # "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan"
         return s
-    stopwords = [
+    stopwords = [s.lower() for s in (
         'Archive',
         'Collection',
         'Coordinator',
@@ -890,9 +890,10 @@ def index_form_to_display_name(s):
         'University',
         'Verein',
         'Volkshochschule',
-    ]
+    )]
+    lower = s.lower()
     for stop in stopwords:
-        if stop.lower() in s.lower():
+        if stop in lower:
             return s
 
     a, b = s.split(',')
-- 
cgit v1.2.3


From e4402d6d4b162d57507d5beb57de88017cea549d Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 3 Jan 2020 19:51:53 +0100
Subject: datacite: prepare release_month (stub)

---
 python/fatcat_tools/importers/datacite.py | 20 ++++++++++----------
 python/tests/import_datacite.py           | 28 ++++++++++++++--------------
 2 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index d13e855e..45c8a421 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -378,7 +378,7 @@ class DataciteImporter(EntityImporter):
         # "attributes.dates[].dateType", values: "Accepted", "Available"
         # "Collected", "Copyrighted", "Created", "Issued", "Submitted",
         # "Updated", "Valid".
-        release_date, release_year = parse_datacite_dates(
+        release_date, release_month, release_year = parse_datacite_dates(
             attributes.get('dates', []))
 
         # Start with clear stages, e.g. published. TODO(martin): we could
@@ -762,10 +762,10 @@ def parse_datacite_dates(dates):
     Given a list of date fields (under .dates), return tuple, (release_date,
     release_year).
     """
-    release_date, release_year = None, None
+    release_date, release_month, release_year = None, None, None
 
     if not dates:
-        return release_date, release_year
+        return release_date, release_month, release_year
 
     if not isinstance(dates, list):
         raise ValueError('expected a list of date items')
@@ -789,7 +789,7 @@ def parse_datacite_dates(dates):
 
     def parse_item(item):
         result, value, year_only = None, item.get('date', ''), False
-        release_date, release_year = None, None
+        release_date, release_month, release_year = None, None, None
 
         for pattern in common_patterns:
             try:
@@ -808,24 +808,24 @@ def parse_datacite_dates(dates):
             except TypeError as err:
                 print("{} date parsing failed with: {}".format(value, err),
                       file=sys.stderr)
-                return result_date, result_year
+                return result_date, release_month, result_year
 
         if result is None:
             # Unparsable date.
-            return release_date, release_year
+            return release_date, release_month, release_year
 
         if not year_only:
             release_date = result.date()
         release_year = result.year
 
-        return release_date, release_year
+        return release_date, release_month, release_year
 
     for prio in date_type_prio:
         for item in dates:
             if not item.get('dateType') == prio:
                 continue
 
-            release_date, release_year = parse_item(item)
+            release_date, release_month, release_year = parse_item(item)
             if release_date is None and release_year is None:
                 continue
 
@@ -841,11 +841,11 @@ def parse_datacite_dates(dates):
 
     if release_date is None and release_year is None:
         for item in dates:
-            release_date, release_year = parse_item(item)
+            release_date, release_month, release_year = parse_item(item)
             if release_year or release_date:
                 break
 
-    return release_date, release_year
+    return release_date, release_month, release_year
 
 def clean_doi(doi):
     """
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 54a529c5..29c608ee 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -170,41 +170,41 @@ def test_parse_datacite_dates():
     """
     Case = collections.namedtuple('Case', 'about input result')
     cases = [
-        Case('None is None', None, (None, None)),
-        Case('empty list is None', [], (None, None)),
-        Case('empty item is None', [{}], (None, None)),
-        Case('empty item is None', [{'date': '2019'}], (None, 2019)),
-        Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, 2019)),
-        Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, 2020)),
+        Case('None is None', None, (None, None, None)),
+        Case('empty list is None', [], (None, None, None)),
+        Case('empty item is None', [{}], (None, None, None)),
+        Case('empty item is None', [{'date': '2019'}], (None, None, 2019)),
+        Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, None, 2019)),
+        Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, None, 2020)),
         Case('first with type', [
             {'date': '2019', 'dateType': 'Accepted'}, {'date': '2020'}
-        ], (None, 2019)),
+        ], (None, None, 2019)),
         Case('full date', [
             {'date': '2019-12-01', 'dateType': 'Valid'},
-        ], (datetime.date(2019, 12, 1), 2019)),
+        ], (datetime.date(2019, 12, 1), None, 2019)),
         Case('date type prio', [
             {'date': '2000-12-01', 'dateType': 'Valid'},
             {'date': '2010-01-01', 'dateType': 'Updated'},
-        ], (datetime.date(2000, 12, 1), 2000)),
+        ], (datetime.date(2000, 12, 1), None, 2000)),
         Case('date type prio, Available > Updated', [
             {'date': '2010-01-01', 'dateType': 'Updated'},
             {'date': '2000-12-01', 'dateType': 'Available'},
-        ], (datetime.date(2000, 12, 1), 2000)),
+        ], (datetime.date(2000, 12, 1), None, 2000)),
         Case('allow different date formats, Available > Updated', [
             {'date': '2010-01-01T10:00:00', 'dateType': 'Updated'},
             {'date': '2000-12-01T10:00:00', 'dateType': 'Available'},
-        ], (datetime.date(2000, 12, 1), 2000)),
+        ], (datetime.date(2000, 12, 1), None, 2000)),
         Case('allow different date formats, Available > Updated', [
             {'date': '2010-01-01T10:00:00Z', 'dateType': 'Updated'},
             {'date': '2000-12-01T10:00:00Z', 'dateType': 'Available'},
-        ], (datetime.date(2000, 12, 1), 2000)),
+        ], (datetime.date(2000, 12, 1), None, 2000)),
         Case('allow fuzzy date formats, Available > Updated', [
             {'date': '2010', 'dateType': 'Updated'},
             {'date': '2000 Dec 01', 'dateType': 'Available'},
-        ], (datetime.date(2000, 12, 1), 2000)),
+        ], (datetime.date(2000, 12, 1), None, 2000)),
         Case('ignore broken date', [
             {'date': 'Febrrr 45', 'dateType': 'Updated'},
-        ], (None, None)),
+        ], (None, None, None)),
     ]
     for case in cases:
         result = parse_datacite_dates(case.input)
-- 
cgit v1.2.3


From 55dcece5a476b1492bf6c7f4597a469b48b41264 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 3 Jan 2020 22:40:53 +0100
Subject: datacite: parse_datacite_dates returns month

As [...] we will soon add support for release_month field in the release schema.
---
 python/fatcat_tools/importers/datacite.py | 45 ++++++++++++++++++++++++-------
 python/tests/import_datacite.py           | 23 +++++++++++-----
 2 files changed, 51 insertions(+), 17 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 45c8a421..5891f8de 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -9,6 +9,7 @@ functions (parse_datacite_...), which can be tested more easily.
 """
 
 from .common import EntityImporter, clean
+import collections
 import dateparser
 import datetime
 import fatcat_openapi_client
@@ -783,43 +784,68 @@ def parse_datacite_dates(dates):
         'Updated',
     )
 
+    # We need to note the granularity, since a string like "2019" would be
+    # parsed into "2019-01-01", even though the month is unknown. Use 3
+    # granularity types: 'y', 'm', 'd'.
+    Pattern = collections.namedtuple('Pattern', 'layout granularity')
+
     # Before using (expensive) dateparser, try a few common patterns.
-    common_patterns = ('%Y-%m-%d', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ',
-                       '%Y-%m-%dT%H:%M:%S', '%Y')
+    common_patterns = (
+        Pattern('%Y-%m-%d', 'd'),
+        Pattern('%Y-%m', 'm'),
+        Pattern('%Y-%m-%dT%H:%M:%SZ', 'd'),
+        Pattern('%Y-%m-%dT%H:%M:%S', 'd'),
+        Pattern('%Y', 'y'),
+    )
 
     def parse_item(item):
         result, value, year_only = None, item.get('date', ''), False
         release_date, release_month, release_year = None, None, None
 
-        for pattern in common_patterns:
+        for layout, granularity in common_patterns:
             try:
-                result = datetime.datetime.strptime(value, pattern)
+                result = datetime.datetime.strptime(value, layout)
             except ValueError:
                 continue
             else:
-                if pattern == '%Y':
+                if granularity == 'y':
                     year_only = True
                 break
 
         if result is None:
             print('fallback for {}'.format(value), file=sys.stderr)
+            parser = dateparser.DateDataParser()
             try:
-                result = dateparser.parse(value)
+                # Results in a dict with keys: date_obj, period, locale.
+                parse_result = parser.get_date_data(value)
+
+                # A datetime object, later we need a date, only.
+                result = parse_result['date_obj']
+                if result is not None:
+                    if parse_result['period'] == 'year':
+                        return None, None, result.year
+                    elif parse_result['period'] == 'month':
+                        return None, result.month, result.year
+                    else:
+                        return result.date(), result.month, result.year
             except TypeError as err:
                 print("{} date parsing failed with: {}".format(value, err),
                       file=sys.stderr)
-                return result_date, release_month, result_year
 
         if result is None:
             # Unparsable date.
             return release_date, release_month, release_year
 
-        if not year_only:
+        if granularity != 'y':
             release_date = result.date()
         release_year = result.year
+        if granularity in ('m', 'd'):
+            release_month = result.month
 
         return release_date, release_month, release_year
 
+    today = datetime.date.today()
+
     for prio in date_type_prio:
         for item in dates:
             if not item.get('dateType') == prio:
@@ -829,8 +855,7 @@ def parse_datacite_dates(dates):
             if release_date is None and release_year is None:
                 continue
 
-            if release_year < 1000 or release_year > datetime.date.today(
-            ).year + 5:
+            if release_year < 1000 or release_year > today.year + 5:
                 # Skip possibly bogus dates.
                 release_year = None
                 continue
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 29c608ee..c2fcdec9 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -173,7 +173,7 @@ def test_parse_datacite_dates():
         Case('None is None', None, (None, None, None)),
         Case('empty list is None', [], (None, None, None)),
         Case('empty item is None', [{}], (None, None, None)),
-        Case('empty item is None', [{'date': '2019'}], (None, None, 2019)),
+        Case('year only yields year only', [{'date': '2019'}], (None, None, 2019)),
         Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, None, 2019)),
         Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, None, 2020)),
         Case('first with type', [
@@ -181,27 +181,36 @@ def test_parse_datacite_dates():
         ], (None, None, 2019)),
         Case('full date', [
             {'date': '2019-12-01', 'dateType': 'Valid'},
-        ], (datetime.date(2019, 12, 1), None, 2019)),
+        ], (datetime.date(2019, 12, 1), 12, 2019)),
         Case('date type prio', [
             {'date': '2000-12-01', 'dateType': 'Valid'},
             {'date': '2010-01-01', 'dateType': 'Updated'},
-        ], (datetime.date(2000, 12, 1), None, 2000)),
+        ], (datetime.date(2000, 12, 1), 12, 2000)),
         Case('date type prio, Available > Updated', [
             {'date': '2010-01-01', 'dateType': 'Updated'},
             {'date': '2000-12-01', 'dateType': 'Available'},
-        ], (datetime.date(2000, 12, 1), None, 2000)),
+        ], (datetime.date(2000, 12, 1), 12, 2000)),
         Case('allow different date formats, Available > Updated', [
             {'date': '2010-01-01T10:00:00', 'dateType': 'Updated'},
             {'date': '2000-12-01T10:00:00', 'dateType': 'Available'},
-        ], (datetime.date(2000, 12, 1), None, 2000)),
+        ], (datetime.date(2000, 12, 1), 12, 2000)),
         Case('allow different date formats, Available > Updated', [
             {'date': '2010-01-01T10:00:00Z', 'dateType': 'Updated'},
             {'date': '2000-12-01T10:00:00Z', 'dateType': 'Available'},
-        ], (datetime.date(2000, 12, 1), None, 2000)),
+        ], (datetime.date(2000, 12, 1), 12, 2000)),
         Case('allow fuzzy date formats, Available > Updated', [
             {'date': '2010', 'dateType': 'Updated'},
             {'date': '2000 Dec 01', 'dateType': 'Available'},
-        ], (datetime.date(2000, 12, 1), None, 2000)),
+        ], (datetime.date(2000, 12, 1), 12, 2000)),
+        Case('fuzzy year only', [
+            {'date': 'Year 2010', 'dateType': 'Issued'},
+        ], (None, None, 2010)),
+        Case('fuzzy year and month', [
+            {'date': 'Year 2010 Feb', 'dateType': 'Issued'},
+        ], (None, 2, 2010)),
+        Case('fuzzy year, month, day', [
+            {'date': 'Year 2010 Feb 24', 'dateType': 'Issued'},
+        ], (datetime.date(2010, 2, 24), 2, 2010)),
         Case('ignore broken date', [
             {'date': 'Febrrr 45', 'dateType': 'Updated'},
         ], (None, None, None)),
-- 
cgit v1.2.3


From 328d7901df30ba94685d34d6a428e798b4604839 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 3 Jan 2020 22:53:23 +0100
Subject: datacite: use normal.clean_doi

---
 python/fatcat_tools/importers/datacite.py | 12 +-----------
 python/tests/import_datacite.py           |  4 ----
 2 files changed, 1 insertion(+), 15 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 5891f8de..d0c75b6e 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -20,6 +20,7 @@ import langdetect
 import sqlite3
 import sys
 from fatcat_tools.transforms import entity_to_dict
+from fatcat_tools.normal import clean_doi
 
 
 # Cutoff length for abstracts.
@@ -872,17 +873,6 @@ def parse_datacite_dates(dates):
 
     return release_date, release_month, release_year
 
-def clean_doi(doi):
-    """
-    10.25513/1812-3996.2017.1.34–42 // 8211, Hex 2013, Octal 20023
-    See also: https://github.com/miku/throwaway-check-doi
-
-    Replace unicode HYPHEN..HORIZONTAL BAR with HYPHEN-MINUS.
-    """
-    for c in ('\u2010', '\u2011', '\u2012', '\u2013', '\u2014', '\u2015'):
-        doi = doi.replace(c, "-")
-    return doi
-
 def index_form_to_display_name(s):
     """
     Try to convert an index form name, like 'Razis, Panos A' into display_name,
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index c2fcdec9..881452ed 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -281,10 +281,6 @@ def test_datacite_dict_parse(datacite_importer):
         assert r.contribs[0].surname == None
         assert len(r.refs) == 0
 
-def test_clean_doi():
-    assert clean_doi("10.25513/1812-3996.2017.1.34\u201342") == "10.25513/1812-3996.2017.1.34-42"
-    assert "123" == clean_doi("123")
-
 def test_datacite_conversions(datacite_importer):
     """
     Datacite JSON to release entity JSON representation. The count is hardcoded
-- 
cgit v1.2.3


From e6feb6fd6d48f7b179389e79dfeb994d1b0f797b Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Sat, 4 Jan 2020 00:19:56 +0100
Subject: datacite: always include "datacite" key in extra

> always include extra values for the respective DOI registrars
(datacite, crossref, jalc), even if they are empty ({}), to be used as a
flag so we know which DOI registrar supplied the metadata.
---
 python/fatcat_tools/importers/datacite.py           | 4 ++--
 python/tests/files/datacite/datacite_result_03.json | 4 ++--
 python/tests/files/datacite/datacite_result_04.json | 2 +-
 python/tests/files/datacite/datacite_result_11.json | 4 ++--
 python/tests/files/datacite/datacite_result_12.json | 4 ++--
 python/tests/files/datacite/datacite_result_13.json | 4 ++--
 python/tests/files/datacite/datacite_result_15.json | 4 ++--
 python/tests/files/datacite/datacite_result_17.json | 4 ++--
 python/tests/files/datacite/datacite_result_18.json | 4 ++--
 python/tests/files/datacite/datacite_result_19.json | 4 ++--
 python/tests/files/datacite/datacite_result_20.json | 4 ++--
 python/tests/files/datacite/datacite_result_21.json | 4 ++--
 python/tests/files/datacite/datacite_result_22.json | 4 ++--
 python/tests/files/datacite/datacite_result_23.json | 4 ++--
 python/tests/files/datacite/datacite_result_24.json | 2 +-
 15 files changed, 28 insertions(+), 28 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index d0c75b6e..2fad1264 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -604,8 +604,8 @@ class DataciteImporter(EntityImporter):
         if not container_id and container_name:
             extra['container_name'] = container_name
 
-        if extra_datacite:
-            extra['datacite'] = extra_datacite
+        # Always include datacite key, even if value is empty (dict).
+        extra['datacite'] = extra_datacite
 
         extids = self.lookup_ext_ids(doi=doi)
 
diff --git a/python/tests/files/datacite/datacite_result_03.json b/python/tests/files/datacite/datacite_result_03.json
index 3e3c2bd5..e8367e8f 100644
--- a/python/tests/files/datacite/datacite_result_03.json
+++ b/python/tests/files/datacite/datacite_result_03.json
@@ -1,5 +1,5 @@
 {
-    "extra": {},
+    "extra": {"datacite": {}},
     "title": "midterm ah30903",
     "release_type": "article",
     "release_year": 2016,
@@ -16,4 +16,4 @@
     ],
     "refs": [],
     "abstracts": []
-}
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_04.json b/python/tests/files/datacite/datacite_result_04.json
index 94fa1f94..5b956836 100644
--- a/python/tests/files/datacite/datacite_result_04.json
+++ b/python/tests/files/datacite/datacite_result_04.json
@@ -1,5 +1,5 @@
 {
-    "extra": {},
+    "extra": {"datacite": {}},
     "title": "On chain maps inducing isomorphisms in homology",
     "release_type": "article-journal",
     "release_stage": "published",
diff --git a/python/tests/files/datacite/datacite_result_11.json b/python/tests/files/datacite/datacite_result_11.json
index 037c5ac2..3045701f 100644
--- a/python/tests/files/datacite/datacite_result_11.json
+++ b/python/tests/files/datacite/datacite_result_11.json
@@ -1,5 +1,5 @@
 {
-    "extra": {},
+    "extra": {"datacite": {}},
     "title": "N1 bei Safenwil",
     "release_type": "graphic",
     "release_stage": "published",
@@ -18,4 +18,4 @@
     ],
     "refs": [],
     "abstracts": []
-}
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_12.json b/python/tests/files/datacite/datacite_result_12.json
index 192062e3..5dbcd8d0 100644
--- a/python/tests/files/datacite/datacite_result_12.json
+++ b/python/tests/files/datacite/datacite_result_12.json
@@ -1,5 +1,5 @@
 {
-    "extra": {},
+    "extra": {"datacite": {}},
     "title": "Anthropometric and Physiological Profile of Mixed Martial Art Athletes: A Brief Review",
     "release_type": "article-journal",
     "release_stage": "published",
@@ -41,4 +41,4 @@
     ],
     "refs": [],
     "abstracts": []
-}
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_13.json b/python/tests/files/datacite/datacite_result_13.json
index c8971667..2509f27e 100644
--- a/python/tests/files/datacite/datacite_result_13.json
+++ b/python/tests/files/datacite/datacite_result_13.json
@@ -1,5 +1,5 @@
 {
-    "extra": {},
+    "extra": {"datacite": {}},
     "title": "[M\u00fcssen wir des Gl\u00fccks uns sch\u00e4men?]",
     "release_type": "article-journal",
     "release_stage": "published",
@@ -25,4 +25,4 @@
     ],
     "refs": [],
     "abstracts": []
-}
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_15.json b/python/tests/files/datacite/datacite_result_15.json
index bdeb8426..1b430a7d 100644
--- a/python/tests/files/datacite/datacite_result_15.json
+++ b/python/tests/files/datacite/datacite_result_15.json
@@ -1,5 +1,5 @@
 {
-    "extra": {},
+    "extra": {"datacite": {}},
     "title": "Parramore Island of the Virginia Coast Reserve Permanent Plot Resurvey: Tree data 1997",
     "release_type": "dataset",
     "release_stage": "published",
@@ -19,4 +19,4 @@
     ],
     "refs": [],
     "abstracts": []
-}
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_17.json b/python/tests/files/datacite/datacite_result_17.json
index 0852a09e..73b082d9 100644
--- a/python/tests/files/datacite/datacite_result_17.json
+++ b/python/tests/files/datacite/datacite_result_17.json
@@ -1,5 +1,5 @@
 {
-    "extra": {},
+    "extra": {"datacite": {}},
     "title": "gel_BSA-FITC_Markov_segmntation0343.tif",
     "release_type": "dataset",
     "release_stage": "published",
@@ -17,4 +17,4 @@
     ],
     "refs": [],
     "abstracts": []
-}
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_18.json b/python/tests/files/datacite/datacite_result_18.json
index 274858c3..d0b53222 100644
--- a/python/tests/files/datacite/datacite_result_18.json
+++ b/python/tests/files/datacite/datacite_result_18.json
@@ -1,5 +1,5 @@
 {
-    "extra": {},
+    "extra": {"datacite": {}},
     "title": "Eastern questionnaire, answer sheet for Interviewee 53215, page 064",
     "release_type": "article",
     "release_stage": "published",
@@ -12,4 +12,4 @@
     "contribs": [],
     "refs": [],
     "abstracts": []
-}
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_19.json b/python/tests/files/datacite/datacite_result_19.json
index 8d797268..55b43684 100644
--- a/python/tests/files/datacite/datacite_result_19.json
+++ b/python/tests/files/datacite/datacite_result_19.json
@@ -1,5 +1,5 @@
 {
-    "extra": {},
+    "extra": {"datacite": {}},
     "title": "Eastern questionnaire, answer sheet for Interviewee 55236, page 092",
     "release_type": "article",
     "release_stage": "published",
@@ -12,4 +12,4 @@
     "contribs": [],
     "refs": [],
     "abstracts": []
-}
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_20.json b/python/tests/files/datacite/datacite_result_20.json
index 97d7ae75..48063d9d 100644
--- a/python/tests/files/datacite/datacite_result_20.json
+++ b/python/tests/files/datacite/datacite_result_20.json
@@ -1,5 +1,5 @@
 {
-    "extra": {},
+    "extra": {"datacite": {}},
     "title": "<h1>Eastern questionnaire</h1>",
     "release_type": "article",
     "release_stage": "published",
@@ -11,4 +11,4 @@
     "contribs": [],
     "refs": [],
     "abstracts": []
-}
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_21.json b/python/tests/files/datacite/datacite_result_21.json
index 0a05a7cd..99dcad1b 100644
--- a/python/tests/files/datacite/datacite_result_21.json
+++ b/python/tests/files/datacite/datacite_result_21.json
@@ -1,5 +1,5 @@
 {
-    "extra": {},
+    "extra": {"datacite": {}},
     "title": "ABC",
     "release_type": "article",
     "release_stage": "published",
@@ -12,4 +12,4 @@
     "contribs": [],
     "refs": [],
     "abstracts": []
-}
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_22.json b/python/tests/files/datacite/datacite_result_22.json
index 9e4225b5..30d75a3d 100644
--- a/python/tests/files/datacite/datacite_result_22.json
+++ b/python/tests/files/datacite/datacite_result_22.json
@@ -1,5 +1,5 @@
 {
-    "extra": {},
+    "extra": {"datacite": {}},
     "title": "ABC",
     "release_type": "article",
     "release_stage": "published",
@@ -19,4 +19,4 @@
     ],
     "refs": [],
     "abstracts": []
-}
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_23.json b/python/tests/files/datacite/datacite_result_23.json
index 46f60492..f79053df 100644
--- a/python/tests/files/datacite/datacite_result_23.json
+++ b/python/tests/files/datacite/datacite_result_23.json
@@ -1,5 +1,5 @@
 {
-    "extra": {},
+    "extra": {"datacite": {}},
     "title": "ABC",
     "release_type": "article",
     "release_stage": "published",
@@ -19,4 +19,4 @@
     ],
     "refs": [],
     "abstracts": []
-}
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_24.json b/python/tests/files/datacite/datacite_result_24.json
index 42859275..a7fc59ba 100644
--- a/python/tests/files/datacite/datacite_result_24.json
+++ b/python/tests/files/datacite/datacite_result_24.json
@@ -1,5 +1,5 @@
 {
-    "extra": {},
+    "extra": {"datacite": {}},
     "title": "ABC",
     "subtitle": "DEF",
     "release_type": "article",
-- 
cgit v1.2.3


From b7675e276520c7ce595b9b5dbb10c02a42f5a9b1 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Fri, 3 Jan 2020 14:38:29 -0800
Subject: pytest: explicitly indicate all in-scope test files

The purpose of this change is to test errors when pytest tries to
recursively update assertion statements in all dependent packages. The
reason pytest does this is to add pretty printing, which is nice, but
probably shouldn't be done in all dependency libraries.

This fixes test problems with both CSL (citeproc_styles) and dateparser
(when actually imported in code, which currently on master does not
happen).
---
 python/pytest.ini | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'python')

diff --git a/python/pytest.ini b/python/pytest.ini
index 444333ea..069102b1 100644
--- a/python/pytest.ini
+++ b/python/pytest.ini
@@ -2,10 +2,8 @@
 
 ignore = setup.py
 
-python_paths = .
-
 # search for 'test_*' functions in all python files, not just under tests
-python_files = *.py
+python_files = test_*.py tests/*.py fatcat_tools/*.py fatcat_tools/*/*.py fatcat_web/*.py
 
 addopts = --pylint --pylint-rcfile=.pylintrc --pylint-error-types=EF --pylint-jobs=4
 
-- 
cgit v1.2.3


From 6fb2a2bda8b2e1e704075f18ef27a459cf6789c3 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Mon, 6 Jan 2020 19:17:23 +0100
Subject: datacite: filter out 'Cites' relation as well

---
 python/fatcat_tools/importers/datacite.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 2fad1264..9a1e9935 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -560,7 +560,7 @@ class DataciteImporter(EntityImporter):
 
         relIds = attributes.get('relatedIdentifiers', []) or []
         for rel in relIds:
-            if not rel.get('relationType', '') == 'References':
+            if not rel.get('relationType', '') in ('References', 'Cites'):
                 continue
             ref_extra = dict()
             if rel.get('relatedIdentifierType', '') == 'DOI':
-- 
cgit v1.2.3


From 582e18d3b9b4599604cddacd526f9b81c1d117d4 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Mon, 6 Jan 2020 19:37:45 +0100
Subject: datacite: clean abstract as well

---
 python/fatcat_tools/importers/datacite.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 9a1e9935..c3d6138e 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -543,7 +543,7 @@ class DataciteImporter(EntityImporter):
             abstracts.append(
                 fatcat_openapi_client.ReleaseAbstract(
                     mimetype="text/plain",
-                    content=text,
+                    content=clean(text),
                     lang=lang,
                 ))
 
-- 
cgit v1.2.3


From 3590cf0e06b6c4f1b1c9621a94c9567e398bca04 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Mon, 6 Jan 2020 21:47:13 +0100
Subject: datacite: clean abstracts, use unknown value tokens

Datacite defines placeholders for unknown values:

* https://support.datacite.org/docs/schema-values-unknown-information-v43

Clean abstracts.
---
 python/fatcat_tools/importers/datacite.py          | 30 +++++++++++++++++++---
 .../tests/files/datacite/datacite_result_05.json   |  2 +-
 .../tests/files/datacite/datacite_result_08.json   |  2 +-
 .../tests/files/datacite/datacite_result_14.json   |  2 +-
 4 files changed, 29 insertions(+), 7 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index c3d6138e..f9d1b49a 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -125,6 +125,29 @@ DATACITE_TYPE_MAP = {
     }
 }
 
+# DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43.
+DATACITE_UNKNOWN_MARKERS = (
+    '(:unac)',  # temporarily inaccessible
+    '(:unal)',  # unallowed, suppressed intentionally
+    '(:unap)',  # not applicable, makes no sense
+    '(:unas)',  # value unassigned (e.g., Untitled)
+    '(:unav)',  # value unavailable, possibly unknown
+    '(:unkn)',  # known to be unknown (e.g., Anonymous, Inconnue)
+    '(:none)',  # never had a value, never will
+    '(:null)',  # explicitly and meaningfully empty
+    '(:tba)',  # to be assigned or announced later
+    '(:etal)',  # too numerous to list (et alia)
+)
+
+# UNKNOWN_MARKERS joins official datacite markers with a generic tokens marking
+# unknown values.
+UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set((
+    'NA',
+    'NN',
+    'n.a.',
+    '[s.n.]',
+)))
+
 # TODO(martin): merge this with other maps, maybe.
 LICENSE_SLUG_MAP = {
     "//creativecommons.org/licenses/by/2.0/": "CC-BY",
@@ -326,7 +349,7 @@ class DataciteImporter(EntityImporter):
                 if raw_affiliation == '':
                     continue
 
-                if name in ('(:Unav)', 'NA', 'NN', '(:Null)'):
+                if name.lower() in UNKNOWN_MARKERS:
                     continue
 
                 # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'.
@@ -345,7 +368,7 @@ class DataciteImporter(EntityImporter):
                     ))
             elif nameType == 'Organizational':
                 name = c.get('name', '') or ''
-                if name == 'NN':
+                if name in UNKNOWN_MARKERS:
                     continue
                 if len(name) < 3:
                     continue
@@ -394,8 +417,7 @@ class DataciteImporter(EntityImporter):
         # Publisher. A few NA values. A few bogus values.
         publisher = attributes.get('publisher')
 
-        if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)',
-                         '(:none)', 'Unpublished'):
+        if publisher in UNKNOWN_MARKERS | set(('Unpublished', 'Unknown')):
             publisher = None
             release_stage = None
         if publisher is not None and len(publisher) > 80:
diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json
index ff998c0f..1840884e 100644
--- a/python/tests/files/datacite/datacite_result_05.json
+++ b/python/tests/files/datacite/datacite_result_05.json
@@ -523,7 +523,7 @@
     "refs": [],
     "abstracts": [
         {
-            "content": "UNITE provides a unified way for delimiting, identifying, communicating, and working with DNA-based Species Hypotheses (SH). All fungal ITS sequences in the international nucleotide sequence databases are clustered to approximately the species level by applying a set of dynamic distance values (&lt;0.5 - 3.0%). All species hypotheses are given a unique, stable name in the form of a DOI, and their taxonomic and ecological annotations are verified through distributed, web-based third-party annotation efforts. SHs are connected to a taxon name and its classification as far as possible (phylum, class, order, etc.) by taking into account identifications for all sequences in the SH. An automatically or manually designated sequence is chosen to represent each such SH. These sequences are released (https://unite.ut.ee/repository.php) for use by the scientific community in, for example, local sequence similarity searches and next-generation sequencing analysis pipelines. The system and the data are updated automatically as the number of public fungal ITS sequences grows.",
+            "content": "UNITE provides a unified way for delimiting, identifying, communicating, and working with DNA-based Species Hypotheses (SH). All fungal ITS sequences in the international nucleotide sequence databases are clustered to approximately the species level by applying a set of dynamic distance values (<0.5 - 3.0%). All species hypotheses are given a unique, stable name in the form of a DOI, and their taxonomic and ecological annotations are verified through distributed, web-based third-party annotation efforts. SHs are connected to a taxon name and its classification as far as possible (phylum, class, order, etc.) by taking into account identifications for all sequences in the SH. An automatically or manually designated sequence is chosen to represent each such SH. These sequences are released (https://unite.ut.ee/repository.php) for use by the scientific community in, for example, local sequence similarity searches and next-generation sequencing analysis pipelines. The system and the data are updated automatically as the number of public fungal ITS sequences grows.",
             "mimetype": "text/plain",
             "lang": "en"
         }
diff --git a/python/tests/files/datacite/datacite_result_08.json b/python/tests/files/datacite/datacite_result_08.json
index cc0e968b..46ef5b44 100644
--- a/python/tests/files/datacite/datacite_result_08.json
+++ b/python/tests/files/datacite/datacite_result_08.json
@@ -46,7 +46,7 @@
     "refs": [],
     "abstracts": [
         {
-            "content": "International society recognizes that the scarcity of fresh water is increasing and farming sectors suffer from lack of irrigation water. However, if we look at this issue with a framework of relative factor endowment, a different view will arise. In emerging states with rapid industrialization and labor migration, labor scarcity increases at a faster pace than that of irrigation water. Using the historical review of Japan\u2019s irrigation policies as well as the case studies of India and China, this paper shows that the introduction of policies which do not reflect the actual relative resource scarcity may mislead the development path. We argue that under increasing relative labor scarcity it is important to realize the substitution of capital for labor for surface irrigation system management and that the substitution needs public support because the service of surface irrigation system has some externalities. Through this argument, this paper also intends to shed the light back to the role of the state for local resource management which seems to be unfairly undervalued since the boom of community participatory approach in the 1980s.",
+            "content": "International society recognizes that the scarcity of fresh water is increasing and farming sectors suffer from lack of irrigation water. However, if we look at this issue with a framework of relative factor endowment, a different view will arise. In emerging states with rapid industrialization and labor migration, labor scarcity increases at a faster pace than that of irrigation water. Using the historical review of Japan's irrigation policies as well as the case studies of India and China, this paper shows that the introduction of policies which do not reflect the actual relative resource scarcity may mislead the development path. We argue that under increasing relative labor scarcity it is important to realize the substitution of capital for labor for surface irrigation system management and that the substitution needs public support because the service of surface irrigation system has some externalities. Through this argument, this paper also intends to shed the light back to the role of the state for local resource management which seems to be unfairly undervalued since the boom of community participatory approach in the 1980s.",
             "mimetype": "text/plain",
             "lang": "en"
         }
diff --git a/python/tests/files/datacite/datacite_result_14.json b/python/tests/files/datacite/datacite_result_14.json
index 4521f891..c3719aeb 100644
--- a/python/tests/files/datacite/datacite_result_14.json
+++ b/python/tests/files/datacite/datacite_result_14.json
@@ -103,7 +103,7 @@
     "refs": [],
     "abstracts": [
         {
-            "content": "An entry from the Cambridge Structural Database, the world\u2019s repository for small molecule crystal structures. The entry contains experimental data from a crystal diffraction study. The deposited dataset for this entry is freely available from the CCDC and typically includes 3D coordinates, cell parameters, space group, experimental conditions and quality measures.",
+            "content": "An entry from the Cambridge Structural Database, the world's repository for small molecule crystal structures. The entry contains experimental data from a crystal diffraction study. The deposited dataset for this entry is freely available from the CCDC and typically includes 3D coordinates, cell parameters, space group, experimental conditions and quality measures.",
             "mimetype": "text/plain",
             "lang": "en"
         }
-- 
cgit v1.2.3


From 171c4ae9f48984438e59bf521b3ec9dd78ce6d3d Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Mon, 6 Jan 2020 22:25:26 +0100
Subject: datacite: indicate mismatched file in test

---
 python/tests/import_datacite.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 881452ed..9ee479e8 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -297,7 +297,7 @@ def test_datacite_conversions(datacite_importer):
         with open(dst, 'r') as f:
            expected = json.loads(f.read())
 
-        assert result == expected
+        assert result == expected, 'output mismatch in {}'.format(dst)
 
 def test_index_form_to_display_name():
     Case = collections.namedtuple('Case', 'input output')
-- 
cgit v1.2.3


From ff37b97e4bbf642efbd830111fe3dbd45ae56dad Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Mon, 6 Jan 2020 22:25:53 +0100
Subject: datacite: include month in extra

> include release_month as a top-level extra field [...] to
auto-populate the schema field from that
---
 python/fatcat_tools/importers/datacite.py           | 2 ++
 python/tests/files/datacite/datacite_result_00.json | 3 ++-
 python/tests/files/datacite/datacite_result_05.json | 3 ++-
 python/tests/files/datacite/datacite_result_12.json | 2 +-
 python/tests/files/datacite/datacite_result_13.json | 2 +-
 python/tests/files/datacite/datacite_result_18.json | 2 +-
 python/tests/files/datacite/datacite_result_19.json | 2 +-
 python/tests/files/datacite/datacite_result_20.json | 2 +-
 python/tests/files/datacite/datacite_result_21.json | 2 +-
 python/tests/files/datacite/datacite_result_22.json | 2 +-
 python/tests/files/datacite/datacite_result_23.json | 2 +-
 python/tests/files/datacite/datacite_result_24.json | 2 +-
 12 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index f9d1b49a..a673f00b 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -603,6 +603,8 @@ class DataciteImporter(EntityImporter):
             extra_datacite['license'] = license_extra
         if attributes.get('subjects'):
             extra_datacite['subjects'] = attributes['subjects']
+        if release_month:
+            extra_datacite['month'] = release_month
 
         # Include certain relations from relatedIdentifiers. Keeping the
         # original structure of data here, which is a list of dicts, with
diff --git a/python/tests/files/datacite/datacite_result_00.json b/python/tests/files/datacite/datacite_result_00.json
index a4b28076..ad917b92 100644
--- a/python/tests/files/datacite/datacite_result_00.json
+++ b/python/tests/files/datacite/datacite_result_00.json
@@ -2,6 +2,7 @@
     "extra": {
         "container_name": "Journal of Chemical Crystallography",
         "datacite": {
+            "month": 5,
             "license": [
                 {
                     "rightsUri": "http://www.springer.com/tdm"
@@ -84,4 +85,4 @@
         }
     ],
     "abstracts": []
-}
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json
index 1840884e..cea2a25c 100644
--- a/python/tests/files/datacite/datacite_result_05.json
+++ b/python/tests/files/datacite/datacite_result_05.json
@@ -6,7 +6,8 @@
                     "rights": "Attribution-NonCommercial (CC BY-NC)",
                     "rightsUri": "http://creativecommons.org/licenses/by-nc/4.0"
                 }
-            ]
+            ],
+            "month": 10
         }
     },
     "title": "SH409843.07FU",
diff --git a/python/tests/files/datacite/datacite_result_12.json b/python/tests/files/datacite/datacite_result_12.json
index 5dbcd8d0..646299cf 100644
--- a/python/tests/files/datacite/datacite_result_12.json
+++ b/python/tests/files/datacite/datacite_result_12.json
@@ -1,5 +1,5 @@
 {
-    "extra": {"datacite": {}},
+    "extra": {"datacite": {"month": 6}},
     "title": "Anthropometric and Physiological Profile of Mixed Martial Art Athletes: A Brief Review",
     "release_type": "article-journal",
     "release_stage": "published",
diff --git a/python/tests/files/datacite/datacite_result_13.json b/python/tests/files/datacite/datacite_result_13.json
index 2509f27e..fea722c7 100644
--- a/python/tests/files/datacite/datacite_result_13.json
+++ b/python/tests/files/datacite/datacite_result_13.json
@@ -1,5 +1,5 @@
 {
-    "extra": {"datacite": {}},
+    "extra": {"datacite": {"month": 10}},
     "title": "[M\u00fcssen wir des Gl\u00fccks uns sch\u00e4men?]",
     "release_type": "article-journal",
     "release_stage": "published",
diff --git a/python/tests/files/datacite/datacite_result_18.json b/python/tests/files/datacite/datacite_result_18.json
index d0b53222..6599fe08 100644
--- a/python/tests/files/datacite/datacite_result_18.json
+++ b/python/tests/files/datacite/datacite_result_18.json
@@ -1,5 +1,5 @@
 {
-    "extra": {"datacite": {}},
+    "extra": {"datacite": {"month": 8}},
     "title": "Eastern questionnaire, answer sheet for Interviewee 53215, page 064",
     "release_type": "article",
     "release_stage": "published",
diff --git a/python/tests/files/datacite/datacite_result_19.json b/python/tests/files/datacite/datacite_result_19.json
index 55b43684..5598ccee 100644
--- a/python/tests/files/datacite/datacite_result_19.json
+++ b/python/tests/files/datacite/datacite_result_19.json
@@ -1,5 +1,5 @@
 {
-    "extra": {"datacite": {}},
+    "extra": {"datacite": {"month": 8}},
     "title": "Eastern questionnaire, answer sheet for Interviewee 55236, page 092",
     "release_type": "article",
     "release_stage": "published",
diff --git a/python/tests/files/datacite/datacite_result_20.json b/python/tests/files/datacite/datacite_result_20.json
index 48063d9d..ec2dfc38 100644
--- a/python/tests/files/datacite/datacite_result_20.json
+++ b/python/tests/files/datacite/datacite_result_20.json
@@ -1,5 +1,5 @@
 {
-    "extra": {"datacite": {}},
+    "extra": {"datacite": {"month": 8}},
     "title": "<h1>Eastern questionnaire</h1>",
     "release_type": "article",
     "release_stage": "published",
diff --git a/python/tests/files/datacite/datacite_result_21.json b/python/tests/files/datacite/datacite_result_21.json
index 99dcad1b..b5e2207a 100644
--- a/python/tests/files/datacite/datacite_result_21.json
+++ b/python/tests/files/datacite/datacite_result_21.json
@@ -1,5 +1,5 @@
 {
-    "extra": {"datacite": {}},
+    "extra": {"datacite": {"month": 8}},
     "title": "ABC",
     "release_type": "article",
     "release_stage": "published",
diff --git a/python/tests/files/datacite/datacite_result_22.json b/python/tests/files/datacite/datacite_result_22.json
index 30d75a3d..bd1290c2 100644
--- a/python/tests/files/datacite/datacite_result_22.json
+++ b/python/tests/files/datacite/datacite_result_22.json
@@ -1,5 +1,5 @@
 {
-    "extra": {"datacite": {}},
+    "extra": {"datacite": {"month": 8}},
     "title": "ABC",
     "release_type": "article",
     "release_stage": "published",
diff --git a/python/tests/files/datacite/datacite_result_23.json b/python/tests/files/datacite/datacite_result_23.json
index f79053df..599d1b37 100644
--- a/python/tests/files/datacite/datacite_result_23.json
+++ b/python/tests/files/datacite/datacite_result_23.json
@@ -1,5 +1,5 @@
 {
-    "extra": {"datacite": {}},
+    "extra": {"datacite": {"month": 8}},
     "title": "ABC",
     "release_type": "article",
     "release_stage": "published",
diff --git a/python/tests/files/datacite/datacite_result_24.json b/python/tests/files/datacite/datacite_result_24.json
index a7fc59ba..a3649867 100644
--- a/python/tests/files/datacite/datacite_result_24.json
+++ b/python/tests/files/datacite/datacite_result_24.json
@@ -1,5 +1,5 @@
 {
-    "extra": {"datacite": {}},
+    "extra": {"datacite": {"month": 8}},
     "title": "ABC",
     "subtitle": "DEF",
     "release_type": "article",
-- 
cgit v1.2.3


From d38dda53dd29024c8c855c64dfbb1529d0aaac83 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Mon, 6 Jan 2020 22:30:20 +0100
Subject: datacite: month field should be top-level

---
 python/fatcat_tools/importers/datacite.py           | 4 ++--
 python/tests/files/datacite/datacite_result_00.json | 4 ++--
 python/tests/files/datacite/datacite_result_05.json | 6 +++---
 python/tests/files/datacite/datacite_result_12.json | 2 +-
 python/tests/files/datacite/datacite_result_13.json | 2 +-
 python/tests/files/datacite/datacite_result_18.json | 2 +-
 python/tests/files/datacite/datacite_result_19.json | 2 +-
 python/tests/files/datacite/datacite_result_20.json | 2 +-
 python/tests/files/datacite/datacite_result_21.json | 2 +-
 python/tests/files/datacite/datacite_result_22.json | 2 +-
 python/tests/files/datacite/datacite_result_23.json | 2 +-
 python/tests/files/datacite/datacite_result_24.json | 2 +-
 12 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index a673f00b..1cee6db3 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -603,8 +603,6 @@ class DataciteImporter(EntityImporter):
             extra_datacite['license'] = license_extra
         if attributes.get('subjects'):
             extra_datacite['subjects'] = attributes['subjects']
-        if release_month:
-            extra_datacite['month'] = release_month
 
         # Include certain relations from relatedIdentifiers. Keeping the
         # original structure of data here, which is a list of dicts, with
@@ -630,6 +628,8 @@ class DataciteImporter(EntityImporter):
 
         # Always include datacite key, even if value is empty (dict).
         extra['datacite'] = extra_datacite
+        if release_month:
+            extra['month'] = release_month
 
         extids = self.lookup_ext_ids(doi=doi)
 
diff --git a/python/tests/files/datacite/datacite_result_00.json b/python/tests/files/datacite/datacite_result_00.json
index ad917b92..e76aa391 100644
--- a/python/tests/files/datacite/datacite_result_00.json
+++ b/python/tests/files/datacite/datacite_result_00.json
@@ -2,7 +2,6 @@
     "extra": {
         "container_name": "Journal of Chemical Crystallography",
         "datacite": {
-            "month": 5,
             "license": [
                 {
                     "rightsUri": "http://www.springer.com/tdm"
@@ -16,7 +15,8 @@
                     "relatedIdentifierType": "ISSN"
                 }
             ]
-        }
+        },
+        "month": 5
     },
     "title": "Synthesis and Crystal Structure of a Compound with Two Conformational Isomers: N-(2-methylbenzoyl)-N\u2032-(4-nitrophenyl)thiourea",
     "release_type": "article-journal",
diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json
index cea2a25c..1352fe29 100644
--- a/python/tests/files/datacite/datacite_result_05.json
+++ b/python/tests/files/datacite/datacite_result_05.json
@@ -6,9 +6,9 @@
                     "rights": "Attribution-NonCommercial (CC BY-NC)",
                     "rightsUri": "http://creativecommons.org/licenses/by-nc/4.0"
                 }
-            ],
-            "month": 10
-        }
+            ]
+        },
+        "month": 10
     },
     "title": "SH409843.07FU",
     "subtitle": "Gomphales",
diff --git a/python/tests/files/datacite/datacite_result_12.json b/python/tests/files/datacite/datacite_result_12.json
index 646299cf..c3a9071c 100644
--- a/python/tests/files/datacite/datacite_result_12.json
+++ b/python/tests/files/datacite/datacite_result_12.json
@@ -1,5 +1,5 @@
 {
-    "extra": {"datacite": {"month": 6}},
+    "extra": {"datacite": {}, "month": 6},
     "title": "Anthropometric and Physiological Profile of Mixed Martial Art Athletes: A Brief Review",
     "release_type": "article-journal",
     "release_stage": "published",
diff --git a/python/tests/files/datacite/datacite_result_13.json b/python/tests/files/datacite/datacite_result_13.json
index fea722c7..d6ed2985 100644
--- a/python/tests/files/datacite/datacite_result_13.json
+++ b/python/tests/files/datacite/datacite_result_13.json
@@ -1,5 +1,5 @@
 {
-    "extra": {"datacite": {"month": 10}},
+    "extra": {"datacite": {}, "month": 10},
     "title": "[M\u00fcssen wir des Gl\u00fccks uns sch\u00e4men?]",
     "release_type": "article-journal",
     "release_stage": "published",
diff --git a/python/tests/files/datacite/datacite_result_18.json b/python/tests/files/datacite/datacite_result_18.json
index 6599fe08..fb109de2 100644
--- a/python/tests/files/datacite/datacite_result_18.json
+++ b/python/tests/files/datacite/datacite_result_18.json
@@ -1,5 +1,5 @@
 {
-    "extra": {"datacite": {"month": 8}},
+    "extra": {"datacite": {}, "month": 8},
     "title": "Eastern questionnaire, answer sheet for Interviewee 53215, page 064",
     "release_type": "article",
     "release_stage": "published",
diff --git a/python/tests/files/datacite/datacite_result_19.json b/python/tests/files/datacite/datacite_result_19.json
index 5598ccee..85bada92 100644
--- a/python/tests/files/datacite/datacite_result_19.json
+++ b/python/tests/files/datacite/datacite_result_19.json
@@ -1,5 +1,5 @@
 {
-    "extra": {"datacite": {"month": 8}},
+    "extra": {"datacite": {}, "month": 8},
     "title": "Eastern questionnaire, answer sheet for Interviewee 55236, page 092",
     "release_type": "article",
     "release_stage": "published",
diff --git a/python/tests/files/datacite/datacite_result_20.json b/python/tests/files/datacite/datacite_result_20.json
index ec2dfc38..891cb41e 100644
--- a/python/tests/files/datacite/datacite_result_20.json
+++ b/python/tests/files/datacite/datacite_result_20.json
@@ -1,5 +1,5 @@
 {
-    "extra": {"datacite": {"month": 8}},
+    "extra": {"datacite": {}, "month": 8},
     "title": "<h1>Eastern questionnaire</h1>",
     "release_type": "article",
     "release_stage": "published",
diff --git a/python/tests/files/datacite/datacite_result_21.json b/python/tests/files/datacite/datacite_result_21.json
index b5e2207a..73df8216 100644
--- a/python/tests/files/datacite/datacite_result_21.json
+++ b/python/tests/files/datacite/datacite_result_21.json
@@ -1,5 +1,5 @@
 {
-    "extra": {"datacite": {"month": 8}},
+    "extra": {"datacite": {}, "month": 8},
     "title": "ABC",
     "release_type": "article",
     "release_stage": "published",
diff --git a/python/tests/files/datacite/datacite_result_22.json b/python/tests/files/datacite/datacite_result_22.json
index bd1290c2..97f35da5 100644
--- a/python/tests/files/datacite/datacite_result_22.json
+++ b/python/tests/files/datacite/datacite_result_22.json
@@ -1,5 +1,5 @@
 {
-    "extra": {"datacite": {"month": 8}},
+    "extra": {"datacite": {}, "month": 8},
     "title": "ABC",
     "release_type": "article",
     "release_stage": "published",
diff --git a/python/tests/files/datacite/datacite_result_23.json b/python/tests/files/datacite/datacite_result_23.json
index 599d1b37..93385c70 100644
--- a/python/tests/files/datacite/datacite_result_23.json
+++ b/python/tests/files/datacite/datacite_result_23.json
@@ -1,5 +1,5 @@
 {
-    "extra": {"datacite": {"month": 8}},
+    "extra": {"datacite": {}, "month": 8},
     "title": "ABC",
     "release_type": "article",
     "release_stage": "published",
diff --git a/python/tests/files/datacite/datacite_result_24.json b/python/tests/files/datacite/datacite_result_24.json
index a3649867..cb08e67b 100644
--- a/python/tests/files/datacite/datacite_result_24.json
+++ b/python/tests/files/datacite/datacite_result_24.json
@@ -1,5 +1,5 @@
 {
-    "extra": {"datacite": {"month": 8}},
+    "extra": {"datacite": {}, "month": 8},
     "title": "ABC",
     "subtitle": "DEF",
     "release_type": "article",
-- 
cgit v1.2.3


From 5b796406a975b50afb88863691cdfaeca55fddcc Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Mon, 6 Jan 2020 22:47:23 +0100
Subject: datacite: set release_stage to published by default

Set to `None` only if there is no publisher yet.

Docs: https://support.datacite.org/docs/doi-states
---
 python/fatcat_tools/importers/datacite.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 1cee6db3..936b6f1b 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -409,10 +409,11 @@ class DataciteImporter(EntityImporter):
         # Start with clear stages, e.g. published. TODO(martin): we could
         # probably infer a bit more from the relations, e.g.
         # "IsPreviousVersionOf" or "IsNewVersionOf".
-        release_stage = None
-        if attributes.get(
-                'state') == 'findable' or attributes.get('isActive') is True:
-            release_stage = 'published'
+        release_stage = 'published'
+
+        # TODO(martin): If 'state' is not 'findable' or 'isActive' is not true,
+        # we might want something else than 'published'. See also:
+        # https://support.datacite.org/docs/doi-states.
 
         # Publisher. A few NA values. A few bogus values.
         publisher = attributes.get('publisher')
-- 
cgit v1.2.3


From d3a1382795d14ac77165fa6eb39e893b03b97215 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 7 Jan 2020 01:57:47 +0100
Subject: datacite: fix typos

---
 python/fatcat_import.py                   | 2 +-
 python/fatcat_tools/importers/datacite.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 6b04d547..ea7e12f2 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -175,7 +175,7 @@ def run_datacite(args):
         extid_map_file=args.extid_map_file,
         insert_log_file=args.insert_log_file)
     if args.kafka_mode:
-        KafkaJsonPusher(fci, args.kafka_hosts, args.kafka_env, "api-datacite",
+        KafkaJsonPusher(dci, args.kafka_hosts, args.kafka_env, "api-datacite",
             "fatcat-import", consume_batch_size=args.batch_size).run()
     else:
         JsonLinePusher(dci, args.json_file).run()
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 936b6f1b..53f46bb4 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -699,7 +699,7 @@ class DataciteImporter(EntityImporter):
         if self.insert_log_file:
             with open(self.insert_log_file, 'a') as f:
                 for doc in batch:
-                    json.dump(entity_to_dict(re, api_client=None), f)
+                    json.dump(entity_to_dict(doc, api_client=None), f)
                     f.write('\n')
         self.api.create_release_auto_batch(
             fatcat_openapi_client.ReleaseAutoBatch(
-- 
cgit v1.2.3


From 3b531d2f83e9fde67e3c45d751fb80b2d9c815be Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 7 Jan 2020 01:58:39 +0100
Subject: datacite: apply pylint suggestions

---
 python/fatcat_tools/importers/datacite.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 53f46bb4..d7fbd269 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -8,20 +8,22 @@ problems in content and structure. A few fields habe their own parsing
 functions (parse_datacite_...), which can be tested more easily.
 """
 
-from .common import EntityImporter, clean
 import collections
-import dateparser
 import datetime
-import fatcat_openapi_client
 import hashlib
 import json
-import pycountry
-import langdetect
 import sqlite3
 import sys
-from fatcat_tools.transforms import entity_to_dict
+
+import dateparser
+import fatcat_openapi_client
+import langdetect
+import pycountry
+
 from fatcat_tools.normal import clean_doi
+from fatcat_tools.transforms import entity_to_dict
 
+from .common import EntityImporter, clean
 
 # Cutoff length for abstracts.
 MAX_ABSTRACT_LENGTH = 2048
@@ -309,7 +311,7 @@ class DataciteImporter(EntityImporter):
 
         for i, c in enumerate(attributes['creators']):
             nameType = c.get('nameType', '') or ''
-            if nameType == 'Personal' or nameType == '':
+            if nameType in ('', 'Personal'):
                 creator_id = None
                 for nid in c.get('nameIdentifiers', []):
                     name_scheme = nid.get('nameIdentifierScheme', '') or ''
@@ -493,7 +495,7 @@ class DataciteImporter(EntityImporter):
 
         if first_page and last_page:
             try:
-                int(first_page) < int(last_page)
+                _ = int(first_page) < int(last_page)
                 pages = '{}-{}'.format(first_page, last_page)
             except ValueError as err:
                 # TODO(martin): This is more debug than info.
-- 
cgit v1.2.3


From f9c711f77bba992e6e9e1d75929d35e8da828f61 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 7 Jan 2020 15:20:25 +0100
Subject: datacite: adding datacite-specific extra metadata

* attributes.metadataVersion
* attributes.schemaVersion
* attributes.version (source dependent values, follows suggestions in
https://schema.datacite.org/meta/kernel-4.3/doc/DataCite-MetadataKernel_v4.3.pdf#page=26,
but values vary)

Furthermore:

* attributes.types.resourceTypeGeneral
* attributes.types.resourceType
---
 python/fatcat_tools/importers/datacite.py          |   28 +
 python/tests/files/datacite/datacite_doc_20.json   |   77 +-
 python/tests/files/datacite/datacite_doc_21.json   |   77 +-
 python/tests/files/datacite/datacite_doc_22.json   |   81 +-
 python/tests/files/datacite/datacite_doc_23.json   |   81 +-
 python/tests/files/datacite/datacite_doc_24.json   |   89 +-
 .../tests/files/datacite/datacite_result_00.json   |  168 ++--
 .../tests/files/datacite/datacite_result_01.json   |   62 +-
 .../tests/files/datacite/datacite_result_02.json   |   70 +-
 .../tests/files/datacite/datacite_result_03.json   |   38 +-
 .../tests/files/datacite/datacite_result_04.json   |   61 +-
 .../tests/files/datacite/datacite_result_05.json   | 1060 ++++++++++----------
 .../tests/files/datacite/datacite_result_06.json   |   49 +-
 .../tests/files/datacite/datacite_result_07.json   |  128 +--
 .../tests/files/datacite/datacite_result_08.json   |   97 +-
 .../tests/files/datacite/datacite_result_09.json   |   69 +-
 .../tests/files/datacite/datacite_result_10.json   |   61 +-
 .../tests/files/datacite/datacite_result_11.json   |   44 +-
 .../tests/files/datacite/datacite_result_12.json   |   87 +-
 .../tests/files/datacite/datacite_result_13.json   |   58 +-
 .../tests/files/datacite/datacite_result_14.json   |  189 ++--
 .../tests/files/datacite/datacite_result_15.json   |   47 +-
 .../tests/files/datacite/datacite_result_16.json   |   59 +-
 .../tests/files/datacite/datacite_result_17.json   |   41 +-
 .../tests/files/datacite/datacite_result_18.json   |   30 +-
 .../tests/files/datacite/datacite_result_19.json   |   30 +-
 .../tests/files/datacite/datacite_result_20.json   |   27 +-
 .../tests/files/datacite/datacite_result_21.json   |   29 +-
 .../tests/files/datacite/datacite_result_22.json   |   43 +-
 .../tests/files/datacite/datacite_result_23.json   |   43 +-
 .../tests/files/datacite/datacite_result_24.json   |   43 +-
 31 files changed, 1598 insertions(+), 1468 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index d7fbd269..c2725aeb 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -607,6 +607,25 @@ class DataciteImporter(EntityImporter):
         if attributes.get('subjects'):
             extra_datacite['subjects'] = attributes['subjects']
 
+        # Include version information.
+        metadata_version = attributes.get('metadataVersion') or ''
+        schema_version = attributes.get('schemaVersion') or ''
+
+        if metadata_version:
+            extra_datacite['metadataVersion'] = metadata_version
+        if schema_version:
+            extra_datacite['schemaVersion'] = schema_version
+
+        # Include resource types.
+        types = attributes.get('types', {}) or {}
+        resource_type = types.get('resourceType', '') or ''
+        resource_type_general = types.get('resourceTypeGeneral', '') or ''
+
+        if resource_type:
+            extra_datacite['resourceType'] = resource_type
+        if resource_type_general:
+            extra_datacite['resourceTypeGeneral'] = resource_type_general
+
         # Include certain relations from relatedIdentifiers. Keeping the
         # original structure of data here, which is a list of dicts, with
         # relation type, identifer and identifier type (mostly).
@@ -625,6 +644,14 @@ class DataciteImporter(EntityImporter):
 
         extra = dict()
 
+        # "1.0.0", "v1.305.2019", "Final", "v1.0.0", "v0.3.0", "1", "0.19.0",
+        # "3.1", "v1.1", "{version}", "4.0", "10329", "11672", "11555",
+        # "v1.4.5", "2", "V1", "v3.0", "v0", "v0.6", "11124", "v1.0-beta", "1st
+        # Edition", "20191024", "v2.0.0", "v0.9.3", "10149", "2.0", null,
+        # "v0.1.1", "3.0", "1.0", "3", "v1.12.2", "20191018", "v0.3.1", "v1.0",
+        # "10161", "10010691", "10780", # "Presentación"
+        version = attributes.get('version')
+
         # top-level extra keys
         if not container_id and container_name:
             extra['container_name'] = container_name
@@ -666,6 +693,7 @@ class DataciteImporter(EntityImporter):
             refs=refs,
             extra=extra,
             license_slug=license_slug,
+            version=version,
         )
         return re
 
diff --git a/python/tests/files/datacite/datacite_doc_20.json b/python/tests/files/datacite/datacite_doc_20.json
index 964e2cbb..cc6cc1fb 100644
--- a/python/tests/files/datacite/datacite_doc_20.json
+++ b/python/tests/files/datacite/datacite_doc_20.json
@@ -1,42 +1,41 @@
 {
-    "attributes": {
-      "doi": "10.7916/d86x0cg1",
-      "creators": [
-        {
-          "name": "(:Unav)",
-          "affiliation": [],
-          "nameIdentifiers": []
-        }
-      ],
-      "titles": [
-        {
-          "title": "<h1>Eastern questionnaire</h1>"
-        }
-      ],
-      "publicationYear": 2017,
-      "dates": [
-        {
-          "date": "2017-08-24",
-          "dateType": "Created"
-        },
-        {
-          "date": "2019-08-04",
-          "dateType": "Updated"
-        },
-        {
-          "date": "2017",
-          "dateType": "Issued"
-        }
-      ],
-      "language": null,
-      "types": {
-        "ris": "GEN",
-        "bibtex": "misc",
-        "citeproc": "article",
-        "schemaOrg": "CreativeWork"
+  "attributes": {
+    "doi": "10.7916/d86x0cg1",
+    "creators": [
+      {
+        "name": "(:Unav)",
+        "affiliation": [],
+        "nameIdentifiers": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "<h1>Eastern questionnaire</h1>"
+      }
+    ],
+    "publicationYear": 2017,
+    "dates": [
+      {
+        "date": "2017-08-24",
+        "dateType": "Created"
       },
-      "isActive": true,
-      "state": "findable"
-    }
+      {
+        "date": "2019-08-04",
+        "dateType": "Updated"
+      },
+      {
+        "date": "2017",
+        "dateType": "Issued"
+      }
+    ],
+    "language": null,
+    "types": {
+      "ris": "GEN",
+      "bibtex": "misc",
+      "citeproc": "article",
+      "schemaOrg": "CreativeWork"
+    },
+    "isActive": true,
+    "state": "findable"
   }
-  
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_doc_21.json b/python/tests/files/datacite/datacite_doc_21.json
index cae7f40f..04b196a6 100644
--- a/python/tests/files/datacite/datacite_doc_21.json
+++ b/python/tests/files/datacite/datacite_doc_21.json
@@ -1,42 +1,41 @@
 {
-    "attributes": {
-      "doi": "10.7916/d86x0cg1",
-      "creators": [
-        {
-          "name": "(:Unav)",
-          "affiliation": [],
-          "nameIdentifiers": []
-        }
-      ],
-      "titles": [
-        {
-          "title": "ABC"
-        }
-      ],
-      "publicationYear": 2017,
-      "language": "GERMAN",
-      "types": {
-        "ris": "GEN",
-        "bibtex": "misc",
-        "citeproc": "article",
-        "schemaOrg": "CreativeWork"
+  "attributes": {
+    "doi": "10.7916/d86x0cg1",
+    "creators": [
+      {
+        "name": "(:Unav)",
+        "affiliation": [],
+        "nameIdentifiers": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "ABC"
+      }
+    ],
+    "publicationYear": 2017,
+    "language": "GERMAN",
+    "types": {
+      "ris": "GEN",
+      "bibtex": "misc",
+      "citeproc": "article",
+      "schemaOrg": "CreativeWork"
+    },
+    "dates": [
+      {
+        "date": "2017-08-24",
+        "dateType": "Created"
       },
-      "dates": [
-        {
-          "date": "2017-08-24",
-          "dateType": "Created"
-        },
-        {
-          "date": "2019-08-04",
-          "dateType": "Updated"
-        },
-        {
-          "date": "2017",
-          "dateType": "Issued"
-        }
-      ],
-      "isActive": true,
-      "state": "findable"
-    }
+      {
+        "date": "2019-08-04",
+        "dateType": "Updated"
+      },
+      {
+        "date": "2017",
+        "dateType": "Issued"
+      }
+    ],
+    "isActive": true,
+    "state": "findable"
   }
-  
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_doc_22.json b/python/tests/files/datacite/datacite_doc_22.json
index 42448ddf..365b1361 100644
--- a/python/tests/files/datacite/datacite_doc_22.json
+++ b/python/tests/files/datacite/datacite_doc_22.json
@@ -1,44 +1,43 @@
 {
-    "attributes": {
-      "doi": "10.7916/d86x0cg1",
-      "creators": [
-        {
-          "name": "Anton Welch",
-          "affiliation": [
-            "Department of pataphysics"
-          ],
-          "nameIdentifiers": []
-        }
-      ],
-      "titles": [
-        {
-          "title": "ABC"
-        }
-      ],
-      "publicationYear": 2017,
-      "language": "GERMAN",
-      "types": {
-        "ris": "GEN",
-        "bibtex": "misc",
-        "citeproc": "article",
-        "schemaOrg": "CreativeWork"
+  "attributes": {
+    "doi": "10.7916/d86x0cg1",
+    "creators": [
+      {
+        "name": "Anton Welch",
+        "affiliation": [
+          "Department of pataphysics"
+        ],
+        "nameIdentifiers": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "ABC"
+      }
+    ],
+    "publicationYear": 2017,
+    "language": "GERMAN",
+    "types": {
+      "ris": "GEN",
+      "bibtex": "misc",
+      "citeproc": "article",
+      "schemaOrg": "CreativeWork"
+    },
+    "dates": [
+      {
+        "date": "2017-08-24",
+        "dateType": "Created"
       },
-      "dates": [
-        {
-          "date": "2017-08-24",
-          "dateType": "Created"
-        },
-        {
-          "date": "2019-08-04",
-          "dateType": "Updated"
-        },
-        {
-          "date": "2017",
-          "dateType": "Issued"
-        }
-      ],
-      "isActive": true,
-      "state": "findable"
-    }
+      {
+        "date": "2019-08-04",
+        "dateType": "Updated"
+      },
+      {
+        "date": "2017",
+        "dateType": "Issued"
+      }
+    ],
+    "isActive": true,
+    "state": "findable"
   }
-
+}
diff --git a/python/tests/files/datacite/datacite_doc_23.json b/python/tests/files/datacite/datacite_doc_23.json
index 1e5bcc3f..1dcdfc27 100644
--- a/python/tests/files/datacite/datacite_doc_23.json
+++ b/python/tests/files/datacite/datacite_doc_23.json
@@ -1,44 +1,43 @@
 {
-    "attributes": {
-      "doi": "10.7916/d86x0cg1\u2013xxx",
-      "creators": [
-        {
-          "name": "Anton Welch",
-          "affiliation": [
-            "Department of pataphysics"
-          ],
-          "nameIdentifiers": []
-        }
-      ],
-      "titles": [
-        {
-          "title": "ABC"
-        }
-      ],
-      "publicationYear": 2017,
-      "language": "GERMAN",
-      "types": {
-        "ris": "GEN",
-        "bibtex": "misc",
-        "citeproc": "article",
-        "schemaOrg": "CreativeWork"
+  "attributes": {
+    "doi": "10.7916/d86x0cg1–xxx",
+    "creators": [
+      {
+        "name": "Anton Welch",
+        "affiliation": [
+          "Department of pataphysics"
+        ],
+        "nameIdentifiers": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "ABC"
+      }
+    ],
+    "publicationYear": 2017,
+    "language": "GERMAN",
+    "types": {
+      "ris": "GEN",
+      "bibtex": "misc",
+      "citeproc": "article",
+      "schemaOrg": "CreativeWork"
+    },
+    "dates": [
+      {
+        "date": "2017-08-24",
+        "dateType": "Created"
       },
-      "dates": [
-        {
-          "date": "2017-08-24",
-          "dateType": "Created"
-        },
-        {
-          "date": "2019-08-04",
-          "dateType": "Updated"
-        },
-        {
-          "date": "2017",
-          "dateType": "Issued"
-        }
-      ],
-      "isActive": true,
-      "state": "findable"
-    }
+      {
+        "date": "2019-08-04",
+        "dateType": "Updated"
+      },
+      {
+        "date": "2017",
+        "dateType": "Issued"
+      }
+    ],
+    "isActive": true,
+    "state": "findable"
   }
-
+}
diff --git a/python/tests/files/datacite/datacite_doc_24.json b/python/tests/files/datacite/datacite_doc_24.json
index 6123350b..4ea6945f 100644
--- a/python/tests/files/datacite/datacite_doc_24.json
+++ b/python/tests/files/datacite/datacite_doc_24.json
@@ -1,48 +1,47 @@
 {
-    "attributes": {
-      "doi": "10.7916/d86x0cg1",
-      "creators": [
-        {
-          "name": "Anton Welch",
-          "affiliation": [
-            "Department of pataphysics"
-          ],
-          "nameIdentifiers": []
-        }
-      ],
-      "titles": [
-        {
-          "title": "ABC"
-        },
-        {
-          "title": "DEF",
-          "titleType": "Subtitle"
-        }
-      ],
-      "publicationYear": 2016,
-      "language": "DE-CH",
-      "types": {
-        "ris": "GEN",
-        "bibtex": "misc",
-        "citeproc": "article",
-        "schemaOrg": "CreativeWork"
+  "attributes": {
+    "doi": "10.7916/d86x0cg1",
+    "creators": [
+      {
+        "name": "Anton Welch",
+        "affiliation": [
+          "Department of pataphysics"
+        ],
+        "nameIdentifiers": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "ABC"
       },
-      "dates": [
-        {
-          "date": "2017-08-24",
-          "dateType": "Created"
-        },
-        {
-          "date": "2019-08-04",
-          "dateType": "Updated"
-        },
-        {
-          "date": "2017",
-          "dateType": "Issued"
-        }
-      ],
-      "isActive": true,
-      "state": "findable"
-    }
+      {
+        "title": "DEF",
+        "titleType": "Subtitle"
+      }
+    ],
+    "publicationYear": 2016,
+    "language": "DE-CH",
+    "types": {
+      "ris": "GEN",
+      "bibtex": "misc",
+      "citeproc": "article",
+      "schemaOrg": "CreativeWork"
+    },
+    "dates": [
+      {
+        "date": "2017-08-24",
+        "dateType": "Created"
+      },
+      {
+        "date": "2019-08-04",
+        "dateType": "Updated"
+      },
+      {
+        "date": "2017",
+        "dateType": "Issued"
+      }
+    ],
+    "isActive": true,
+    "state": "findable"
   }
-
+}
diff --git a/python/tests/files/datacite/datacite_result_00.json b/python/tests/files/datacite/datacite_result_00.json
index e76aa391..28da5397 100644
--- a/python/tests/files/datacite/datacite_result_00.json
+++ b/python/tests/files/datacite/datacite_result_00.json
@@ -1,88 +1,92 @@
 {
-    "extra": {
-        "container_name": "Journal of Chemical Crystallography",
-        "datacite": {
-            "license": [
-                {
-                    "rightsUri": "http://www.springer.com/tdm"
-                }
-            ],
-            "relations": [
-                {
-                    "relationType": "IsPartOf",
-                    "relatedIdentifier": "1074-1542",
-                    "resourceTypeGeneral": "Collection",
-                    "relatedIdentifierType": "ISSN"
-                }
-            ]
-        },
-        "month": 5
-    },
-    "title": "Synthesis and Crystal Structure of a Compound with Two Conformational Isomers: N-(2-methylbenzoyl)-N\u2032-(4-nitrophenyl)thiourea",
-    "release_type": "article-journal",
-    "release_stage": "published",
-    "release_date": "2019-05-31",
-    "release_year": 2019,
-    "ext_ids": {
-        "doi": "10.1007/s10870-008-9413-z"
-    },
-    "volume": "38",
-    "issue": "12",
-    "pages": "927-930",
-    "publisher": "Springer Science and Business Media LLC",
-    "contribs": [
+  "extra": {
+    "container_name": "Journal of Chemical Crystallography",
+    "datacite": {
+      "license": [
         {
-            "index": 0,
-            "raw_name": "Qian-Jin Li",
-            "given_name": "Qian-Jin",
-            "surname": "Li",
-            "role": "author"
-        },
-        {
-            "index": 1,
-            "raw_name": "Chun-Long Yang",
-            "given_name": "Chun-Long",
-            "surname": "Yang",
-            "role": "author"
+          "rightsUri": "http://www.springer.com/tdm"
         }
-    ],
-    "refs": [
-        {
-            "index": 0,
-            "extra": {
-                "doi": "10.1016/j.bmcl.2005.09.033"
-            }
-        },
-        {
-            "index": 1,
-            "extra": {
-                "doi": "10.1016/s0022-1139(02)00330-5"
-            }
-        },
-        {
-            "index": 2,
-            "extra": {
-                "doi": "10.1016/s0010-8545(01)00337-x"
-            }
-        },
-        {
-            "index": 3,
-            "extra": {
-                "doi": "10.1016/j.tetlet.2005.06.135"
-            }
-        },
-        {
-            "index": 4,
-            "extra": {
-                "doi": "10.1039/p298700000s1"
-            }
-        },
+      ],
+      "relations": [
         {
-            "index": 5,
-            "extra": {
-                "doi": "10.1002/anie.199515551"
-            }
+          "relationType": "IsPartOf",
+          "relatedIdentifier": "1074-1542",
+          "resourceTypeGeneral": "Collection",
+          "relatedIdentifierType": "ISSN"
         }
-    ],
-    "abstracts": []
+      ],
+      "resourceType": "JournalArticle",
+      "resourceTypeGeneral": "Text",
+      "schemaVersion": "http://datacite.org/schema/kernel-4",
+      "metadataVersion": 1
+    },
+    "month": 5
+  },
+  "title": "Synthesis and Crystal Structure of a Compound with Two Conformational Isomers: N-(2-methylbenzoyl)-N′-(4-nitrophenyl)thiourea",
+  "release_type": "article-journal",
+  "release_stage": "published",
+  "release_date": "2019-05-31",
+  "release_year": 2019,
+  "ext_ids": {
+    "doi": "10.1007/s10870-008-9413-z"
+  },
+  "volume": "38",
+  "issue": "12",
+  "pages": "927-930",
+  "publisher": "Springer Science and Business Media LLC",
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "Qian-Jin Li",
+      "given_name": "Qian-Jin",
+      "surname": "Li",
+      "role": "author"
+    },
+    {
+      "index": 1,
+      "raw_name": "Chun-Long Yang",
+      "given_name": "Chun-Long",
+      "surname": "Yang",
+      "role": "author"
+    }
+  ],
+  "refs": [
+    {
+      "index": 0,
+      "extra": {
+        "doi": "10.1016/j.bmcl.2005.09.033"
+      }
+    },
+    {
+      "index": 1,
+      "extra": {
+        "doi": "10.1016/s0022-1139(02)00330-5"
+      }
+    },
+    {
+      "index": 2,
+      "extra": {
+        "doi": "10.1016/s0010-8545(01)00337-x"
+      }
+    },
+    {
+      "index": 3,
+      "extra": {
+        "doi": "10.1016/j.tetlet.2005.06.135"
+      }
+    },
+    {
+      "index": 4,
+      "extra": {
+        "doi": "10.1039/p298700000s1"
+      }
+    },
+    {
+      "index": 5,
+      "extra": {
+        "doi": "10.1002/anie.199515551"
+      }
+    }
+  ],
+  "abstracts": []
 }
diff --git a/python/tests/files/datacite/datacite_result_01.json b/python/tests/files/datacite/datacite_result_01.json
index 46be2515..956357b8 100644
--- a/python/tests/files/datacite/datacite_result_01.json
+++ b/python/tests/files/datacite/datacite_result_01.json
@@ -1,32 +1,36 @@
 {
-    "extra": {
-        "datacite": {
-            "license": [
-                {
-                    "lang": "de",
-                    "rights": "Standard (Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen) - http://www.ub.uni-heidelberg.de/helios/digi/nutzung/Welcome.html"
-                }
-            ]
-        }
-    },
-    "title": "Ferdinand Gaillard, [1]: n\u00e9 \u00e0 Paris le 16 janvier 1834, mort \u00e0 Paris le 19 janvier 1887",
-    "release_type": "article-journal",
-    "release_stage": "published",
-    "release_year": 1887,
-    "ext_ids": {
-        "doi": "10.11588/diglit.25558.39"
-    },
-    "publisher": "University Library Heidelberg",
-    "language": "fr",
-    "contribs": [
+  "extra": {
+    "datacite": {
+      "license": [
         {
-            "index": 0,
-            "raw_name": "G. Dargenty",
-            "given_name": "G.",
-            "surname": "Dargenty",
-            "role": "author"
+          "lang": "de",
+          "rights": "Standard (Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen) - http://www.ub.uni-heidelberg.de/helios/digi/nutzung/Welcome.html"
         }
-    ],
-    "refs": [],
-    "abstracts": []
-}
\ No newline at end of file
+      ],
+      "metadataVersion": 4,
+      "resourceType": "DigitalisatDigital copy",
+      "resourceTypeGeneral": "Text",
+      "schemaVersion": "http://datacite.org/schema/kernel-4"
+    }
+  },
+  "title": "Ferdinand Gaillard, [1]: né à Paris le 16 janvier 1834, mort à Paris le 19 janvier 1887",
+  "release_type": "article-journal",
+  "release_stage": "published",
+  "release_year": 1887,
+  "ext_ids": {
+    "doi": "10.11588/diglit.25558.39"
+  },
+  "publisher": "University Library Heidelberg",
+  "language": "fr",
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "G. Dargenty",
+      "given_name": "G.",
+      "surname": "Dargenty",
+      "role": "author"
+    }
+  ],
+  "refs": [],
+  "abstracts": []
+}
diff --git a/python/tests/files/datacite/datacite_result_02.json b/python/tests/files/datacite/datacite_result_02.json
index bdcb4951..322baf59 100644
--- a/python/tests/files/datacite/datacite_result_02.json
+++ b/python/tests/files/datacite/datacite_result_02.json
@@ -1,36 +1,40 @@
 {
-    "extra": {
-        "datacite": {
-            "license": [
-                {
-                    "lang": "de",
-                    "rights": "Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen - https://creativecommons.org/licenses/by-sa/3.0/de/"
-                },
-                {
-                    "lang": "en",
-                    "rights": "Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen - https://creativecommons.org/licenses/by-sa/3.0/"
-                }
-            ]
-        }
-    },
-    "title": "Solinger Schwertschmiede-Familien, [4]",
-    "release_type": "article-journal",
-    "release_stage": "published",
-    "release_year": 1897,
-    "ext_ids": {
-        "doi": "10.11588/diglit.37715.57"
-    },
-    "publisher": "University Library Heidelberg",
-    "language": "de",
-    "contribs": [
+  "extra": {
+    "datacite": {
+      "license": [
+        {
+          "lang": "de",
+          "rights": "Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen - https://creativecommons.org/licenses/by-sa/3.0/de/"
+        },
         {
-            "index": 0,
-            "raw_name": "Albert Weyersberg",
-            "given_name": "Albert",
-            "surname": "Weyersberg",
-            "role": "author"
+          "lang": "en",
+          "rights": "Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen - https://creativecommons.org/licenses/by-sa/3.0/"
         }
-    ],
-    "refs": [],
-    "abstracts": []
-}
\ No newline at end of file
+      ],
+      "metadataVersion": 2,
+      "resourceType": "DigitalisatDigital copy",
+      "resourceTypeGeneral": "Text",
+      "schemaVersion": "http://datacite.org/schema/kernel-4"
+    }
+  },
+  "title": "Solinger Schwertschmiede-Familien, [4]",
+  "release_type": "article-journal",
+  "release_stage": "published",
+  "release_year": 1897,
+  "ext_ids": {
+    "doi": "10.11588/diglit.37715.57"
+  },
+  "publisher": "University Library Heidelberg",
+  "language": "de",
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "Albert Weyersberg",
+      "given_name": "Albert",
+      "surname": "Weyersberg",
+      "role": "author"
+    }
+  ],
+  "refs": [],
+  "abstracts": []
+}
diff --git a/python/tests/files/datacite/datacite_result_03.json b/python/tests/files/datacite/datacite_result_03.json
index e8367e8f..41d8d4cd 100644
--- a/python/tests/files/datacite/datacite_result_03.json
+++ b/python/tests/files/datacite/datacite_result_03.json
@@ -1,19 +1,23 @@
 {
-    "extra": {"datacite": {}},
-    "title": "midterm ah30903",
-    "release_type": "article",
-    "release_year": 2016,
-    "ext_ids": {
-        "doi": "10.13140/rg.2.2.30434.53446"
-    },
-    "language": "ms",
-    "contribs": [
-        {
-            "index": 0,
-            "raw_name": "Mastura Yahya",
-            "role": "author"
-        }
-    ],
-    "refs": [],
-    "abstracts": []
+  "extra": {
+    "datacite": {
+      "schemaVersion": "http://datacite.org/schema/kernel-3"
+    }
+  },
+  "title": "midterm ah30903",
+  "release_type": "article",
+  "release_year": 2016,
+  "ext_ids": {
+    "doi": "10.13140/rg.2.2.30434.53446"
+  },
+  "language": "ms",
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "Mastura Yahya",
+      "role": "author"
+    }
+  ],
+  "refs": [],
+  "abstracts": []
 }
diff --git a/python/tests/files/datacite/datacite_result_04.json b/python/tests/files/datacite/datacite_result_04.json
index 5b956836..0976e40e 100644
--- a/python/tests/files/datacite/datacite_result_04.json
+++ b/python/tests/files/datacite/datacite_result_04.json
@@ -1,29 +1,36 @@
 {
-    "extra": {"datacite": {}},
-    "title": "On chain maps inducing isomorphisms in homology",
-    "release_type": "article-journal",
-    "release_stage": "published",
-    "release_year": 1973,
-    "ext_ids": {
-        "doi": "10.14288/1.0080520"
-    },
-    "publisher": "University of British Columbia",
-    "language": "en",
-    "contribs": [
-        {
-            "index": 0,
-            "raw_name": "Marc Andre Nicollerat",
-            "given_name": "Marc Andre",
-            "surname": "Nicollerat",
-            "role": "author"
-        }
-    ],
-    "refs": [],
-    "abstracts": [
-        {
-            "content": "Let A be an abelian category, I the full subcategory of A consisting of injective objects of A, and K(A) the category whose objects are cochain complexes of elements of A, and whose morphisms are homotopy classes of cochain maps.  In (5), lemma 4.6., p. 42, R. Hartshorne has proved that, under certain conditions, a cochain complex X\u02d9 \u03b5. |KA)| can be embedded in a complex I\u02d9 \u03b5. |K(I)| in such a way that I\u02d9 has the same cohomology as X\u02d9.  In Chapter I we show that the construction given in the two first parts of Hartshorne's Lemma is natural i.e. there exists a functor  J : K(A) \u2192 K(I) and a natural transformation [formula omitted]  (where E : K(I) \u2192 K(A) is the embedding functor) such that [formula omitted] is  injective and induces isomorphism in cohomology. The question whether the construction given in the third part of the lemma is functorial is still open.  We also prove that J is left adjoint to E, so that K(I) is a reflective subcategory of K(A).  In the special case where A is a category [formula omitted] of left A-modules, and [formula omitted] the category of cochain complexes in [formula omitted] and cochain maps (not homotopy classes), we prove the existence of a functor [formula omitted]  In Chapter II we study the natural homomorphism [formula omitted]   where A, B are rings, and M, L, N modules or chain complexes. In particular we give several sufficient conditions under which v is an isomorphism, or induces isomorphism in homology.  In the appendix we give a detailed proof of Hartshorne's Lemma. We think that this is useful, as no complete proof is, to our knowledge, to be found in the literature.",
-            "mimetype": "text/plain",
-            "lang": "en"
-        }
-    ]
+  "extra": {
+    "datacite": {
+      "metadataVersion": 5,
+      "resourceType": "Text",
+      "resourceTypeGeneral": "Text",
+      "schemaVersion": "http://datacite.org/schema/kernel-3"
+    }
+  },
+  "title": "On chain maps inducing isomorphisms in homology",
+  "release_type": "article-journal",
+  "release_stage": "published",
+  "release_year": 1973,
+  "ext_ids": {
+    "doi": "10.14288/1.0080520"
+  },
+  "publisher": "University of British Columbia",
+  "language": "en",
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "Marc Andre Nicollerat",
+      "given_name": "Marc Andre",
+      "surname": "Nicollerat",
+      "role": "author"
+    }
+  ],
+  "refs": [],
+  "abstracts": [
+    {
+      "content": "Let A be an abelian category, I the full subcategory of A consisting of injective objects of A, and K(A) the category whose objects are cochain complexes of elements of A, and whose morphisms are homotopy classes of cochain maps.  In (5), lemma 4.6., p. 42, R. Hartshorne has proved that, under certain conditions, a cochain complex X˙ ε. |KA)| can be embedded in a complex I˙ ε. |K(I)| in such a way that I˙ has the same cohomology as X˙.  In Chapter I we show that the construction given in the two first parts of Hartshorne's Lemma is natural i.e. there exists a functor  J : K(A) → K(I) and a natural transformation [formula omitted]  (where E : K(I) → K(A) is the embedding functor) such that [formula omitted] is  injective and induces isomorphism in cohomology. The question whether the construction given in the third part of the lemma is functorial is still open.  We also prove that J is left adjoint to E, so that K(I) is a reflective subcategory of K(A).  In the special case where A is a category [formula omitted] of left A-modules, and [formula omitted] the category of cochain complexes in [formula omitted] and cochain maps (not homotopy classes), we prove the existence of a functor [formula omitted]  In Chapter II we study the natural homomorphism [formula omitted]   where A, B are rings, and M, L, N modules or chain complexes. In particular we give several sufficient conditions under which v is an isomorphism, or induces isomorphism in homology.  In the appendix we give a detailed proof of Hartshorne's Lemma. We think that this is useful, as no complete proof is, to our knowledge, to be found in the literature.",
+      "mimetype": "text/plain",
+      "lang": "en"
+    }
+  ]
 }
diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json
index 1352fe29..961ad72a 100644
--- a/python/tests/files/datacite/datacite_result_05.json
+++ b/python/tests/files/datacite/datacite_result_05.json
@@ -1,532 +1,536 @@
 {
-    "extra": {
-        "datacite": {
-            "license": [
-                {
-                    "rights": "Attribution-NonCommercial (CC BY-NC)",
-                    "rightsUri": "http://creativecommons.org/licenses/by-nc/4.0"
-                }
-            ]
-        },
-        "month": 10
-    },
-    "title": "SH409843.07FU",
-    "subtitle": "Gomphales",
-    "release_type": "dataset",
-    "release_stage": "published",
-    "release_date": "2014-10-05",
-    "release_year": 2014,
-    "ext_ids": {
-        "doi": "10.15156/bio/sh409843.07fu"
-    },
-    "publisher": "UNITE Community",
-    "language": "en",
-    "license_slug": "CC-BY-NC",
-    "contribs": [
+  "extra": {
+    "datacite": {
+      "license": [
         {
-            "index": 0,
-            "raw_name": "Urmas K\u00f5ljalg",
-            "given_name": "Urmas",
-            "surname": "K\u00f5ljalg",
-            "role": "author"
-        },
-        {
-            "index": 1,
-            "raw_name": "Kessy Abarenkov",
-            "given_name": "Kessy",
-            "surname": "Abarenkov",
-            "role": "author"
-        },
-        {
-            "index": 2,
-            "raw_name": "R. Henrik Nilsson",
-            "given_name": "R. Henrik",
-            "surname": "Nilsson",
-            "role": "author"
-        },
-        {
-            "index": 3,
-            "raw_name": "Karl-Henrik Larsson",
-            "given_name": "Karl-Henrik",
-            "surname": "Larsson",
-            "role": "author"
-        },
-        {
-            "index": 4,
-            "raw_name": "Anders Bj\u00f8rnsgard Aas",
-            "given_name": "Anders Bj\u00f8rnsgard",
-            "surname": "Aas",
-            "role": "author"
-        },
-        {
-            "index": 5,
-            "raw_name": "Rachel Adams",
-            "given_name": "Rachel",
-            "surname": "Adams",
-            "role": "author"
-        },
-        {
-            "index": 6,
-            "raw_name": "Artur Alves",
-            "given_name": "Artur",
-            "surname": "Alves",
-            "role": "author"
-        },
-        {
-            "index": 7,
-            "raw_name": "Joseph F. Ammirati",
-            "given_name": "Joseph F.",
-            "surname": "Ammirati",
-            "role": "author"
-        },
-        {
-            "index": 8,
-            "raw_name": "A. Elizabeth Arnold",
-            "given_name": "A. Elizabeth",
-            "surname": "Arnold",
-            "role": "author"
-        },
-        {
-            "index": 9,
-            "raw_name": "Mohammad Bahram",
-            "given_name": "Mohammad",
-            "surname": "Bahram",
-            "role": "author"
-        },
-        {
-            "index": 10,
-            "raw_name": "Johan Bengtsson-Palme",
-            "given_name": "Johan",
-            "surname": "Bengtsson-Palme",
-            "role": "author"
-        },
-        {
-            "index": 11,
-            "raw_name": "Anna Berlin",
-            "given_name": "Anna",
-            "surname": "Berlin",
-            "role": "author"
-        },
-        {
-            "index": 12,
-            "raw_name": "Synn\u00f8ve Botnen",
-            "given_name": "Synn\u00f8ve",
-            "surname": "Botnen",
-            "role": "author"
-        },
-        {
-            "index": 13,
-            "raw_name": "Sarah Bourlat",
-            "given_name": "Sarah",
-            "surname": "Bourlat",
-            "role": "author"
-        },
-        {
-            "index": 14,
-            "raw_name": "Tanya Cheeke",
-            "given_name": "Tanya",
-            "surname": "Cheeke",
-            "role": "author"
-        },
-        {
-            "index": 15,
-            "raw_name": "B\u00e1lint Dima",
-            "given_name": "B\u00e1lint",
-            "surname": "Dima",
-            "role": "author"
-        },
-        {
-            "index": 16,
-            "raw_name": "Rein Drenkhan",
-            "given_name": "Rein",
-            "surname": "Drenkhan",
-            "role": "author"
-        },
-        {
-            "index": 17,
-            "raw_name": "Camila Duarte",
-            "given_name": "Camila",
-            "surname": "Duarte",
-            "role": "author"
-        },
-        {
-            "index": 18,
-            "raw_name": "Margarita Due\u00f1as",
-            "given_name": "Margarita",
-            "surname": "Due\u00f1as",
-            "role": "author"
-        },
-        {
-            "index": 19,
-            "raw_name": "Ursula Eberhardt",
-            "given_name": "Ursula",
-            "surname": "Eberhardt",
-            "role": "author"
-        },
-        {
-            "index": 20,
-            "raw_name": "Hanna Friberg",
-            "given_name": "Hanna",
-            "surname": "Friberg",
-            "role": "author"
-        },
-        {
-            "index": 21,
-            "raw_name": "Tobias G. Fr\u00f8slev",
-            "given_name": "Tobias G.",
-            "surname": "Fr\u00f8slev",
-            "role": "author"
-        },
-        {
-            "index": 22,
-            "raw_name": "Sigisfredo Garnica",
-            "given_name": "Sigisfredo",
-            "surname": "Garnica",
-            "role": "author"
-        },
-        {
-            "index": 23,
-            "raw_name": "J\u00f3zsef Geml",
-            "given_name": "J\u00f3zsef",
-            "surname": "Geml",
-            "role": "author"
-        },
-        {
-            "index": 24,
-            "raw_name": "Masoomeh Ghobad-Nejhad",
-            "given_name": "Masoomeh",
-            "surname": "Ghobad-Nejhad",
-            "role": "author"
-        },
-        {
-            "index": 25,
-            "raw_name": "Tine Grebenc",
-            "given_name": "Tine",
-            "surname": "Grebenc",
-            "role": "author"
-        },
-        {
-            "index": 26,
-            "raw_name": "Gareth W. Griffith",
-            "given_name": "Gareth W.",
-            "surname": "Griffith",
-            "role": "author"
-        },
-        {
-            "index": 27,
-            "raw_name": "Felix Hampe",
-            "given_name": "Felix",
-            "surname": "Hampe",
-            "role": "author"
-        },
-        {
-            "index": 28,
-            "raw_name": "Peter Kennedy",
-            "given_name": "Peter",
-            "surname": "Kennedy",
-            "role": "author"
-        },
-        {
-            "index": 29,
-            "raw_name": "Maryia Khomich",
-            "given_name": "Maryia",
-            "surname": "Khomich",
-            "role": "author"
-        },
-        {
-            "index": 30,
-            "raw_name": "Petr Kohout",
-            "given_name": "Petr",
-            "surname": "Kohout",
-            "role": "author"
-        },
-        {
-            "index": 31,
-            "raw_name": "Anu Kollom",
-            "given_name": "Anu",
-            "surname": "Kollom",
-            "role": "author"
-        },
-        {
-            "index": 32,
-            "raw_name": "Ellen Larsson",
-            "given_name": "Ellen",
-            "surname": "Larsson",
-            "role": "author"
-        },
-        {
-            "index": 33,
-            "raw_name": "Irinyi Laszlo",
-            "given_name": "Irinyi",
-            "surname": "Laszlo",
-            "role": "author"
-        },
-        {
-            "index": 34,
-            "raw_name": "Steven Leavitt",
-            "given_name": "Steven",
-            "surname": "Leavitt",
-            "role": "author"
-        },
-        {
-            "index": 35,
-            "raw_name": "Kare Liimatainen",
-            "given_name": "Kare",
-            "surname": "Liimatainen",
-            "role": "author"
-        },
-        {
-            "index": 36,
-            "raw_name": "Bj\u00f6rn Lindahl",
-            "given_name": "Bj\u00f6rn",
-            "surname": "Lindahl",
-            "role": "author"
-        },
-        {
-            "index": 37,
-            "raw_name": "Deborah J. Lodge",
-            "given_name": "Deborah J.",
-            "surname": "Lodge",
-            "role": "author"
-        },
-        {
-            "index": 38,
-            "raw_name": "Helge Thorsten Lumbsch",
-            "given_name": "Helge Thorsten",
-            "surname": "Lumbsch",
-            "role": "author"
-        },
-        {
-            "index": 39,
-            "raw_name": "Mar\u00eda Paz Mart\u00edn Esteban",
-            "given_name": "Mar\u00eda Paz",
-            "surname": "Mart\u00edn Esteban",
-            "role": "author"
-        },
-        {
-            "index": 40,
-            "raw_name": "Wieland Meyer",
-            "given_name": "Wieland",
-            "surname": "Meyer",
-            "role": "author"
-        },
-        {
-            "index": 41,
-            "raw_name": "Otto Miettinen",
-            "given_name": "Otto",
-            "surname": "Miettinen",
-            "role": "author"
-        },
-        {
-            "index": 42,
-            "raw_name": "Nhu Nguyen",
-            "given_name": "Nhu",
-            "surname": "Nguyen",
-            "role": "author"
-        },
-        {
-            "index": 43,
-            "raw_name": "Tuula Niskanen",
-            "given_name": "Tuula",
-            "surname": "Niskanen",
-            "role": "author"
-        },
-        {
-            "index": 44,
-            "raw_name": "Ryoko Oono",
-            "given_name": "Ryoko",
-            "surname": "Oono",
-            "role": "author"
-        },
-        {
-            "index": 45,
-            "raw_name": "Maarja \u00d6pik",
-            "given_name": "Maarja",
-            "surname": "\u00d6pik",
-            "role": "author"
-        },
-        {
-            "index": 46,
-            "raw_name": "Alexander Ordynets",
-            "given_name": "Alexander",
-            "surname": "Ordynets",
-            "role": "author"
-        },
-        {
-            "index": 47,
-            "raw_name": "Julia Paw\u0142owska",
-            "given_name": "Julia",
-            "surname": "Paw\u0142owska",
-            "role": "author"
-        },
-        {
-            "index": 48,
-            "raw_name": "Ursula Peintner",
-            "given_name": "Ursula",
-            "surname": "Peintner",
-            "role": "author"
-        },
-        {
-            "index": 49,
-            "raw_name": "Olinto Liparini Pereira",
-            "given_name": "Olinto Liparini",
-            "surname": "Pereira",
-            "role": "author"
-        },
-        {
-            "index": 50,
-            "raw_name": "Danilo Batista Pinho",
-            "given_name": "Danilo Batista",
-            "surname": "Pinho",
-            "role": "author"
-        },
-        {
-            "index": 51,
-            "raw_name": "Kadri P\u00f5ldmaa",
-            "given_name": "Kadri",
-            "surname": "P\u00f5ldmaa",
-            "role": "author"
-        },
-        {
-            "index": 52,
-            "raw_name": "Kadri Runnel",
-            "given_name": "Kadri",
-            "surname": "Runnel",
-            "role": "author"
-        },
-        {
-            "index": 53,
-            "raw_name": "Martin Ryberg",
-            "given_name": "Martin",
-            "surname": "Ryberg",
-            "role": "author"
-        },
-        {
-            "index": 54,
-            "raw_name": "Irja Saar",
-            "given_name": "Irja",
-            "surname": "Saar",
-            "role": "author"
-        },
-        {
-            "index": 55,
-            "raw_name": "Kemal Sanli",
-            "given_name": "Kemal",
-            "surname": "Sanli",
-            "role": "author"
-        },
-        {
-            "index": 56,
-            "raw_name": "James Scott",
-            "given_name": "James",
-            "surname": "Scott",
-            "role": "author"
-        },
-        {
-            "index": 57,
-            "raw_name": "Viacheslav Spirin",
-            "given_name": "Viacheslav",
-            "surname": "Spirin",
-            "role": "author"
-        },
-        {
-            "index": 58,
-            "raw_name": "Ave Suija",
-            "given_name": "Ave",
-            "surname": "Suija",
-            "role": "author"
-        },
-        {
-            "index": 59,
-            "raw_name": "Sten Svantesson",
-            "given_name": "Sten",
-            "surname": "Svantesson",
-            "role": "author"
-        },
-        {
-            "index": 60,
-            "raw_name": "Mariusz Tadych",
-            "given_name": "Mariusz",
-            "surname": "Tadych",
-            "role": "author"
-        },
-        {
-            "index": 61,
-            "raw_name": "Susumu Takamatsu",
-            "given_name": "Susumu",
-            "surname": "Takamatsu",
-            "role": "author"
-        },
-        {
-            "index": 62,
-            "raw_name": "Heidi Tamm",
-            "given_name": "Heidi",
-            "surname": "Tamm",
-            "role": "author"
-        },
-        {
-            "index": 63,
-            "raw_name": "AFS. Taylor",
-            "given_name": "AFS.",
-            "surname": "Taylor",
-            "role": "author"
-        },
-        {
-            "index": 64,
-            "raw_name": "Leho Tedersoo",
-            "given_name": "Leho",
-            "surname": "Tedersoo",
-            "role": "author"
-        },
-        {
-            "index": 65,
-            "raw_name": "M.T. Telleria",
-            "given_name": "M.T.",
-            "surname": "Telleria",
-            "role": "author"
-        },
-        {
-            "index": 66,
-            "raw_name": "Dhanushka Udayanga",
-            "given_name": "Dhanushka",
-            "surname": "Udayanga",
-            "role": "author"
-        },
-        {
-            "index": 67,
-            "raw_name": "Martin Unterseher",
-            "given_name": "Martin",
-            "surname": "Unterseher",
-            "role": "author"
-        },
-        {
-            "index": 68,
-            "raw_name": "Sergey Volobuev",
-            "given_name": "Sergey",
-            "surname": "Volobuev",
-            "role": "author"
-        },
-        {
-            "index": 69,
-            "raw_name": "Michael Weiss",
-            "given_name": "Michael",
-            "surname": "Weiss",
-            "role": "author"
-        },
-        {
-            "index": 70,
-            "raw_name": "Christian Wurzbacher",
-            "given_name": "Christian",
-            "surname": "Wurzbacher",
-            "role": "author"
-        }
-    ],
-    "refs": [],
-    "abstracts": [
-        {
-            "content": "UNITE provides a unified way for delimiting, identifying, communicating, and working with DNA-based Species Hypotheses (SH). All fungal ITS sequences in the international nucleotide sequence databases are clustered to approximately the species level by applying a set of dynamic distance values (<0.5 - 3.0%). All species hypotheses are given a unique, stable name in the form of a DOI, and their taxonomic and ecological annotations are verified through distributed, web-based third-party annotation efforts. SHs are connected to a taxon name and its classification as far as possible (phylum, class, order, etc.) by taking into account identifications for all sequences in the SH. An automatically or manually designated sequence is chosen to represent each such SH. These sequences are released (https://unite.ut.ee/repository.php) for use by the scientific community in, for example, local sequence similarity searches and next-generation sequencing analysis pipelines. The system and the data are updated automatically as the number of public fungal ITS sequences grows.",
-            "mimetype": "text/plain",
-            "lang": "en"
+          "rights": "Attribution-NonCommercial (CC BY-NC)",
+          "rightsUri": "http://creativecommons.org/licenses/by-nc/4.0"
         }
-    ]
+      ],
+      "metadataVersion": 1,
+      "resourceType": "Dataset/UNITE Species Hypothesis",
+      "resourceTypeGeneral": "Dataset",
+      "schemaVersion": "http://datacite.org/schema/kernel-3"
+    },
+    "month": 10
+  },
+  "title": "SH409843.07FU",
+  "subtitle": "Gomphales",
+  "release_type": "dataset",
+  "release_stage": "published",
+  "release_date": "2014-10-05",
+  "release_year": 2014,
+  "ext_ids": {
+    "doi": "10.15156/bio/sh409843.07fu"
+  },
+  "publisher": "UNITE Community",
+  "language": "en",
+  "license_slug": "CC-BY-NC",
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "Urmas Kõljalg",
+      "given_name": "Urmas",
+      "surname": "Kõljalg",
+      "role": "author"
+    },
+    {
+      "index": 1,
+      "raw_name": "Kessy Abarenkov",
+      "given_name": "Kessy",
+      "surname": "Abarenkov",
+      "role": "author"
+    },
+    {
+      "index": 2,
+      "raw_name": "R. Henrik Nilsson",
+      "given_name": "R. Henrik",
+      "surname": "Nilsson",
+      "role": "author"
+    },
+    {
+      "index": 3,
+      "raw_name": "Karl-Henrik Larsson",
+      "given_name": "Karl-Henrik",
+      "surname": "Larsson",
+      "role": "author"
+    },
+    {
+      "index": 4,
+      "raw_name": "Anders Bjørnsgard Aas",
+      "given_name": "Anders Bjørnsgard",
+      "surname": "Aas",
+      "role": "author"
+    },
+    {
+      "index": 5,
+      "raw_name": "Rachel Adams",
+      "given_name": "Rachel",
+      "surname": "Adams",
+      "role": "author"
+    },
+    {
+      "index": 6,
+      "raw_name": "Artur Alves",
+      "given_name": "Artur",
+      "surname": "Alves",
+      "role": "author"
+    },
+    {
+      "index": 7,
+      "raw_name": "Joseph F. Ammirati",
+      "given_name": "Joseph F.",
+      "surname": "Ammirati",
+      "role": "author"
+    },
+    {
+      "index": 8,
+      "raw_name": "A. Elizabeth Arnold",
+      "given_name": "A. Elizabeth",
+      "surname": "Arnold",
+      "role": "author"
+    },
+    {
+      "index": 9,
+      "raw_name": "Mohammad Bahram",
+      "given_name": "Mohammad",
+      "surname": "Bahram",
+      "role": "author"
+    },
+    {
+      "index": 10,
+      "raw_name": "Johan Bengtsson-Palme",
+      "given_name": "Johan",
+      "surname": "Bengtsson-Palme",
+      "role": "author"
+    },
+    {
+      "index": 11,
+      "raw_name": "Anna Berlin",
+      "given_name": "Anna",
+      "surname": "Berlin",
+      "role": "author"
+    },
+    {
+      "index": 12,
+      "raw_name": "Synnøve Botnen",
+      "given_name": "Synnøve",
+      "surname": "Botnen",
+      "role": "author"
+    },
+    {
+      "index": 13,
+      "raw_name": "Sarah Bourlat",
+      "given_name": "Sarah",
+      "surname": "Bourlat",
+      "role": "author"
+    },
+    {
+      "index": 14,
+      "raw_name": "Tanya Cheeke",
+      "given_name": "Tanya",
+      "surname": "Cheeke",
+      "role": "author"
+    },
+    {
+      "index": 15,
+      "raw_name": "Bálint Dima",
+      "given_name": "Bálint",
+      "surname": "Dima",
+      "role": "author"
+    },
+    {
+      "index": 16,
+      "raw_name": "Rein Drenkhan",
+      "given_name": "Rein",
+      "surname": "Drenkhan",
+      "role": "author"
+    },
+    {
+      "index": 17,
+      "raw_name": "Camila Duarte",
+      "given_name": "Camila",
+      "surname": "Duarte",
+      "role": "author"
+    },
+    {
+      "index": 18,
+      "raw_name": "Margarita Dueñas",
+      "given_name": "Margarita",
+      "surname": "Dueñas",
+      "role": "author"
+    },
+    {
+      "index": 19,
+      "raw_name": "Ursula Eberhardt",
+      "given_name": "Ursula",
+      "surname": "Eberhardt",
+      "role": "author"
+    },
+    {
+      "index": 20,
+      "raw_name": "Hanna Friberg",
+      "given_name": "Hanna",
+      "surname": "Friberg",
+      "role": "author"
+    },
+    {
+      "index": 21,
+      "raw_name": "Tobias G. Frøslev",
+      "given_name": "Tobias G.",
+      "surname": "Frøslev",
+      "role": "author"
+    },
+    {
+      "index": 22,
+      "raw_name": "Sigisfredo Garnica",
+      "given_name": "Sigisfredo",
+      "surname": "Garnica",
+      "role": "author"
+    },
+    {
+      "index": 23,
+      "raw_name": "József Geml",
+      "given_name": "József",
+      "surname": "Geml",
+      "role": "author"
+    },
+    {
+      "index": 24,
+      "raw_name": "Masoomeh Ghobad-Nejhad",
+      "given_name": "Masoomeh",
+      "surname": "Ghobad-Nejhad",
+      "role": "author"
+    },
+    {
+      "index": 25,
+      "raw_name": "Tine Grebenc",
+      "given_name": "Tine",
+      "surname": "Grebenc",
+      "role": "author"
+    },
+    {
+      "index": 26,
+      "raw_name": "Gareth W. Griffith",
+      "given_name": "Gareth W.",
+      "surname": "Griffith",
+      "role": "author"
+    },
+    {
+      "index": 27,
+      "raw_name": "Felix Hampe",
+      "given_name": "Felix",
+      "surname": "Hampe",
+      "role": "author"
+    },
+    {
+      "index": 28,
+      "raw_name": "Peter Kennedy",
+      "given_name": "Peter",
+      "surname": "Kennedy",
+      "role": "author"
+    },
+    {
+      "index": 29,
+      "raw_name": "Maryia Khomich",
+      "given_name": "Maryia",
+      "surname": "Khomich",
+      "role": "author"
+    },
+    {
+      "index": 30,
+      "raw_name": "Petr Kohout",
+      "given_name": "Petr",
+      "surname": "Kohout",
+      "role": "author"
+    },
+    {
+      "index": 31,
+      "raw_name": "Anu Kollom",
+      "given_name": "Anu",
+      "surname": "Kollom",
+      "role": "author"
+    },
+    {
+      "index": 32,
+      "raw_name": "Ellen Larsson",
+      "given_name": "Ellen",
+      "surname": "Larsson",
+      "role": "author"
+    },
+    {
+      "index": 33,
+      "raw_name": "Irinyi Laszlo",
+      "given_name": "Irinyi",
+      "surname": "Laszlo",
+      "role": "author"
+    },
+    {
+      "index": 34,
+      "raw_name": "Steven Leavitt",
+      "given_name": "Steven",
+      "surname": "Leavitt",
+      "role": "author"
+    },
+    {
+      "index": 35,
+      "raw_name": "Kare Liimatainen",
+      "given_name": "Kare",
+      "surname": "Liimatainen",
+      "role": "author"
+    },
+    {
+      "index": 36,
+      "raw_name": "Björn Lindahl",
+      "given_name": "Björn",
+      "surname": "Lindahl",
+      "role": "author"
+    },
+    {
+      "index": 37,
+      "raw_name": "Deborah J. Lodge",
+      "given_name": "Deborah J.",
+      "surname": "Lodge",
+      "role": "author"
+    },
+    {
+      "index": 38,
+      "raw_name": "Helge Thorsten Lumbsch",
+      "given_name": "Helge Thorsten",
+      "surname": "Lumbsch",
+      "role": "author"
+    },
+    {
+      "index": 39,
+      "raw_name": "María Paz Martín Esteban",
+      "given_name": "María Paz",
+      "surname": "Martín Esteban",
+      "role": "author"
+    },
+    {
+      "index": 40,
+      "raw_name": "Wieland Meyer",
+      "given_name": "Wieland",
+      "surname": "Meyer",
+      "role": "author"
+    },
+    {
+      "index": 41,
+      "raw_name": "Otto Miettinen",
+      "given_name": "Otto",
+      "surname": "Miettinen",
+      "role": "author"
+    },
+    {
+      "index": 42,
+      "raw_name": "Nhu Nguyen",
+      "given_name": "Nhu",
+      "surname": "Nguyen",
+      "role": "author"
+    },
+    {
+      "index": 43,
+      "raw_name": "Tuula Niskanen",
+      "given_name": "Tuula",
+      "surname": "Niskanen",
+      "role": "author"
+    },
+    {
+      "index": 44,
+      "raw_name": "Ryoko Oono",
+      "given_name": "Ryoko",
+      "surname": "Oono",
+      "role": "author"
+    },
+    {
+      "index": 45,
+      "raw_name": "Maarja Öpik",
+      "given_name": "Maarja",
+      "surname": "Öpik",
+      "role": "author"
+    },
+    {
+      "index": 46,
+      "raw_name": "Alexander Ordynets",
+      "given_name": "Alexander",
+      "surname": "Ordynets",
+      "role": "author"
+    },
+    {
+      "index": 47,
+      "raw_name": "Julia Pawłowska",
+      "given_name": "Julia",
+      "surname": "Pawłowska",
+      "role": "author"
+    },
+    {
+      "index": 48,
+      "raw_name": "Ursula Peintner",
+      "given_name": "Ursula",
+      "surname": "Peintner",
+      "role": "author"
+    },
+    {
+      "index": 49,
+      "raw_name": "Olinto Liparini Pereira",
+      "given_name": "Olinto Liparini",
+      "surname": "Pereira",
+      "role": "author"
+    },
+    {
+      "index": 50,
+      "raw_name": "Danilo Batista Pinho",
+      "given_name": "Danilo Batista",
+      "surname": "Pinho",
+      "role": "author"
+    },
+    {
+      "index": 51,
+      "raw_name": "Kadri Põldmaa",
+      "given_name": "Kadri",
+      "surname": "Põldmaa",
+      "role": "author"
+    },
+    {
+      "index": 52,
+      "raw_name": "Kadri Runnel",
+      "given_name": "Kadri",
+      "surname": "Runnel",
+      "role": "author"
+    },
+    {
+      "index": 53,
+      "raw_name": "Martin Ryberg",
+      "given_name": "Martin",
+      "surname": "Ryberg",
+      "role": "author"
+    },
+    {
+      "index": 54,
+      "raw_name": "Irja Saar",
+      "given_name": "Irja",
+      "surname": "Saar",
+      "role": "author"
+    },
+    {
+      "index": 55,
+      "raw_name": "Kemal Sanli",
+      "given_name": "Kemal",
+      "surname": "Sanli",
+      "role": "author"
+    },
+    {
+      "index": 56,
+      "raw_name": "James Scott",
+      "given_name": "James",
+      "surname": "Scott",
+      "role": "author"
+    },
+    {
+      "index": 57,
+      "raw_name": "Viacheslav Spirin",
+      "given_name": "Viacheslav",
+      "surname": "Spirin",
+      "role": "author"
+    },
+    {
+      "index": 58,
+      "raw_name": "Ave Suija",
+      "given_name": "Ave",
+      "surname": "Suija",
+      "role": "author"
+    },
+    {
+      "index": 59,
+      "raw_name": "Sten Svantesson",
+      "given_name": "Sten",
+      "surname": "Svantesson",
+      "role": "author"
+    },
+    {
+      "index": 60,
+      "raw_name": "Mariusz Tadych",
+      "given_name": "Mariusz",
+      "surname": "Tadych",
+      "role": "author"
+    },
+    {
+      "index": 61,
+      "raw_name": "Susumu Takamatsu",
+      "given_name": "Susumu",
+      "surname": "Takamatsu",
+      "role": "author"
+    },
+    {
+      "index": 62,
+      "raw_name": "Heidi Tamm",
+      "given_name": "Heidi",
+      "surname": "Tamm",
+      "role": "author"
+    },
+    {
+      "index": 63,
+      "raw_name": "AFS. Taylor",
+      "given_name": "AFS.",
+      "surname": "Taylor",
+      "role": "author"
+    },
+    {
+      "index": 64,
+      "raw_name": "Leho Tedersoo",
+      "given_name": "Leho",
+      "surname": "Tedersoo",
+      "role": "author"
+    },
+    {
+      "index": 65,
+      "raw_name": "M.T. Telleria",
+      "given_name": "M.T.",
+      "surname": "Telleria",
+      "role": "author"
+    },
+    {
+      "index": 66,
+      "raw_name": "Dhanushka Udayanga",
+      "given_name": "Dhanushka",
+      "surname": "Udayanga",
+      "role": "author"
+    },
+    {
+      "index": 67,
+      "raw_name": "Martin Unterseher",
+      "given_name": "Martin",
+      "surname": "Unterseher",
+      "role": "author"
+    },
+    {
+      "index": 68,
+      "raw_name": "Sergey Volobuev",
+      "given_name": "Sergey",
+      "surname": "Volobuev",
+      "role": "author"
+    },
+    {
+      "index": 69,
+      "raw_name": "Michael Weiss",
+      "given_name": "Michael",
+      "surname": "Weiss",
+      "role": "author"
+    },
+    {
+      "index": 70,
+      "raw_name": "Christian Wurzbacher",
+      "given_name": "Christian",
+      "surname": "Wurzbacher",
+      "role": "author"
+    }
+  ],
+  "refs": [],
+  "abstracts": [
+    {
+      "content": "UNITE provides a unified way for delimiting, identifying, communicating, and working with DNA-based Species Hypotheses (SH). All fungal ITS sequences in the international nucleotide sequence databases are clustered to approximately the species level by applying a set of dynamic distance values (<0.5 - 3.0%). All species hypotheses are given a unique, stable name in the form of a DOI, and their taxonomic and ecological annotations are verified through distributed, web-based third-party annotation efforts. SHs are connected to a taxon name and its classification as far as possible (phylum, class, order, etc.) by taking into account identifications for all sequences in the SH. An automatically or manually designated sequence is chosen to represent each such SH. These sequences are released (https://unite.ut.ee/repository.php) for use by the scientific community in, for example, local sequence similarity searches and next-generation sequencing analysis pipelines. The system and the data are updated automatically as the number of public fungal ITS sequences grows.",
+      "mimetype": "text/plain",
+      "lang": "en"
+    }
+  ]
 }
diff --git a/python/tests/files/datacite/datacite_result_06.json b/python/tests/files/datacite/datacite_result_06.json
index 61f2549d..18880100 100644
--- a/python/tests/files/datacite/datacite_result_06.json
+++ b/python/tests/files/datacite/datacite_result_06.json
@@ -1,26 +1,29 @@
 {
-    "extra": {
-        "datacite": {
-            "license": [
-                {
-                    "rights": "ETH-Bibliothek Z\u00fcrich, Graphische Sammlung / D 6220 / Public Domain Mark 1.0"
-                }
-            ]
-        }
-    },
-    "title": "Der Eifer (Sedulitas), Blatt 7 der Folge \"Die Tugenden\"",
-    "release_type": "article",
-    "release_year": 1590,
-    "ext_ids": {
-        "doi": "10.16903/ethz-grs-d_006220"
-    },
-    "contribs": [
+  "extra": {
+    "datacite": {
+      "license": [
         {
-            "index": 0,
-            "raw_name": "Crispijn De Passe (Der \u00c4ltere) (1564-1637)",
-            "role": "author"
+          "rights": "ETH-Bibliothek Zürich, Graphische Sammlung / D 6220 / Public Domain Mark 1.0"
         }
-    ],
-    "refs": [],
-    "abstracts": []
-}
\ No newline at end of file
+      ],
+      "metadataVersion": 1,
+      "resourceTypeGeneral": "InteractiveResource",
+      "schemaVersion": "http://datacite.org/schema/kernel-3"
+    }
+  },
+  "title": "Der Eifer (Sedulitas), Blatt 7 der Folge \"Die Tugenden\"",
+  "release_type": "article",
+  "release_year": 1590,
+  "ext_ids": {
+    "doi": "10.16903/ethz-grs-d_006220"
+  },
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "Crispijn De Passe (Der Ältere) (1564-1637)",
+      "role": "author"
+    }
+  ],
+  "refs": [],
+  "abstracts": []
+}
diff --git a/python/tests/files/datacite/datacite_result_07.json b/python/tests/files/datacite/datacite_result_07.json
index f694ddef..23b63d50 100644
--- a/python/tests/files/datacite/datacite_result_07.json
+++ b/python/tests/files/datacite/datacite_result_07.json
@@ -1,74 +1,76 @@
 {
-    "extra": {
-        "datacite": {
-            "subjects": [
-                {
-                    "subject": "HEAT PUMP"
-                },
-                {
-                    "subject": "HOT WATER"
-                },
-                {
-                    "subject": "HEAT TRANSFER"
-                },
-                {
-                    "subject": "PERFORMANCE"
-                },
-                {
-                    "subject": "THERMAL STORAGE"
-                },
-                {
-                    "subject": "TANK"
-                },
-                {
-                    "subject": "MODEL"
-                }
-            ]
-        }
-    },
-    "title": "High efficient heat pump system using storage tanks to increase cop by means of the ISEC concept. 1: model validation.",
-    "release_type": "dataset",
-    "release_stage": "published",
-    "release_year": 2015,
-    "ext_ids": {
-        "doi": "10.18462/iir.icr.2015.0926"
-    },
-    "publisher": "International Institute of Refrigeration (IIR)",
-    "language": "en",
-    "contribs": [
+  "extra": {
+    "datacite": {
+      "subjects": [
         {
-            "index": 0,
-            "raw_name": "E. ROTHUIZEN",
-            "given_name": "E.",
-            "surname": "ROTHUIZEN",
-            "role": "author"
+          "subject": "HEAT PUMP"
         },
         {
-            "index": 1,
-            "raw_name": "B. ELMEGAARD",
-            "given_name": "B.",
-            "surname": "ELMEGAARD",
-            "role": "author"
+          "subject": "HOT WATER"
         },
         {
-            "index": 2,
-            "raw_name": "B. MARKUSSEN W.",
-            "given_name": "B.",
-            "surname": "MARKUSSEN W.",
-            "role": "author"
+          "subject": "HEAT TRANSFER"
         },
         {
-            "index": 3,
-            "raw_name": "Et Al.",
-            "role": "author"
-        }
-    ],
-    "refs": [],
-    "abstracts": [
+          "subject": "PERFORMANCE"
+        },
+        {
+          "subject": "THERMAL STORAGE"
+        },
+        {
+          "subject": "TANK"
+        },
         {
-            "content": "The purpose of the ISEC concept is to provide a high-efficient heat pump system for hot water production. The ISEC concept uses two storage tanks for the water, one discharged and one charged. Hot water for the industrial process is tapped from the charged tank, while the other tank is charging. Charging is done by circulating the water in the tank through the condenser of a heat pump several times and thereby gradually heating the water. The charging is done with a higher mass flow rate than the discharging to reach several circulations of the water during the time frame of one discharging. This result in a lower condensing temperature than if the water was heated in one step. Two test setups were built, one to test the performance of the heat pump gradually heating the water and one to investigate the stratification in the storage tanks. Furthermore, a dynamic model of the system was implemented in Dymola, and validated by the use of test data from the two experimental setups. This paper shows that there is a good consistency between the model and the experimental tests.",
-            "mimetype": "text/plain",
-            "lang": "en"
+          "subject": "MODEL"
         }
-    ]
+      ],
+      "resourceType": "Dataset",
+      "resourceTypeGeneral": "Dataset"
+    }
+  },
+  "title": "High efficient heat pump system using storage tanks to increase cop by means of the ISEC concept. 1: model validation.",
+  "release_type": "dataset",
+  "release_stage": "published",
+  "release_year": 2015,
+  "ext_ids": {
+    "doi": "10.18462/iir.icr.2015.0926"
+  },
+  "publisher": "International Institute of Refrigeration (IIR)",
+  "language": "en",
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "E. ROTHUIZEN",
+      "given_name": "E.",
+      "surname": "ROTHUIZEN",
+      "role": "author"
+    },
+    {
+      "index": 1,
+      "raw_name": "B. ELMEGAARD",
+      "given_name": "B.",
+      "surname": "ELMEGAARD",
+      "role": "author"
+    },
+    {
+      "index": 2,
+      "raw_name": "B. MARKUSSEN W.",
+      "given_name": "B.",
+      "surname": "MARKUSSEN W.",
+      "role": "author"
+    },
+    {
+      "index": 3,
+      "raw_name": "Et Al.",
+      "role": "author"
+    }
+  ],
+  "refs": [],
+  "abstracts": [
+    {
+      "content": "The purpose of the ISEC concept is to provide a high-efficient heat pump system for hot water production. The ISEC concept uses two storage tanks for the water, one discharged and one charged. Hot water for the industrial process is tapped from the charged tank, while the other tank is charging. Charging is done by circulating the water in the tank through the condenser of a heat pump several times and thereby gradually heating the water. The charging is done with a higher mass flow rate than the discharging to reach several circulations of the water during the time frame of one discharging. This result in a lower condensing temperature than if the water was heated in one step. Two test setups were built, one to test the performance of the heat pump gradually heating the water and one to investigate the stratification in the storage tanks. Furthermore, a dynamic model of the system was implemented in Dymola, and validated by the use of test data from the two experimental setups. This paper shows that there is a good consistency between the model and the experimental tests.",
+      "mimetype": "text/plain",
+      "lang": "en"
+    }
+  ]
 }
diff --git a/python/tests/files/datacite/datacite_result_08.json b/python/tests/files/datacite/datacite_result_08.json
index 46ef5b44..ff942d0a 100644
--- a/python/tests/files/datacite/datacite_result_08.json
+++ b/python/tests/files/datacite/datacite_result_08.json
@@ -1,54 +1,57 @@
 {
-    "extra": {
-        "datacite": {
-            "subjects": [
-                {
-                    "subject": "Land Economics/Use"
-                },
-                {
-                    "subject": "irrigation",
-                    "subjectScheme": "keyword"
-                },
-                {
-                    "subject": "industrialization",
-                    "subjectScheme": "keyword"
-                },
-                {
-                    "subject": "collective action",
-                    "subjectScheme": "keyword"
-                }
-            ]
-        }
-    },
-    "title": "Irrigation Policies under Rapid Industrialization and Labor Migration: Lessons from Japan, China and India",
-    "release_type": "article-journal",
-    "release_year": 2017,
-    "ext_ids": {
-        "doi": "10.22004/ag.econ.284864"
-    },
-    "language": "en",
-    "contribs": [
+  "extra": {
+    "datacite": {
+      "subjects": [
         {
-            "index": 0,
-            "raw_name": "Kei Kajisa",
-            "given_name": "Kei",
-            "surname": "Kajisa",
-            "role": "author"
+          "subject": "Land Economics/Use"
         },
         {
-            "index": 1,
-            "raw_name": "Kei Kajisa",
-            "given_name": "Kei",
-            "surname": "Kajisa",
-            "role": "author"
-        }
-    ],
-    "refs": [],
-    "abstracts": [
+          "subject": "irrigation",
+          "subjectScheme": "keyword"
+        },
+        {
+          "subject": "industrialization",
+          "subjectScheme": "keyword"
+        },
         {
-            "content": "International society recognizes that the scarcity of fresh water is increasing and farming sectors suffer from lack of irrigation water. However, if we look at this issue with a framework of relative factor endowment, a different view will arise. In emerging states with rapid industrialization and labor migration, labor scarcity increases at a faster pace than that of irrigation water. Using the historical review of Japan's irrigation policies as well as the case studies of India and China, this paper shows that the introduction of policies which do not reflect the actual relative resource scarcity may mislead the development path. We argue that under increasing relative labor scarcity it is important to realize the substitution of capital for labor for surface irrigation system management and that the substitution needs public support because the service of surface irrigation system has some externalities. Through this argument, this paper also intends to shed the light back to the role of the state for local resource management which seems to be unfairly undervalued since the boom of community participatory approach in the 1980s.",
-            "mimetype": "text/plain",
-            "lang": "en"
+          "subject": "collective action",
+          "subjectScheme": "keyword"
         }
-    ]
+      ],
+      "metadataVersion": 1,
+      "resourceType": "Text",
+      "resourceTypeGeneral": "Text"
+    }
+  },
+  "title": "Irrigation Policies under Rapid Industrialization and Labor Migration: Lessons from Japan, China and India",
+  "release_type": "article-journal",
+  "release_year": 2017,
+  "ext_ids": {
+    "doi": "10.22004/ag.econ.284864"
+  },
+  "language": "en",
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "Kei Kajisa",
+      "given_name": "Kei",
+      "surname": "Kajisa",
+      "role": "author"
+    },
+    {
+      "index": 1,
+      "raw_name": "Kei Kajisa",
+      "given_name": "Kei",
+      "surname": "Kajisa",
+      "role": "author"
+    }
+  ],
+  "refs": [],
+  "abstracts": [
+    {
+      "content": "International society recognizes that the scarcity of fresh water is increasing and farming sectors suffer from lack of irrigation water. However, if we look at this issue with a framework of relative factor endowment, a different view will arise. In emerging states with rapid industrialization and labor migration, labor scarcity increases at a faster pace than that of irrigation water. Using the historical review of Japan's irrigation policies as well as the case studies of India and China, this paper shows that the introduction of policies which do not reflect the actual relative resource scarcity may mislead the development path. We argue that under increasing relative labor scarcity it is important to realize the substitution of capital for labor for surface irrigation system management and that the substitution needs public support because the service of surface irrigation system has some externalities. Through this argument, this paper also intends to shed the light back to the role of the state for local resource management which seems to be unfairly undervalued since the boom of community participatory approach in the 1980s.",
+      "mimetype": "text/plain",
+      "lang": "en"
+    }
+  ]
 }
diff --git a/python/tests/files/datacite/datacite_result_09.json b/python/tests/files/datacite/datacite_result_09.json
index db103d2b..fd873309 100644
--- a/python/tests/files/datacite/datacite_result_09.json
+++ b/python/tests/files/datacite/datacite_result_09.json
@@ -1,35 +1,40 @@
 {
-    "extra": {
-        "datacite": {
-            "subjects": [
-                {
-                    "subject": "Direktdiodenlasersysteme"
-                },
-                {
-                    "subject": "Physics",
-                    "subjectScheme": "linsearch"
-                }
-            ]
-        }
-    },
-    "title": "BrightLas : TP3.3. Module f\u00fcr Direktdiodenstrahlquellen bis 4kW und Untersuchungen zur Leistungsskalierung (Diodemodul) : zum Verbundvorhaben Direktdiodenlaseranlagen und -systeme (VP3) im F\u00f6rderschwerpunkt innovative regionale Wachstumskerne, BMBF : Abschlussbericht",
-    "release_type": "report",
-    "release_stage": "published",
-    "release_year": 2016,
-    "ext_ids": {
-        "doi": "10.2314/gbv:880813733"
-    },
-    "publisher": "[Lumics GmbH]",
-    "language": "de",
-    "contribs": [
+  "extra": {
+    "datacite": {
+      "subjects": [
+        {
+          "subject": "Direktdiodenlasersysteme"
+        },
         {
-            "index": 0,
-            "raw_name": "Nils Kirstaedter",
-            "given_name": "Nils",
-            "surname": "Kirstaedter",
-            "role": "author"
+          "subject": "Physics",
+          "subjectScheme": "linsearch"
         }
-    ],
-    "refs": [],
-    "abstracts": []
-}
\ No newline at end of file
+      ],
+      "metadataVersion": 9,
+      "resourceType": "Report",
+      "resourceTypeGeneral": "Text",
+      "schemaVersion": "http://datacite.org/schema/kernel-4"
+    }
+  },
+  "title": "BrightLas : TP3.3. Module für Direktdiodenstrahlquellen bis 4kW und Untersuchungen zur Leistungsskalierung (Diodemodul) : zum Verbundvorhaben Direktdiodenlaseranlagen und -systeme (VP3) im Förderschwerpunkt innovative regionale Wachstumskerne, BMBF : Abschlussbericht",
+  "release_type": "report",
+  "release_stage": "published",
+  "release_year": 2016,
+  "ext_ids": {
+    "doi": "10.2314/gbv:880813733"
+  },
+  "publisher": "[Lumics GmbH]",
+  "language": "de",
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "Nils Kirstaedter",
+      "given_name": "Nils",
+      "surname": "Kirstaedter",
+      "role": "author"
+    }
+  ],
+  "refs": [],
+  "abstracts": [],
+  "version": "1.0"
+}
diff --git a/python/tests/files/datacite/datacite_result_10.json b/python/tests/files/datacite/datacite_result_10.json
index 325facf7..8dea8957 100644
--- a/python/tests/files/datacite/datacite_result_10.json
+++ b/python/tests/files/datacite/datacite_result_10.json
@@ -1,32 +1,35 @@
 {
-    "extra": {
-        "datacite": {
-            "subjects": [
-                {
-                    "subject": "housing areas"
-                },
-                {
-                    "subject": "Dwellings"
-                }
-            ]
-        }
-    },
-    "title": "WPA household census for 210 E VERNON, Los Angeles",
-    "release_type": "dataset",
-    "release_stage": "published",
-    "release_year": 2012,
-    "ext_ids": {
-        "doi": "10.25549/wpacards-m6171"
-    },
-    "publisher": "University of Southern California Digital Library (USC.DL)",
-    "language": "en",
-    "contribs": [
+  "extra": {
+    "datacite": {
+      "subjects": [
+        {
+          "subject": "housing areas"
+        },
         {
-            "index": 0,
-            "raw_name": "Unknown",
-            "role": "author"
+          "subject": "Dwellings"
         }
-    ],
-    "refs": [],
-    "abstracts": []
-}
\ No newline at end of file
+      ],
+      "resourceType": "Dataset",
+      "resourceTypeGeneral": "Dataset",
+      "schemaVersion": "http://datacite.org/schema/kernel-4"
+    }
+  },
+  "title": "WPA household census for 210 E VERNON, Los Angeles",
+  "release_type": "dataset",
+  "release_stage": "published",
+  "release_year": 2012,
+  "ext_ids": {
+    "doi": "10.25549/wpacards-m6171"
+  },
+  "publisher": "University of Southern California Digital Library (USC.DL)",
+  "language": "en",
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "Unknown",
+      "role": "author"
+    }
+  ],
+  "refs": [],
+  "abstracts": []
+}
diff --git a/python/tests/files/datacite/datacite_result_11.json b/python/tests/files/datacite/datacite_result_11.json
index 3045701f..944ca718 100644
--- a/python/tests/files/datacite/datacite_result_11.json
+++ b/python/tests/files/datacite/datacite_result_11.json
@@ -1,21 +1,27 @@
 {
-    "extra": {"datacite": {}},
-    "title": "N1 bei Safenwil",
-    "release_type": "graphic",
-    "release_stage": "published",
-    "release_year": 1965,
-    "ext_ids": {
-        "doi": "10.3932/ethz-a-000055869"
-    },
-    "publisher": "ETH-Bibliothek Z\u00fcrich, Bildarchiv",
-    "language": "de",
-    "contribs": [
-        {
-            "index": 0,
-            "raw_name": "Comet Photo AG (Z\u00fcrich)",
-            "role": "author"
-        }
-    ],
-    "refs": [],
-    "abstracts": []
+  "extra": {
+    "datacite": {
+      "metadataVersion": 6,
+      "resourceTypeGeneral": "Image",
+      "schemaVersion": "http://datacite.org/schema/kernel-3"
+    }
+  },
+  "title": "N1 bei Safenwil",
+  "release_type": "graphic",
+  "release_stage": "published",
+  "release_year": 1965,
+  "ext_ids": {
+    "doi": "10.3932/ethz-a-000055869"
+  },
+  "publisher": "ETH-Bibliothek Zürich, Bildarchiv",
+  "language": "de",
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "Comet Photo AG (Zürich)",
+      "role": "author"
+    }
+  ],
+  "refs": [],
+  "abstracts": []
 }
diff --git a/python/tests/files/datacite/datacite_result_12.json b/python/tests/files/datacite/datacite_result_12.json
index c3a9071c..5e2a6281 100644
--- a/python/tests/files/datacite/datacite_result_12.json
+++ b/python/tests/files/datacite/datacite_result_12.json
@@ -1,44 +1,49 @@
 {
-    "extra": {"datacite": {}, "month": 6},
-    "title": "Anthropometric and Physiological Profile of Mixed Martial Art Athletes: A Brief Review",
-    "release_type": "article-journal",
-    "release_stage": "published",
-    "release_date": "2019-06-14",
-    "release_year": 2019,
-    "ext_ids": {
-        "doi": "10.5167/uzh-171449"
+  "extra": {
+    "datacite": {
+      "resourceTypeGeneral": "Text"
     },
-    "publisher": "MDPI Publishing",
-    "contribs": [
-        {
-            "index": 0,
-            "raw_name": "Charalampos Spanias",
-            "given_name": "Charalampos",
-            "surname": "Spanias",
-            "role": "author"
-        },
-        {
-            "index": 1,
-            "raw_name": "Pantelis T Nikolaidis",
-            "given_name": "Pantelis T",
-            "surname": "Nikolaidis",
-            "role": "author"
-        },
-        {
-            "index": 2,
-            "raw_name": "Thomas Rosemann",
-            "given_name": "Thomas",
-            "surname": "Rosemann",
-            "role": "author"
-        },
-        {
-            "index": 3,
-            "raw_name": "Beat Knechtle",
-            "given_name": "Beat",
-            "surname": "Knechtle",
-            "role": "author"
-        }
-    ],
-    "refs": [],
-    "abstracts": []
+    "month": 6
+  },
+  "title": "Anthropometric and Physiological Profile of Mixed Martial Art Athletes: A Brief Review",
+  "release_type": "article-journal",
+  "release_stage": "published",
+  "release_date": "2019-06-14",
+  "release_year": 2019,
+  "ext_ids": {
+    "doi": "10.5167/uzh-171449"
+  },
+  "publisher": "MDPI Publishing",
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "Charalampos Spanias",
+      "given_name": "Charalampos",
+      "surname": "Spanias",
+      "role": "author"
+    },
+    {
+      "index": 1,
+      "raw_name": "Pantelis T Nikolaidis",
+      "given_name": "Pantelis T",
+      "surname": "Nikolaidis",
+      "role": "author"
+    },
+    {
+      "index": 2,
+      "raw_name": "Thomas Rosemann",
+      "given_name": "Thomas",
+      "surname": "Rosemann",
+      "role": "author"
+    },
+    {
+      "index": 3,
+      "raw_name": "Beat Knechtle",
+      "given_name": "Beat",
+      "surname": "Knechtle",
+      "role": "author"
+    }
+  ],
+  "refs": [],
+  "abstracts": []
 }
diff --git a/python/tests/files/datacite/datacite_result_13.json b/python/tests/files/datacite/datacite_result_13.json
index d6ed2985..3dc7cafb 100644
--- a/python/tests/files/datacite/datacite_result_13.json
+++ b/python/tests/files/datacite/datacite_result_13.json
@@ -1,28 +1,36 @@
 {
-    "extra": {"datacite": {}, "month": 10},
-    "title": "[M\u00fcssen wir des Gl\u00fccks uns sch\u00e4men?]",
-    "release_type": "article-journal",
-    "release_stage": "published",
-    "release_date": "1940-10-05",
-    "release_year": 1940,
-    "ext_ids": {
-        "doi": "10.5169/seals-314104"
+  "extra": {
+    "datacite": {
+      "metadataVersion": 17,
+      "resourceType": "Journal Article",
+      "resourceTypeGeneral": "Text",
+      "schemaVersion": "http://datacite.org/schema/kernel-3"
     },
-    "publisher": "Buchdruckerei B\u00fcchler & Co.",
-    "contribs": [
-        {
-            "index": 0,
-            "raw_name": "O.M.",
-            "role": "author"
-        },
-        {
-            "index": 1,
-            "raw_name": "Hermann Hiltbrunner",
-            "given_name": "Hermann",
-            "surname": "Hiltbrunner",
-            "role": "author"
-        }
-    ],
-    "refs": [],
-    "abstracts": []
+    "month": 10
+  },
+  "title": "[Müssen wir des Glücks uns schämen?]",
+  "release_type": "article-journal",
+  "release_stage": "published",
+  "release_date": "1940-10-05",
+  "release_year": 1940,
+  "ext_ids": {
+    "doi": "10.5169/seals-314104"
+  },
+  "publisher": "Buchdruckerei Büchler & Co.",
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "O.M.",
+      "role": "author"
+    },
+    {
+      "index": 1,
+      "raw_name": "Hermann Hiltbrunner",
+      "given_name": "Hermann",
+      "surname": "Hiltbrunner",
+      "role": "author"
+    }
+  ],
+  "refs": [],
+  "abstracts": []
 }
diff --git a/python/tests/files/datacite/datacite_result_14.json b/python/tests/files/datacite/datacite_result_14.json
index c3719aeb..e28ee5c3 100644
--- a/python/tests/files/datacite/datacite_result_14.json
+++ b/python/tests/files/datacite/datacite_result_14.json
@@ -1,111 +1,114 @@
 {
-    "extra": {
-        "datacite": {
-            "subjects": [
-                {
-                    "subject": "Crystal Structure"
-                },
-                {
-                    "subject": "Experimental 3D Coordinates"
-                },
-                {
-                    "subject": "Crystal System"
-                },
-                {
-                    "subject": "Space Group"
-                },
-                {
-                    "subject": "Cell Parameters"
-                },
-                {
-                    "subject": "Crystallography"
-                },
-                {
-                    "subject": "bis(mu~2~-5-(3,5-Di-t-butylphenyl)-15-(4-(2-(diphenylphosphino)ethynyl)phenyl)-2,8,12,18-tetrahexyl-3,7,13,17-tetramethylporphyrinato)-(5,15-bis(3,5-di-t-butylphenyl)-2,8,12,18-tetraethyl-3,7,13,17-tetramethylporphyrinato)-di-nickel-ruthenium chloroform solvate"
-                }
-            ],
-            "relations": [
-                {
-                    "relationType": "IsSupplementTo",
-                    "relatedIdentifier": "10.1021/ic034699w",
-                    "relatedIdentifierType": "DOI"
-                }
-            ]
-        }
-    },
-    "title": "CCDC 222635: Experimental Crystal Structure Determination",
-    "release_type": "dataset",
-    "release_stage": "published",
-    "release_year": 2004,
-    "ext_ids": {
-        "doi": "10.5517/cc7gns3"
-    },
-    "publisher": "Cambridge Crystallographic Data Centre",
-    "language": "en",
-    "contribs": [
+  "extra": {
+    "datacite": {
+      "subjects": [
         {
-            "index": 0,
-            "raw_name": "E. Stulz",
-            "given_name": "E.",
-            "surname": "Stulz",
-            "role": "author"
+          "subject": "Crystal Structure"
         },
         {
-            "index": 1,
-            "raw_name": "S.M. Scott",
-            "given_name": "S.M.",
-            "surname": "Scott",
-            "role": "author"
+          "subject": "Experimental 3D Coordinates"
         },
         {
-            "index": 2,
-            "raw_name": "Yiu-Fai Ng",
-            "given_name": "Yiu-Fai",
-            "surname": "Ng",
-            "role": "author"
+          "subject": "Crystal System"
         },
         {
-            "index": 3,
-            "raw_name": "A.D. Bond",
-            "given_name": "A.D.",
-            "surname": "Bond",
-            "role": "author"
+          "subject": "Space Group"
         },
         {
-            "index": 4,
-            "raw_name": "S.J. Teat",
-            "given_name": "S.J.",
-            "surname": "Teat",
-            "role": "author"
+          "subject": "Cell Parameters"
         },
         {
-            "index": 5,
-            "raw_name": "S.L. Darling",
-            "given_name": "S.L.",
-            "surname": "Darling",
-            "role": "author"
+          "subject": "Crystallography"
         },
         {
-            "index": 6,
-            "raw_name": "N. Feeder",
-            "given_name": "N.",
-            "surname": "Feeder",
-            "role": "author"
-        },
-        {
-            "index": 7,
-            "raw_name": "J.K.M. Sanders",
-            "given_name": "J.K.M.",
-            "surname": "Sanders",
-            "role": "author"
+          "subject": "bis(mu~2~-5-(3,5-Di-t-butylphenyl)-15-(4-(2-(diphenylphosphino)ethynyl)phenyl)-2,8,12,18-tetrahexyl-3,7,13,17-tetramethylporphyrinato)-(5,15-bis(3,5-di-t-butylphenyl)-2,8,12,18-tetraethyl-3,7,13,17-tetramethylporphyrinato)-di-nickel-ruthenium chloroform solvate"
         }
-    ],
-    "refs": [],
-    "abstracts": [
+      ],
+      "relations": [
         {
-            "content": "An entry from the Cambridge Structural Database, the world's repository for small molecule crystal structures. The entry contains experimental data from a crystal diffraction study. The deposited dataset for this entry is freely available from the CCDC and typically includes 3D coordinates, cell parameters, space group, experimental conditions and quality measures.",
-            "mimetype": "text/plain",
-            "lang": "en"
+          "relationType": "IsSupplementTo",
+          "relatedIdentifier": "10.1021/ic034699w",
+          "relatedIdentifierType": "DOI"
         }
-    ]
+      ],
+      "metadataVersion": 2,
+      "resourceTypeGeneral": "Dataset",
+      "schemaVersion": "http://datacite.org/schema/kernel-3"
+    }
+  },
+  "title": "CCDC 222635: Experimental Crystal Structure Determination",
+  "release_type": "dataset",
+  "release_stage": "published",
+  "release_year": 2004,
+  "ext_ids": {
+    "doi": "10.5517/cc7gns3"
+  },
+  "publisher": "Cambridge Crystallographic Data Centre",
+  "language": "en",
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "E. Stulz",
+      "given_name": "E.",
+      "surname": "Stulz",
+      "role": "author"
+    },
+    {
+      "index": 1,
+      "raw_name": "S.M. Scott",
+      "given_name": "S.M.",
+      "surname": "Scott",
+      "role": "author"
+    },
+    {
+      "index": 2,
+      "raw_name": "Yiu-Fai Ng",
+      "given_name": "Yiu-Fai",
+      "surname": "Ng",
+      "role": "author"
+    },
+    {
+      "index": 3,
+      "raw_name": "A.D. Bond",
+      "given_name": "A.D.",
+      "surname": "Bond",
+      "role": "author"
+    },
+    {
+      "index": 4,
+      "raw_name": "S.J. Teat",
+      "given_name": "S.J.",
+      "surname": "Teat",
+      "role": "author"
+    },
+    {
+      "index": 5,
+      "raw_name": "S.L. Darling",
+      "given_name": "S.L.",
+      "surname": "Darling",
+      "role": "author"
+    },
+    {
+      "index": 6,
+      "raw_name": "N. Feeder",
+      "given_name": "N.",
+      "surname": "Feeder",
+      "role": "author"
+    },
+    {
+      "index": 7,
+      "raw_name": "J.K.M. Sanders",
+      "given_name": "J.K.M.",
+      "surname": "Sanders",
+      "role": "author"
+    }
+  ],
+  "refs": [],
+  "abstracts": [
+    {
+      "content": "An entry from the Cambridge Structural Database, the world's repository for small molecule crystal structures. The entry contains experimental data from a crystal diffraction study. The deposited dataset for this entry is freely available from the CCDC and typically includes 3D coordinates, cell parameters, space group, experimental conditions and quality measures.",
+      "mimetype": "text/plain",
+      "lang": "en"
+    }
+  ]
 }
diff --git a/python/tests/files/datacite/datacite_result_15.json b/python/tests/files/datacite/datacite_result_15.json
index 1b430a7d..3a03dfb6 100644
--- a/python/tests/files/datacite/datacite_result_15.json
+++ b/python/tests/files/datacite/datacite_result_15.json
@@ -1,22 +1,29 @@
 {
-    "extra": {"datacite": {}},
-    "title": "Parramore Island of the Virginia Coast Reserve Permanent Plot Resurvey: Tree data 1997",
-    "release_type": "dataset",
-    "release_stage": "published",
-    "release_year": 2017,
-    "ext_ids": {
-        "doi": "10.6073/pasta/95296d8416aae24f3d39b4ecb27f0b28"
-    },
-    "publisher": "Environmental Data Initiative",
-    "contribs": [
-        {
-            "index": 0,
-            "raw_name": "David Richardson",
-            "given_name": "David",
-            "surname": "Richardson",
-            "role": "author"
-        }
-    ],
-    "refs": [],
-    "abstracts": []
+  "extra": {
+    "datacite": {
+      "metadataVersion": 1,
+      "resourceType": "dataPackage",
+      "resourceTypeGeneral": "Dataset",
+      "schemaVersion": "http://datacite.org/schema/kernel-2.2"
+    }
+  },
+  "title": "Parramore Island of the Virginia Coast Reserve Permanent Plot Resurvey: Tree data 1997",
+  "release_type": "dataset",
+  "release_stage": "published",
+  "release_year": 2017,
+  "ext_ids": {
+    "doi": "10.6073/pasta/95296d8416aae24f3d39b4ecb27f0b28"
+  },
+  "publisher": "Environmental Data Initiative",
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "David Richardson",
+      "given_name": "David",
+      "surname": "Richardson",
+      "role": "author"
+    }
+  ],
+  "refs": [],
+  "abstracts": []
 }
diff --git a/python/tests/files/datacite/datacite_result_16.json b/python/tests/files/datacite/datacite_result_16.json
index ea8c2e59..8cf762b6 100644
--- a/python/tests/files/datacite/datacite_result_16.json
+++ b/python/tests/files/datacite/datacite_result_16.json
@@ -1,31 +1,34 @@
 {
-    "extra": {
-        "datacite": {
-            "license": [
-                {
-                    "rights": "CC-BY",
-                    "rightsUri": "http://creativecommons.org/licenses/by/3.0/us"
-                }
-            ]
-        }
-    },
-    "title": "Testing the Connectivity of Networks",
-    "release_type": "dataset",
-    "release_stage": "published",
-    "release_year": 2014,
-    "ext_ids": {
-        "doi": "10.6084/m9.figshare.1282478"
-    },
-    "publisher": "Figshare",
-    "contribs": [
+  "extra": {
+    "datacite": {
+      "license": [
         {
-            "index": 0,
-            "raw_name": "Taha Sochi",
-            "given_name": "Taha",
-            "surname": "Sochi",
-            "role": "author"
+          "rights": "CC-BY",
+          "rightsUri": "http://creativecommons.org/licenses/by/3.0/us"
         }
-    ],
-    "refs": [],
-    "abstracts": []
-}
\ No newline at end of file
+      ],
+      "resourceType": "Paper",
+      "resourceTypeGeneral": "Dataset",
+      "schemaVersion": "http://datacite.org/schema/kernel-3"
+    }
+  },
+  "title": "Testing the Connectivity of Networks",
+  "release_type": "dataset",
+  "release_stage": "published",
+  "release_year": 2014,
+  "ext_ids": {
+    "doi": "10.6084/m9.figshare.1282478"
+  },
+  "publisher": "Figshare",
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "Taha Sochi",
+      "given_name": "Taha",
+      "surname": "Sochi",
+      "role": "author"
+    }
+  ],
+  "refs": [],
+  "abstracts": []
+}
diff --git a/python/tests/files/datacite/datacite_result_17.json b/python/tests/files/datacite/datacite_result_17.json
index 73b082d9..6e8c4e34 100644
--- a/python/tests/files/datacite/datacite_result_17.json
+++ b/python/tests/files/datacite/datacite_result_17.json
@@ -1,20 +1,25 @@
 {
-    "extra": {"datacite": {}},
-    "title": "gel_BSA-FITC_Markov_segmntation0343.tif",
-    "release_type": "dataset",
-    "release_stage": "published",
-    "release_year": 2018,
-    "ext_ids": {
-        "doi": "10.7910/dvn/tsqfwc/yytj22"
-    },
-    "publisher": "Harvard Dataverse",
-    "contribs": [
-        {
-            "index": 0,
-            "raw_name": "Di Giovanna, Antonino Paolo (University Of Florence)",
-            "role": "author"
-        }
-    ],
-    "refs": [],
-    "abstracts": []
+  "extra": {
+    "datacite": {
+      "resourceTypeGeneral": "Dataset",
+      "schemaVersion": "http://datacite.org/schema/kernel-4"
+    }
+  },
+  "title": "gel_BSA-FITC_Markov_segmntation0343.tif",
+  "release_type": "dataset",
+  "release_stage": "published",
+  "release_year": 2018,
+  "ext_ids": {
+    "doi": "10.7910/dvn/tsqfwc/yytj22"
+  },
+  "publisher": "Harvard Dataverse",
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "Di Giovanna, Antonino Paolo (University Of Florence)",
+      "role": "author"
+    }
+  ],
+  "refs": [],
+  "abstracts": []
 }
diff --git a/python/tests/files/datacite/datacite_result_18.json b/python/tests/files/datacite/datacite_result_18.json
index fb109de2..43b46923 100644
--- a/python/tests/files/datacite/datacite_result_18.json
+++ b/python/tests/files/datacite/datacite_result_18.json
@@ -1,15 +1,21 @@
 {
-    "extra": {"datacite": {}, "month": 8},
-    "title": "Eastern questionnaire, answer sheet for Interviewee 53215, page 064",
-    "release_type": "article",
-    "release_stage": "published",
-    "release_date": "2017-08-21",
-    "release_year": 2017,
-    "ext_ids": {
-        "doi": "10.7916/d81z522m"
+  "extra": {
+    "datacite": {
+      "metadataVersion": 2,
+      "schemaVersion": "http://datacite.org/schema/kernel-3"
     },
-    "publisher": "Columbia University",
-    "contribs": [],
-    "refs": [],
-    "abstracts": []
+    "month": 8
+  },
+  "title": "Eastern questionnaire, answer sheet for Interviewee 53215, page 064",
+  "release_type": "article",
+  "release_stage": "published",
+  "release_date": "2017-08-21",
+  "release_year": 2017,
+  "ext_ids": {
+    "doi": "10.7916/d81z522m"
+  },
+  "publisher": "Columbia University",
+  "contribs": [],
+  "refs": [],
+  "abstracts": []
 }
diff --git a/python/tests/files/datacite/datacite_result_19.json b/python/tests/files/datacite/datacite_result_19.json
index 85bada92..8b91efe5 100644
--- a/python/tests/files/datacite/datacite_result_19.json
+++ b/python/tests/files/datacite/datacite_result_19.json
@@ -1,15 +1,21 @@
 {
-    "extra": {"datacite": {}, "month": 8},
-    "title": "Eastern questionnaire, answer sheet for Interviewee 55236, page 092",
-    "release_type": "article",
-    "release_stage": "published",
-    "release_date": "2017-08-24",
-    "release_year": 2017,
-    "ext_ids": {
-        "doi": "10.7916/d86x0cg1"
+  "extra": {
+    "datacite": {
+      "metadataVersion": 3,
+      "schemaVersion": "http://datacite.org/schema/kernel-3"
     },
-    "publisher": "Columbia University",
-    "contribs": [],
-    "refs": [],
-    "abstracts": []
+    "month": 8
+  },
+  "title": "Eastern questionnaire, answer sheet for Interviewee 55236, page 092",
+  "release_type": "article",
+  "release_stage": "published",
+  "release_date": "2017-08-24",
+  "release_year": 2017,
+  "ext_ids": {
+    "doi": "10.7916/d86x0cg1"
+  },
+  "publisher": "Columbia University",
+  "contribs": [],
+  "refs": [],
+  "abstracts": []
 }
diff --git a/python/tests/files/datacite/datacite_result_20.json b/python/tests/files/datacite/datacite_result_20.json
index 891cb41e..ed1f8885 100644
--- a/python/tests/files/datacite/datacite_result_20.json
+++ b/python/tests/files/datacite/datacite_result_20.json
@@ -1,14 +1,17 @@
 {
-    "extra": {"datacite": {}, "month": 8},
-    "title": "<h1>Eastern questionnaire</h1>",
-    "release_type": "article",
-    "release_stage": "published",
-    "release_date": "2017-08-24",
-    "release_year": 2017,
-    "ext_ids": {
-        "doi": "10.7916/d86x0cg1"
-    },
-    "contribs": [],
-    "refs": [],
-    "abstracts": []
+  "extra": {
+    "datacite": {},
+    "month": 8
+  },
+  "title": "<h1>Eastern questionnaire</h1>",
+  "release_type": "article",
+  "release_stage": "published",
+  "release_date": "2017-08-24",
+  "release_year": 2017,
+  "ext_ids": {
+    "doi": "10.7916/d86x0cg1"
+  },
+  "contribs": [],
+  "refs": [],
+  "abstracts": []
 }
diff --git a/python/tests/files/datacite/datacite_result_21.json b/python/tests/files/datacite/datacite_result_21.json
index 73df8216..1230abfa 100644
--- a/python/tests/files/datacite/datacite_result_21.json
+++ b/python/tests/files/datacite/datacite_result_21.json
@@ -1,15 +1,18 @@
 {
-    "extra": {"datacite": {}, "month": 8},
-    "title": "ABC",
-    "release_type": "article",
-    "release_stage": "published",
-    "release_date": "2017-08-24",
-    "release_year": 2017,
-    "ext_ids": {
-        "doi": "10.7916/d86x0cg1"
-    },
-    "language": "de",
-    "contribs": [],
-    "refs": [],
-    "abstracts": []
+  "extra": {
+    "datacite": {},
+    "month": 8
+  },
+  "title": "ABC",
+  "release_type": "article",
+  "release_stage": "published",
+  "release_date": "2017-08-24",
+  "release_year": 2017,
+  "ext_ids": {
+    "doi": "10.7916/d86x0cg1"
+  },
+  "language": "de",
+  "contribs": [],
+  "refs": [],
+  "abstracts": []
 }
diff --git a/python/tests/files/datacite/datacite_result_22.json b/python/tests/files/datacite/datacite_result_22.json
index 97f35da5..cba01531 100644
--- a/python/tests/files/datacite/datacite_result_22.json
+++ b/python/tests/files/datacite/datacite_result_22.json
@@ -1,22 +1,25 @@
 {
-    "extra": {"datacite": {}, "month": 8},
-    "title": "ABC",
-    "release_type": "article",
-    "release_stage": "published",
-    "release_date": "2017-08-24",
-    "release_year": 2017,
-    "ext_ids": {
-        "doi": "10.7916/d86x0cg1"
-    },
-    "language": "de",
-    "contribs": [
-        {
-            "index": 0,
-            "raw_name": "Anton Welch",
-            "role": "author",
-            "raw_affiliation": "Department of pataphysics"
-        }
-    ],
-    "refs": [],
-    "abstracts": []
+  "extra": {
+    "datacite": {},
+    "month": 8
+  },
+  "title": "ABC",
+  "release_type": "article",
+  "release_stage": "published",
+  "release_date": "2017-08-24",
+  "release_year": 2017,
+  "ext_ids": {
+    "doi": "10.7916/d86x0cg1"
+  },
+  "language": "de",
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "Anton Welch",
+      "role": "author",
+      "raw_affiliation": "Department of pataphysics"
+    }
+  ],
+  "refs": [],
+  "abstracts": []
 }
diff --git a/python/tests/files/datacite/datacite_result_23.json b/python/tests/files/datacite/datacite_result_23.json
index 93385c70..db622e1c 100644
--- a/python/tests/files/datacite/datacite_result_23.json
+++ b/python/tests/files/datacite/datacite_result_23.json
@@ -1,22 +1,25 @@
 {
-    "extra": {"datacite": {}, "month": 8},
-    "title": "ABC",
-    "release_type": "article",
-    "release_stage": "published",
-    "release_date": "2017-08-24",
-    "release_year": 2017,
-    "ext_ids": {
-        "doi": "10.7916/d86x0cg1-xxx"
-    },
-    "language": "de",
-    "contribs": [
-        {
-            "index": 0,
-            "raw_name": "Anton Welch",
-            "role": "author",
-            "raw_affiliation": "Department of pataphysics"
-        }
-    ],
-    "refs": [],
-    "abstracts": []
+  "extra": {
+    "datacite": {},
+    "month": 8
+  },
+  "title": "ABC",
+  "release_type": "article",
+  "release_stage": "published",
+  "release_date": "2017-08-24",
+  "release_year": 2017,
+  "ext_ids": {
+    "doi": "10.7916/d86x0cg1-xxx"
+  },
+  "language": "de",
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "Anton Welch",
+      "role": "author",
+      "raw_affiliation": "Department of pataphysics"
+    }
+  ],
+  "refs": [],
+  "abstracts": []
 }
diff --git a/python/tests/files/datacite/datacite_result_24.json b/python/tests/files/datacite/datacite_result_24.json
index cb08e67b..8338cf29 100644
--- a/python/tests/files/datacite/datacite_result_24.json
+++ b/python/tests/files/datacite/datacite_result_24.json
@@ -1,22 +1,25 @@
 {
-    "extra": {"datacite": {}, "month": 8},
-    "title": "ABC",
-    "subtitle": "DEF",
-    "release_type": "article",
-    "release_stage": "published",
-    "release_date": "2017-08-24",
-    "release_year": 2017,
-    "ext_ids": {
-        "doi": "10.7916/d86x0cg1"
-    },
-    "contribs": [
-        {
-            "index": 0,
-            "raw_name": "Anton Welch",
-            "role": "author",
-            "raw_affiliation": "Department of pataphysics"
-        }
-    ],
-    "refs": [],
-    "abstracts": []
+  "extra": {
+    "datacite": {},
+    "month": 8
+  },
+  "title": "ABC",
+  "subtitle": "DEF",
+  "release_type": "article",
+  "release_stage": "published",
+  "release_date": "2017-08-24",
+  "release_year": 2017,
+  "ext_ids": {
+    "doi": "10.7916/d86x0cg1"
+  },
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "Anton Welch",
+      "role": "author",
+      "raw_affiliation": "Department of pataphysics"
+    }
+  ],
+  "refs": [],
+  "abstracts": []
 }
-- 
cgit v1.2.3


From 06da78e2360f803b60fd9a0e28932d825c0a0019 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 8 Jan 2020 02:31:46 +0100
Subject: datacite: fill a few more release_type gaps

* citeproc: http://docs.citationstyles.org/en/stable/specification.html#appendix-iii-types
* resourceTypeGeneral: https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32
* resourceType: uncontrolled, over 170000 distinct values, frequent:
null, Dataset, JournalArticle, PGRFA Material, Journal Article,
Dataset/UNITE Species Hypothesis, ...

General frequency:

* "attributes.types": 18210075,
* "attributes.types.ris": 18058890,
* "attributes.types.bibtex": 18058888,
* "attributes.types.citeproc": 18058890,
* "attributes.types.schemaOrg": 18058929,
* "attributes.types.resourceType": 12737988,
* "attributes.types.resourceTypeGeneral": 16576139,
---
 python/fatcat_tools/importers/datacite.py | 35 ++++++++++++++++---------------
 1 file changed, 18 insertions(+), 17 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index c2725aeb..4996fbed 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -41,28 +41,28 @@ CONTAINER_TYPE_MAP = {
 DATACITE_TYPE_MAP = {
     'ris': {
         'THES': 'thesis',
-        'SOUND': None,
+        'SOUND': 'song', # 99.9% maps to citeproc song, so use that (exception: report)
         'CHAP': 'chapter',
-        'FIGURE': None,
+        'FIGURE': 'figure',
         'RPRT': 'report',
         'JOUR': 'article-journal',
-        'MPCT': None,
-        'GEN': None,
+        'MPCT': 'motion_picture',
+        'GEN': 'article-journal', # GEN consist of 99% article and report, post-weblog, misc - and one dataset
         'BOOK': 'book',
         'DATA': 'dataset',
-        'COMP': None,
+        'COMP': 'software',
     },
     'schemaOrg': {
         'Dataset': 'dataset',
         'Book': 'book',
-        'ScholarlyArticle': 'article',
+        'ScholarlyArticle': 'article-journal',
         'ImageObject': 'graphic',
         'Collection': None,
         'MediaObject': None,
         'Event': None,
-        'SoftwareSourceCode': None,
+        'SoftwareSourceCode': 'software',
         'Chapter': 'chapter',
-        'CreativeWork': None,
+        'CreativeWork': None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score.
         'PublicationIssue': 'article',
         'AudioObject': None,
         'Thesis': 'thesis',
@@ -112,19 +112,19 @@ DATACITE_TYPE_MAP = {
         'book': 'book',
     },
     'resourceTypeGeneral': {
-        'Image': None,
+        'Image': 'graphic',
         'Dataset': 'dataset',
         'PhysicalObject': None,
         'Collection': None,
-        'Text': None,
+        'Text': None, # "Greyliterature, labnotes, accompanyingmaterials"
         'Sound': None,
         'InteractiveResource': None,
         'Event': None,
-        'Software': None,
+        'Software': 'software',
         'Other': None,
         'Workflow': None,
         'Audiovisual': None,
-    }
+    } # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32
 }
 
 # DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43.
@@ -516,11 +516,12 @@ class DataciteImporter(EntityImporter):
             license_extra.append(l)
 
         # Release type. Try to determine the release type from a variety of
-        # types supplied in datacite. The "attributes.types.resourceType"
-        # contains too many (176 in sample) things for now; citeproc may be the
-        # closest, but not always supplied.
-        for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg',
-                         'bibtex', 'ris'):
+        # types supplied in datacite. The "attributes.types.resourceType" is
+        # uncontrolled (170000+ unique values, from "null", "Dataset" to
+        # "Jupyter Notebook" and "Macroseismic Data Points" or "2 days of IP
+        # flows in 2009") citeproc may be the closest, but not always supplied.
+        # Order lookup roughly by completeness of mapping.
+        for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'):
             value = attributes.get('types', {}).get(typeType)
             release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value)
             if release_type is not None:
-- 
cgit v1.2.3


From 6499e2911386f3f5e82a589c71da4003043bfc72 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 8 Jan 2020 03:01:27 +0100
Subject: datacite: over 3% records have the same title: stub

The GBIF (https://www.gbif.org/) deposits most records under the titles:

* 599243 GBIF Occurrence Download
* 41176 Occurrence Download

Mark them as "stub" for the moment
(https://guide.fatcat.wiki/entity_release.html#release_type-vocabulary).
---
 python/fatcat_tools/importers/datacite.py | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 4996fbed..52fede06 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -530,6 +530,13 @@ class DataciteImporter(EntityImporter):
         if release_type is None:
             print("[{}] no mapped type: {}".format(doi, value), file=sys.stderr)
 
+        # release_type exception: Global Biodiversity Information Facility
+        # publishes highly interesting datasets, but titles are mostly the same
+        # ("GBIF Occurrence Download" or "Occurrence Download"); set
+        # release_type to "stub" (CSL/FC).
+        if publisher == 'The Global Biodiversity Information Facility':
+            release_type = 'stub'
+
         # Language values are varied ("ger", "es", "English", "ENG", "en-us",
         # "other", ...). Try to crush it with langcodes: "It may sound to you
         # like langcodes solves a pretty boring problem. At one level, that's
-- 
cgit v1.2.3


From 21e5cb620f7c8cb14f0c9d72b0504eeb7ff31977 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 8 Jan 2020 03:06:29 +0100
Subject: datacite: ignore certain names

---
 python/fatcat_tools/importers/datacite.py | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 52fede06..fe98d62a 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -309,6 +309,9 @@ class DataciteImporter(EntityImporter):
         # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"].
         contribs = []
 
+        # Names, that should be ignored right away.
+        name_blacklist = set(('Occdownload Gbif.Org',))
+
         for i, c in enumerate(attributes['creators']):
             nameType = c.get('nameType', '') or ''
             if nameType in ('', 'Personal'):
@@ -339,6 +342,9 @@ class DataciteImporter(EntityImporter):
                 if name:
                     name = clean(name)
 
+                if name in name_blacklist:
+                    continue
+
                 if given_name:
                     given_name = clean(given_name)
 
-- 
cgit v1.2.3


From 7d3ccb1c079f619ce664a984eef4f295294dd741 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 8 Jan 2020 03:27:05 +0100
Subject: datacite: use more specific release_type, if possible

---
 python/fatcat_tools/importers/datacite.py | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index fe98d62a..58dfc556 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -613,6 +613,12 @@ class DataciteImporter(EntityImporter):
                 ))
             ref_index += 1
 
+        # More specific release_type via 'Reviews' relationsship.
+        for rel in relIds:
+            if rel.get('relatedIdentifierType', '') != 'Reviews':
+                continue
+            release_type = 'review'
+
         # Extra information.
         extra_datacite = dict()
 
-- 
cgit v1.2.3


From a23f73e37cd88de5467c47aa5f84b96448c5713d Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 8 Jan 2020 03:35:41 +0100
Subject: datacite: CCDC are entries, mostly

---
 python/fatcat_tools/importers/datacite.py           | 4 ++++
 python/tests/files/datacite/datacite_result_14.json | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 58dfc556..587a65aa 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -543,6 +543,10 @@ class DataciteImporter(EntityImporter):
         if publisher == 'The Global Biodiversity Information Facility':
             release_type = 'stub'
 
+        # release_type exception: lots of "Experimental Crystal Structure Determination"
+        if publisher == 'Cambridge Crystallographic Data Centre':
+            release_type = 'entry'
+
         # Language values are varied ("ger", "es", "English", "ENG", "en-us",
         # "other", ...). Try to crush it with langcodes: "It may sound to you
         # like langcodes solves a pretty boring problem. At one level, that's
diff --git a/python/tests/files/datacite/datacite_result_14.json b/python/tests/files/datacite/datacite_result_14.json
index e28ee5c3..20f6bfd4 100644
--- a/python/tests/files/datacite/datacite_result_14.json
+++ b/python/tests/files/datacite/datacite_result_14.json
@@ -37,7 +37,7 @@
     }
   },
   "title": "CCDC 222635: Experimental Crystal Structure Determination",
-  "release_type": "dataset",
+  "release_type": "entry",
   "release_stage": "published",
   "release_year": 2004,
   "ext_ids": {
-- 
cgit v1.2.3


From a7e5460d6355dd0e99b08e480d4e50755fda3b16 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 8 Jan 2020 03:47:10 +0100
Subject: datacite: mark additional files as stub

---
 python/fatcat_tools/importers/datacite.py          |  4 ++
 python/tests/files/datacite/datacite_doc_25.json   | 47 ++++++++++++++++++++++
 .../tests/files/datacite/datacite_result_25.json   | 25 ++++++++++++
 python/tests/import_datacite.py                    |  2 +-
 4 files changed, 77 insertions(+), 1 deletion(-)
 create mode 100644 python/tests/files/datacite/datacite_doc_25.json
 create mode 100644 python/tests/files/datacite/datacite_result_25.json

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 587a65aa..90bc3db7 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -547,6 +547,10 @@ class DataciteImporter(EntityImporter):
         if publisher == 'Cambridge Crystallographic Data Centre':
             release_type = 'entry'
 
+        # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire."
+        if title.lower().startswith('additional file'):
+            release_type = 'stub'
+
         # Language values are varied ("ger", "es", "English", "ENG", "en-us",
         # "other", ...). Try to crush it with langcodes: "It may sound to you
         # like langcodes solves a pretty boring problem. At one level, that's
diff --git a/python/tests/files/datacite/datacite_doc_25.json b/python/tests/files/datacite/datacite_doc_25.json
new file mode 100644
index 00000000..60cd0ab7
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_25.json
@@ -0,0 +1,47 @@
+{
+  "attributes": {
+    "doi": "10.7916/d86x0cg1",
+    "creators": [
+      {
+        "name": "Anton Welch",
+        "affiliation": [
+          "Department of pataphysics"
+        ],
+        "nameIdentifiers": []
+      }
+    ],
+    "titles": [
+      {
+        "title": "Additional file 123: ABC"
+      },
+      {
+        "title": "DEF",
+        "titleType": "Subtitle"
+      }
+    ],
+    "publicationYear": 2016,
+    "language": "DE-CH",
+    "types": {
+      "ris": "GEN",
+      "bibtex": "misc",
+      "citeproc": "article",
+      "schemaOrg": "CreativeWork"
+    },
+    "dates": [
+      {
+        "date": "2017-08-24",
+        "dateType": "Created"
+      },
+      {
+        "date": "2019-08-04",
+        "dateType": "Updated"
+      },
+      {
+        "date": "2017",
+        "dateType": "Issued"
+      }
+    ],
+    "isActive": true,
+    "state": "findable"
+  }
+}
diff --git a/python/tests/files/datacite/datacite_result_25.json b/python/tests/files/datacite/datacite_result_25.json
new file mode 100644
index 00000000..8a370bbb
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_25.json
@@ -0,0 +1,25 @@
+{
+  "extra": {
+    "datacite": {},
+    "month": 8
+  },
+  "title": "Additional file 123: ABC",
+  "subtitle": "DEF",
+  "release_type": "stub",
+  "release_stage": "published",
+  "release_date": "2017-08-24",
+  "release_year": 2017,
+  "ext_ids": {
+    "doi": "10.7916/d86x0cg1"
+  },
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "Anton Welch",
+      "role": "author",
+      "raw_affiliation": "Department of pataphysics"
+    }
+  ],
+  "refs": [],
+  "abstracts": []
+}
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 9ee479e8..7293ecac 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -287,7 +287,7 @@ def test_datacite_conversions(datacite_importer):
     for now.
     """
     datacite_importer.debug = True
-    for i in range(25):
+    for i in range(26):
         src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i)
         dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i)
         print('testing mapping from {} => {}'.format(src, dst))
-- 
cgit v1.2.3


From 5d7a7651f0ae6f66ca60930daaf194350814e5a6 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 8 Jan 2020 03:53:19 +0100
Subject: datacite: name extra.month, extra.release_month

---
 python/fatcat_tools/importers/datacite.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 90bc3db7..aaf1af2c 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -686,8 +686,10 @@ class DataciteImporter(EntityImporter):
 
         # Always include datacite key, even if value is empty (dict).
         extra['datacite'] = extra_datacite
+
+        # Preparation for a schema update.
         if release_month:
-            extra['month'] = release_month
+            extra['release_month'] = release_month
 
         extids = self.lookup_ext_ids(doi=doi)
 
-- 
cgit v1.2.3


From 791c21af58554203cbfa52a7ebc1d91db261daec Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 8 Jan 2020 03:56:28 +0100
Subject: datacite: adjust tests for release_month

---
 python/tests/files/datacite/datacite_result_00.json | 2 +-
 python/tests/files/datacite/datacite_result_05.json | 2 +-
 python/tests/files/datacite/datacite_result_12.json | 2 +-
 python/tests/files/datacite/datacite_result_13.json | 2 +-
 python/tests/files/datacite/datacite_result_18.json | 2 +-
 python/tests/files/datacite/datacite_result_19.json | 2 +-
 python/tests/files/datacite/datacite_result_20.json | 2 +-
 python/tests/files/datacite/datacite_result_21.json | 2 +-
 python/tests/files/datacite/datacite_result_22.json | 2 +-
 python/tests/files/datacite/datacite_result_23.json | 2 +-
 python/tests/files/datacite/datacite_result_24.json | 2 +-
 python/tests/files/datacite/datacite_result_25.json | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'python')

diff --git a/python/tests/files/datacite/datacite_result_00.json b/python/tests/files/datacite/datacite_result_00.json
index 28da5397..0a84e7bd 100644
--- a/python/tests/files/datacite/datacite_result_00.json
+++ b/python/tests/files/datacite/datacite_result_00.json
@@ -20,7 +20,7 @@
       "schemaVersion": "http://datacite.org/schema/kernel-4",
       "metadataVersion": 1
     },
-    "month": 5
+    "release_month": 5
   },
   "title": "Synthesis and Crystal Structure of a Compound with Two Conformational Isomers: N-(2-methylbenzoyl)-N′-(4-nitrophenyl)thiourea",
   "release_type": "article-journal",
diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json
index 961ad72a..22542a10 100644
--- a/python/tests/files/datacite/datacite_result_05.json
+++ b/python/tests/files/datacite/datacite_result_05.json
@@ -12,7 +12,7 @@
       "resourceTypeGeneral": "Dataset",
       "schemaVersion": "http://datacite.org/schema/kernel-3"
     },
-    "month": 10
+    "release_month": 10
   },
   "title": "SH409843.07FU",
   "subtitle": "Gomphales",
diff --git a/python/tests/files/datacite/datacite_result_12.json b/python/tests/files/datacite/datacite_result_12.json
index 5e2a6281..6977ecea 100644
--- a/python/tests/files/datacite/datacite_result_12.json
+++ b/python/tests/files/datacite/datacite_result_12.json
@@ -3,7 +3,7 @@
     "datacite": {
       "resourceTypeGeneral": "Text"
     },
-    "month": 6
+    "release_month": 6
   },
   "title": "Anthropometric and Physiological Profile of Mixed Martial Art Athletes: A Brief Review",
   "release_type": "article-journal",
diff --git a/python/tests/files/datacite/datacite_result_13.json b/python/tests/files/datacite/datacite_result_13.json
index 3dc7cafb..91126c5a 100644
--- a/python/tests/files/datacite/datacite_result_13.json
+++ b/python/tests/files/datacite/datacite_result_13.json
@@ -6,7 +6,7 @@
       "resourceTypeGeneral": "Text",
       "schemaVersion": "http://datacite.org/schema/kernel-3"
     },
-    "month": 10
+    "release_month": 10
   },
   "title": "[Müssen wir des Glücks uns schämen?]",
   "release_type": "article-journal",
diff --git a/python/tests/files/datacite/datacite_result_18.json b/python/tests/files/datacite/datacite_result_18.json
index 43b46923..6e69bad2 100644
--- a/python/tests/files/datacite/datacite_result_18.json
+++ b/python/tests/files/datacite/datacite_result_18.json
@@ -4,7 +4,7 @@
       "metadataVersion": 2,
       "schemaVersion": "http://datacite.org/schema/kernel-3"
     },
-    "month": 8
+    "release_month": 8
   },
   "title": "Eastern questionnaire, answer sheet for Interviewee 53215, page 064",
   "release_type": "article",
diff --git a/python/tests/files/datacite/datacite_result_19.json b/python/tests/files/datacite/datacite_result_19.json
index 8b91efe5..2f2f217e 100644
--- a/python/tests/files/datacite/datacite_result_19.json
+++ b/python/tests/files/datacite/datacite_result_19.json
@@ -4,7 +4,7 @@
       "metadataVersion": 3,
       "schemaVersion": "http://datacite.org/schema/kernel-3"
     },
-    "month": 8
+    "release_month": 8
   },
   "title": "Eastern questionnaire, answer sheet for Interviewee 55236, page 092",
   "release_type": "article",
diff --git a/python/tests/files/datacite/datacite_result_20.json b/python/tests/files/datacite/datacite_result_20.json
index ed1f8885..0f99e2a2 100644
--- a/python/tests/files/datacite/datacite_result_20.json
+++ b/python/tests/files/datacite/datacite_result_20.json
@@ -1,7 +1,7 @@
 {
   "extra": {
     "datacite": {},
-    "month": 8
+    "release_month": 8
   },
   "title": "<h1>Eastern questionnaire</h1>",
   "release_type": "article",
diff --git a/python/tests/files/datacite/datacite_result_21.json b/python/tests/files/datacite/datacite_result_21.json
index 1230abfa..3dfcf1bf 100644
--- a/python/tests/files/datacite/datacite_result_21.json
+++ b/python/tests/files/datacite/datacite_result_21.json
@@ -1,7 +1,7 @@
 {
   "extra": {
     "datacite": {},
-    "month": 8
+    "release_month": 8
   },
   "title": "ABC",
   "release_type": "article",
diff --git a/python/tests/files/datacite/datacite_result_22.json b/python/tests/files/datacite/datacite_result_22.json
index cba01531..bd88c358 100644
--- a/python/tests/files/datacite/datacite_result_22.json
+++ b/python/tests/files/datacite/datacite_result_22.json
@@ -1,7 +1,7 @@
 {
   "extra": {
     "datacite": {},
-    "month": 8
+    "release_month": 8
   },
   "title": "ABC",
   "release_type": "article",
diff --git a/python/tests/files/datacite/datacite_result_23.json b/python/tests/files/datacite/datacite_result_23.json
index db622e1c..e82925af 100644
--- a/python/tests/files/datacite/datacite_result_23.json
+++ b/python/tests/files/datacite/datacite_result_23.json
@@ -1,7 +1,7 @@
 {
   "extra": {
     "datacite": {},
-    "month": 8
+    "release_month": 8
   },
   "title": "ABC",
   "release_type": "article",
diff --git a/python/tests/files/datacite/datacite_result_24.json b/python/tests/files/datacite/datacite_result_24.json
index 8338cf29..2d95d300 100644
--- a/python/tests/files/datacite/datacite_result_24.json
+++ b/python/tests/files/datacite/datacite_result_24.json
@@ -1,7 +1,7 @@
 {
   "extra": {
     "datacite": {},
-    "month": 8
+    "release_month": 8
   },
   "title": "ABC",
   "subtitle": "DEF",
diff --git a/python/tests/files/datacite/datacite_result_25.json b/python/tests/files/datacite/datacite_result_25.json
index 8a370bbb..aad6d17e 100644
--- a/python/tests/files/datacite/datacite_result_25.json
+++ b/python/tests/files/datacite/datacite_result_25.json
@@ -1,7 +1,7 @@
 {
   "extra": {
     "datacite": {},
-    "month": 8
+    "release_month": 8
   },
   "title": "Additional file 123: ABC",
   "subtitle": "DEF",
-- 
cgit v1.2.3


From b7a325360ca8ae3107411e9e1966d93b999bbb52 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 8 Jan 2020 21:56:29 +0100
Subject: datacite: catch type mismatch in language detection

---
 python/fatcat_tools/importers/datacite.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index aaf1af2c..fc986994 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -584,9 +584,8 @@ class DataciteImporter(EntityImporter):
             lang = None
             try:
                 lang = langdetect.detect(text)
-            except langdetect.lang_detect_exception.LangDetectException as err:
-                print('[{}] language detection failed: {}'.format(doi, err),
-                      file=sys.stderr)
+            except (langdetect.lang_detect_exception.LangDetectException, TypeError) as err:
+                print('[{}] language detection failed with {} on {}'.format(doi, err, text), file=sys.stderr)
             abstracts.append(
                 fatcat_openapi_client.ReleaseAbstract(
                     mimetype="text/plain",
-- 
cgit v1.2.3


From 62d6a7e48d6bea1bc7f451c6043f38aee2051f9b Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 8 Jan 2020 22:33:58 +0100
Subject: datacite: factor out contributor handling

Use values from:

* attributes.creators[]
* attributes.contributors[]
---
 python/fatcat_tools/importers/datacite.py          | 183 ++++++++++++---------
 python/tests/files/datacite/datacite_doc_26.json   |  57 +++++++
 .../tests/files/datacite/datacite_result_05.json   |   6 +
 .../tests/files/datacite/datacite_result_09.json   |  11 ++
 .../tests/files/datacite/datacite_result_26.json   |  31 ++++
 python/tests/import_datacite.py                    |   4 +-
 6 files changed, 210 insertions(+), 82 deletions(-)
 create mode 100644 python/tests/files/datacite/datacite_doc_26.json
 create mode 100644 python/tests/files/datacite/datacite_result_26.json

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index fc986994..9ca72758 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -303,88 +303,11 @@ class DataciteImporter(EntityImporter):
             print('[{}] skipping non-ascii doi for now'.format(doi))
             return None
 
-        # Contributors. Many nameIdentifierSchemes, we do not use (yet):
-        # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme":
-        # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID",
-        # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"].
-        contribs = []
-
-        # Names, that should be ignored right away.
-        name_blacklist = set(('Occdownload Gbif.Org',))
-
-        for i, c in enumerate(attributes['creators']):
-            nameType = c.get('nameType', '') or ''
-            if nameType in ('', 'Personal'):
-                creator_id = None
-                for nid in c.get('nameIdentifiers', []):
-                    name_scheme = nid.get('nameIdentifierScheme', '') or ''
-                    if not name_scheme.lower() == "orcid":
-                        continue
-                    orcid = nid.get('nameIdentifier',
-                                    '').replace('https://orcid.org/', '')
-                    if not orcid:
-                        continue
-                    creator_id = self.lookup_orcid(orcid)
-                    # TODO(martin): If creator_id is None, should we create creators?
-
-                # If there are multiple affiliation strings, use the first one.
-                affiliations = c.get('affiliation', []) or []
-                raw_affiliation = None
-                if len(affiliations) == 0:
-                    raw_affiliation = None
-                else:
-                    raw_affiliation = clean(affiliations[0])
-
-                name = c.get('name')
-                given_name = c.get('givenName')
-                surname = c.get('familyName')
-
-                if name:
-                    name = clean(name)
-
-                if name in name_blacklist:
-                    continue
-
-                if given_name:
-                    given_name = clean(given_name)
-
-                if surname:
-                    surname = clean(surname)
-
-                if not name:
-                    continue
-
-                if raw_affiliation == '':
-                    continue
 
-                if name.lower() in UNKNOWN_MARKERS:
-                    continue
+        creators = attributes.get('creators', []) or []
+        contributors = attributes.get('contributors', []) or []  # Much fewer than creators.
 
-                # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'.
-                if name:
-                    name = index_form_to_display_name(name)
-
-                contribs.append(
-                    fatcat_openapi_client.ReleaseContrib(
-                        creator_id=creator_id,
-                        index=i,
-                        raw_name=name,
-                        given_name=given_name,
-                        surname=surname,
-                        role='author',
-                        raw_affiliation=raw_affiliation,
-                    ))
-            elif nameType == 'Organizational':
-                name = c.get('name', '') or ''
-                if name in UNKNOWN_MARKERS:
-                    continue
-                if len(name) < 3:
-                    continue
-                extra = {'organization': name}
-                contribs.append(fatcat_openapi_client.ReleaseContrib(
-                    index=i, extra=extra))
-            else:
-                print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr)
+        contribs = self.parse_datacite_creators(creators) + self.parse_datacite_creators(contributors, role=None, set_index=False)
 
         # Title, may come with "attributes.titles[].titleType", like
         # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
@@ -767,6 +690,104 @@ class DataciteImporter(EntityImporter):
                     extra=self.editgroup_extra),
                 entity_list=batch))
 
+    def parse_datacite_creators(self, creators, role='author', set_index=True):
+        """
+        Parses a list of creators into a list of ReleaseContrib objects. Set
+        set_index to False, if the index contrib field should be left blank.
+        """
+        # Contributors. Many nameIdentifierSchemes, we do not use (yet):
+        # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme":
+        # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID",
+        # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"].
+        contribs = []
+
+        # Names, that should be ignored right away.
+        name_blacklist = set(('Occdownload Gbif.Org',))
+
+        for i, c in enumerate(creators):
+            if not set_index:
+                i = None
+            nameType = c.get('nameType', '') or ''
+            if nameType in ('', 'Personal'):
+                creator_id = None
+                for nid in c.get('nameIdentifiers', []):
+                    name_scheme = nid.get('nameIdentifierScheme', '') or ''
+                    if not name_scheme.lower() == "orcid":
+                        continue
+                    orcid = nid.get('nameIdentifier', '').replace('https://orcid.org/', '')
+                    if not orcid:
+                        continue
+                    creator_id = self.lookup_orcid(orcid)
+                    # TODO(martin): If creator_id is None, should we create creators?
+
+                # If there are multiple affiliation strings, use the first one.
+                affiliations = c.get('affiliation', []) or []
+                raw_affiliation = None
+                if len(affiliations) == 0:
+                    raw_affiliation = None
+                else:
+                    raw_affiliation = clean(affiliations[0])
+
+                name = c.get('name')
+                given_name = c.get('givenName')
+                surname = c.get('familyName')
+
+                if name:
+                    name = clean(name)
+                if not name:
+                    continue
+                if name in name_blacklist:
+                    continue
+                if name.lower() in UNKNOWN_MARKERS:
+                    continue
+                # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'.
+                if name:
+                    name = index_form_to_display_name(name)
+
+                if given_name:
+                    given_name = clean(given_name)
+                if surname:
+                    surname = clean(surname)
+                if raw_affiliation == '':
+                    continue
+
+                extra = None
+
+                # "DataManager", "DataCurator", "ContactPerson", "Distributor",
+                # "RegistrationAgency", "Sponsor", "Researcher",
+                # "RelatedPerson", "ProjectLeader", "Editor", "Other",
+                # "ProjectMember", "Funder", "RightsHolder", "DataCollector",
+                # "Supervisor", "Producer", "HostingInstitution", "ResearchGroup"
+                contributorType = c.get('contributorType', '') or ''
+
+                if contributorType:
+                    extra = {'type': contributorType}
+
+                contribs.append(
+                    fatcat_openapi_client.ReleaseContrib(
+                        creator_id=creator_id,
+                        index=i,
+                        raw_name=name,
+                        given_name=given_name,
+                        surname=surname,
+                        role=role,
+                        raw_affiliation=raw_affiliation,
+                        extra=extra,
+                    ))
+            elif nameType == 'Organizational':
+                name = c.get('name', '') or ''
+                if name in UNKNOWN_MARKERS:
+                    continue
+                if len(name) < 3:
+                    continue
+                extra = {'organization': name}
+                contribs.append(fatcat_openapi_client.ReleaseContrib(
+                    index=i, extra=extra))
+            else:
+                print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr)
+
+        return contribs
+
 
 def lookup_license_slug(raw):
     """
@@ -971,6 +992,8 @@ def index_form_to_display_name(s):
     if s.count(',') > 1:
         # "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan"
         return s
+
+    # Not names, but sprinkled in fields where authors live.
     stopwords = [s.lower() for s in (
         'Archive',
         'Collection',
diff --git a/python/tests/files/datacite/datacite_doc_26.json b/python/tests/files/datacite/datacite_doc_26.json
new file mode 100644
index 00000000..c2abb1b2
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_26.json
@@ -0,0 +1,57 @@
+{
+  "attributes": {
+    "doi": "10.7916/d86x0cg1",
+    "creators": [
+      {
+        "name": "Anton Welch",
+        "affiliation": [
+          "Department of pataphysics"
+        ],
+        "nameIdentifiers": []
+      }
+    ],
+    "contributors": [
+      {
+        "name": "Wemmer, David",
+        "nameType": "Personal",
+        "givenName": "David",
+        "familyName": "Wemmer",
+        "affiliation": [],
+        "contributorType": "Editor"
+      }
+    ],
+    "titles": [
+      {
+        "title": "Additional file 123: ABC"
+      },
+      {
+        "title": "DEF",
+        "titleType": "Subtitle"
+      }
+    ],
+    "publicationYear": 2016,
+    "language": "DE-CH",
+    "types": {
+      "ris": "GEN",
+      "bibtex": "misc",
+      "citeproc": "article",
+      "schemaOrg": "CreativeWork"
+    },
+    "dates": [
+      {
+        "date": "2017-08-24",
+        "dateType": "Created"
+      },
+      {
+        "date": "2019-08-04",
+        "dateType": "Updated"
+      },
+      {
+        "date": "2017",
+        "dateType": "Issued"
+      }
+    ],
+    "isActive": true,
+    "state": "findable"
+  }
+}
diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json
index 22542a10..c4e5418d 100644
--- a/python/tests/files/datacite/datacite_result_05.json
+++ b/python/tests/files/datacite/datacite_result_05.json
@@ -523,6 +523,12 @@
       "given_name": "Christian",
       "surname": "Wurzbacher",
       "role": "author"
+    },
+    {
+      "raw_name": "Kessy Abarenkov"
+    },
+    {
+      "raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden"
     }
   ],
   "refs": [],
diff --git a/python/tests/files/datacite/datacite_result_09.json b/python/tests/files/datacite/datacite_result_09.json
index fd873309..c93dc769 100644
--- a/python/tests/files/datacite/datacite_result_09.json
+++ b/python/tests/files/datacite/datacite_result_09.json
@@ -32,6 +32,17 @@
       "given_name": "Nils",
       "surname": "Kirstaedter",
       "role": "author"
+    },
+    {
+      "extra": {
+        "organization": "TIB-Technische Informationsbibliothek Universitätsbibliothek Hannover"
+      }
+    },
+    {
+      "raw_name": "Technische Informationsbibliothek (TIB)",
+      "extra": {
+        "type": "DataManager"
+      }
     }
   ],
   "refs": [],
diff --git a/python/tests/files/datacite/datacite_result_26.json b/python/tests/files/datacite/datacite_result_26.json
new file mode 100644
index 00000000..8d26197c
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_26.json
@@ -0,0 +1,31 @@
+{
+  "extra": {
+    "datacite": {},
+    "release_month": 8
+  },
+  "title": "Additional file 123: ABC",
+  "subtitle": "DEF",
+  "release_type": "stub",
+  "release_stage": "published",
+  "release_date": "2017-08-24",
+  "release_year": 2017,
+  "ext_ids": {
+    "doi": "10.7916/d86x0cg1"
+  },
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "Anton Welch",
+      "role": "author",
+      "raw_affiliation": "Department of pataphysics"
+    },
+      {
+        "extra": {"type": "Editor"},
+        "raw_name": "David Wemmer",
+        "given_name": "David",
+        "surname": "Wemmer"
+      }
+  ],
+  "refs": [],
+  "abstracts": []
+}
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 7293ecac..5ad7ef2c 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -275,7 +275,7 @@ def test_datacite_dict_parse(datacite_importer):
         assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}]
         assert len(r.abstracts) == 1
         assert len(r.abstracts[0].content) == 421
-        assert len(r.contribs) == 1
+        assert len(r.contribs) == 2
         assert r.contribs[0].raw_name == "GLIS Of The ITPGRFA"
         assert r.contribs[0].given_name == None
         assert r.contribs[0].surname == None
@@ -287,7 +287,7 @@ def test_datacite_conversions(datacite_importer):
     for now.
     """
     datacite_importer.debug = True
-    for i in range(26):
+    for i in range(27):
         src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i)
         dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i)
         print('testing mapping from {} => {}'.format(src, dst))
-- 
cgit v1.2.3


From d3deb36c26ae86c1763c33a8c356ecd5491caa40 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 8 Jan 2020 22:41:17 +0100
Subject: datacite: reformat test cases and use jq . --sort-keys

---
 python/tests/files/datacite/datacite_doc_00.json   | 158 ++---
 python/tests/files/datacite/datacite_doc_01.json   |  96 +--
 python/tests/files/datacite/datacite_doc_02.json   |  96 +--
 python/tests/files/datacite/datacite_doc_03.json   |  78 +--
 python/tests/files/datacite/datacite_doc_04.json   |  94 +--
 python/tests/files/datacite/datacite_doc_05.json   | 684 ++++++++++-----------
 python/tests/files/datacite/datacite_doc_06.json   |  90 +--
 python/tests/files/datacite/datacite_doc_07.json   | 120 ++--
 python/tests/files/datacite/datacite_doc_08.json   | 112 ++--
 python/tests/files/datacite/datacite_doc_09.json   | 140 ++---
 python/tests/files/datacite/datacite_doc_10.json   |  90 +--
 python/tests/files/datacite/datacite_doc_11.json   |  92 +--
 python/tests/files/datacite/datacite_doc_12.json   | 124 ++--
 python/tests/files/datacite/datacite_doc_13.json   |  98 +--
 python/tests/files/datacite/datacite_doc_14.json   | 188 +++---
 python/tests/files/datacite/datacite_doc_15.json   |  92 +--
 python/tests/files/datacite/datacite_doc_16.json   |  94 +--
 python/tests/files/datacite/datacite_doc_17.json   |  84 +--
 python/tests/files/datacite/datacite_doc_18.json   |  82 +--
 python/tests/files/datacite/datacite_doc_19.json   |  82 +--
 python/tests/files/datacite/datacite_doc_20.json   |  24 +-
 python/tests/files/datacite/datacite_doc_21.json   |  32 +-
 python/tests/files/datacite/datacite_doc_22.json   |  32 +-
 python/tests/files/datacite/datacite_doc_23.json   |  32 +-
 python/tests/files/datacite/datacite_doc_24.json   |  40 +-
 python/tests/files/datacite/datacite_doc_25.json   |  40 +-
 python/tests/files/datacite/datacite_doc_26.json   |  58 +-
 .../tests/files/datacite/datacite_result_00.json   |  86 +--
 .../tests/files/datacite/datacite_result_01.json   |  36 +-
 .../tests/files/datacite/datacite_result_02.json   |  36 +-
 .../tests/files/datacite/datacite_result_03.json   |  26 +-
 .../tests/files/datacite/datacite_result_04.json   |  48 +-
 .../tests/files/datacite/datacite_result_05.json   | 494 +++++++--------
 .../tests/files/datacite/datacite_result_06.json   |  26 +-
 .../tests/files/datacite/datacite_result_07.json   |  92 +--
 .../tests/files/datacite/datacite_result_08.json   |  66 +-
 .../tests/files/datacite/datacite_result_09.json   |  64 +-
 .../tests/files/datacite/datacite_result_10.json   |  40 +-
 .../tests/files/datacite/datacite_result_11.json   |  32 +-
 .../tests/files/datacite/datacite_result_12.json   |  56 +-
 .../tests/files/datacite/datacite_result_13.json   |  44 +-
 .../tests/files/datacite/datacite_result_14.json   | 152 ++---
 .../tests/files/datacite/datacite_result_15.json   |  34 +-
 .../tests/files/datacite/datacite_result_16.json   |  34 +-
 .../tests/files/datacite/datacite_result_17.json   |  30 +-
 .../tests/files/datacite/datacite_result_18.json   |  20 +-
 .../tests/files/datacite/datacite_result_19.json   |  20 +-
 .../tests/files/datacite/datacite_result_20.json   |  18 +-
 .../tests/files/datacite/datacite_result_21.json   |  20 +-
 .../tests/files/datacite/datacite_result_22.json   |  32 +-
 .../tests/files/datacite/datacite_result_23.json   |  32 +-
 .../tests/files/datacite/datacite_result_24.json   |  32 +-
 .../tests/files/datacite/datacite_result_25.json   |  32 +-
 .../tests/files/datacite/datacite_result_26.json   |  46 +-
 54 files changed, 2301 insertions(+), 2299 deletions(-)

(limited to 'python')

diff --git a/python/tests/files/datacite/datacite_doc_00.json b/python/tests/files/datacite/datacite_doc_00.json
index 248f525f..f60b106f 100644
--- a/python/tests/files/datacite/datacite_doc_00.json
+++ b/python/tests/files/datacite/datacite_doc_00.json
@@ -1,53 +1,34 @@
 {
-  "id": "10.1007/s10870-008-9413-z",
-  "type": "dois",
   "attributes": {
-    "doi": "10.1007/s10870-008-9413-z",
-    "identifiers": [
-      {
-        "identifier": "https://doi.org/10.1007/s10870-008-9413-z",
-        "identifierType": "DOI"
-      },
-      {
-        "identifier": "s10870-008-9413-z",
-        "identifierType": "Publisher ID"
-      }
-    ],
+    "container": {
+      "firstPage": "927",
+      "identifier": "1074-1542",
+      "identifierType": "ISSN",
+      "issue": "12",
+      "lastPage": "930",
+      "title": "Journal of Chemical Crystallography",
+      "type": "Journal",
+      "volume": "38"
+    },
+    "contentUrl": null,
+    "contributors": [],
+    "created": "2019-06-18T14:52:19.000Z",
     "creators": [
       {
-        "name": "Li, Qian-Jin",
-        "nameType": "Personal",
-        "givenName": "Qian-Jin",
+        "affiliation": [],
         "familyName": "Li",
-        "affiliation": []
+        "givenName": "Qian-Jin",
+        "name": "Li, Qian-Jin",
+        "nameType": "Personal"
       },
       {
-        "name": "Yang, Chun-Long",
-        "nameType": "Personal",
-        "givenName": "Chun-Long",
+        "affiliation": [],
         "familyName": "Yang",
-        "affiliation": []
-      }
-    ],
-    "titles": [
-      {
-        "title": "Synthesis and Crystal Structure of a Compound with Two Conformational Isomers: N-(2-methylbenzoyl)-N′-(4-nitrophenyl)thiourea"
+        "givenName": "Chun-Long",
+        "name": "Yang, Chun-Long",
+        "nameType": "Personal"
       }
     ],
-    "publisher": "Springer Science and Business Media LLC",
-    "container": {
-      "type": "Journal",
-      "issue": "12",
-      "title": "Journal of Chemical Crystallography",
-      "volume": "38",
-      "lastPage": "930",
-      "firstPage": "927",
-      "identifier": "1074-1542",
-      "identifierType": "ISSN"
-    },
-    "publicationYear": 2008,
-    "subjects": [],
-    "contributors": [],
     "dates": [
       {
         "date": "2008-05-30",
@@ -58,77 +39,95 @@
         "dateType": "Updated"
       }
     ],
+    "descriptions": [],
+    "doi": "10.1007/s10870-008-9413-z",
+    "formats": [],
+    "fundingReferences": [],
+    "geoLocations": [],
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.1007/s10870-008-9413-z",
+        "identifierType": "DOI"
+      },
+      {
+        "identifier": "s10870-008-9413-z",
+        "identifierType": "Publisher ID"
+      }
+    ],
+    "isActive": true,
     "language": null,
-    "types": {
-      "ris": "JOUR",
-      "bibtex": "article",
-      "citeproc": "article-journal",
-      "schemaOrg": "ScholarlyArticle",
-      "resourceType": "JournalArticle",
-      "resourceTypeGeneral": "Text"
-    },
+    "metadataVersion": 1,
+    "publicationYear": 2008,
+    "published": "2008",
+    "publisher": "Springer Science and Business Media LLC",
+    "reason": null,
+    "registered": null,
     "relatedIdentifiers": [
       {
-        "relationType": "IsPartOf",
         "relatedIdentifier": "1074-1542",
-        "resourceTypeGeneral": "Collection",
-        "relatedIdentifierType": "ISSN"
+        "relatedIdentifierType": "ISSN",
+        "relationType": "IsPartOf",
+        "resourceTypeGeneral": "Collection"
       },
       {
-        "relationType": "References",
         "relatedIdentifier": "10.1016/j.bmcl.2005.09.033",
-        "relatedIdentifierType": "DOI"
+        "relatedIdentifierType": "DOI",
+        "relationType": "References"
       },
       {
-        "relationType": "References",
         "relatedIdentifier": "10.1016/s0022-1139(02)00330-5",
-        "relatedIdentifierType": "DOI"
+        "relatedIdentifierType": "DOI",
+        "relationType": "References"
       },
       {
-        "relationType": "References",
         "relatedIdentifier": "10.1016/s0010-8545(01)00337-x",
-        "relatedIdentifierType": "DOI"
+        "relatedIdentifierType": "DOI",
+        "relationType": "References"
       },
       {
-        "relationType": "References",
         "relatedIdentifier": "10.1016/j.tetlet.2005.06.135",
-        "relatedIdentifierType": "DOI"
+        "relatedIdentifierType": "DOI",
+        "relationType": "References"
       },
       {
-        "relationType": "References",
         "relatedIdentifier": "10.1039/p298700000s1",
-        "relatedIdentifierType": "DOI"
+        "relatedIdentifierType": "DOI",
+        "relationType": "References"
       },
       {
-        "relationType": "References",
         "relatedIdentifier": "10.1002/anie.199515551",
-        "relatedIdentifierType": "DOI"
+        "relatedIdentifierType": "DOI",
+        "relationType": "References"
       }
     ],
-    "sizes": [],
-    "formats": [],
-    "version": null,
     "rightsList": [
       {
         "rightsUri": "http://www.springer.com/tdm"
       }
     ],
-    "descriptions": [],
-    "geoLocations": [],
-    "fundingReferences": [],
-    "url": "http://link.springer.com/10.1007/s10870-008-9413-z",
-    "contentUrl": null,
-    "metadataVersion": 1,
     "schemaVersion": "http://datacite.org/schema/kernel-4",
+    "sizes": [],
     "source": "levriero",
-    "isActive": true,
     "state": "findable",
-    "reason": null,
-    "created": "2019-06-18T14:52:19.000Z",
-    "registered": null,
-    "published": "2008",
-    "updated": "2019-08-03T00:03:40.000Z"
+    "subjects": [],
+    "titles": [
+      {
+        "title": "Synthesis and Crystal Structure of a Compound with Two Conformational Isomers: N-(2-methylbenzoyl)-N′-(4-nitrophenyl)thiourea"
+      }
+    ],
+    "types": {
+      "bibtex": "article",
+      "citeproc": "article-journal",
+      "resourceType": "JournalArticle",
+      "resourceTypeGeneral": "Text",
+      "ris": "JOUR",
+      "schemaOrg": "ScholarlyArticle"
+    },
+    "updated": "2019-08-03T00:03:40.000Z",
+    "url": "http://link.springer.com/10.1007/s10870-008-9413-z",
+    "version": null
   },
+  "id": "10.1007/s10870-008-9413-z",
   "relationships": {
     "client": {
       "data": {
@@ -136,5 +135,6 @@
         "type": "clients"
       }
     }
-  }
+  },
+  "type": "dois"
 }
diff --git a/python/tests/files/datacite/datacite_doc_01.json b/python/tests/files/datacite/datacite_doc_01.json
index c4ef6e45..16a446b3 100644
--- a/python/tests/files/datacite/datacite_doc_01.json
+++ b/python/tests/files/datacite/datacite_doc_01.json
@@ -1,75 +1,74 @@
 {
-  "id": "10.11588/diglit.25558.39",
-  "type": "dois",
   "attributes": {
-    "doi": "10.11588/diglit.25558.39",
-    "identifiers": [
-      {
-        "identifier": "https://doi.org/10.11588/diglit.25558.39",
-        "identifierType": "DOI"
-      }
-    ],
+    "container": {},
+    "contentUrl": null,
+    "contributors": [],
+    "created": "2016-12-08T07:43:15.000Z",
     "creators": [
       {
-        "name": "Dargenty, G.",
-        "nameType": "Personal",
-        "givenName": "G.",
+        "affiliation": [],
         "familyName": "Dargenty",
-        "affiliation": []
-      }
-    ],
-    "titles": [
-      {
-        "lang": "de",
-        "title": "Ferdinand Gaillard, [1]: né à Paris le 16 janvier 1834, mort à Paris le 19 janvier 1887"
+        "givenName": "G.",
+        "name": "Dargenty, G.",
+        "nameType": "Personal"
       }
     ],
-    "publisher": "University Library Heidelberg",
-    "container": {},
-    "publicationYear": 1887,
-    "subjects": [],
-    "contributors": [],
     "dates": [
       {
         "date": "1887",
         "dateType": "Issued"
       }
     ],
+    "descriptions": [],
+    "doi": "10.11588/diglit.25558.39",
+    "formats": [],
+    "fundingReferences": [],
+    "geoLocations": [],
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.11588/diglit.25558.39",
+        "identifierType": "DOI"
+      }
+    ],
+    "isActive": true,
     "language": "fre",
-    "types": {
-      "ris": "RPRT",
-      "bibtex": "article",
-      "citeproc": "article-journal",
-      "schemaOrg": "ScholarlyArticle",
-      "resourceType": "DigitalisatDigital copy",
-      "resourceTypeGeneral": "Text"
-    },
+    "metadataVersion": 4,
+    "publicationYear": 1887,
+    "published": "1887",
+    "publisher": "University Library Heidelberg",
+    "reason": null,
+    "registered": "2016-12-08T07:43:15.000Z",
     "relatedIdentifiers": [],
-    "sizes": [],
-    "formats": [],
-    "version": null,
     "rightsList": [
       {
         "lang": "de",
         "rights": "Standard (Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen) - http://www.ub.uni-heidelberg.de/helios/digi/nutzung/Welcome.html"
       }
     ],
-    "descriptions": [],
-    "geoLocations": [],
-    "fundingReferences": [],
-    "url": "http://digi.ub.uni-heidelberg.de/diglit/art1887_1/0172",
-    "contentUrl": null,
-    "metadataVersion": 4,
     "schemaVersion": "http://datacite.org/schema/kernel-4",
+    "sizes": [],
     "source": null,
-    "isActive": true,
     "state": "findable",
-    "reason": null,
-    "created": "2016-12-08T07:43:15.000Z",
-    "registered": "2016-12-08T07:43:15.000Z",
-    "published": "1887",
-    "updated": "2019-08-02T14:27:33.000Z"
+    "subjects": [],
+    "titles": [
+      {
+        "lang": "de",
+        "title": "Ferdinand Gaillard, [1]: né à Paris le 16 janvier 1834, mort à Paris le 19 janvier 1887"
+      }
+    ],
+    "types": {
+      "bibtex": "article",
+      "citeproc": "article-journal",
+      "resourceType": "DigitalisatDigital copy",
+      "resourceTypeGeneral": "Text",
+      "ris": "RPRT",
+      "schemaOrg": "ScholarlyArticle"
+    },
+    "updated": "2019-08-02T14:27:33.000Z",
+    "url": "http://digi.ub.uni-heidelberg.de/diglit/art1887_1/0172",
+    "version": null
   },
+  "id": "10.11588/diglit.25558.39",
   "relationships": {
     "client": {
       "data": {
@@ -77,5 +76,6 @@
         "type": "clients"
       }
     }
-  }
+  },
+  "type": "dois"
 }
diff --git a/python/tests/files/datacite/datacite_doc_02.json b/python/tests/files/datacite/datacite_doc_02.json
index 8b9a594e..139e2cb0 100644
--- a/python/tests/files/datacite/datacite_doc_02.json
+++ b/python/tests/files/datacite/datacite_doc_02.json
@@ -1,53 +1,44 @@
 {
-  "id": "10.11588/diglit.37715.57",
-  "type": "dois",
   "attributes": {
-    "doi": "10.11588/diglit.37715.57",
-    "identifiers": [
-      {
-        "identifier": "https://doi.org/10.11588/diglit.37715.57",
-        "identifierType": "DOI"
-      }
-    ],
+    "container": {},
+    "contentUrl": null,
+    "contributors": [],
+    "created": "2018-11-29T12:04:12.000Z",
     "creators": [
       {
-        "name": "Weyersberg, Albert",
-        "nameType": "Personal",
-        "givenName": "Albert",
+        "affiliation": [],
         "familyName": "Weyersberg",
-        "affiliation": []
-      }
-    ],
-    "titles": [
-      {
-        "lang": "de",
-        "title": "Solinger Schwertschmiede-Familien, [4]"
+        "givenName": "Albert",
+        "name": "Weyersberg, Albert",
+        "nameType": "Personal"
       }
     ],
-    "publisher": "University Library Heidelberg",
-    "container": {},
-    "publicationYear": 1897,
-    "subjects": [],
-    "contributors": [],
     "dates": [
       {
         "date": "1897",
         "dateType": "Issued"
       }
     ],
+    "descriptions": [],
+    "doi": "10.11588/diglit.37715.57",
+    "formats": [],
+    "fundingReferences": [],
+    "geoLocations": [],
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.11588/diglit.37715.57",
+        "identifierType": "DOI"
+      }
+    ],
+    "isActive": true,
     "language": "ger",
-    "types": {
-      "ris": "RPRT",
-      "bibtex": "article",
-      "citeproc": "article-journal",
-      "schemaOrg": "ScholarlyArticle",
-      "resourceType": "DigitalisatDigital copy",
-      "resourceTypeGeneral": "Text"
-    },
+    "metadataVersion": 2,
+    "publicationYear": 1897,
+    "published": "1897",
+    "publisher": "University Library Heidelberg",
+    "reason": null,
+    "registered": "2018-11-29T12:04:13.000Z",
     "relatedIdentifiers": [],
-    "sizes": [],
-    "formats": [],
-    "version": null,
     "rightsList": [
       {
         "lang": "de",
@@ -58,22 +49,30 @@
         "rights": "Creative Commons - Namensnennung - Weitergabe unter gleichen Bedingungen - https://creativecommons.org/licenses/by-sa/3.0/"
       }
     ],
-    "descriptions": [],
-    "geoLocations": [],
-    "fundingReferences": [],
-    "url": "https://digi.ub.uni-heidelberg.de/diglit/zhwk1897_1899/0131",
-    "contentUrl": null,
-    "metadataVersion": 2,
     "schemaVersion": "http://datacite.org/schema/kernel-4",
+    "sizes": [],
     "source": "mds",
-    "isActive": true,
     "state": "findable",
-    "reason": null,
-    "created": "2018-11-29T12:04:12.000Z",
-    "registered": "2018-11-29T12:04:13.000Z",
-    "published": "1897",
-    "updated": "2019-08-02T21:31:04.000Z"
+    "subjects": [],
+    "titles": [
+      {
+        "lang": "de",
+        "title": "Solinger Schwertschmiede-Familien, [4]"
+      }
+    ],
+    "types": {
+      "bibtex": "article",
+      "citeproc": "article-journal",
+      "resourceType": "DigitalisatDigital copy",
+      "resourceTypeGeneral": "Text",
+      "ris": "RPRT",
+      "schemaOrg": "ScholarlyArticle"
+    },
+    "updated": "2019-08-02T21:31:04.000Z",
+    "url": "https://digi.ub.uni-heidelberg.de/diglit/zhwk1897_1899/0131",
+    "version": null
   },
+  "id": "10.11588/diglit.37715.57",
   "relationships": {
     "client": {
       "data": {
@@ -81,5 +80,6 @@
         "type": "clients"
       }
     }
-  }
+  },
+  "type": "dois"
 }
diff --git a/python/tests/files/datacite/datacite_doc_03.json b/python/tests/files/datacite/datacite_doc_03.json
index e77a359c..80bacabc 100644
--- a/python/tests/files/datacite/datacite_doc_03.json
+++ b/python/tests/files/datacite/datacite_doc_03.json
@@ -1,64 +1,63 @@
 {
-  "id": "10.13140/rg.2.2.30434.53446",
-  "type": "dois",
   "attributes": {
-    "doi": "10.13140/rg.2.2.30434.53446",
-    "identifiers": [
+    "container": {},
+    "contentUrl": null,
+    "contributors": [],
+    "created": "2016-11-03T09:07:08.000Z",
+    "creators": [
       {
-        "identifier": "https://doi.org/10.13140/rg.2.2.30434.53446",
-        "identifierType": "DOI"
+        "affiliation": [],
+        "name": "Mastura Yahya"
       }
     ],
-    "creators": [
+    "dates": [
       {
-        "name": "Mastura Yahya",
-        "affiliation": []
+        "date": "2016",
+        "dateType": "Issued"
       }
     ],
-    "titles": [
+    "descriptions": [],
+    "doi": "10.13140/rg.2.2.30434.53446",
+    "formats": [],
+    "fundingReferences": [],
+    "geoLocations": [],
+    "identifiers": [
       {
-        "title": "midterm ah30903"
+        "identifier": "https://doi.org/10.13140/rg.2.2.30434.53446",
+        "identifierType": "DOI"
       }
     ],
-    "publisher": "Unpublished",
-    "container": {},
+    "isActive": true,
+    "language": "ms",
+    "metadataVersion": 0,
     "publicationYear": 2016,
+    "published": "2016",
+    "publisher": "Unpublished",
+    "reason": null,
+    "registered": "2016-11-03T09:07:09.000Z",
+    "relatedIdentifiers": [],
+    "rightsList": [],
+    "schemaVersion": "http://datacite.org/schema/kernel-3",
+    "sizes": [],
+    "source": null,
+    "state": "findable",
     "subjects": [],
-    "contributors": [],
-    "dates": [
+    "titles": [
       {
-        "date": "2016",
-        "dateType": "Issued"
+        "title": "midterm ah30903"
       }
     ],
-    "language": "ms",
     "types": {
-      "ris": "GEN",
       "bibtex": "misc",
       "citeproc": "article",
+      "ris": "GEN",
       "schemaOrg": "CreativeWork"
     },
-    "relatedIdentifiers": [],
-    "sizes": [],
-    "formats": [],
-    "version": null,
-    "rightsList": [],
-    "descriptions": [],
-    "geoLocations": [],
-    "fundingReferences": [],
+    "updated": "2019-08-02T12:51:15.000Z",
     "url": "http://rgdoi.net/10.13140/RG.2.2.30434.53446",
-    "contentUrl": null,
-    "metadataVersion": 0,
-    "schemaVersion": "http://datacite.org/schema/kernel-3",
-    "source": null,
-    "isActive": true,
-    "state": "findable",
-    "reason": null,
-    "created": "2016-11-03T09:07:08.000Z",
-    "registered": "2016-11-03T09:07:09.000Z",
-    "published": "2016",
-    "updated": "2019-08-02T12:51:15.000Z"
+    "version": null
   },
+  "id": "10.13140/rg.2.2.30434.53446",
   "relationships": {
     "client": {
       "data": {
@@ -66,5 +65,6 @@
         "type": "clients"
       }
     }
-  }
+  },
+  "type": "dois"
 }
diff --git a/python/tests/files/datacite/datacite_doc_04.json b/python/tests/files/datacite/datacite_doc_04.json
index 8655a26a..f7d06a75 100644
--- a/python/tests/files/datacite/datacite_doc_04.json
+++ b/python/tests/files/datacite/datacite_doc_04.json
@@ -1,74 +1,73 @@
 {
-  "id": "10.14288/1.0080520",
-  "type": "dois",
   "attributes": {
-    "doi": "10.14288/1.0080520",
-    "identifiers": [
-      {
-        "identifier": "https://doi.org/10.14288/1.0080520",
-        "identifierType": "DOI"
-      }
-    ],
+    "container": {},
+    "contentUrl": null,
+    "contributors": [],
+    "created": "2015-11-11T11:12:34.000Z",
     "creators": [
       {
-        "name": "Nicollerat, Marc Andre",
-        "nameType": "Personal",
-        "givenName": "Marc Andre",
+        "affiliation": [],
         "familyName": "Nicollerat",
-        "affiliation": []
-      }
-    ],
-    "titles": [
-      {
-        "title": "On chain maps inducing isomorphisms in homology"
+        "givenName": "Marc Andre",
+        "name": "Nicollerat, Marc Andre",
+        "nameType": "Personal"
       }
     ],
-    "publisher": "University of British Columbia",
-    "container": {},
-    "publicationYear": 1973,
-    "subjects": [],
-    "contributors": [],
     "dates": [
       {
         "date": "1973",
         "dateType": "Issued"
       }
     ],
-    "language": "en",
-    "types": {
-      "ris": "RPRT",
-      "bibtex": "article",
-      "citeproc": "article-journal",
-      "schemaOrg": "ScholarlyArticle",
-      "resourceType": "Text",
-      "resourceTypeGeneral": "Text"
-    },
-    "relatedIdentifiers": [],
-    "sizes": [],
-    "formats": [],
-    "version": null,
-    "rightsList": [],
     "descriptions": [
       {
         "description": "Let A be an abelian category, I the full subcategory of A consisting of injective objects of A, and K(A) the category whose objects are cochain complexes of elements of A, and whose morphisms are homotopy classes of cochain maps.  In (5), lemma 4.6., p. 42, R. Hartshorne has proved that, under certain conditions, a cochain complex X˙ ε. |KA)| can be embedded in a complex I˙ ε. |K(I)| in such a way that I˙ has the same cohomology as X˙.  In Chapter I we show that the construction given in the two first parts of Hartshorne's Lemma is natural i.e. there exists a functor  J : K(A) → K(I) and a natural transformation [formula omitted]  (where E : K(I) → K(A) is the embedding functor) such that [formula omitted] is  injective and induces isomorphism in cohomology. The question whether the construction given in the third part of the lemma is functorial is still open.  We also prove that J is left adjoint to E, so that K(I) is a reflective subcategory of K(A).  In the special case where A is a category [formula omitted] of left A-modules, and [formula omitted] the category of cochain complexes in [formula omitted] and cochain maps (not homotopy classes), we prove the existence of a functor [formula omitted]  In Chapter II we study the natural homomorphism [formula omitted]   where A, B are rings, and M, L, N modules or chain complexes. In particular we give several sufficient conditions under which v is an isomorphism, or induces isomorphism in homology.  In the appendix we give a detailed proof of Hartshorne's Lemma. We think that this is useful, as no complete proof is, to our knowledge, to be found in the literature.",
         "descriptionType": "Abstract"
       }
     ],
-    "geoLocations": [],
+    "doi": "10.14288/1.0080520",
+    "formats": [],
     "fundingReferences": [],
-    "url": "https://doi.library.ubc.ca/10.14288/1.0080520",
-    "contentUrl": null,
+    "geoLocations": [],
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.14288/1.0080520",
+        "identifierType": "DOI"
+      }
+    ],
+    "isActive": true,
+    "language": "en",
     "metadataVersion": 5,
+    "publicationYear": 1973,
+    "published": "1973",
+    "publisher": "University of British Columbia",
+    "reason": null,
+    "registered": "2015-11-11T11:12:35.000Z",
+    "relatedIdentifiers": [],
+    "rightsList": [],
     "schemaVersion": "http://datacite.org/schema/kernel-3",
+    "sizes": [],
     "source": null,
-    "isActive": true,
     "state": "findable",
-    "reason": null,
-    "created": "2015-11-11T11:12:34.000Z",
-    "registered": "2015-11-11T11:12:35.000Z",
-    "published": "1973",
-    "updated": "2019-08-02T09:43:14.000Z"
+    "subjects": [],
+    "titles": [
+      {
+        "title": "On chain maps inducing isomorphisms in homology"
+      }
+    ],
+    "types": {
+      "bibtex": "article",
+      "citeproc": "article-journal",
+      "resourceType": "Text",
+      "resourceTypeGeneral": "Text",
+      "ris": "RPRT",
+      "schemaOrg": "ScholarlyArticle"
+    },
+    "updated": "2019-08-02T09:43:14.000Z",
+    "url": "https://doi.library.ubc.ca/10.14288/1.0080520",
+    "version": null
   },
+  "id": "10.14288/1.0080520",
   "relationships": {
     "client": {
       "data": {
@@ -76,5 +75,6 @@
         "type": "clients"
       }
     }
-  }
+  },
+  "type": "dois"
 }
diff --git a/python/tests/files/datacite/datacite_doc_05.json b/python/tests/files/datacite/datacite_doc_05.json
index 75e68e9d..76fb73a8 100644
--- a/python/tests/files/datacite/datacite_doc_05.json
+++ b/python/tests/files/datacite/datacite_doc_05.json
@@ -1,534 +1,515 @@
 {
-  "id": "10.15156/bio/sh409843.07fu",
-  "type": "dois",
   "attributes": {
-    "doi": "10.15156/bio/sh409843.07fu",
-    "identifiers": [
+    "container": {},
+    "contentUrl": null,
+    "contributors": [
       {
-        "identifier": "https://doi.org/10.15156/bio/sh409843.07fu",
-        "identifierType": "DOI"
+        "affiliation": [],
+        "name": "Kessy Abarenkov"
+      },
+      {
+        "affiliation": [],
+        "name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden"
       }
     ],
+    "created": "2015-06-05T10:23:18.000Z",
     "creators": [
       {
-        "name": "Kõljalg, Urmas",
-        "nameType": "Personal",
-        "givenName": "Urmas",
+        "affiliation": [],
         "familyName": "Kõljalg",
-        "affiliation": []
+        "givenName": "Urmas",
+        "name": "Kõljalg, Urmas",
+        "nameType": "Personal"
       },
       {
-        "name": "Abarenkov, Kessy",
-        "nameType": "Personal",
-        "givenName": "Kessy",
+        "affiliation": [],
         "familyName": "Abarenkov",
-        "affiliation": []
+        "givenName": "Kessy",
+        "name": "Abarenkov, Kessy",
+        "nameType": "Personal"
       },
       {
-        "name": "Nilsson, R. Henrik",
-        "nameType": "Personal",
-        "givenName": "R. Henrik",
+        "affiliation": [],
         "familyName": "Nilsson",
-        "affiliation": []
+        "givenName": "R. Henrik",
+        "name": "Nilsson, R. Henrik",
+        "nameType": "Personal"
       },
       {
-        "name": "Larsson, Karl-Henrik",
-        "nameType": "Personal",
-        "givenName": "Karl-Henrik",
+        "affiliation": [],
         "familyName": "Larsson",
-        "affiliation": []
+        "givenName": "Karl-Henrik",
+        "name": "Larsson, Karl-Henrik",
+        "nameType": "Personal"
       },
       {
-        "name": "Aas, Anders Bjørnsgard",
-        "nameType": "Personal",
-        "givenName": "Anders Bjørnsgard",
+        "affiliation": [],
         "familyName": "Aas",
-        "affiliation": []
+        "givenName": "Anders Bjørnsgard",
+        "name": "Aas, Anders Bjørnsgard",
+        "nameType": "Personal"
       },
       {
-        "name": "Adams, Rachel",
-        "nameType": "Personal",
-        "givenName": "Rachel",
+        "affiliation": [],
         "familyName": "Adams",
-        "affiliation": []
+        "givenName": "Rachel",
+        "name": "Adams, Rachel",
+        "nameType": "Personal"
       },
       {
-        "name": "Alves, Artur",
-        "nameType": "Personal",
-        "givenName": "Artur",
+        "affiliation": [],
         "familyName": "Alves",
-        "affiliation": []
+        "givenName": "Artur",
+        "name": "Alves, Artur",
+        "nameType": "Personal"
       },
       {
-        "name": "Ammirati, Joseph F.",
-        "nameType": "Personal",
-        "givenName": "Joseph F.",
+        "affiliation": [],
         "familyName": "Ammirati",
-        "affiliation": []
+        "givenName": "Joseph F.",
+        "name": "Ammirati, Joseph F.",
+        "nameType": "Personal"
       },
       {
-        "name": "Arnold, A. Elizabeth",
-        "nameType": "Personal",
-        "givenName": "A. Elizabeth",
+        "affiliation": [],
         "familyName": "Arnold",
-        "affiliation": []
+        "givenName": "A. Elizabeth",
+        "name": "Arnold, A. Elizabeth",
+        "nameType": "Personal"
       },
       {
-        "name": "Bahram, Mohammad",
-        "nameType": "Personal",
-        "givenName": "Mohammad",
+        "affiliation": [],
         "familyName": "Bahram",
-        "affiliation": []
+        "givenName": "Mohammad",
+        "name": "Bahram, Mohammad",
+        "nameType": "Personal"
       },
       {
-        "name": "Bengtsson-Palme, Johan",
-        "nameType": "Personal",
-        "givenName": "Johan",
+        "affiliation": [],
         "familyName": "Bengtsson-Palme",
-        "affiliation": []
+        "givenName": "Johan",
+        "name": "Bengtsson-Palme, Johan",
+        "nameType": "Personal"
       },
       {
-        "name": "Berlin, Anna",
-        "nameType": "Personal",
-        "givenName": "Anna",
+        "affiliation": [],
         "familyName": "Berlin",
-        "affiliation": []
+        "givenName": "Anna",
+        "name": "Berlin, Anna",
+        "nameType": "Personal"
       },
       {
-        "name": "Botnen, Synnøve",
-        "nameType": "Personal",
-        "givenName": "Synnøve",
+        "affiliation": [],
         "familyName": "Botnen",
-        "affiliation": []
+        "givenName": "Synnøve",
+        "name": "Botnen, Synnøve",
+        "nameType": "Personal"
       },
       {
-        "name": "Bourlat, Sarah",
-        "nameType": "Personal",
-        "givenName": "Sarah",
+        "affiliation": [],
         "familyName": "Bourlat",
-        "affiliation": []
+        "givenName": "Sarah",
+        "name": "Bourlat, Sarah",
+        "nameType": "Personal"
       },
       {
-        "name": "Cheeke, Tanya",
-        "nameType": "Personal",
-        "givenName": "Tanya",
+        "affiliation": [],
         "familyName": "Cheeke",
-        "affiliation": []
+        "givenName": "Tanya",
+        "name": "Cheeke, Tanya",
+        "nameType": "Personal"
       },
       {
-        "name": "Dima, Bálint",
-        "nameType": "Personal",
-        "givenName": "Bálint",
+        "affiliation": [],
         "familyName": "Dima",
-        "affiliation": []
+        "givenName": "Bálint",
+        "name": "Dima, Bálint",
+        "nameType": "Personal"
       },
       {
-        "name": "Drenkhan, Rein",
-        "nameType": "Personal",
-        "givenName": "Rein",
+        "affiliation": [],
         "familyName": "Drenkhan",
-        "affiliation": []
+        "givenName": "Rein",
+        "name": "Drenkhan, Rein",
+        "nameType": "Personal"
       },
       {
-        "name": "Duarte, Camila",
-        "nameType": "Personal",
-        "givenName": "Camila",
+        "affiliation": [],
         "familyName": "Duarte",
-        "affiliation": []
+        "givenName": "Camila",
+        "name": "Duarte, Camila",
+        "nameType": "Personal"
       },
       {
-        "name": "Dueñas, Margarita",
-        "nameType": "Personal",
-        "givenName": "Margarita",
+        "affiliation": [],
         "familyName": "Dueñas",
-        "affiliation": []
+        "givenName": "Margarita",
+        "name": "Dueñas, Margarita",
+        "nameType": "Personal"
       },
       {
-        "name": "Eberhardt, Ursula",
-        "nameType": "Personal",
-        "givenName": "Ursula",
+        "affiliation": [],
         "familyName": "Eberhardt",
-        "affiliation": []
+        "givenName": "Ursula",
+        "name": "Eberhardt, Ursula",
+        "nameType": "Personal"
       },
       {
-        "name": "Friberg, Hanna",
-        "nameType": "Personal",
-        "givenName": "Hanna",
+        "affiliation": [],
         "familyName": "Friberg",
-        "affiliation": []
+        "givenName": "Hanna",
+        "name": "Friberg, Hanna",
+        "nameType": "Personal"
       },
       {
-        "name": "Frøslev, Tobias G.",
-        "nameType": "Personal",
-        "givenName": "Tobias G.",
+        "affiliation": [],
         "familyName": "Frøslev",
-        "affiliation": []
+        "givenName": "Tobias G.",
+        "name": "Frøslev, Tobias G.",
+        "nameType": "Personal"
       },
       {
-        "name": "Garnica, Sigisfredo",
-        "nameType": "Personal",
-        "givenName": "Sigisfredo",
+        "affiliation": [],
         "familyName": "Garnica",
-        "affiliation": []
+        "givenName": "Sigisfredo",
+        "name": "Garnica, Sigisfredo",
+        "nameType": "Personal"
       },
       {
-        "name": "Geml, József",
-        "nameType": "Personal",
-        "givenName": "József",
+        "affiliation": [],
         "familyName": "Geml",
-        "affiliation": []
+        "givenName": "József",
+        "name": "Geml, József",
+        "nameType": "Personal"
       },
       {
-        "name": "Ghobad-Nejhad, Masoomeh",
-        "nameType": "Personal",
-        "givenName": "Masoomeh",
+        "affiliation": [],
         "familyName": "Ghobad-Nejhad",
-        "affiliation": []
+        "givenName": "Masoomeh",
+        "name": "Ghobad-Nejhad, Masoomeh",
+        "nameType": "Personal"
       },
       {
-        "name": "Grebenc, Tine",
-        "nameType": "Personal",
-        "givenName": "Tine",
+        "affiliation": [],
         "familyName": "Grebenc",
-        "affiliation": []
+        "givenName": "Tine",
+        "name": "Grebenc, Tine",
+        "nameType": "Personal"
       },
       {
-        "name": "Griffith, Gareth W.",
-        "nameType": "Personal",
-        "givenName": "Gareth W.",
+        "affiliation": [],
         "familyName": "Griffith",
-        "affiliation": []
+        "givenName": "Gareth W.",
+        "name": "Griffith, Gareth W.",
+        "nameType": "Personal"
       },
       {
-        "name": "Hampe, Felix",
-        "nameType": "Personal",
-        "givenName": "Felix",
+        "affiliation": [],
         "familyName": "Hampe",
-        "affiliation": []
+        "givenName": "Felix",
+        "name": "Hampe, Felix",
+        "nameType": "Personal"
       },
       {
-        "name": "Kennedy, Peter",
-        "nameType": "Personal",
-        "givenName": "Peter",
+        "affiliation": [],
         "familyName": "Kennedy",
-        "affiliation": []
+        "givenName": "Peter",
+        "name": "Kennedy, Peter",
+        "nameType": "Personal"
       },
       {
-        "name": "Khomich, Maryia",
-        "nameType": "Personal",
-        "givenName": "Maryia",
+        "affiliation": [],
         "familyName": "Khomich",
-        "affiliation": []
+        "givenName": "Maryia",
+        "name": "Khomich, Maryia",
+        "nameType": "Personal"
       },
       {
-        "name": "Kohout, Petr",
-        "nameType": "Personal",
-        "givenName": "Petr",
+        "affiliation": [],
         "familyName": "Kohout",
-        "affiliation": []
+        "givenName": "Petr",
+        "name": "Kohout, Petr",
+        "nameType": "Personal"
       },
       {
-        "name": "Kollom, Anu",
-        "nameType": "Personal",
-        "givenName": "Anu",
+        "affiliation": [],
         "familyName": "Kollom",
-        "affiliation": []
+        "givenName": "Anu",
+        "name": "Kollom, Anu",
+        "nameType": "Personal"
       },
       {
-        "name": "Larsson, Ellen",
-        "nameType": "Personal",
-        "givenName": "Ellen",
+        "affiliation": [],
         "familyName": "Larsson",
-        "affiliation": []
+        "givenName": "Ellen",
+        "name": "Larsson, Ellen",
+        "nameType": "Personal"
       },
       {
-        "name": "Laszlo, Irinyi",
-        "nameType": "Personal",
-        "givenName": "Irinyi",
+        "affiliation": [],
         "familyName": "Laszlo",
-        "affiliation": []
+        "givenName": "Irinyi",
+        "name": "Laszlo, Irinyi",
+        "nameType": "Personal"
       },
       {
-        "name": "Leavitt, Steven",
-        "nameType": "Personal",
-        "givenName": "Steven",
+        "affiliation": [],
         "familyName": "Leavitt",
-        "affiliation": []
+        "givenName": "Steven",
+        "name": "Leavitt, Steven",
+        "nameType": "Personal"
       },
       {
-        "name": "Liimatainen, Kare",
-        "nameType": "Personal",
-        "givenName": "Kare",
+        "affiliation": [],
         "familyName": "Liimatainen",
-        "affiliation": []
+        "givenName": "Kare",
+        "name": "Liimatainen, Kare",
+        "nameType": "Personal"
       },
       {
-        "name": "Lindahl, Björn",
-        "nameType": "Personal",
-        "givenName": "Björn",
+        "affiliation": [],
         "familyName": "Lindahl",
-        "affiliation": []
+        "givenName": "Björn",
+        "name": "Lindahl, Björn",
+        "nameType": "Personal"
       },
       {
-        "name": "Lodge, Deborah J.",
-        "nameType": "Personal",
-        "givenName": "Deborah J.",
+        "affiliation": [],
         "familyName": "Lodge",
-        "affiliation": []
+        "givenName": "Deborah J.",
+        "name": "Lodge, Deborah J.",
+        "nameType": "Personal"
       },
       {
-        "name": "Lumbsch, Helge Thorsten",
-        "nameType": "Personal",
-        "givenName": "Helge Thorsten",
+        "affiliation": [],
         "familyName": "Lumbsch",
-        "affiliation": []
+        "givenName": "Helge Thorsten",
+        "name": "Lumbsch, Helge Thorsten",
+        "nameType": "Personal"
       },
       {
-        "name": "Martín Esteban, María Paz",
-        "nameType": "Personal",
-        "givenName": "María Paz",
+        "affiliation": [],
         "familyName": "Martín Esteban",
-        "affiliation": []
+        "givenName": "María Paz",
+        "name": "Martín Esteban, María Paz",
+        "nameType": "Personal"
       },
       {
-        "name": "Meyer, Wieland",
-        "nameType": "Personal",
-        "givenName": "Wieland",
+        "affiliation": [],
         "familyName": "Meyer",
-        "affiliation": []
+        "givenName": "Wieland",
+        "name": "Meyer, Wieland",
+        "nameType": "Personal"
       },
       {
-        "name": "Miettinen, Otto",
-        "nameType": "Personal",
-        "givenName": "Otto",
+        "affiliation": [],
         "familyName": "Miettinen",
-        "affiliation": []
+        "givenName": "Otto",
+        "name": "Miettinen, Otto",
+        "nameType": "Personal"
       },
       {
-        "name": "Nguyen, Nhu",
-        "nameType": "Personal",
-        "givenName": "Nhu",
+        "affiliation": [],
         "familyName": "Nguyen",
-        "affiliation": []
+        "givenName": "Nhu",
+        "name": "Nguyen, Nhu",
+        "nameType": "Personal"
       },
       {
-        "name": "Niskanen, Tuula",
-        "nameType": "Personal",
-        "givenName": "Tuula",
+        "affiliation": [],
         "familyName": "Niskanen",
-        "affiliation": []
+        "givenName": "Tuula",
+        "name": "Niskanen, Tuula",
+        "nameType": "Personal"
       },
       {
-        "name": "Oono, Ryoko",
-        "nameType": "Personal",
-        "givenName": "Ryoko",
+        "affiliation": [],
         "familyName": "Oono",
-        "affiliation": []
+        "givenName": "Ryoko",
+        "name": "Oono, Ryoko",
+        "nameType": "Personal"
       },
       {
-        "name": "Öpik, Maarja",
-        "nameType": "Personal",
-        "givenName": "Maarja",
+        "affiliation": [],
         "familyName": "Öpik",
-        "affiliation": []
+        "givenName": "Maarja",
+        "name": "Öpik, Maarja",
+        "nameType": "Personal"
       },
       {
-        "name": "Ordynets, Alexander",
-        "nameType": "Personal",
-        "givenName": "Alexander",
+        "affiliation": [],
         "familyName": "Ordynets",
-        "affiliation": []
+        "givenName": "Alexander",
+        "name": "Ordynets, Alexander",
+        "nameType": "Personal"
       },
       {
-        "name": "Pawłowska, Julia",
-        "nameType": "Personal",
-        "givenName": "Julia",
+        "affiliation": [],
         "familyName": "Pawłowska",
-        "affiliation": []
+        "givenName": "Julia",
+        "name": "Pawłowska, Julia",
+        "nameType": "Personal"
       },
       {
-        "name": "Peintner, Ursula",
-        "nameType": "Personal",
-        "givenName": "Ursula",
+        "affiliation": [],
         "familyName": "Peintner",
-        "affiliation": []
+        "givenName": "Ursula",
+        "name": "Peintner, Ursula",
+        "nameType": "Personal"
       },
       {
-        "name": "Pereira, Olinto Liparini",
-        "nameType": "Personal",
-        "givenName": "Olinto Liparini",
+        "affiliation": [],
         "familyName": "Pereira",
-        "affiliation": []
+        "givenName": "Olinto Liparini",
+        "name": "Pereira, Olinto Liparini",
+        "nameType": "Personal"
       },
       {
-        "name": "Pinho, Danilo Batista",
-        "nameType": "Personal",
-        "givenName": "Danilo Batista",
+        "affiliation": [],
         "familyName": "Pinho",
-        "affiliation": []
+        "givenName": "Danilo Batista",
+        "name": "Pinho, Danilo Batista",
+        "nameType": "Personal"
       },
       {
-        "name": "Põldmaa, Kadri",
-        "nameType": "Personal",
-        "givenName": "Kadri",
+        "affiliation": [],
         "familyName": "Põldmaa",
-        "affiliation": []
+        "givenName": "Kadri",
+        "name": "Põldmaa, Kadri",
+        "nameType": "Personal"
       },
       {
-        "name": "Runnel, Kadri",
-        "nameType": "Personal",
-        "givenName": "Kadri",
+        "affiliation": [],
         "familyName": "Runnel",
-        "affiliation": []
+        "givenName": "Kadri",
+        "name": "Runnel, Kadri",
+        "nameType": "Personal"
       },
       {
-        "name": "Ryberg, Martin",
-        "nameType": "Personal",
-        "givenName": "Martin",
+        "affiliation": [],
         "familyName": "Ryberg",
-        "affiliation": []
+        "givenName": "Martin",
+        "name": "Ryberg, Martin",
+        "nameType": "Personal"
       },
       {
-        "name": "Saar, Irja",
-        "nameType": "Personal",
-        "givenName": "Irja",
+        "affiliation": [],
         "familyName": "Saar",
-        "affiliation": []
+        "givenName": "Irja",
+        "name": "Saar, Irja",
+        "nameType": "Personal"
       },
       {
-        "name": "Sanli, Kemal",
-        "nameType": "Personal",
-        "givenName": "Kemal",
+        "affiliation": [],
         "familyName": "Sanli",
-        "affiliation": []
+        "givenName": "Kemal",
+        "name": "Sanli, Kemal",
+        "nameType": "Personal"
       },
       {
-        "name": "Scott, James",
-        "nameType": "Personal",
-        "givenName": "James",
+        "affiliation": [],
         "familyName": "Scott",
-        "affiliation": []
+        "givenName": "James",
+        "name": "Scott, James",
+        "nameType": "Personal"
       },
       {
-        "name": "Spirin, Viacheslav",
-        "nameType": "Personal",
-        "givenName": "Viacheslav",
+        "affiliation": [],
         "familyName": "Spirin",
-        "affiliation": []
+        "givenName": "Viacheslav",
+        "name": "Spirin, Viacheslav",
+        "nameType": "Personal"
       },
       {
-        "name": "Suija, Ave",
-        "nameType": "Personal",
-        "givenName": "Ave",
+        "affiliation": [],
         "familyName": "Suija",
-        "affiliation": []
+        "givenName": "Ave",
+        "name": "Suija, Ave",
+        "nameType": "Personal"
       },
       {
-        "name": "Svantesson, Sten",
-        "nameType": "Personal",
-        "givenName": "Sten",
+        "affiliation": [],
         "familyName": "Svantesson",
-        "affiliation": []
+        "givenName": "Sten",
+        "name": "Svantesson, Sten",
+        "nameType": "Personal"
       },
       {
-        "name": "Tadych, Mariusz",
-        "nameType": "Personal",
-        "givenName": "Mariusz",
+        "affiliation": [],
         "familyName": "Tadych",
-        "affiliation": []
+        "givenName": "Mariusz",
+        "name": "Tadych, Mariusz",
+        "nameType": "Personal"
       },
       {
-        "name": "Takamatsu, Susumu",
-        "nameType": "Personal",
-        "givenName": "Susumu",
+        "affiliation": [],
         "familyName": "Takamatsu",
-        "affiliation": []
+        "givenName": "Susumu",
+        "name": "Takamatsu, Susumu",
+        "nameType": "Personal"
       },
       {
-        "name": "Tamm, Heidi",
-        "nameType": "Personal",
-        "givenName": "Heidi",
+        "affiliation": [],
         "familyName": "Tamm",
-        "affiliation": []
+        "givenName": "Heidi",
+        "name": "Tamm, Heidi",
+        "nameType": "Personal"
       },
       {
-        "name": "Taylor, AFS.",
-        "nameType": "Personal",
-        "givenName": "AFS.",
+        "affiliation": [],
         "familyName": "Taylor",
-        "affiliation": []
+        "givenName": "AFS.",
+        "name": "Taylor, AFS.",
+        "nameType": "Personal"
       },
       {
-        "name": "Tedersoo, Leho",
-        "nameType": "Personal",
-        "givenName": "Leho",
+        "affiliation": [],
         "familyName": "Tedersoo",
-        "affiliation": []
+        "givenName": "Leho",
+        "name": "Tedersoo, Leho",
+        "nameType": "Personal"
       },
       {
-        "name": "Telleria, M.T.",
-        "nameType": "Personal",
-        "givenName": "M.T.",
+        "affiliation": [],
         "familyName": "Telleria",
-        "affiliation": []
+        "givenName": "M.T.",
+        "name": "Telleria, M.T.",
+        "nameType": "Personal"
       },
       {
-        "name": "Udayanga, Dhanushka",
-        "nameType": "Personal",
-        "givenName": "Dhanushka",
+        "affiliation": [],
         "familyName": "Udayanga",
-        "affiliation": []
+        "givenName": "Dhanushka",
+        "name": "Udayanga, Dhanushka",
+        "nameType": "Personal"
       },
       {
-        "name": "Unterseher, Martin",
-        "nameType": "Personal",
-        "givenName": "Martin",
+        "affiliation": [],
         "familyName": "Unterseher",
-        "affiliation": []
+        "givenName": "Martin",
+        "name": "Unterseher, Martin",
+        "nameType": "Personal"
       },
       {
-        "name": "Volobuev, Sergey",
-        "nameType": "Personal",
-        "givenName": "Sergey",
+        "affiliation": [],
         "familyName": "Volobuev",
-        "affiliation": []
+        "givenName": "Sergey",
+        "name": "Volobuev, Sergey",
+        "nameType": "Personal"
       },
       {
-        "name": "Weiss, Michael",
-        "nameType": "Personal",
-        "givenName": "Michael",
+        "affiliation": [],
         "familyName": "Weiss",
-        "affiliation": []
+        "givenName": "Michael",
+        "name": "Weiss, Michael",
+        "nameType": "Personal"
       },
       {
-        "name": "Wurzbacher, Christian",
-        "nameType": "Personal",
-        "givenName": "Christian",
+        "affiliation": [],
         "familyName": "Wurzbacher",
-        "affiliation": []
-      }
-    ],
-    "titles": [
-      {
-        "title": "SH409843.07FU"
-      },
-      {
-        "title": "Gomphales",
-        "titleType": "Subtitle"
-      }
-    ],
-    "publisher": "UNITE Community",
-    "container": {},
-    "publicationYear": 2015,
-    "subjects": [],
-    "contributors": [
-      {
-        "name": "Kessy Abarenkov",
-        "affiliation": []
-      },
-      {
-        "name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden",
-        "affiliation": []
+        "givenName": "Christian",
+        "name": "Wurzbacher, Christian",
+        "nameType": "Personal"
       }
     ],
     "dates": [
@@ -545,48 +526,66 @@
         "dateType": "Issued"
       }
     ],
-    "language": "eng",
-    "types": {
-      "ris": "DATA",
-      "bibtex": "misc",
-      "citeproc": "dataset",
-      "schemaOrg": "Dataset",
-      "resourceType": "Dataset/UNITE Species Hypothesis",
-      "resourceTypeGeneral": "Dataset"
-    },
-    "relatedIdentifiers": [],
-    "sizes": [],
+    "descriptions": [
+      {
+        "description": "UNITE provides a unified way for delimiting, identifying, communicating, and working with DNA-based Species Hypotheses (SH). All fungal ITS sequences in the international nucleotide sequence databases are clustered to approximately the species level by applying a set of dynamic distance values (&lt;0.5 - 3.0%). All species hypotheses are given a unique, stable name in the form of a DOI, and their taxonomic and ecological annotations are verified through distributed, web-based third-party annotation efforts. SHs are connected to a taxon name and its classification as far as possible (phylum, class, order, etc.) by taking into account identifications for all sequences in the SH. An automatically or manually designated sequence is chosen to represent each such SH. These sequences are released (https://unite.ut.ee/repository.php) for use by the scientific community in, for example, local sequence similarity searches and next-generation sequencing analysis pipelines. The system and the data are updated automatically as the number of public fungal ITS sequences grows.",
+        "descriptionType": "Abstract"
+      }
+    ],
+    "doi": "10.15156/bio/sh409843.07fu",
     "formats": [
       "application/json"
     ],
-    "version": null,
+    "fundingReferences": [],
+    "geoLocations": [],
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.15156/bio/sh409843.07fu",
+        "identifierType": "DOI"
+      }
+    ],
+    "isActive": true,
+    "language": "eng",
+    "metadataVersion": 1,
+    "publicationYear": 2015,
+    "published": "2015",
+    "publisher": "UNITE Community",
+    "reason": null,
+    "registered": "2015-06-05T10:23:19.000Z",
+    "relatedIdentifiers": [],
     "rightsList": [
       {
         "rights": "Attribution-NonCommercial (CC BY-NC)",
         "rightsUri": "http://creativecommons.org/licenses/by-nc/4.0"
       }
     ],
-    "descriptions": [
+    "schemaVersion": "http://datacite.org/schema/kernel-3",
+    "sizes": [],
+    "source": null,
+    "state": "findable",
+    "subjects": [],
+    "titles": [
       {
-        "description": "UNITE provides a unified way for delimiting, identifying, communicating, and working with DNA-based Species Hypotheses (SH). All fungal ITS sequences in the international nucleotide sequence databases are clustered to approximately the species level by applying a set of dynamic distance values (&lt;0.5 - 3.0%). All species hypotheses are given a unique, stable name in the form of a DOI, and their taxonomic and ecological annotations are verified through distributed, web-based third-party annotation efforts. SHs are connected to a taxon name and its classification as far as possible (phylum, class, order, etc.) by taking into account identifications for all sequences in the SH. An automatically or manually designated sequence is chosen to represent each such SH. These sequences are released (https://unite.ut.ee/repository.php) for use by the scientific community in, for example, local sequence similarity searches and next-generation sequencing analysis pipelines. The system and the data are updated automatically as the number of public fungal ITS sequences grows.",
-        "descriptionType": "Abstract"
+        "title": "SH409843.07FU"
+      },
+      {
+        "title": "Gomphales",
+        "titleType": "Subtitle"
       }
     ],
-    "geoLocations": [],
-    "fundingReferences": [],
+    "types": {
+      "bibtex": "misc",
+      "citeproc": "dataset",
+      "resourceType": "Dataset/UNITE Species Hypothesis",
+      "resourceTypeGeneral": "Dataset",
+      "ris": "DATA",
+      "schemaOrg": "Dataset"
+    },
+    "updated": "2019-08-02T07:45:28.000Z",
     "url": "https://plutof.ut.ee/#/datacite/10.15156/BIO/SH409843.07FU",
-    "contentUrl": null,
-    "metadataVersion": 1,
-    "schemaVersion": "http://datacite.org/schema/kernel-3",
-    "source": null,
-    "isActive": true,
-    "state": "findable",
-    "reason": null,
-    "created": "2015-06-05T10:23:18.000Z",
-    "registered": "2015-06-05T10:23:19.000Z",
-    "published": "2015",
-    "updated": "2019-08-02T07:45:28.000Z"
+    "version": null
   },
+  "id": "10.15156/bio/sh409843.07fu",
   "relationships": {
     "client": {
       "data": {
@@ -594,5 +593,6 @@
         "type": "clients"
       }
     }
-  }
+  },
+  "type": "dois"
 }
diff --git a/python/tests/files/datacite/datacite_doc_06.json b/python/tests/files/datacite/datacite_doc_06.json
index a7f3ee70..01cb2cb3 100644
--- a/python/tests/files/datacite/datacite_doc_06.json
+++ b/python/tests/files/datacite/datacite_doc_06.json
@@ -1,31 +1,16 @@
 {
-  "id": "10.16903/ethz-grs-d_006220",
-  "type": "dois",
   "attributes": {
-    "doi": "10.16903/ethz-grs-d_006220",
-    "identifiers": [
-      {
-        "identifier": "https://doi.org/10.16903/ethz-grs-d_006220",
-        "identifierType": "DOI"
-      }
-    ],
+    "container": {},
+    "contentUrl": null,
+    "contributors": [],
+    "created": "2017-12-13T12:03:09.000Z",
     "creators": [
       {
+        "affiliation": [],
         "name": "Crispijn De Passe (Der Ältere) (1564-1637)",
-        "nameType": "Personal",
-        "affiliation": []
+        "nameType": "Personal"
       }
     ],
-    "titles": [
-      {
-        "title": "Der Eifer (Sedulitas), Blatt 7 der Folge \"Die Tugenden\""
-      }
-    ],
-    "publisher": "n.a.",
-    "container": {},
-    "publicationYear": 1590,
-    "subjects": [],
-    "contributors": [],
     "dates": [
       {
         "date": "1590",
@@ -36,42 +21,56 @@
         "dateType": "Issued"
       }
     ],
-    "language": null,
-    "types": {
-      "ris": "GEN",
-      "bibtex": "misc",
-      "citeproc": "article",
-      "schemaOrg": "CreativeWork",
-      "resourceTypeGeneral": "InteractiveResource"
-    },
-    "relatedIdentifiers": [],
-    "sizes": [],
+    "descriptions": [],
+    "doi": "10.16903/ethz-grs-d_006220",
     "formats": [
       "Blattgrösse: 21.0 x 14.4 x 0.0 cm (beschnitten)",
       "Kupferstich"
     ],
-    "version": null,
+    "fundingReferences": [],
+    "geoLocations": [],
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.16903/ethz-grs-d_006220",
+        "identifierType": "DOI"
+      }
+    ],
+    "isActive": true,
+    "language": null,
+    "metadataVersion": 1,
+    "publicationYear": 1590,
+    "published": "1590",
+    "publisher": "n.a.",
+    "reason": null,
+    "registered": "2017-12-13T12:03:09.000Z",
+    "relatedIdentifiers": [],
     "rightsList": [
       {
         "rights": "ETH-Bibliothek Zürich, Graphische Sammlung / D 6220 / Public Domain Mark 1.0"
       }
     ],
-    "descriptions": [],
-    "geoLocations": [],
-    "fundingReferences": [],
-    "url": "http://www.e-gs.ethz.ch/eMP/eMuseumPlus?service=ExternalInterface&module=collection&objectId=29469&viewType=detailView",
-    "contentUrl": null,
-    "metadataVersion": 1,
     "schemaVersion": "http://datacite.org/schema/kernel-3",
+    "sizes": [],
     "source": "mds",
-    "isActive": true,
     "state": "findable",
-    "reason": null,
-    "created": "2017-12-13T12:03:09.000Z",
-    "registered": "2017-12-13T12:03:09.000Z",
-    "published": "1590",
-    "updated": "2019-08-02T17:20:02.000Z"
+    "subjects": [],
+    "titles": [
+      {
+        "title": "Der Eifer (Sedulitas), Blatt 7 der Folge \"Die Tugenden\""
+      }
+    ],
+    "types": {
+      "bibtex": "misc",
+      "citeproc": "article",
+      "resourceTypeGeneral": "InteractiveResource",
+      "ris": "GEN",
+      "schemaOrg": "CreativeWork"
+    },
+    "updated": "2019-08-02T17:20:02.000Z",
+    "url": "http://www.e-gs.ethz.ch/eMP/eMuseumPlus?service=ExternalInterface&module=collection&objectId=29469&viewType=detailView",
+    "version": null
   },
+  "id": "10.16903/ethz-grs-d_006220",
   "relationships": {
     "client": {
       "data": {
@@ -79,5 +78,6 @@
         "type": "clients"
       }
     }
-  }
+  },
+  "type": "dois"
 }
diff --git a/python/tests/files/datacite/datacite_doc_07.json b/python/tests/files/datacite/datacite_doc_07.json
index c70695b6..8e292fea 100644
--- a/python/tests/files/datacite/datacite_doc_07.json
+++ b/python/tests/files/datacite/datacite_doc_07.json
@@ -1,49 +1,72 @@
 {
-  "id": "10.18462/iir.icr.2015.0926",
-  "type": "dois",
   "attributes": {
-    "doi": "10.18462/iir.icr.2015.0926",
-    "identifiers": [
-      {
-        "identifier": "https://doi.org/10.18462/iir.icr.2015.0926",
-        "identifierType": "DOI"
-      }
-    ],
+    "container": {},
+    "contentUrl": null,
+    "contributors": [],
+    "created": "2016-11-21T13:08:14.000Z",
     "creators": [
       {
-        "name": "ROTHUIZEN, E.",
-        "nameType": "Personal",
-        "givenName": "E.",
+        "affiliation": [],
         "familyName": "ROTHUIZEN",
-        "affiliation": []
+        "givenName": "E.",
+        "name": "ROTHUIZEN, E.",
+        "nameType": "Personal"
       },
       {
-        "name": "ELMEGAARD, B.",
-        "nameType": "Personal",
-        "givenName": "B.",
+        "affiliation": [],
         "familyName": "ELMEGAARD",
-        "affiliation": []
+        "givenName": "B.",
+        "name": "ELMEGAARD, B.",
+        "nameType": "Personal"
       },
       {
-        "name": "MARKUSSEN W., B.",
-        "nameType": "Personal",
-        "givenName": "B.",
+        "affiliation": [],
         "familyName": "MARKUSSEN W.",
-        "affiliation": []
+        "givenName": "B.",
+        "name": "MARKUSSEN W., B.",
+        "nameType": "Personal"
       },
       {
-        "name": "Et Al.",
-        "affiliation": []
+        "affiliation": [],
+        "name": "Et Al."
       }
     ],
-    "titles": [
+    "dates": [
       {
-        "title": "High efficient heat pump system using storage tanks to increase cop by means of the ISEC concept. 1: model validation."
+        "date": "2015",
+        "dateType": "Issued"
       }
     ],
-    "publisher": "International Institute of Refrigeration (IIR)",
-    "container": {},
+    "descriptions": [
+      {
+        "description": "The purpose of the ISEC concept is to provide a high-efficient heat pump system for hot water production. The ISEC concept uses two storage tanks for the water, one discharged and one charged. Hot water for the industrial process is tapped from the charged tank, while the other tank is charging. Charging is done by circulating the water in the tank through the condenser of a heat pump several times and thereby gradually heating the water. The charging is done with a higher mass flow rate than the discharging to reach several circulations of the water during the time frame of one discharging. This result in a lower condensing temperature than if the water was heated in one step. Two test setups were built, one to test the performance of the heat pump gradually heating the water and one to investigate the stratification in the storage tanks. Furthermore, a dynamic model of the system was implemented in Dymola, and validated by the use of test data from the two experimental setups. This paper shows that there is a good consistency between the model and the experimental tests.",
+        "descriptionType": "Abstract"
+      }
+    ],
+    "doi": "10.18462/iir.icr.2015.0926",
+    "formats": [],
+    "fundingReferences": [],
+    "geoLocations": [],
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.18462/iir.icr.2015.0926",
+        "identifierType": "DOI"
+      }
+    ],
+    "isActive": true,
+    "language": "eng",
+    "metadataVersion": 0,
     "publicationYear": 2015,
+    "published": "2015",
+    "publisher": "International Institute of Refrigeration (IIR)",
+    "reason": null,
+    "registered": "2016-11-21T13:08:14.000Z",
+    "relatedIdentifiers": [],
+    "rightsList": [],
+    "schemaVersion": null,
+    "sizes": [],
+    "source": null,
+    "state": "findable",
     "subjects": [
       {
         "subject": "HEAT PUMP"
@@ -67,48 +90,24 @@
         "subject": "MODEL"
       }
     ],
-    "contributors": [],
-    "dates": [
+    "titles": [
       {
-        "date": "2015",
-        "dateType": "Issued"
+        "title": "High efficient heat pump system using storage tanks to increase cop by means of the ISEC concept. 1: model validation."
       }
     ],
-    "language": "eng",
     "types": {
-      "ris": "DATA",
       "bibtex": "misc",
       "citeproc": "dataset",
-      "schemaOrg": "Dataset",
       "resourceType": "Dataset",
-      "resourceTypeGeneral": "Dataset"
+      "resourceTypeGeneral": "Dataset",
+      "ris": "DATA",
+      "schemaOrg": "Dataset"
     },
-    "relatedIdentifiers": [],
-    "sizes": [],
-    "formats": [],
-    "version": null,
-    "rightsList": [],
-    "descriptions": [
-      {
-        "description": "The purpose of the ISEC concept is to provide a high-efficient heat pump system for hot water production. The ISEC concept uses two storage tanks for the water, one discharged and one charged. Hot water for the industrial process is tapped from the charged tank, while the other tank is charging. Charging is done by circulating the water in the tank through the condenser of a heat pump several times and thereby gradually heating the water. The charging is done with a higher mass flow rate than the discharging to reach several circulations of the water during the time frame of one discharging. This result in a lower condensing temperature than if the water was heated in one step. Two test setups were built, one to test the performance of the heat pump gradually heating the water and one to investigate the stratification in the storage tanks. Furthermore, a dynamic model of the system was implemented in Dymola, and validated by the use of test data from the two experimental setups. This paper shows that there is a good consistency between the model and the experimental tests.",
-        "descriptionType": "Abstract"
-      }
-    ],
-    "geoLocations": [],
-    "fundingReferences": [],
+    "updated": "2019-08-16T18:00:59.000Z",
     "url": "http://www.iifiir.org/clientBookline/service/reference.asp?INSTANCE=EXPLOITATION&OUTPUT=PORTAL&DOCID=IFD_REFDOC_0015008&DOCBASE=IFD_REFDOC_EN&SETLANGUAGE=EN",
-    "contentUrl": null,
-    "metadataVersion": 0,
-    "schemaVersion": null,
-    "source": null,
-    "isActive": true,
-    "state": "findable",
-    "reason": null,
-    "created": "2016-11-21T13:08:14.000Z",
-    "registered": "2016-11-21T13:08:14.000Z",
-    "published": "2015",
-    "updated": "2019-08-16T18:00:59.000Z"
+    "version": null
   },
+  "id": "10.18462/iir.icr.2015.0926",
   "relationships": {
     "client": {
       "data": {
@@ -116,5 +115,6 @@
         "type": "clients"
       }
     }
-  }
+  },
+  "type": "dois"
 }
diff --git a/python/tests/files/datacite/datacite_doc_08.json b/python/tests/files/datacite/datacite_doc_08.json
index e9170788..84f756e8 100644
--- a/python/tests/files/datacite/datacite_doc_08.json
+++ b/python/tests/files/datacite/datacite_doc_08.json
@@ -1,40 +1,63 @@
 {
-  "id": "10.22004/ag.econ.284864",
-  "type": "dois",
   "attributes": {
-    "doi": "10.22004/ag.econ.284864",
-    "identifiers": [
-      {
-        "identifier": "https://doi.org/10.22004/ag.econ.284864",
-        "identifierType": "DOI"
-      }
-    ],
+    "container": {},
+    "contentUrl": null,
+    "contributors": [],
+    "created": "2019-08-24T07:46:47.000Z",
     "creators": [
       {
-        "name": "Kajisa, Kei",
-        "nameType": "Personal",
-        "givenName": "Kei",
-        "familyName": "Kajisa",
         "affiliation": [],
-        "nameIdentifiers": []
+        "familyName": "Kajisa",
+        "givenName": "Kei",
+        "name": "Kajisa, Kei",
+        "nameIdentifiers": [],
+        "nameType": "Personal"
       },
       {
-        "name": "Kajisa, Kei",
-        "nameType": "Personal",
-        "givenName": "Kei",
-        "familyName": "Kajisa",
         "affiliation": [],
-        "nameIdentifiers": []
+        "familyName": "Kajisa",
+        "givenName": "Kei",
+        "name": "Kajisa, Kei",
+        "nameIdentifiers": [],
+        "nameType": "Personal"
       }
     ],
-    "titles": [
+    "dates": [
       {
-        "title": "Irrigation Policies under Rapid Industrialization and Labor Migration: Lessons from Japan, China and India"
+        "date": "2017",
+        "dateType": "Issued"
       }
     ],
-    "publisher": "Unknown",
-    "container": {},
+    "descriptions": [
+      {
+        "description": "International society recognizes that the scarcity of fresh water is increasing and farming sectors suffer from lack of irrigation water. However, if we look at this issue with a framework of relative factor endowment, a different view will arise. In emerging states with rapid industrialization and labor migration, labor scarcity increases at a faster pace than that of irrigation water. Using the historical review of Japan’s irrigation policies as well as the case studies of India and China, this paper shows that the introduction of policies which do not reflect the actual relative resource scarcity may mislead the development path. We argue that under increasing relative labor scarcity it is important to realize the substitution of capital for labor for surface irrigation system management and that the substitution needs public support because the service of surface irrigation system has some externalities. Through this argument, this paper also intends to shed the light back to the role of the state for local resource management which seems to be unfairly undervalued since the boom of community participatory approach in the 1980s.",
+        "descriptionType": "Abstract"
+      }
+    ],
+    "doi": "10.22004/ag.econ.284864",
+    "formats": [],
+    "fundingReferences": [],
+    "geoLocations": [],
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.22004/ag.econ.284864",
+        "identifierType": "DOI"
+      }
+    ],
+    "isActive": true,
+    "language": "eng",
+    "metadataVersion": 1,
     "publicationYear": 2017,
+    "published": "2017",
+    "publisher": "Unknown",
+    "reason": null,
+    "registered": "2019-08-24T07:46:47.000Z",
+    "relatedIdentifiers": [],
+    "rightsList": [],
+    "schemaVersion": null,
+    "sizes": [],
+    "source": "mds",
+    "state": "findable",
     "subjects": [
       {
         "subject": "Land Economics/Use"
@@ -52,48 +75,24 @@
         "subjectScheme": "keyword"
       }
     ],
-    "contributors": [],
-    "dates": [
+    "titles": [
       {
-        "date": "2017",
-        "dateType": "Issued"
+        "title": "Irrigation Policies under Rapid Industrialization and Labor Migration: Lessons from Japan, China and India"
       }
     ],
-    "language": "eng",
     "types": {
-      "ris": "RPRT",
       "bibtex": "article",
       "citeproc": "article-journal",
-      "schemaOrg": "ScholarlyArticle",
       "resourceType": "Text",
-      "resourceTypeGeneral": "Text"
+      "resourceTypeGeneral": "Text",
+      "ris": "RPRT",
+      "schemaOrg": "ScholarlyArticle"
     },
-    "relatedIdentifiers": [],
-    "sizes": [],
-    "formats": [],
-    "version": null,
-    "rightsList": [],
-    "descriptions": [
-      {
-        "description": "International society recognizes that the scarcity of fresh water is increasing and farming sectors suffer from lack of irrigation water. However, if we look at this issue with a framework of relative factor endowment, a different view will arise. In emerging states with rapid industrialization and labor migration, labor scarcity increases at a faster pace than that of irrigation water. Using the historical review of Japan’s irrigation policies as well as the case studies of India and China, this paper shows that the introduction of policies which do not reflect the actual relative resource scarcity may mislead the development path. We argue that under increasing relative labor scarcity it is important to realize the substitution of capital for labor for surface irrigation system management and that the substitution needs public support because the service of surface irrigation system has some externalities. Through this argument, this paper also intends to shed the light back to the role of the state for local resource management which seems to be unfairly undervalued since the boom of community participatory approach in the 1980s.",
-        "descriptionType": "Abstract"
-      }
-    ],
-    "geoLocations": [],
-    "fundingReferences": [],
+    "updated": "2019-08-25T09:38:33.000Z",
     "url": "https://ageconsearch.umn.edu/record/284864",
-    "contentUrl": null,
-    "metadataVersion": 1,
-    "schemaVersion": null,
-    "source": "mds",
-    "isActive": true,
-    "state": "findable",
-    "reason": null,
-    "created": "2019-08-24T07:46:47.000Z",
-    "registered": "2019-08-24T07:46:47.000Z",
-    "published": "2017",
-    "updated": "2019-08-25T09:38:33.000Z"
+    "version": null
   },
+  "id": "10.22004/ag.econ.284864",
   "relationships": {
     "client": {
       "data": {
@@ -101,5 +100,6 @@
         "type": "clients"
       }
     }
-  }
+  },
+  "type": "dois"
 }
diff --git a/python/tests/files/datacite/datacite_doc_09.json b/python/tests/files/datacite/datacite_doc_09.json
index d09af545..d6617d0d 100644
--- a/python/tests/files/datacite/datacite_doc_09.json
+++ b/python/tests/files/datacite/datacite_doc_09.json
@@ -1,8 +1,46 @@
 {
-  "id": "10.2314/gbv:880813733",
-  "type": "dois",
   "attributes": {
+    "container": {},
+    "contentUrl": null,
+    "contributors": [
+      {
+        "affiliation": [],
+        "contributorType": "HostingInstitution",
+        "name": "TIB-Technische Informationsbibliothek Universitätsbibliothek Hannover",
+        "nameIdentifiers": [],
+        "nameType": "Organizational"
+      },
+      {
+        "affiliation": [],
+        "contributorType": "DataManager",
+        "name": "Technische Informationsbibliothek (TIB)",
+        "nameIdentifiers": []
+      }
+    ],
+    "created": "2017-02-25T00:00:18.000Z",
+    "creators": [
+      {
+        "affiliation": [],
+        "familyName": "Kirstaedter",
+        "givenName": "Nils",
+        "name": "Kirstaedter, Nils",
+        "nameIdentifiers": [],
+        "nameType": "Personal"
+      }
+    ],
+    "dates": [
+      {
+        "date": "2016",
+        "dateType": "Issued"
+      }
+    ],
+    "descriptions": [],
     "doi": "10.2314/gbv:880813733",
+    "formats": [
+      "application/pdf"
+    ],
+    "fundingReferences": [],
+    "geoLocations": [],
     "identifiers": [
       {
         "identifier": "https://doi.org/10.2314/gbv:880813733",
@@ -29,32 +67,22 @@
         "identifierType": "ftx-id"
       }
     ],
-    "creators": [
-      {
-        "name": "Kirstaedter, Nils",
-        "nameType": "Personal",
-        "givenName": "Nils",
-        "familyName": "Kirstaedter",
-        "affiliation": [],
-        "nameIdentifiers": []
-      }
-    ],
-    "titles": [
-      {
-        "title": "BrightLas : TP3.3. Module für Direktdiodenstrahlquellen bis 4kW und Untersuchungen zur Leistungsskalierung (Diodemodul) : zum Verbundvorhaben Direktdiodenlaseranlagen und -systeme (VP3) im Förderschwerpunkt innovative regionale Wachstumskerne, BMBF : Abschlussbericht"
-      },
-      {
-        "title": "Module für Direktdiodenstrahlquellen bis 4kW und Untersuchungen zur Leistungsskalierung (Diodemodul)",
-        "titleType": "AlternativeTitle"
-      },
-      {
-        "title": "Direktdiodenlaseranlagen und -systeme (VP3)",
-        "titleType": "AlternativeTitle"
-      }
-    ],
-    "publisher": "[Lumics GmbH]",
-    "container": {},
+    "isActive": true,
+    "language": "de",
+    "metadataVersion": 9,
     "publicationYear": 2016,
+    "published": "2016",
+    "publisher": "[Lumics GmbH]",
+    "reason": null,
+    "registered": "2017-02-25T00:00:19.000Z",
+    "relatedIdentifiers": [],
+    "rightsList": [],
+    "schemaVersion": "http://datacite.org/schema/kernel-4",
+    "sizes": [
+      "1 Online-Ressource (10 Seiten, 1,40 MB)"
+    ],
+    "source": "mds",
+    "state": "findable",
     "subjects": [
       {
         "subject": "Direktdiodenlasersysteme"
@@ -64,61 +92,32 @@
         "subjectScheme": "linsearch"
       }
     ],
-    "contributors": [
+    "titles": [
       {
-        "name": "TIB-Technische Informationsbibliothek Universitätsbibliothek Hannover",
-        "nameType": "Organizational",
-        "affiliation": [],
-        "contributorType": "HostingInstitution",
-        "nameIdentifiers": []
+        "title": "BrightLas : TP3.3. Module für Direktdiodenstrahlquellen bis 4kW und Untersuchungen zur Leistungsskalierung (Diodemodul) : zum Verbundvorhaben Direktdiodenlaseranlagen und -systeme (VP3) im Förderschwerpunkt innovative regionale Wachstumskerne, BMBF : Abschlussbericht"
       },
       {
-        "name": "Technische Informationsbibliothek (TIB)",
-        "affiliation": [],
-        "contributorType": "DataManager",
-        "nameIdentifiers": []
-      }
-    ],
-    "dates": [
+        "title": "Module für Direktdiodenstrahlquellen bis 4kW und Untersuchungen zur Leistungsskalierung (Diodemodul)",
+        "titleType": "AlternativeTitle"
+      },
       {
-        "date": "2016",
-        "dateType": "Issued"
+        "title": "Direktdiodenlaseranlagen und -systeme (VP3)",
+        "titleType": "AlternativeTitle"
       }
     ],
-    "language": "de",
     "types": {
-      "ris": "RPRT",
       "bibtex": "article",
       "citeproc": "report",
-      "schemaOrg": "ScholarlyArticle",
       "resourceType": "Report",
-      "resourceTypeGeneral": "Text"
+      "resourceTypeGeneral": "Text",
+      "ris": "RPRT",
+      "schemaOrg": "ScholarlyArticle"
     },
-    "relatedIdentifiers": [],
-    "sizes": [
-      "1 Online-Ressource (10 Seiten, 1,40 MB)"
-    ],
-    "formats": [
-      "application/pdf"
-    ],
-    "version": "1.0",
-    "rightsList": [],
-    "descriptions": [],
-    "geoLocations": [],
-    "fundingReferences": [],
+    "updated": "2019-08-03T05:53:51.000Z",
     "url": "https://www.tib.eu/suchen/id/TIBKAT:880813733/",
-    "contentUrl": null,
-    "metadataVersion": 9,
-    "schemaVersion": "http://datacite.org/schema/kernel-4",
-    "source": "mds",
-    "isActive": true,
-    "state": "findable",
-    "reason": null,
-    "created": "2017-02-25T00:00:18.000Z",
-    "registered": "2017-02-25T00:00:19.000Z",
-    "published": "2016",
-    "updated": "2019-08-03T05:53:51.000Z"
+    "version": "1.0"
   },
+  "id": "10.2314/gbv:880813733",
   "relationships": {
     "client": {
       "data": {
@@ -126,5 +125,6 @@
         "type": "clients"
       }
     }
-  }
+  },
+  "type": "dois"
 }
diff --git a/python/tests/files/datacite/datacite_doc_10.json b/python/tests/files/datacite/datacite_doc_10.json
index d40fc272..154242cb 100644
--- a/python/tests/files/datacite/datacite_doc_10.json
+++ b/python/tests/files/datacite/datacite_doc_10.json
@@ -1,28 +1,50 @@
 {
-  "id": "10.25549/wpacards-m6171",
-  "type": "dois",
   "attributes": {
-    "doi": "10.25549/wpacards-m6171",
-    "identifiers": [
+    "container": {},
+    "contentUrl": null,
+    "contributors": [],
+    "created": "2018-09-09T08:32:09.000Z",
+    "creators": [
       {
-        "identifier": "https://doi.org/10.25549/wpacards-m6171",
-        "identifierType": "DOI"
+        "affiliation": [],
+        "name": "Unknown"
       }
     ],
-    "creators": [
+    "dates": [
       {
-        "name": "Unknown",
-        "affiliation": []
+        "date": "2012",
+        "dateType": "Issued"
       }
     ],
-    "titles": [
+    "descriptions": [
       {
-        "title": "WPA household census for 210 E VERNON, Los Angeles"
+        "descriptionType": "Abstract"
       }
     ],
-    "publisher": "University of Southern California Digital Library (USC.DL)",
-    "container": {},
+    "doi": "10.25549/wpacards-m6171",
+    "formats": [],
+    "fundingReferences": [],
+    "geoLocations": [],
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.25549/wpacards-m6171",
+        "identifierType": "DOI"
+      }
+    ],
+    "isActive": true,
+    "language": "eng",
+    "metadataVersion": 0,
     "publicationYear": 2012,
+    "published": "2012",
+    "publisher": "University of Southern California Digital Library (USC.DL)",
+    "reason": null,
+    "registered": "2018-09-09T08:33:10.000Z",
+    "relatedIdentifiers": [],
+    "rightsList": [],
+    "schemaVersion": "http://datacite.org/schema/kernel-4",
+    "sizes": [],
+    "source": "mds",
+    "state": "findable",
     "subjects": [
       {
         "subject": "housing areas"
@@ -31,47 +53,24 @@
         "subject": "Dwellings"
       }
     ],
-    "contributors": [],
-    "dates": [
+    "titles": [
       {
-        "date": "2012",
-        "dateType": "Issued"
+        "title": "WPA household census for 210 E VERNON, Los Angeles"
       }
     ],
-    "language": "eng",
     "types": {
-      "ris": "DATA",
       "bibtex": "misc",
       "citeproc": "dataset",
-      "schemaOrg": "Dataset",
       "resourceType": "Dataset",
-      "resourceTypeGeneral": "Dataset"
+      "resourceTypeGeneral": "Dataset",
+      "ris": "DATA",
+      "schemaOrg": "Dataset"
     },
-    "relatedIdentifiers": [],
-    "sizes": [],
-    "formats": [],
-    "version": null,
-    "rightsList": [],
-    "descriptions": [
-      {
-        "descriptionType": "Abstract"
-      }
-    ],
-    "geoLocations": [],
-    "fundingReferences": [],
+    "updated": "2019-08-02T20:03:32.000Z",
     "url": "http://digitallibrary.usc.edu/cdm/ref/collection/p15799coll8/id/2608",
-    "contentUrl": null,
-    "metadataVersion": 0,
-    "schemaVersion": "http://datacite.org/schema/kernel-4",
-    "source": "mds",
-    "isActive": true,
-    "state": "findable",
-    "reason": null,
-    "created": "2018-09-09T08:32:09.000Z",
-    "registered": "2018-09-09T08:33:10.000Z",
-    "published": "2012",
-    "updated": "2019-08-02T20:03:32.000Z"
+    "version": null
   },
+  "id": "10.25549/wpacards-m6171",
   "relationships": {
     "client": {
       "data": {
@@ -79,5 +78,6 @@
         "type": "clients"
       }
     }
-  }
+  },
+  "type": "dois"
 }
diff --git a/python/tests/files/datacite/datacite_doc_11.json b/python/tests/files/datacite/datacite_doc_11.json
index 50fe8363..80194762 100644
--- a/python/tests/files/datacite/datacite_doc_11.json
+++ b/python/tests/files/datacite/datacite_doc_11.json
@@ -1,30 +1,15 @@
 {
-  "id": "10.3932/ethz-a-000055869",
-  "type": "dois",
   "attributes": {
-    "doi": "10.3932/ethz-a-000055869",
-    "identifiers": [
-      {
-        "identifier": "https://doi.org/10.3932/ethz-a-000055869",
-        "identifierType": "DOI"
-      }
-    ],
+    "container": {},
+    "contentUrl": null,
+    "contributors": [],
+    "created": "2019-03-04T23:56:42.000Z",
     "creators": [
       {
-        "name": "Comet Photo AG (Zürich)",
-        "affiliation": []
-      }
-    ],
-    "titles": [
-      {
-        "title": "N1 bei Safenwil"
+        "affiliation": [],
+        "name": "Comet Photo AG (Zürich)"
       }
     ],
-    "publisher": "ETH-Bibliothek Zürich, Bildarchiv",
-    "container": {},
-    "publicationYear": 1965,
-    "subjects": [],
-    "contributors": [],
     "dates": [
       {
         "date": "1965",
@@ -35,21 +20,6 @@
         "dateType": "Issued"
       }
     ],
-    "language": "de",
-    "types": {
-      "ris": "FIGURE",
-      "bibtex": "misc",
-      "citeproc": "graphic",
-      "schemaOrg": "ImageObject",
-      "resourceTypeGeneral": "Image"
-    },
-    "relatedIdentifiers": [],
-    "sizes": [],
-    "formats": [
-      "TIFF-Bild"
-    ],
-    "version": null,
-    "rightsList": [],
     "descriptions": [
       {
         "description": "Download und Nutzung frei",
@@ -60,21 +30,50 @@
         "descriptionType": "Other"
       }
     ],
-    "geoLocations": [],
+    "doi": "10.3932/ethz-a-000055869",
+    "formats": [
+      "TIFF-Bild"
+    ],
     "fundingReferences": [],
-    "url": "http://ba.e-pics.ethz.ch/link.jsp?id=44861",
-    "contentUrl": null,
+    "geoLocations": [],
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.3932/ethz-a-000055869",
+        "identifierType": "DOI"
+      }
+    ],
+    "isActive": true,
+    "language": "de",
     "metadataVersion": 6,
+    "publicationYear": 1965,
+    "published": "1965",
+    "publisher": "ETH-Bibliothek Zürich, Bildarchiv",
+    "reason": null,
+    "registered": "2019-07-30T13:17:45.000Z",
+    "relatedIdentifiers": [],
+    "rightsList": [],
     "schemaVersion": "http://datacite.org/schema/kernel-3",
+    "sizes": [],
     "source": "mds",
-    "isActive": true,
     "state": "findable",
-    "reason": null,
-    "created": "2019-03-04T23:56:42.000Z",
-    "registered": "2019-07-30T13:17:45.000Z",
-    "published": "1965",
-    "updated": "2019-08-02T22:08:26.000Z"
+    "subjects": [],
+    "titles": [
+      {
+        "title": "N1 bei Safenwil"
+      }
+    ],
+    "types": {
+      "bibtex": "misc",
+      "citeproc": "graphic",
+      "resourceTypeGeneral": "Image",
+      "ris": "FIGURE",
+      "schemaOrg": "ImageObject"
+    },
+    "updated": "2019-08-02T22:08:26.000Z",
+    "url": "http://ba.e-pics.ethz.ch/link.jsp?id=44861",
+    "version": null
   },
+  "id": "10.3932/ethz-a-000055869",
   "relationships": {
     "client": {
       "data": {
@@ -82,5 +81,6 @@
         "type": "clients"
       }
     }
-  }
+  },
+  "type": "dois"
 }
diff --git a/python/tests/files/datacite/datacite_doc_12.json b/python/tests/files/datacite/datacite_doc_12.json
index 31c0f0ca..642011d5 100644
--- a/python/tests/files/datacite/datacite_doc_12.json
+++ b/python/tests/files/datacite/datacite_doc_12.json
@@ -1,58 +1,43 @@
 {
-  "id": "10.5167/uzh-171449",
-  "type": "dois",
   "attributes": {
-    "doi": "10.5167/uzh-171449",
-    "identifiers": [
-      {
-        "identifier": "https://doi.org/10.5167/uzh-171449",
-        "identifierType": "DOI"
-      }
-    ],
+    "container": {},
+    "contentUrl": null,
+    "contributors": [],
+    "created": "2019-06-27T01:01:35.000Z",
     "creators": [
       {
-        "name": "Spanias, Charalampos",
-        "nameType": "Personal",
-        "givenName": "Charalampos",
-        "familyName": "Spanias",
         "affiliation": [],
-        "nameIdentifiers": []
+        "familyName": "Spanias",
+        "givenName": "Charalampos",
+        "name": "Spanias, Charalampos",
+        "nameIdentifiers": [],
+        "nameType": "Personal"
       },
       {
-        "name": "Nikolaidis, Pantelis T",
-        "nameType": "Personal",
-        "givenName": "Pantelis T",
-        "familyName": "Nikolaidis",
         "affiliation": [],
-        "nameIdentifiers": []
+        "familyName": "Nikolaidis",
+        "givenName": "Pantelis T",
+        "name": "Nikolaidis, Pantelis T",
+        "nameIdentifiers": [],
+        "nameType": "Personal"
       },
       {
-        "name": "Rosemann, Thomas",
-        "nameType": "Personal",
-        "givenName": "Thomas",
-        "familyName": "Rosemann",
         "affiliation": [],
-        "nameIdentifiers": []
+        "familyName": "Rosemann",
+        "givenName": "Thomas",
+        "name": "Rosemann, Thomas",
+        "nameIdentifiers": [],
+        "nameType": "Personal"
       },
       {
-        "name": "Knechtle, Beat",
-        "nameType": "Personal",
-        "givenName": "Beat",
-        "familyName": "Knechtle",
         "affiliation": [],
-        "nameIdentifiers": []
-      }
-    ],
-    "titles": [
-      {
-        "title": "Anthropometric and Physiological Profile of Mixed Martial Art Athletes: A Brief Review"
+        "familyName": "Knechtle",
+        "givenName": "Beat",
+        "name": "Knechtle, Beat",
+        "nameIdentifiers": [],
+        "nameType": "Personal"
       }
     ],
-    "publisher": "MDPI Publishing",
-    "container": {},
-    "publicationYear": 2019,
-    "subjects": [],
-    "contributors": [],
     "dates": [
       {
         "date": "2019-06-14",
@@ -63,35 +48,49 @@
         "dateType": "Issued"
       }
     ],
-    "language": null,
-    "types": {
-      "ris": "RPRT",
-      "bibtex": "article",
-      "citeproc": "article-journal",
-      "schemaOrg": "ScholarlyArticle",
-      "resourceTypeGeneral": "Text"
-    },
-    "relatedIdentifiers": [],
-    "sizes": [],
-    "formats": [],
-    "version": null,
-    "rightsList": [],
     "descriptions": [],
-    "geoLocations": [],
+    "doi": "10.5167/uzh-171449",
+    "formats": [],
     "fundingReferences": [],
-    "url": "https://www.zora.uzh.ch/id/eprint/171449",
-    "contentUrl": null,
+    "geoLocations": [],
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.5167/uzh-171449",
+        "identifierType": "DOI"
+      }
+    ],
+    "isActive": true,
+    "language": null,
     "metadataVersion": 0,
+    "publicationYear": 2019,
+    "published": "2019",
+    "publisher": "MDPI Publishing",
+    "reason": null,
+    "registered": "2019-06-27T01:01:36.000Z",
+    "relatedIdentifiers": [],
+    "rightsList": [],
     "schemaVersion": null,
+    "sizes": [],
     "source": "mds",
-    "isActive": true,
     "state": "findable",
-    "reason": null,
-    "created": "2019-06-27T01:01:35.000Z",
-    "registered": "2019-06-27T01:01:36.000Z",
-    "published": "2019",
-    "updated": "2019-09-26T16:44:24.000Z"
+    "subjects": [],
+    "titles": [
+      {
+        "title": "Anthropometric and Physiological Profile of Mixed Martial Art Athletes: A Brief Review"
+      }
+    ],
+    "types": {
+      "bibtex": "article",
+      "citeproc": "article-journal",
+      "resourceTypeGeneral": "Text",
+      "ris": "RPRT",
+      "schemaOrg": "ScholarlyArticle"
+    },
+    "updated": "2019-09-26T16:44:24.000Z",
+    "url": "https://www.zora.uzh.ch/id/eprint/171449",
+    "version": null
   },
+  "id": "10.5167/uzh-171449",
   "relationships": {
     "client": {
       "data": {
@@ -99,5 +98,6 @@
         "type": "clients"
       }
     }
-  }
+  },
+  "type": "dois"
 }
diff --git a/python/tests/files/datacite/datacite_doc_13.json b/python/tests/files/datacite/datacite_doc_13.json
index ff6eb229..0cada273 100644
--- a/python/tests/files/datacite/datacite_doc_13.json
+++ b/python/tests/files/datacite/datacite_doc_13.json
@@ -1,37 +1,22 @@
 {
-  "id": "10.5169/seals-314104",
-  "type": "dois",
   "attributes": {
-    "doi": "10.5169/seals-314104",
-    "identifiers": [
-      {
-        "identifier": "https://doi.org/10.5169/seals-314104",
-        "identifierType": "DOI"
-      }
-    ],
+    "container": {},
+    "contentUrl": null,
+    "contributors": [],
+    "created": "2013-03-22T14:02:08.000Z",
     "creators": [
       {
-        "name": "O.M.",
-        "affiliation": []
+        "affiliation": [],
+        "name": "O.M."
       },
       {
-        "name": "Hiltbrunner, Hermann",
-        "nameType": "Personal",
-        "givenName": "Hermann",
+        "affiliation": [],
         "familyName": "Hiltbrunner",
-        "affiliation": []
-      }
-    ],
-    "titles": [
-      {
-        "title": "[Müssen wir des Glücks uns schämen?]"
+        "givenName": "Hermann",
+        "name": "Hiltbrunner, Hermann",
+        "nameType": "Personal"
       }
     ],
-    "publisher": "Buchdruckerei Büchler & Co.",
-    "container": {},
-    "publicationYear": 1940,
-    "subjects": [],
-    "contributors": [],
     "dates": [
       {
         "date": "1940-10-05",
@@ -42,39 +27,53 @@
         "dateType": "Issued"
       }
     ],
-    "language": null,
-    "types": {
-      "ris": "JOUR",
-      "bibtex": "article",
-      "citeproc": "article-journal",
-      "schemaOrg": "ScholarlyArticle",
-      "resourceType": "Journal Article",
-      "resourceTypeGeneral": "Text"
-    },
-    "relatedIdentifiers": [],
-    "sizes": [],
+    "descriptions": [],
+    "doi": "10.5169/seals-314104",
     "formats": [
       "text/html",
       "application/pdf"
     ],
-    "version": null,
-    "rightsList": [],
-    "descriptions": [],
-    "geoLocations": [],
     "fundingReferences": [],
-    "url": "https://www.e-periodica.ch/digbib/view?pid=sle-001:1940-1941:45::13",
-    "contentUrl": null,
+    "geoLocations": [],
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.5169/seals-314104",
+        "identifierType": "DOI"
+      }
+    ],
+    "isActive": true,
+    "language": null,
     "metadataVersion": 17,
+    "publicationYear": 1940,
+    "published": "1940",
+    "publisher": "Buchdruckerei Büchler & Co.",
+    "reason": null,
+    "registered": "2013-03-22T13:58:11.000Z",
+    "relatedIdentifiers": [],
+    "rightsList": [],
     "schemaVersion": "http://datacite.org/schema/kernel-3",
+    "sizes": [],
     "source": null,
-    "isActive": true,
     "state": "findable",
-    "reason": null,
-    "created": "2013-03-22T14:02:08.000Z",
-    "registered": "2013-03-22T13:58:11.000Z",
-    "published": "1940",
-    "updated": "2019-08-02T02:22:55.000Z"
+    "subjects": [],
+    "titles": [
+      {
+        "title": "[Müssen wir des Glücks uns schämen?]"
+      }
+    ],
+    "types": {
+      "bibtex": "article",
+      "citeproc": "article-journal",
+      "resourceType": "Journal Article",
+      "resourceTypeGeneral": "Text",
+      "ris": "JOUR",
+      "schemaOrg": "ScholarlyArticle"
+    },
+    "updated": "2019-08-02T02:22:55.000Z",
+    "url": "https://www.e-periodica.ch/digbib/view?pid=sle-001:1940-1941:45::13",
+    "version": null
   },
+  "id": "10.5169/seals-314104",
   "relationships": {
     "client": {
       "data": {
@@ -82,5 +81,6 @@
         "type": "clients"
       }
     }
-  }
+  },
+  "type": "dois"
 }
diff --git a/python/tests/files/datacite/datacite_doc_14.json b/python/tests/files/datacite/datacite_doc_14.json
index b1e1ebf2..c0911819 100644
--- a/python/tests/files/datacite/datacite_doc_14.json
+++ b/python/tests/files/datacite/datacite_doc_14.json
@@ -1,84 +1,119 @@
 {
-  "id": "10.5517/cc7gns3",
-  "type": "dois",
   "attributes": {
-    "doi": "10.5517/cc7gns3",
-    "identifiers": [
-      {
-        "identifier": "https://doi.org/10.5517/cc7gns3",
-        "identifierType": "DOI"
-      },
-      {
-        "identifier": "222635",
-        "identifierType": "CCDC"
-      }
-    ],
+    "container": {},
+    "contentUrl": null,
+    "contributors": [],
+    "created": "2014-03-18T07:28:28.000Z",
     "creators": [
       {
-        "name": "Stulz, E.",
-        "nameType": "Personal",
-        "givenName": "E.",
+        "affiliation": [],
         "familyName": "Stulz",
-        "affiliation": []
+        "givenName": "E.",
+        "name": "Stulz, E.",
+        "nameType": "Personal"
       },
       {
-        "name": "Scott, S.M.",
-        "nameType": "Personal",
-        "givenName": "S.M.",
+        "affiliation": [],
         "familyName": "Scott",
-        "affiliation": []
+        "givenName": "S.M.",
+        "name": "Scott, S.M.",
+        "nameType": "Personal"
       },
       {
-        "name": "Ng, Yiu-Fai",
-        "nameType": "Personal",
-        "givenName": "Yiu-Fai",
+        "affiliation": [],
         "familyName": "Ng",
-        "affiliation": []
+        "givenName": "Yiu-Fai",
+        "name": "Ng, Yiu-Fai",
+        "nameType": "Personal"
       },
       {
-        "name": "Bond, A.D.",
-        "nameType": "Personal",
-        "givenName": "A.D.",
+        "affiliation": [],
         "familyName": "Bond",
-        "affiliation": []
+        "givenName": "A.D.",
+        "name": "Bond, A.D.",
+        "nameType": "Personal"
       },
       {
-        "name": "Teat, S.J.",
-        "nameType": "Personal",
-        "givenName": "S.J.",
+        "affiliation": [],
         "familyName": "Teat",
-        "affiliation": []
+        "givenName": "S.J.",
+        "name": "Teat, S.J.",
+        "nameType": "Personal"
       },
       {
-        "name": "Darling, S.L.",
-        "nameType": "Personal",
-        "givenName": "S.L.",
+        "affiliation": [],
         "familyName": "Darling",
-        "affiliation": []
+        "givenName": "S.L.",
+        "name": "Darling, S.L.",
+        "nameType": "Personal"
       },
       {
-        "name": "Feeder, N.",
-        "nameType": "Personal",
-        "givenName": "N.",
+        "affiliation": [],
         "familyName": "Feeder",
-        "affiliation": []
+        "givenName": "N.",
+        "name": "Feeder, N.",
+        "nameType": "Personal"
       },
       {
-        "name": "Sanders, J.K.M.",
-        "nameType": "Personal",
-        "givenName": "J.K.M.",
+        "affiliation": [],
         "familyName": "Sanders",
-        "affiliation": []
+        "givenName": "J.K.M.",
+        "name": "Sanders, J.K.M.",
+        "nameType": "Personal"
       }
     ],
-    "titles": [
+    "dates": [
       {
-        "title": "CCDC 222635: Experimental Crystal Structure Determination"
+        "date": "2004",
+        "dateType": "Issued"
       }
     ],
-    "publisher": "Cambridge Crystallographic Data Centre",
-    "container": {},
+    "descriptions": [
+      {
+        "description": "Related Article: E.Stulz, S.M.Scott, Yiu-Fai Ng, A.D.Bond, S.J.Teat, S.L.Darling, N.Feeder, J.K.M.Sanders|2003|Inorg.Chem.|42|6564|doi:10.1021/ic034699w",
+        "descriptionType": "Other"
+      },
+      {
+        "description": "An entry from the Cambridge Structural Database, the world’s repository for small molecule crystal structures. The entry contains experimental data from a crystal diffraction study. The deposited dataset for this entry is freely available from the CCDC and typically includes 3D coordinates, cell parameters, space group, experimental conditions and quality measures.",
+        "descriptionType": "Abstract"
+      }
+    ],
+    "doi": "10.5517/cc7gns3",
+    "formats": [
+      "CIF"
+    ],
+    "fundingReferences": [],
+    "geoLocations": [],
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.5517/cc7gns3",
+        "identifierType": "DOI"
+      },
+      {
+        "identifier": "222635",
+        "identifierType": "CCDC"
+      }
+    ],
+    "isActive": true,
+    "language": "eng",
+    "metadataVersion": 2,
     "publicationYear": 2004,
+    "published": "2004",
+    "publisher": "Cambridge Crystallographic Data Centre",
+    "reason": null,
+    "registered": "2014-03-18T07:28:29.000Z",
+    "relatedIdentifiers": [
+      {
+        "relatedIdentifier": "10.1021/ic034699w",
+        "relatedIdentifierType": "DOI",
+        "relationType": "IsSupplementTo"
+      }
+    ],
+    "rightsList": [],
+    "schemaVersion": "http://datacite.org/schema/kernel-3",
+    "sizes": [],
+    "source": null,
+    "state": "findable",
     "subjects": [
       {
         "subject": "Crystal Structure"
@@ -102,59 +137,23 @@
         "subject": "bis(mu~2~-5-(3,5-Di-t-butylphenyl)-15-(4-(2-(diphenylphosphino)ethynyl)phenyl)-2,8,12,18-tetrahexyl-3,7,13,17-tetramethylporphyrinato)-(5,15-bis(3,5-di-t-butylphenyl)-2,8,12,18-tetraethyl-3,7,13,17-tetramethylporphyrinato)-di-nickel-ruthenium chloroform solvate"
       }
     ],
-    "contributors": [],
-    "dates": [
+    "titles": [
       {
-        "date": "2004",
-        "dateType": "Issued"
+        "title": "CCDC 222635: Experimental Crystal Structure Determination"
       }
     ],
-    "language": "eng",
     "types": {
-      "ris": "DATA",
       "bibtex": "misc",
       "citeproc": "dataset",
-      "schemaOrg": "Dataset",
-      "resourceTypeGeneral": "Dataset"
+      "resourceTypeGeneral": "Dataset",
+      "ris": "DATA",
+      "schemaOrg": "Dataset"
     },
-    "relatedIdentifiers": [
-      {
-        "relationType": "IsSupplementTo",
-        "relatedIdentifier": "10.1021/ic034699w",
-        "relatedIdentifierType": "DOI"
-      }
-    ],
-    "sizes": [],
-    "formats": [
-      "CIF"
-    ],
-    "version": null,
-    "rightsList": [],
-    "descriptions": [
-      {
-        "description": "Related Article: E.Stulz, S.M.Scott, Yiu-Fai Ng, A.D.Bond, S.J.Teat, S.L.Darling, N.Feeder, J.K.M.Sanders|2003|Inorg.Chem.|42|6564|doi:10.1021/ic034699w",
-        "descriptionType": "Other"
-      },
-      {
-        "description": "An entry from the Cambridge Structural Database, the world’s repository for small molecule crystal structures. The entry contains experimental data from a crystal diffraction study. The deposited dataset for this entry is freely available from the CCDC and typically includes 3D coordinates, cell parameters, space group, experimental conditions and quality measures.",
-        "descriptionType": "Abstract"
-      }
-    ],
-    "geoLocations": [],
-    "fundingReferences": [],
+    "updated": "2019-08-02T03:38:32.000Z",
     "url": "http://www.ccdc.cam.ac.uk/services/structure_request?id=doi:10.5517/cc7gns3&sid=DataCite",
-    "contentUrl": null,
-    "metadataVersion": 2,
-    "schemaVersion": "http://datacite.org/schema/kernel-3",
-    "source": null,
-    "isActive": true,
-    "state": "findable",
-    "reason": null,
-    "created": "2014-03-18T07:28:28.000Z",
-    "registered": "2014-03-18T07:28:29.000Z",
-    "published": "2004",
-    "updated": "2019-08-02T03:38:32.000Z"
+    "version": null
   },
+  "id": "10.5517/cc7gns3",
   "relationships": {
     "client": {
       "data": {
@@ -162,5 +161,6 @@
         "type": "clients"
       }
     }
-  }
+  },
+  "type": "dois"
 }
diff --git a/python/tests/files/datacite/datacite_doc_15.json b/python/tests/files/datacite/datacite_doc_15.json
index 5b4ee8ec..8dc67267 100644
--- a/python/tests/files/datacite/datacite_doc_15.json
+++ b/python/tests/files/datacite/datacite_doc_15.json
@@ -1,8 +1,29 @@
 {
-  "id": "10.6073/pasta/95296d8416aae24f3d39b4ecb27f0b28",
-  "type": "dois",
   "attributes": {
+    "container": {},
+    "contentUrl": null,
+    "contributors": [],
+    "created": "2017-02-01T18:20:04.000Z",
+    "creators": [
+      {
+        "affiliation": [],
+        "familyName": "Richardson",
+        "givenName": "David",
+        "name": "Richardson, David",
+        "nameType": "Personal"
+      }
+    ],
+    "dates": [
+      {
+        "date": "2017",
+        "dateType": "Issued"
+      }
+    ],
+    "descriptions": [],
     "doi": "10.6073/pasta/95296d8416aae24f3d39b4ecb27f0b28",
+    "formats": [],
+    "fundingReferences": [],
+    "geoLocations": [],
     "identifiers": [
       {
         "identifier": "https://doi.org/10.6073/pasta/95296d8416aae24f3d39b4ecb27f0b28",
@@ -13,61 +34,39 @@
         "identifierType": "URL"
       }
     ],
-    "creators": [
-      {
-        "name": "Richardson, David",
-        "nameType": "Personal",
-        "givenName": "David",
-        "familyName": "Richardson",
-        "affiliation": []
-      }
-    ],
-    "titles": [
-      {
-        "title": "Parramore Island of the Virginia Coast Reserve Permanent Plot Resurvey: Tree data 1997"
-      }
-    ],
-    "publisher": "Environmental Data Initiative",
-    "container": {},
+    "isActive": true,
+    "language": null,
+    "metadataVersion": 1,
     "publicationYear": 2017,
+    "published": "2017",
+    "publisher": "Environmental Data Initiative",
+    "reason": null,
+    "registered": "2017-02-01T18:20:05.000Z",
+    "relatedIdentifiers": [],
+    "rightsList": [],
+    "schemaVersion": "http://datacite.org/schema/kernel-2.2",
+    "sizes": [],
+    "source": null,
+    "state": "findable",
     "subjects": [],
-    "contributors": [],
-    "dates": [
+    "titles": [
       {
-        "date": "2017",
-        "dateType": "Issued"
+        "title": "Parramore Island of the Virginia Coast Reserve Permanent Plot Resurvey: Tree data 1997"
       }
     ],
-    "language": null,
     "types": {
-      "ris": "DATA",
       "bibtex": "misc",
       "citeproc": "dataset",
-      "schemaOrg": "Dataset",
       "resourceType": "dataPackage",
-      "resourceTypeGeneral": "Dataset"
+      "resourceTypeGeneral": "Dataset",
+      "ris": "DATA",
+      "schemaOrg": "Dataset"
     },
-    "relatedIdentifiers": [],
-    "sizes": [],
-    "formats": [],
-    "version": null,
-    "rightsList": [],
-    "descriptions": [],
-    "geoLocations": [],
-    "fundingReferences": [],
+    "updated": "2019-08-02T14:16:49.000Z",
     "url": "https://portal.lternet.edu/nis/mapbrowse?packageid=knb-lter-vcr.102.16",
-    "contentUrl": null,
-    "metadataVersion": 1,
-    "schemaVersion": "http://datacite.org/schema/kernel-2.2",
-    "source": null,
-    "isActive": true,
-    "state": "findable",
-    "reason": null,
-    "created": "2017-02-01T18:20:04.000Z",
-    "registered": "2017-02-01T18:20:05.000Z",
-    "published": "2017",
-    "updated": "2019-08-02T14:16:49.000Z"
+    "version": null
   },
+  "id": "10.6073/pasta/95296d8416aae24f3d39b4ecb27f0b28",
   "relationships": {
     "client": {
       "data": {
@@ -75,5 +74,6 @@
         "type": "clients"
       }
     }
-  }
+  },
+  "type": "dois"
 }
diff --git a/python/tests/files/datacite/datacite_doc_16.json b/python/tests/files/datacite/datacite_doc_16.json
index 5af7fbe1..72ad59ac 100644
--- a/python/tests/files/datacite/datacite_doc_16.json
+++ b/python/tests/files/datacite/datacite_doc_16.json
@@ -1,74 +1,73 @@
 {
-  "id": "10.6084/m9.figshare.1282478",
-  "type": "dois",
   "attributes": {
-    "doi": "10.6084/m9.figshare.1282478",
-    "identifiers": [
-      {
-        "identifier": "https://doi.org/10.6084/m9.figshare.1282478",
-        "identifierType": "DOI"
-      }
-    ],
+    "container": {},
+    "contentUrl": null,
+    "contributors": [],
+    "created": "2014-12-31T15:38:16.000Z",
     "creators": [
       {
-        "name": "Sochi, Taha",
-        "nameType": "Personal",
-        "givenName": "Taha",
+        "affiliation": [],
         "familyName": "Sochi",
-        "affiliation": []
-      }
-    ],
-    "titles": [
-      {
-        "title": "Testing the Connectivity of Networks"
+        "givenName": "Taha",
+        "name": "Sochi, Taha",
+        "nameType": "Personal"
       }
     ],
-    "publisher": "Figshare",
-    "container": {},
-    "publicationYear": 2014,
-    "subjects": [],
-    "contributors": [],
     "dates": [
       {
         "date": "2014",
         "dateType": "Issued"
       }
     ],
+    "descriptions": [],
+    "doi": "10.6084/m9.figshare.1282478",
+    "formats": [],
+    "fundingReferences": [],
+    "geoLocations": [],
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.6084/m9.figshare.1282478",
+        "identifierType": "DOI"
+      }
+    ],
+    "isActive": true,
     "language": null,
-    "types": {
-      "ris": "DATA",
-      "bibtex": "misc",
-      "citeproc": "dataset",
-      "schemaOrg": "Dataset",
-      "resourceType": "Paper",
-      "resourceTypeGeneral": "Dataset"
-    },
+    "metadataVersion": 0,
+    "publicationYear": 2014,
+    "published": "2014",
+    "publisher": "Figshare",
+    "reason": null,
+    "registered": "2014-12-31T15:38:18.000Z",
     "relatedIdentifiers": [],
-    "sizes": [],
-    "formats": [],
-    "version": null,
     "rightsList": [
       {
         "rights": "CC-BY",
         "rightsUri": "http://creativecommons.org/licenses/by/3.0/us"
       }
     ],
-    "descriptions": [],
-    "geoLocations": [],
-    "fundingReferences": [],
-    "url": "http://figshare.com/articles/Testing_the_Connectivity_of_Networks/1282478",
-    "contentUrl": null,
-    "metadataVersion": 0,
     "schemaVersion": "http://datacite.org/schema/kernel-3",
+    "sizes": [],
     "source": null,
-    "isActive": true,
     "state": "findable",
-    "reason": null,
-    "created": "2014-12-31T15:38:16.000Z",
-    "registered": "2014-12-31T15:38:18.000Z",
-    "published": "2014",
-    "updated": "2019-08-02T04:52:11.000Z"
+    "subjects": [],
+    "titles": [
+      {
+        "title": "Testing the Connectivity of Networks"
+      }
+    ],
+    "types": {
+      "bibtex": "misc",
+      "citeproc": "dataset",
+      "resourceType": "Paper",
+      "resourceTypeGeneral": "Dataset",
+      "ris": "DATA",
+      "schemaOrg": "Dataset"
+    },
+    "updated": "2019-08-02T04:52:11.000Z",
+    "url": "http://figshare.com/articles/Testing_the_Connectivity_of_Networks/1282478",
+    "version": null
   },
+  "id": "10.6084/m9.figshare.1282478",
   "relationships": {
     "client": {
       "data": {
@@ -76,5 +75,6 @@
         "type": "clients"
       }
     }
-  }
+  },
+  "type": "dois"
 }
diff --git a/python/tests/files/datacite/datacite_doc_17.json b/python/tests/files/datacite/datacite_doc_17.json
index f1363a61..93ec715e 100644
--- a/python/tests/files/datacite/datacite_doc_17.json
+++ b/python/tests/files/datacite/datacite_doc_17.json
@@ -1,66 +1,65 @@
 {
-  "id": "10.7910/dvn/tsqfwc/yytj22",
-  "type": "dois",
   "attributes": {
-    "doi": "10.7910/dvn/tsqfwc/yytj22",
-    "identifiers": [
+    "container": {},
+    "contentUrl": null,
+    "contributors": [],
+    "created": "2018-08-22T17:36:10.000Z",
+    "creators": [
       {
-        "identifier": "https://doi.org/10.7910/dvn/tsqfwc/yytj22",
-        "identifierType": "DOI"
+        "affiliation": [],
+        "name": "Di Giovanna, Antonino Paolo (University Of Florence)",
+        "nameType": "Personal"
       }
     ],
-    "creators": [
+    "dates": [
       {
-        "name": "Di Giovanna, Antonino Paolo (University Of Florence)",
-        "nameType": "Personal",
-        "affiliation": []
+        "date": "2018",
+        "dateType": "Issued"
       }
     ],
-    "titles": [
+    "descriptions": [],
+    "doi": "10.7910/dvn/tsqfwc/yytj22",
+    "formats": [],
+    "fundingReferences": [],
+    "geoLocations": [],
+    "identifiers": [
       {
-        "title": "gel_BSA-FITC_Markov_segmntation0343.tif"
+        "identifier": "https://doi.org/10.7910/dvn/tsqfwc/yytj22",
+        "identifierType": "DOI"
       }
     ],
-    "publisher": "Harvard Dataverse",
-    "container": {},
+    "isActive": true,
+    "language": null,
+    "metadataVersion": 0,
     "publicationYear": 2018,
+    "published": "2018",
+    "publisher": "Harvard Dataverse",
+    "reason": null,
+    "registered": "2018-08-22T17:37:30.000Z",
+    "relatedIdentifiers": [],
+    "rightsList": [],
+    "schemaVersion": "http://datacite.org/schema/kernel-4",
+    "sizes": [],
+    "source": "mds",
+    "state": "findable",
     "subjects": [],
-    "contributors": [],
-    "dates": [
+    "titles": [
       {
-        "date": "2018",
-        "dateType": "Issued"
+        "title": "gel_BSA-FITC_Markov_segmntation0343.tif"
       }
     ],
-    "language": null,
     "types": {
-      "ris": "DATA",
       "bibtex": "misc",
       "citeproc": "dataset",
-      "schemaOrg": "Dataset",
-      "resourceTypeGeneral": "Dataset"
+      "resourceTypeGeneral": "Dataset",
+      "ris": "DATA",
+      "schemaOrg": "Dataset"
     },
-    "relatedIdentifiers": [],
-    "sizes": [],
-    "formats": [],
-    "version": null,
-    "rightsList": [],
-    "descriptions": [],
-    "geoLocations": [],
-    "fundingReferences": [],
+    "updated": "2019-08-02T19:43:20.000Z",
     "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/TSQFWC/YYTJ22",
-    "contentUrl": null,
-    "metadataVersion": 0,
-    "schemaVersion": "http://datacite.org/schema/kernel-4",
-    "source": "mds",
-    "isActive": true,
-    "state": "findable",
-    "reason": null,
-    "created": "2018-08-22T17:36:10.000Z",
-    "registered": "2018-08-22T17:37:30.000Z",
-    "published": "2018",
-    "updated": "2019-08-02T19:43:20.000Z"
+    "version": null
   },
+  "id": "10.7910/dvn/tsqfwc/yytj22",
   "relationships": {
     "client": {
       "data": {
@@ -68,5 +67,6 @@
         "type": "clients"
       }
     }
-  }
+  },
+  "type": "dois"
 }
diff --git a/python/tests/files/datacite/datacite_doc_18.json b/python/tests/files/datacite/datacite_doc_18.json
index f6bc81a6..b5c41b68 100644
--- a/python/tests/files/datacite/datacite_doc_18.json
+++ b/python/tests/files/datacite/datacite_doc_18.json
@@ -1,31 +1,16 @@
 {
-  "id": "10.7916/d81z522m",
-  "type": "dois",
   "attributes": {
-    "doi": "10.7916/d81z522m",
-    "identifiers": [
-      {
-        "identifier": "https://doi.org/10.7916/d81z522m",
-        "identifierType": "DOI"
-      }
-    ],
+    "container": {},
+    "contentUrl": null,
+    "contributors": [],
+    "created": "2017-11-29T02:15:31.000Z",
     "creators": [
       {
-        "name": "(:Unav)",
         "affiliation": [],
+        "name": "(:Unav)",
         "nameIdentifiers": []
       }
     ],
-    "titles": [
-      {
-        "title": "Eastern questionnaire, answer sheet for Interviewee 53215, page 064"
-      }
-    ],
-    "publisher": "Columbia University",
-    "container": {},
-    "publicationYear": 2017,
-    "subjects": [],
-    "contributors": [],
     "dates": [
       {
         "date": "2017-08-21",
@@ -40,34 +25,48 @@
         "dateType": "Issued"
       }
     ],
+    "descriptions": [],
+    "doi": "10.7916/d81z522m",
+    "formats": [],
+    "fundingReferences": [],
+    "geoLocations": [],
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.7916/d81z522m",
+        "identifierType": "DOI"
+      }
+    ],
+    "isActive": true,
     "language": null,
+    "metadataVersion": 2,
+    "publicationYear": 2017,
+    "published": "2017",
+    "publisher": "Columbia University",
+    "reason": null,
+    "registered": "2017-11-29T02:15:32.000Z",
+    "relatedIdentifiers": [],
+    "rightsList": [],
+    "schemaVersion": "http://datacite.org/schema/kernel-3",
+    "sizes": [],
+    "source": "ez",
+    "state": "findable",
+    "subjects": [],
+    "titles": [
+      {
+        "title": "Eastern questionnaire, answer sheet for Interviewee 53215, page 064"
+      }
+    ],
     "types": {
-      "ris": "GEN",
       "bibtex": "misc",
       "citeproc": "article",
+      "ris": "GEN",
       "schemaOrg": "CreativeWork"
     },
-    "relatedIdentifiers": [],
-    "sizes": [],
-    "formats": [],
-    "version": null,
-    "rightsList": [],
-    "descriptions": [],
-    "geoLocations": [],
-    "fundingReferences": [],
+    "updated": "2019-08-04T13:17:58.000Z",
     "url": "https://dlc.library.columbia.edu/lcaaj/cul:k3j9kd52d6",
-    "contentUrl": null,
-    "metadataVersion": 2,
-    "schemaVersion": "http://datacite.org/schema/kernel-3",
-    "source": "ez",
-    "isActive": true,
-    "state": "findable",
-    "reason": null,
-    "created": "2017-11-29T02:15:31.000Z",
-    "registered": "2017-11-29T02:15:32.000Z",
-    "published": "2017",
-    "updated": "2019-08-04T13:17:58.000Z"
+    "version": null
   },
+  "id": "10.7916/d81z522m",
   "relationships": {
     "client": {
       "data": {
@@ -75,5 +74,6 @@
         "type": "clients"
       }
     }
-  }
+  },
+  "type": "dois"
 }
diff --git a/python/tests/files/datacite/datacite_doc_19.json b/python/tests/files/datacite/datacite_doc_19.json
index c0bc25ba..9fbe7372 100644
--- a/python/tests/files/datacite/datacite_doc_19.json
+++ b/python/tests/files/datacite/datacite_doc_19.json
@@ -1,31 +1,16 @@
 {
-  "id": "10.7916/d86x0cg1",
-  "type": "dois",
   "attributes": {
-    "doi": "10.7916/d86x0cg1",
-    "identifiers": [
-      {
-        "identifier": "https://doi.org/10.7916/d86x0cg1",
-        "identifierType": "DOI"
-      }
-    ],
+    "container": {},
+    "contentUrl": null,
+    "contributors": [],
+    "created": "2017-11-29T09:29:33.000Z",
     "creators": [
       {
-        "name": "(:Unav)",
         "affiliation": [],
+        "name": "(:Unav)",
         "nameIdentifiers": []
       }
     ],
-    "titles": [
-      {
-        "title": "Eastern questionnaire, answer sheet for Interviewee 55236, page 092"
-      }
-    ],
-    "publisher": "Columbia University",
-    "container": {},
-    "publicationYear": 2017,
-    "subjects": [],
-    "contributors": [],
     "dates": [
       {
         "date": "2017-08-24",
@@ -40,34 +25,48 @@
         "dateType": "Issued"
       }
     ],
+    "descriptions": [],
+    "doi": "10.7916/d86x0cg1",
+    "formats": [],
+    "fundingReferences": [],
+    "geoLocations": [],
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.7916/d86x0cg1",
+        "identifierType": "DOI"
+      }
+    ],
+    "isActive": true,
     "language": null,
+    "metadataVersion": 3,
+    "publicationYear": 2017,
+    "published": "2017",
+    "publisher": "Columbia University",
+    "reason": null,
+    "registered": "2017-11-29T09:29:34.000Z",
+    "relatedIdentifiers": [],
+    "rightsList": [],
+    "schemaVersion": "http://datacite.org/schema/kernel-3",
+    "sizes": [],
+    "source": "ez",
+    "state": "findable",
+    "subjects": [],
+    "titles": [
+      {
+        "title": "Eastern questionnaire, answer sheet for Interviewee 55236, page 092"
+      }
+    ],
     "types": {
-      "ris": "GEN",
       "bibtex": "misc",
       "citeproc": "article",
+      "ris": "GEN",
       "schemaOrg": "CreativeWork"
     },
-    "relatedIdentifiers": [],
-    "sizes": [],
-    "formats": [],
-    "version": null,
-    "rightsList": [],
-    "descriptions": [],
-    "geoLocations": [],
-    "fundingReferences": [],
+    "updated": "2019-08-04T23:43:40.000Z",
     "url": "https://dlc.library.columbia.edu/lcaaj/cul:44j0zpc98s",
-    "contentUrl": null,
-    "metadataVersion": 3,
-    "schemaVersion": "http://datacite.org/schema/kernel-3",
-    "source": "ez",
-    "isActive": true,
-    "state": "findable",
-    "reason": null,
-    "created": "2017-11-29T09:29:33.000Z",
-    "registered": "2017-11-29T09:29:34.000Z",
-    "published": "2017",
-    "updated": "2019-08-04T23:43:40.000Z"
+    "version": null
   },
+  "id": "10.7916/d86x0cg1",
   "relationships": {
     "client": {
       "data": {
@@ -75,5 +74,6 @@
         "type": "clients"
       }
     }
-  }
+  },
+  "type": "dois"
 }
diff --git a/python/tests/files/datacite/datacite_doc_20.json b/python/tests/files/datacite/datacite_doc_20.json
index cc6cc1fb..7126ee37 100644
--- a/python/tests/files/datacite/datacite_doc_20.json
+++ b/python/tests/files/datacite/datacite_doc_20.json
@@ -1,19 +1,12 @@
 {
   "attributes": {
-    "doi": "10.7916/d86x0cg1",
     "creators": [
       {
-        "name": "(:Unav)",
         "affiliation": [],
+        "name": "(:Unav)",
         "nameIdentifiers": []
       }
     ],
-    "titles": [
-      {
-        "title": "<h1>Eastern questionnaire</h1>"
-      }
-    ],
-    "publicationYear": 2017,
     "dates": [
       {
         "date": "2017-08-24",
@@ -28,14 +21,21 @@
         "dateType": "Issued"
       }
     ],
+    "doi": "10.7916/d86x0cg1",
+    "isActive": true,
     "language": null,
+    "publicationYear": 2017,
+    "state": "findable",
+    "titles": [
+      {
+        "title": "<h1>Eastern questionnaire</h1>"
+      }
+    ],
     "types": {
-      "ris": "GEN",
       "bibtex": "misc",
       "citeproc": "article",
+      "ris": "GEN",
       "schemaOrg": "CreativeWork"
-    },
-    "isActive": true,
-    "state": "findable"
+    }
   }
 }
diff --git a/python/tests/files/datacite/datacite_doc_21.json b/python/tests/files/datacite/datacite_doc_21.json
index 04b196a6..248879c2 100644
--- a/python/tests/files/datacite/datacite_doc_21.json
+++ b/python/tests/files/datacite/datacite_doc_21.json
@@ -1,26 +1,12 @@
 {
   "attributes": {
-    "doi": "10.7916/d86x0cg1",
     "creators": [
       {
-        "name": "(:Unav)",
         "affiliation": [],
+        "name": "(:Unav)",
         "nameIdentifiers": []
       }
     ],
-    "titles": [
-      {
-        "title": "ABC"
-      }
-    ],
-    "publicationYear": 2017,
-    "language": "GERMAN",
-    "types": {
-      "ris": "GEN",
-      "bibtex": "misc",
-      "citeproc": "article",
-      "schemaOrg": "CreativeWork"
-    },
     "dates": [
       {
         "date": "2017-08-24",
@@ -35,7 +21,21 @@
         "dateType": "Issued"
       }
     ],
+    "doi": "10.7916/d86x0cg1",
     "isActive": true,
-    "state": "findable"
+    "language": "GERMAN",
+    "publicationYear": 2017,
+    "state": "findable",
+    "titles": [
+      {
+        "title": "ABC"
+      }
+    ],
+    "types": {
+      "bibtex": "misc",
+      "citeproc": "article",
+      "ris": "GEN",
+      "schemaOrg": "CreativeWork"
+    }
   }
 }
diff --git a/python/tests/files/datacite/datacite_doc_22.json b/python/tests/files/datacite/datacite_doc_22.json
index 365b1361..0f7c5e57 100644
--- a/python/tests/files/datacite/datacite_doc_22.json
+++ b/python/tests/files/datacite/datacite_doc_22.json
@@ -1,28 +1,14 @@
 {
   "attributes": {
-    "doi": "10.7916/d86x0cg1",
     "creators": [
       {
-        "name": "Anton Welch",
         "affiliation": [
           "Department of pataphysics"
         ],
+        "name": "Anton Welch",
         "nameIdentifiers": []
       }
     ],
-    "titles": [
-      {
-        "title": "ABC"
-      }
-    ],
-    "publicationYear": 2017,
-    "language": "GERMAN",
-    "types": {
-      "ris": "GEN",
-      "bibtex": "misc",
-      "citeproc": "article",
-      "schemaOrg": "CreativeWork"
-    },
     "dates": [
       {
         "date": "2017-08-24",
@@ -37,7 +23,21 @@
         "dateType": "Issued"
       }
     ],
+    "doi": "10.7916/d86x0cg1",
     "isActive": true,
-    "state": "findable"
+    "language": "GERMAN",
+    "publicationYear": 2017,
+    "state": "findable",
+    "titles": [
+      {
+        "title": "ABC"
+      }
+    ],
+    "types": {
+      "bibtex": "misc",
+      "citeproc": "article",
+      "ris": "GEN",
+      "schemaOrg": "CreativeWork"
+    }
   }
 }
diff --git a/python/tests/files/datacite/datacite_doc_23.json b/python/tests/files/datacite/datacite_doc_23.json
index 1dcdfc27..b755f1a5 100644
--- a/python/tests/files/datacite/datacite_doc_23.json
+++ b/python/tests/files/datacite/datacite_doc_23.json
@@ -1,28 +1,14 @@
 {
   "attributes": {
-    "doi": "10.7916/d86x0cg1–xxx",
     "creators": [
       {
-        "name": "Anton Welch",
         "affiliation": [
           "Department of pataphysics"
         ],
+        "name": "Anton Welch",
         "nameIdentifiers": []
       }
     ],
-    "titles": [
-      {
-        "title": "ABC"
-      }
-    ],
-    "publicationYear": 2017,
-    "language": "GERMAN",
-    "types": {
-      "ris": "GEN",
-      "bibtex": "misc",
-      "citeproc": "article",
-      "schemaOrg": "CreativeWork"
-    },
     "dates": [
       {
         "date": "2017-08-24",
@@ -37,7 +23,21 @@
         "dateType": "Issued"
       }
     ],
+    "doi": "10.7916/d86x0cg1–xxx",
     "isActive": true,
-    "state": "findable"
+    "language": "GERMAN",
+    "publicationYear": 2017,
+    "state": "findable",
+    "titles": [
+      {
+        "title": "ABC"
+      }
+    ],
+    "types": {
+      "bibtex": "misc",
+      "citeproc": "article",
+      "ris": "GEN",
+      "schemaOrg": "CreativeWork"
+    }
   }
 }
diff --git a/python/tests/files/datacite/datacite_doc_24.json b/python/tests/files/datacite/datacite_doc_24.json
index 4ea6945f..4023055b 100644
--- a/python/tests/files/datacite/datacite_doc_24.json
+++ b/python/tests/files/datacite/datacite_doc_24.json
@@ -1,32 +1,14 @@
 {
   "attributes": {
-    "doi": "10.7916/d86x0cg1",
     "creators": [
       {
-        "name": "Anton Welch",
         "affiliation": [
           "Department of pataphysics"
         ],
+        "name": "Anton Welch",
         "nameIdentifiers": []
       }
     ],
-    "titles": [
-      {
-        "title": "ABC"
-      },
-      {
-        "title": "DEF",
-        "titleType": "Subtitle"
-      }
-    ],
-    "publicationYear": 2016,
-    "language": "DE-CH",
-    "types": {
-      "ris": "GEN",
-      "bibtex": "misc",
-      "citeproc": "article",
-      "schemaOrg": "CreativeWork"
-    },
     "dates": [
       {
         "date": "2017-08-24",
@@ -41,7 +23,25 @@
         "dateType": "Issued"
       }
     ],
+    "doi": "10.7916/d86x0cg1",
     "isActive": true,
-    "state": "findable"
+    "language": "DE-CH",
+    "publicationYear": 2016,
+    "state": "findable",
+    "titles": [
+      {
+        "title": "ABC"
+      },
+      {
+        "title": "DEF",
+        "titleType": "Subtitle"
+      }
+    ],
+    "types": {
+      "bibtex": "misc",
+      "citeproc": "article",
+      "ris": "GEN",
+      "schemaOrg": "CreativeWork"
+    }
   }
 }
diff --git a/python/tests/files/datacite/datacite_doc_25.json b/python/tests/files/datacite/datacite_doc_25.json
index 60cd0ab7..2b219728 100644
--- a/python/tests/files/datacite/datacite_doc_25.json
+++ b/python/tests/files/datacite/datacite_doc_25.json
@@ -1,32 +1,14 @@
 {
   "attributes": {
-    "doi": "10.7916/d86x0cg1",
     "creators": [
       {
-        "name": "Anton Welch",
         "affiliation": [
           "Department of pataphysics"
         ],
+        "name": "Anton Welch",
         "nameIdentifiers": []
       }
     ],
-    "titles": [
-      {
-        "title": "Additional file 123: ABC"
-      },
-      {
-        "title": "DEF",
-        "titleType": "Subtitle"
-      }
-    ],
-    "publicationYear": 2016,
-    "language": "DE-CH",
-    "types": {
-      "ris": "GEN",
-      "bibtex": "misc",
-      "citeproc": "article",
-      "schemaOrg": "CreativeWork"
-    },
     "dates": [
       {
         "date": "2017-08-24",
@@ -41,7 +23,25 @@
         "dateType": "Issued"
       }
     ],
+    "doi": "10.7916/d86x0cg1",
     "isActive": true,
-    "state": "findable"
+    "language": "DE-CH",
+    "publicationYear": 2016,
+    "state": "findable",
+    "titles": [
+      {
+        "title": "Additional file 123: ABC"
+      },
+      {
+        "title": "DEF",
+        "titleType": "Subtitle"
+      }
+    ],
+    "types": {
+      "bibtex": "misc",
+      "citeproc": "article",
+      "ris": "GEN",
+      "schemaOrg": "CreativeWork"
+    }
   }
 }
diff --git a/python/tests/files/datacite/datacite_doc_26.json b/python/tests/files/datacite/datacite_doc_26.json
index c2abb1b2..36fa565d 100644
--- a/python/tests/files/datacite/datacite_doc_26.json
+++ b/python/tests/files/datacite/datacite_doc_26.json
@@ -1,25 +1,43 @@
 {
   "attributes": {
-    "doi": "10.7916/d86x0cg1",
+    "contributors": [
+      {
+        "affiliation": [],
+        "contributorType": "Editor",
+        "familyName": "Wemmer",
+        "givenName": "David",
+        "name": "Wemmer, David",
+        "nameType": "Personal"
+      }
+    ],
     "creators": [
       {
-        "name": "Anton Welch",
         "affiliation": [
           "Department of pataphysics"
         ],
+        "name": "Anton Welch",
         "nameIdentifiers": []
       }
     ],
-    "contributors": [
+    "dates": [
       {
-        "name": "Wemmer, David",
-        "nameType": "Personal",
-        "givenName": "David",
-        "familyName": "Wemmer",
-        "affiliation": [],
-        "contributorType": "Editor"
+        "date": "2017-08-24",
+        "dateType": "Created"
+      },
+      {
+        "date": "2019-08-04",
+        "dateType": "Updated"
+      },
+      {
+        "date": "2017",
+        "dateType": "Issued"
       }
     ],
+    "doi": "10.7916/d86x0cg1",
+    "isActive": true,
+    "language": "DE-CH",
+    "publicationYear": 2016,
+    "state": "findable",
     "titles": [
       {
         "title": "Additional file 123: ABC"
@@ -29,29 +47,11 @@
         "titleType": "Subtitle"
       }
     ],
-    "publicationYear": 2016,
-    "language": "DE-CH",
     "types": {
-      "ris": "GEN",
       "bibtex": "misc",
       "citeproc": "article",
+      "ris": "GEN",
       "schemaOrg": "CreativeWork"
-    },
-    "dates": [
-      {
-        "date": "2017-08-24",
-        "dateType": "Created"
-      },
-      {
-        "date": "2019-08-04",
-        "dateType": "Updated"
-      },
-      {
-        "date": "2017",
-        "dateType": "Issued"
-      }
-    ],
-    "isActive": true,
-    "state": "findable"
+    }
   }
 }
diff --git a/python/tests/files/datacite/datacite_result_00.json b/python/tests/files/datacite/datacite_result_00.json
index 0a84e7bd..89450f9d 100644
--- a/python/tests/files/datacite/datacite_result_00.json
+++ b/python/tests/files/datacite/datacite_result_00.json
@@ -1,4 +1,24 @@
 {
+  "abstracts": [],
+  "contribs": [
+    {
+      "given_name": "Qian-Jin",
+      "index": 0,
+      "raw_name": "Qian-Jin Li",
+      "role": "author",
+      "surname": "Li"
+    },
+    {
+      "given_name": "Chun-Long",
+      "index": 1,
+      "raw_name": "Chun-Long Yang",
+      "role": "author",
+      "surname": "Yang"
+    }
+  ],
+  "ext_ids": {
+    "doi": "10.1007/s10870-008-9413-z"
+  },
   "extra": {
     "container_name": "Journal of Chemical Crystallography",
     "datacite": {
@@ -7,86 +27,66 @@
           "rightsUri": "http://www.springer.com/tdm"
         }
       ],
+      "metadataVersion": 1,
       "relations": [
         {
-          "relationType": "IsPartOf",
           "relatedIdentifier": "1074-1542",
-          "resourceTypeGeneral": "Collection",
-          "relatedIdentifierType": "ISSN"
+          "relatedIdentifierType": "ISSN",
+          "relationType": "IsPartOf",
+          "resourceTypeGeneral": "Collection"
         }
       ],
       "resourceType": "JournalArticle",
       "resourceTypeGeneral": "Text",
-      "schemaVersion": "http://datacite.org/schema/kernel-4",
-      "metadataVersion": 1
+      "schemaVersion": "http://datacite.org/schema/kernel-4"
     },
     "release_month": 5
   },
-  "title": "Synthesis and Crystal Structure of a Compound with Two Conformational Isomers: N-(2-methylbenzoyl)-N′-(4-nitrophenyl)thiourea",
-  "release_type": "article-journal",
-  "release_stage": "published",
-  "release_date": "2019-05-31",
-  "release_year": 2019,
-  "ext_ids": {
-    "doi": "10.1007/s10870-008-9413-z"
-  },
-  "volume": "38",
   "issue": "12",
   "pages": "927-930",
   "publisher": "Springer Science and Business Media LLC",
-  "contribs": [
-    {
-      "index": 0,
-      "raw_name": "Qian-Jin Li",
-      "given_name": "Qian-Jin",
-      "surname": "Li",
-      "role": "author"
-    },
-    {
-      "index": 1,
-      "raw_name": "Chun-Long Yang",
-      "given_name": "Chun-Long",
-      "surname": "Yang",
-      "role": "author"
-    }
-  ],
   "refs": [
     {
-      "index": 0,
       "extra": {
         "doi": "10.1016/j.bmcl.2005.09.033"
-      }
+      },
+      "index": 0
     },
     {
-      "index": 1,
       "extra": {
         "doi": "10.1016/s0022-1139(02)00330-5"
-      }
+      },
+      "index": 1
     },
     {
-      "index": 2,
       "extra": {
         "doi": "10.1016/s0010-8545(01)00337-x"
-      }
+      },
+      "index": 2
     },
     {
-      "index": 3,
       "extra": {
         "doi": "10.1016/j.tetlet.2005.06.135"
-      }
+      },
+      "index": 3
     },
     {
-      "index": 4,
       "extra": {
         "doi": "10.1039/p298700000s1"
-      }
+      },
+      "index": 4
     },
     {
-      "index": 5,
       "extra": {
         "doi": "10.1002/anie.199515551"
-      }
+      },
+      "index": 5
     }
   ],
-  "abstracts": []
+  "release_date": "2019-05-31",
+  "release_stage": "published",
+  "release_type": "article-journal",
+  "release_year": 2019,
+  "title": "Synthesis and Crystal Structure of a Compound with Two Conformational Isomers: N-(2-methylbenzoyl)-N′-(4-nitrophenyl)thiourea",
+  "volume": "38"
 }
diff --git a/python/tests/files/datacite/datacite_result_01.json b/python/tests/files/datacite/datacite_result_01.json
index 956357b8..9fc62db4 100644
--- a/python/tests/files/datacite/datacite_result_01.json
+++ b/python/tests/files/datacite/datacite_result_01.json
@@ -1,4 +1,17 @@
 {
+  "abstracts": [],
+  "contribs": [
+    {
+      "given_name": "G.",
+      "index": 0,
+      "raw_name": "G. Dargenty",
+      "role": "author",
+      "surname": "Dargenty"
+    }
+  ],
+  "ext_ids": {
+    "doi": "10.11588/diglit.25558.39"
+  },
   "extra": {
     "datacite": {
       "license": [
@@ -13,24 +26,11 @@
       "schemaVersion": "http://datacite.org/schema/kernel-4"
     }
   },
-  "title": "Ferdinand Gaillard, [1]: né à Paris le 16 janvier 1834, mort à Paris le 19 janvier 1887",
-  "release_type": "article-journal",
-  "release_stage": "published",
-  "release_year": 1887,
-  "ext_ids": {
-    "doi": "10.11588/diglit.25558.39"
-  },
-  "publisher": "University Library Heidelberg",
   "language": "fr",
-  "contribs": [
-    {
-      "index": 0,
-      "raw_name": "G. Dargenty",
-      "given_name": "G.",
-      "surname": "Dargenty",
-      "role": "author"
-    }
-  ],
+  "publisher": "University Library Heidelberg",
   "refs": [],
-  "abstracts": []
+  "release_stage": "published",
+  "release_type": "article-journal",
+  "release_year": 1887,
+  "title": "Ferdinand Gaillard, [1]: né à Paris le 16 janvier 1834, mort à Paris le 19 janvier 1887"
 }
diff --git a/python/tests/files/datacite/datacite_result_02.json b/python/tests/files/datacite/datacite_result_02.json
index 322baf59..d6b9556f 100644
--- a/python/tests/files/datacite/datacite_result_02.json
+++ b/python/tests/files/datacite/datacite_result_02.json
@@ -1,4 +1,17 @@
 {
+  "abstracts": [],
+  "contribs": [
+    {
+      "given_name": "Albert",
+      "index": 0,
+      "raw_name": "Albert Weyersberg",
+      "role": "author",
+      "surname": "Weyersberg"
+    }
+  ],
+  "ext_ids": {
+    "doi": "10.11588/diglit.37715.57"
+  },
   "extra": {
     "datacite": {
       "license": [
@@ -17,24 +30,11 @@
       "schemaVersion": "http://datacite.org/schema/kernel-4"
     }
   },
-  "title": "Solinger Schwertschmiede-Familien, [4]",
-  "release_type": "article-journal",
-  "release_stage": "published",
-  "release_year": 1897,
-  "ext_ids": {
-    "doi": "10.11588/diglit.37715.57"
-  },
-  "publisher": "University Library Heidelberg",
   "language": "de",
-  "contribs": [
-    {
-      "index": 0,
-      "raw_name": "Albert Weyersberg",
-      "given_name": "Albert",
-      "surname": "Weyersberg",
-      "role": "author"
-    }
-  ],
+  "publisher": "University Library Heidelberg",
   "refs": [],
-  "abstracts": []
+  "release_stage": "published",
+  "release_type": "article-journal",
+  "release_year": 1897,
+  "title": "Solinger Schwertschmiede-Familien, [4]"
 }
diff --git a/python/tests/files/datacite/datacite_result_03.json b/python/tests/files/datacite/datacite_result_03.json
index 41d8d4cd..6aa65aee 100644
--- a/python/tests/files/datacite/datacite_result_03.json
+++ b/python/tests/files/datacite/datacite_result_03.json
@@ -1,16 +1,5 @@
 {
-  "extra": {
-    "datacite": {
-      "schemaVersion": "http://datacite.org/schema/kernel-3"
-    }
-  },
-  "title": "midterm ah30903",
-  "release_type": "article",
-  "release_year": 2016,
-  "ext_ids": {
-    "doi": "10.13140/rg.2.2.30434.53446"
-  },
-  "language": "ms",
+  "abstracts": [],
   "contribs": [
     {
       "index": 0,
@@ -18,6 +7,17 @@
       "role": "author"
     }
   ],
+  "ext_ids": {
+    "doi": "10.13140/rg.2.2.30434.53446"
+  },
+  "extra": {
+    "datacite": {
+      "schemaVersion": "http://datacite.org/schema/kernel-3"
+    }
+  },
+  "language": "ms",
   "refs": [],
-  "abstracts": []
+  "release_type": "article",
+  "release_year": 2016,
+  "title": "midterm ah30903"
 }
diff --git a/python/tests/files/datacite/datacite_result_04.json b/python/tests/files/datacite/datacite_result_04.json
index 0976e40e..571c3f64 100644
--- a/python/tests/files/datacite/datacite_result_04.json
+++ b/python/tests/files/datacite/datacite_result_04.json
@@ -1,4 +1,23 @@
 {
+  "abstracts": [
+    {
+      "content": "Let A be an abelian category, I the full subcategory of A consisting of injective objects of A, and K(A) the category whose objects are cochain complexes of elements of A, and whose morphisms are homotopy classes of cochain maps.  In (5), lemma 4.6., p. 42, R. Hartshorne has proved that, under certain conditions, a cochain complex X˙ ε. |KA)| can be embedded in a complex I˙ ε. |K(I)| in such a way that I˙ has the same cohomology as X˙.  In Chapter I we show that the construction given in the two first parts of Hartshorne's Lemma is natural i.e. there exists a functor  J : K(A) → K(I) and a natural transformation [formula omitted]  (where E : K(I) → K(A) is the embedding functor) such that [formula omitted] is  injective and induces isomorphism in cohomology. The question whether the construction given in the third part of the lemma is functorial is still open.  We also prove that J is left adjoint to E, so that K(I) is a reflective subcategory of K(A).  In the special case where A is a category [formula omitted] of left A-modules, and [formula omitted] the category of cochain complexes in [formula omitted] and cochain maps (not homotopy classes), we prove the existence of a functor [formula omitted]  In Chapter II we study the natural homomorphism [formula omitted]   where A, B are rings, and M, L, N modules or chain complexes. In particular we give several sufficient conditions under which v is an isomorphism, or induces isomorphism in homology.  In the appendix we give a detailed proof of Hartshorne's Lemma. We think that this is useful, as no complete proof is, to our knowledge, to be found in the literature.",
+      "lang": "en",
+      "mimetype": "text/plain"
+    }
+  ],
+  "contribs": [
+    {
+      "given_name": "Marc Andre",
+      "index": 0,
+      "raw_name": "Marc Andre Nicollerat",
+      "role": "author",
+      "surname": "Nicollerat"
+    }
+  ],
+  "ext_ids": {
+    "doi": "10.14288/1.0080520"
+  },
   "extra": {
     "datacite": {
       "metadataVersion": 5,
@@ -7,30 +26,11 @@
       "schemaVersion": "http://datacite.org/schema/kernel-3"
     }
   },
-  "title": "On chain maps inducing isomorphisms in homology",
-  "release_type": "article-journal",
-  "release_stage": "published",
-  "release_year": 1973,
-  "ext_ids": {
-    "doi": "10.14288/1.0080520"
-  },
-  "publisher": "University of British Columbia",
   "language": "en",
-  "contribs": [
-    {
-      "index": 0,
-      "raw_name": "Marc Andre Nicollerat",
-      "given_name": "Marc Andre",
-      "surname": "Nicollerat",
-      "role": "author"
-    }
-  ],
+  "publisher": "University of British Columbia",
   "refs": [],
-  "abstracts": [
-    {
-      "content": "Let A be an abelian category, I the full subcategory of A consisting of injective objects of A, and K(A) the category whose objects are cochain complexes of elements of A, and whose morphisms are homotopy classes of cochain maps.  In (5), lemma 4.6., p. 42, R. Hartshorne has proved that, under certain conditions, a cochain complex X˙ ε. |KA)| can be embedded in a complex I˙ ε. |K(I)| in such a way that I˙ has the same cohomology as X˙.  In Chapter I we show that the construction given in the two first parts of Hartshorne's Lemma is natural i.e. there exists a functor  J : K(A) → K(I) and a natural transformation [formula omitted]  (where E : K(I) → K(A) is the embedding functor) such that [formula omitted] is  injective and induces isomorphism in cohomology. The question whether the construction given in the third part of the lemma is functorial is still open.  We also prove that J is left adjoint to E, so that K(I) is a reflective subcategory of K(A).  In the special case where A is a category [formula omitted] of left A-modules, and [formula omitted] the category of cochain complexes in [formula omitted] and cochain maps (not homotopy classes), we prove the existence of a functor [formula omitted]  In Chapter II we study the natural homomorphism [formula omitted]   where A, B are rings, and M, L, N modules or chain complexes. In particular we give several sufficient conditions under which v is an isomorphism, or induces isomorphism in homology.  In the appendix we give a detailed proof of Hartshorne's Lemma. We think that this is useful, as no complete proof is, to our knowledge, to be found in the literature.",
-      "mimetype": "text/plain",
-      "lang": "en"
-    }
-  ]
+  "release_stage": "published",
+  "release_type": "article-journal",
+  "release_year": 1973,
+  "title": "On chain maps inducing isomorphisms in homology"
 }
diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json
index c4e5418d..5b7b4ed2 100644
--- a/python/tests/files/datacite/datacite_result_05.json
+++ b/python/tests/files/datacite/datacite_result_05.json
@@ -1,528 +1,508 @@
 {
-  "extra": {
-    "datacite": {
-      "license": [
-        {
-          "rights": "Attribution-NonCommercial (CC BY-NC)",
-          "rightsUri": "http://creativecommons.org/licenses/by-nc/4.0"
-        }
-      ],
-      "metadataVersion": 1,
-      "resourceType": "Dataset/UNITE Species Hypothesis",
-      "resourceTypeGeneral": "Dataset",
-      "schemaVersion": "http://datacite.org/schema/kernel-3"
-    },
-    "release_month": 10
-  },
-  "title": "SH409843.07FU",
-  "subtitle": "Gomphales",
-  "release_type": "dataset",
-  "release_stage": "published",
-  "release_date": "2014-10-05",
-  "release_year": 2014,
-  "ext_ids": {
-    "doi": "10.15156/bio/sh409843.07fu"
-  },
-  "publisher": "UNITE Community",
-  "language": "en",
-  "license_slug": "CC-BY-NC",
+  "abstracts": [
+    {
+      "content": "UNITE provides a unified way for delimiting, identifying, communicating, and working with DNA-based Species Hypotheses (SH). All fungal ITS sequences in the international nucleotide sequence databases are clustered to approximately the species level by applying a set of dynamic distance values (<0.5 - 3.0%). All species hypotheses are given a unique, stable name in the form of a DOI, and their taxonomic and ecological annotations are verified through distributed, web-based third-party annotation efforts. SHs are connected to a taxon name and its classification as far as possible (phylum, class, order, etc.) by taking into account identifications for all sequences in the SH. An automatically or manually designated sequence is chosen to represent each such SH. These sequences are released (https://unite.ut.ee/repository.php) for use by the scientific community in, for example, local sequence similarity searches and next-generation sequencing analysis pipelines. The system and the data are updated automatically as the number of public fungal ITS sequences grows.",
+      "lang": "en",
+      "mimetype": "text/plain"
+    }
+  ],
   "contribs": [
     {
+      "given_name": "Urmas",
       "index": 0,
       "raw_name": "Urmas Kõljalg",
-      "given_name": "Urmas",
-      "surname": "Kõljalg",
-      "role": "author"
+      "role": "author",
+      "surname": "Kõljalg"
     },
     {
+      "given_name": "Kessy",
       "index": 1,
       "raw_name": "Kessy Abarenkov",
-      "given_name": "Kessy",
-      "surname": "Abarenkov",
-      "role": "author"
+      "role": "author",
+      "surname": "Abarenkov"
     },
     {
+      "given_name": "R. Henrik",
       "index": 2,
       "raw_name": "R. Henrik Nilsson",
-      "given_name": "R. Henrik",
-      "surname": "Nilsson",
-      "role": "author"
+      "role": "author",
+      "surname": "Nilsson"
     },
     {
+      "given_name": "Karl-Henrik",
       "index": 3,
       "raw_name": "Karl-Henrik Larsson",
-      "given_name": "Karl-Henrik",
-      "surname": "Larsson",
-      "role": "author"
+      "role": "author",
+      "surname": "Larsson"
     },
     {
+      "given_name": "Anders Bjørnsgard",
       "index": 4,
       "raw_name": "Anders Bjørnsgard Aas",
-      "given_name": "Anders Bjørnsgard",
-      "surname": "Aas",
-      "role": "author"
+      "role": "author",
+      "surname": "Aas"
     },
     {
+      "given_name": "Rachel",
       "index": 5,
       "raw_name": "Rachel Adams",
-      "given_name": "Rachel",
-      "surname": "Adams",
-      "role": "author"
+      "role": "author",
+      "surname": "Adams"
     },
     {
+      "given_name": "Artur",
       "index": 6,
       "raw_name": "Artur Alves",
-      "given_name": "Artur",
-      "surname": "Alves",
-      "role": "author"
+      "role": "author",
+      "surname": "Alves"
     },
     {
+      "given_name": "Joseph F.",
       "index": 7,
       "raw_name": "Joseph F. Ammirati",
-      "given_name": "Joseph F.",
-      "surname": "Ammirati",
-      "role": "author"
+      "role": "author",
+      "surname": "Ammirati"
     },
     {
+      "given_name": "A. Elizabeth",
       "index": 8,
       "raw_name": "A. Elizabeth Arnold",
-      "given_name": "A. Elizabeth",
-      "surname": "Arnold",
-      "role": "author"
+      "role": "author",
+      "surname": "Arnold"
     },
     {
+      "given_name": "Mohammad",
       "index": 9,
       "raw_name": "Mohammad Bahram",
-      "given_name": "Mohammad",
-      "surname": "Bahram",
-      "role": "author"
+      "role": "author",
+      "surname": "Bahram"
     },
     {
+      "given_name": "Johan",
       "index": 10,
       "raw_name": "Johan Bengtsson-Palme",
-      "given_name": "Johan",
-      "surname": "Bengtsson-Palme",
-      "role": "author"
+      "role": "author",
+      "surname": "Bengtsson-Palme"
     },
     {
+      "given_name": "Anna",
       "index": 11,
       "raw_name": "Anna Berlin",
-      "given_name": "Anna",
-      "surname": "Berlin",
-      "role": "author"
+      "role": "author",
+      "surname": "Berlin"
     },
     {
+      "given_name": "Synnøve",
       "index": 12,
       "raw_name": "Synnøve Botnen",
-      "given_name": "Synnøve",
-      "surname": "Botnen",
-      "role": "author"
+      "role": "author",
+      "surname": "Botnen"
     },
     {
+      "given_name": "Sarah",
       "index": 13,
       "raw_name": "Sarah Bourlat",
-      "given_name": "Sarah",
-      "surname": "Bourlat",
-      "role": "author"
+      "role": "author",
+      "surname": "Bourlat"
     },
     {
+      "given_name": "Tanya",
       "index": 14,
       "raw_name": "Tanya Cheeke",
-      "given_name": "Tanya",
-      "surname": "Cheeke",
-      "role": "author"
+      "role": "author",
+      "surname": "Cheeke"
     },
     {
+      "given_name": "Bálint",
       "index": 15,
       "raw_name": "Bálint Dima",
-      "given_name": "Bálint",
-      "surname": "Dima",
-      "role": "author"
+      "role": "author",
+      "surname": "Dima"
     },
     {
+      "given_name": "Rein",
       "index": 16,
       "raw_name": "Rein Drenkhan",
-      "given_name": "Rein",
-      "surname": "Drenkhan",
-      "role": "author"
+      "role": "author",
+      "surname": "Drenkhan"
     },
     {
+      "given_name": "Camila",
       "index": 17,
       "raw_name": "Camila Duarte",
-      "given_name": "Camila",
-      "surname": "Duarte",
-      "role": "author"
+      "role": "author",
+      "surname": "Duarte"
     },
     {
+      "given_name": "Margarita",
       "index": 18,
       "raw_name": "Margarita Dueñas",
-      "given_name": "Margarita",
-      "surname": "Dueñas",
-      "role": "author"
+      "role": "author",
+      "surname": "Dueñas"
     },
     {
+      "given_name": "Ursula",
       "index": 19,
       "raw_name": "Ursula Eberhardt",
-      "given_name": "Ursula",
-      "surname": "Eberhardt",
-      "role": "author"
+      "role": "author",
+      "surname": "Eberhardt"
     },
     {
+      "given_name": "Hanna",
       "index": 20,
       "raw_name": "Hanna Friberg",
-      "given_name": "Hanna",
-      "surname": "Friberg",
-      "role": "author"
+      "role": "author",
+      "surname": "Friberg"
     },
     {
+      "given_name": "Tobias G.",
       "index": 21,
       "raw_name": "Tobias G. Frøslev",
-      "given_name": "Tobias G.",
-      "surname": "Frøslev",
-      "role": "author"
+      "role": "author",
+      "surname": "Frøslev"
     },
     {
+      "given_name": "Sigisfredo",
       "index": 22,
       "raw_name": "Sigisfredo Garnica",
-      "given_name": "Sigisfredo",
-      "surname": "Garnica",
-      "role": "author"
+      "role": "author",
+      "surname": "Garnica"
     },
     {
+      "given_name": "József",
       "index": 23,
       "raw_name": "József Geml",
-      "given_name": "József",
-      "surname": "Geml",
-      "role": "author"
+      "role": "author",
+      "surname": "Geml"
     },
     {
+      "given_name": "Masoomeh",
       "index": 24,
       "raw_name": "Masoomeh Ghobad-Nejhad",
-      "given_name": "Masoomeh",
-      "surname": "Ghobad-Nejhad",
-      "role": "author"
+      "role": "author",
+      "surname": "Ghobad-Nejhad"
     },
     {
+      "given_name": "Tine",
       "index": 25,
       "raw_name": "Tine Grebenc",
-      "given_name": "Tine",
-      "surname": "Grebenc",
-      "role": "author"
+      "role": "author",
+      "surname": "Grebenc"
     },
     {
+      "given_name": "Gareth W.",
       "index": 26,
       "raw_name": "Gareth W. Griffith",
-      "given_name": "Gareth W.",
-      "surname": "Griffith",
-      "role": "author"
+      "role": "author",
+      "surname": "Griffith"
     },
     {
+      "given_name": "Felix",
       "index": 27,
       "raw_name": "Felix Hampe",
-      "given_name": "Felix",
-      "surname": "Hampe",
-      "role": "author"
+      "role": "author",
+      "surname": "Hampe"
     },
     {
+      "given_name": "Peter",
       "index": 28,
       "raw_name": "Peter Kennedy",
-      "given_name": "Peter",
-      "surname": "Kennedy",
-      "role": "author"
+      "role": "author",
+      "surname": "Kennedy"
     },
     {
+      "given_name": "Maryia",
       "index": 29,
       "raw_name": "Maryia Khomich",
-      "given_name": "Maryia",
-      "surname": "Khomich",
-      "role": "author"
+      "role": "author",
+      "surname": "Khomich"
     },
     {
+      "given_name": "Petr",
       "index": 30,
       "raw_name": "Petr Kohout",
-      "given_name": "Petr",
-      "surname": "Kohout",
-      "role": "author"
+      "role": "author",
+      "surname": "Kohout"
     },
     {
+      "given_name": "Anu",
       "index": 31,
       "raw_name": "Anu Kollom",
-      "given_name": "Anu",
-      "surname": "Kollom",
-      "role": "author"
+      "role": "author",
+      "surname": "Kollom"
     },
     {
+      "given_name": "Ellen",
       "index": 32,
       "raw_name": "Ellen Larsson",
-      "given_name": "Ellen",
-      "surname": "Larsson",
-      "role": "author"
+      "role": "author",
+      "surname": "Larsson"
     },
     {
+      "given_name": "Irinyi",
       "index": 33,
       "raw_name": "Irinyi Laszlo",
-      "given_name": "Irinyi",
-      "surname": "Laszlo",
-      "role": "author"
+      "role": "author",
+      "surname": "Laszlo"
     },
     {
+      "given_name": "Steven",
       "index": 34,
       "raw_name": "Steven Leavitt",
-      "given_name": "Steven",
-      "surname": "Leavitt",
-      "role": "author"
+      "role": "author",
+      "surname": "Leavitt"
     },
     {
+      "given_name": "Kare",
       "index": 35,
       "raw_name": "Kare Liimatainen",
-      "given_name": "Kare",
-      "surname": "Liimatainen",
-      "role": "author"
+      "role": "author",
+      "surname": "Liimatainen"
     },
     {
+      "given_name": "Björn",
       "index": 36,
       "raw_name": "Björn Lindahl",
-      "given_name": "Björn",
-      "surname": "Lindahl",
-      "role": "author"
+      "role": "author",
+      "surname": "Lindahl"
     },
     {
+      "given_name": "Deborah J.",
       "index": 37,
       "raw_name": "Deborah J. Lodge",
-      "given_name": "Deborah J.",
-      "surname": "Lodge",
-      "role": "author"
+      "role": "author",
+      "surname": "Lodge"
     },
     {
+      "given_name": "Helge Thorsten",
       "index": 38,
       "raw_name": "Helge Thorsten Lumbsch",
-      "given_name": "Helge Thorsten",
-      "surname": "Lumbsch",
-      "role": "author"
+      "role": "author",
+      "surname": "Lumbsch"
     },
     {
+      "given_name": "María Paz",
       "index": 39,
       "raw_name": "María Paz Martín Esteban",
-      "given_name": "María Paz",
-      "surname": "Martín Esteban",
-      "role": "author"
+      "role": "author",
+      "surname": "Martín Esteban"
     },
     {
+      "given_name": "Wieland",
       "index": 40,
       "raw_name": "Wieland Meyer",
-      "given_name": "Wieland",
-      "surname": "Meyer",
-      "role": "author"
+      "role": "author",
+      "surname": "Meyer"
     },
     {
+      "given_name": "Otto",
       "index": 41,
       "raw_name": "Otto Miettinen",
-      "given_name": "Otto",
-      "surname": "Miettinen",
-      "role": "author"
+      "role": "author",
+      "surname": "Miettinen"
     },
     {
+      "given_name": "Nhu",
       "index": 42,
       "raw_name": "Nhu Nguyen",
-      "given_name": "Nhu",
-      "surname": "Nguyen",
-      "role": "author"
+      "role": "author",
+      "surname": "Nguyen"
     },
     {
+      "given_name": "Tuula",
       "index": 43,
       "raw_name": "Tuula Niskanen",
-      "given_name": "Tuula",
-      "surname": "Niskanen",
-      "role": "author"
+      "role": "author",
+      "surname": "Niskanen"
     },
     {
+      "given_name": "Ryoko",
       "index": 44,
       "raw_name": "Ryoko Oono",
-      "given_name": "Ryoko",
-      "surname": "Oono",
-      "role": "author"
+      "role": "author",
+      "surname": "Oono"
     },
     {
+      "given_name": "Maarja",
       "index": 45,
       "raw_name": "Maarja Öpik",
-      "given_name": "Maarja",
-      "surname": "Öpik",
-      "role": "author"
+      "role": "author",
+      "surname": "Öpik"
     },
     {
+      "given_name": "Alexander",
       "index": 46,
       "raw_name": "Alexander Ordynets",
-      "given_name": "Alexander",
-      "surname": "Ordynets",
-      "role": "author"
+      "role": "author",
+      "surname": "Ordynets"
     },
     {
+      "given_name": "Julia",
       "index": 47,
       "raw_name": "Julia Pawłowska",
-      "given_name": "Julia",
-      "surname": "Pawłowska",
-      "role": "author"
+      "role": "author",
+      "surname": "Pawłowska"
     },
     {
+      "given_name": "Ursula",
       "index": 48,
       "raw_name": "Ursula Peintner",
-      "given_name": "Ursula",
-      "surname": "Peintner",
-      "role": "author"
+      "role": "author",
+      "surname": "Peintner"
     },
     {
+      "given_name": "Olinto Liparini",
       "index": 49,
       "raw_name": "Olinto Liparini Pereira",
-      "given_name": "Olinto Liparini",
-      "surname": "Pereira",
-      "role": "author"
+      "role": "author",
+      "surname": "Pereira"
     },
     {
+      "given_name": "Danilo Batista",
       "index": 50,
       "raw_name": "Danilo Batista Pinho",
-      "given_name": "Danilo Batista",
-      "surname": "Pinho",
-      "role": "author"
+      "role": "author",
+      "surname": "Pinho"
     },
     {
+      "given_name": "Kadri",
       "index": 51,
       "raw_name": "Kadri Põldmaa",
-      "given_name": "Kadri",
-      "surname": "Põldmaa",
-      "role": "author"
+      "role": "author",
+      "surname": "Põldmaa"
     },
     {
+      "given_name": "Kadri",
       "index": 52,
       "raw_name": "Kadri Runnel",
-      "given_name": "Kadri",
-      "surname": "Runnel",
-      "role": "author"
+      "role": "author",
+      "surname": "Runnel"
     },
     {
+      "given_name": "Martin",
       "index": 53,
       "raw_name": "Martin Ryberg",
-      "given_name": "Martin",
-      "surname": "Ryberg",
-      "role": "author"
+      "role": "author",
+      "surname": "Ryberg"
     },
     {
+      "given_name": "Irja",
       "index": 54,
       "raw_name": "Irja Saar",
-      "given_name": "Irja",
-      "surname": "Saar",
-      "role": "author"
+      "role": "author",
+      "surname": "Saar"
     },
     {
+      "given_name": "Kemal",
       "index": 55,
       "raw_name": "Kemal Sanli",
-      "given_name": "Kemal",
-      "surname": "Sanli",
-      "role": "author"
+      "role": "author",
+      "surname": "Sanli"
     },
     {
+      "given_name": "James",
       "index": 56,
       "raw_name": "James Scott",
-      "given_name": "James",
-      "surname": "Scott",
-      "role": "author"
+      "role": "author",
+      "surname": "Scott"
     },
     {
+      "given_name": "Viacheslav",
       "index": 57,
       "raw_name": "Viacheslav Spirin",
-      "given_name": "Viacheslav",
-      "surname": "Spirin",
-      "role": "author"
+      "role": "author",
+      "surname": "Spirin"
     },
     {
+      "given_name": "Ave",
       "index": 58,
       "raw_name": "Ave Suija",
-      "given_name": "Ave",
-      "surname": "Suija",
-      "role": "author"
+      "role": "author",
+      "surname": "Suija"
     },
     {
+      "given_name": "Sten",
       "index": 59,
       "raw_name": "Sten Svantesson",
-      "given_name": "Sten",
-      "surname": "Svantesson",
-      "role": "author"
+      "role": "author",
+      "surname": "Svantesson"
     },
     {
+      "given_name": "Mariusz",
       "index": 60,
       "raw_name": "Mariusz Tadych",
-      "given_name": "Mariusz",
-      "surname": "Tadych",
-      "role": "author"
+      "role": "author",
+      "surname": "Tadych"
     },
     {
+      "given_name": "Susumu",
       "index": 61,
       "raw_name": "Susumu Takamatsu",
-      "given_name": "Susumu",
-      "surname": "Takamatsu",
-      "role": "author"
+      "role": "author",
+      "surname": "Takamatsu"
     },
     {
+      "given_name": "Heidi",
       "index": 62,
       "raw_name": "Heidi Tamm",
-      "given_name": "Heidi",
-      "surname": "Tamm",
-      "role": "author"
+      "role": "author",
+      "surname": "Tamm"
     },
     {
+      "given_name": "AFS.",
       "index": 63,
       "raw_name": "AFS. Taylor",
-      "given_name": "AFS.",
-      "surname": "Taylor",
-      "role": "author"
+      "role": "author",
+      "surname": "Taylor"
     },
     {
+      "given_name": "Leho",
       "index": 64,
       "raw_name": "Leho Tedersoo",
-      "given_name": "Leho",
-      "surname": "Tedersoo",
-      "role": "author"
+      "role": "author",
+      "surname": "Tedersoo"
     },
     {
+      "given_name": "M.T.",
       "index": 65,
       "raw_name": "M.T. Telleria",
-      "given_name": "M.T.",
-      "surname": "Telleria",
-      "role": "author"
+      "role": "author",
+      "surname": "Telleria"
     },
     {
+      "given_name": "Dhanushka",
       "index": 66,
       "raw_name": "Dhanushka Udayanga",
-      "given_name": "Dhanushka",
-      "surname": "Udayanga",
-      "role": "author"
+      "role": "author",
+      "surname": "Udayanga"
     },
     {
+      "given_name": "Martin",
       "index": 67,
       "raw_name": "Martin Unterseher",
-      "given_name": "Martin",
-      "surname": "Unterseher",
-      "role": "author"
+      "role": "author",
+      "surname": "Unterseher"
     },
     {
+      "given_name": "Sergey",
       "index": 68,
       "raw_name": "Sergey Volobuev",
-      "given_name": "Sergey",
-      "surname": "Volobuev",
-      "role": "author"
+      "role": "author",
+      "surname": "Volobuev"
     },
     {
+      "given_name": "Michael",
       "index": 69,
       "raw_name": "Michael Weiss",
-      "given_name": "Michael",
-      "surname": "Weiss",
-      "role": "author"
+      "role": "author",
+      "surname": "Weiss"
     },
     {
+      "given_name": "Christian",
       "index": 70,
       "raw_name": "Christian Wurzbacher",
-      "given_name": "Christian",
-      "surname": "Wurzbacher",
-      "role": "author"
+      "role": "author",
+      "surname": "Wurzbacher"
     },
     {
       "raw_name": "Kessy Abarenkov"
@@ -531,12 +511,32 @@
       "raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden"
     }
   ],
+  "ext_ids": {
+    "doi": "10.15156/bio/sh409843.07fu"
+  },
+  "extra": {
+    "datacite": {
+      "license": [
+        {
+          "rights": "Attribution-NonCommercial (CC BY-NC)",
+          "rightsUri": "http://creativecommons.org/licenses/by-nc/4.0"
+        }
+      ],
+      "metadataVersion": 1,
+      "resourceType": "Dataset/UNITE Species Hypothesis",
+      "resourceTypeGeneral": "Dataset",
+      "schemaVersion": "http://datacite.org/schema/kernel-3"
+    },
+    "release_month": 10
+  },
+  "language": "en",
+  "license_slug": "CC-BY-NC",
+  "publisher": "UNITE Community",
   "refs": [],
-  "abstracts": [
-    {
-      "content": "UNITE provides a unified way for delimiting, identifying, communicating, and working with DNA-based Species Hypotheses (SH). All fungal ITS sequences in the international nucleotide sequence databases are clustered to approximately the species level by applying a set of dynamic distance values (<0.5 - 3.0%). All species hypotheses are given a unique, stable name in the form of a DOI, and their taxonomic and ecological annotations are verified through distributed, web-based third-party annotation efforts. SHs are connected to a taxon name and its classification as far as possible (phylum, class, order, etc.) by taking into account identifications for all sequences in the SH. An automatically or manually designated sequence is chosen to represent each such SH. These sequences are released (https://unite.ut.ee/repository.php) for use by the scientific community in, for example, local sequence similarity searches and next-generation sequencing analysis pipelines. The system and the data are updated automatically as the number of public fungal ITS sequences grows.",
-      "mimetype": "text/plain",
-      "lang": "en"
-    }
-  ]
+  "release_date": "2014-10-05",
+  "release_stage": "published",
+  "release_type": "dataset",
+  "release_year": 2014,
+  "subtitle": "Gomphales",
+  "title": "SH409843.07FU"
 }
diff --git a/python/tests/files/datacite/datacite_result_06.json b/python/tests/files/datacite/datacite_result_06.json
index 18880100..4f6cae94 100644
--- a/python/tests/files/datacite/datacite_result_06.json
+++ b/python/tests/files/datacite/datacite_result_06.json
@@ -1,4 +1,15 @@
 {
+  "abstracts": [],
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "Crispijn De Passe (Der Ältere) (1564-1637)",
+      "role": "author"
+    }
+  ],
+  "ext_ids": {
+    "doi": "10.16903/ethz-grs-d_006220"
+  },
   "extra": {
     "datacite": {
       "license": [
@@ -11,19 +22,8 @@
       "schemaVersion": "http://datacite.org/schema/kernel-3"
     }
   },
-  "title": "Der Eifer (Sedulitas), Blatt 7 der Folge \"Die Tugenden\"",
+  "refs": [],
   "release_type": "article",
   "release_year": 1590,
-  "ext_ids": {
-    "doi": "10.16903/ethz-grs-d_006220"
-  },
-  "contribs": [
-    {
-      "index": 0,
-      "raw_name": "Crispijn De Passe (Der Ältere) (1564-1637)",
-      "role": "author"
-    }
-  ],
-  "refs": [],
-  "abstracts": []
+  "title": "Der Eifer (Sedulitas), Blatt 7 der Folge \"Die Tugenden\""
 }
diff --git a/python/tests/files/datacite/datacite_result_07.json b/python/tests/files/datacite/datacite_result_07.json
index 23b63d50..2f500925 100644
--- a/python/tests/files/datacite/datacite_result_07.json
+++ b/python/tests/files/datacite/datacite_result_07.json
@@ -1,6 +1,46 @@
 {
+  "abstracts": [
+    {
+      "content": "The purpose of the ISEC concept is to provide a high-efficient heat pump system for hot water production. The ISEC concept uses two storage tanks for the water, one discharged and one charged. Hot water for the industrial process is tapped from the charged tank, while the other tank is charging. Charging is done by circulating the water in the tank through the condenser of a heat pump several times and thereby gradually heating the water. The charging is done with a higher mass flow rate than the discharging to reach several circulations of the water during the time frame of one discharging. This result in a lower condensing temperature than if the water was heated in one step. Two test setups were built, one to test the performance of the heat pump gradually heating the water and one to investigate the stratification in the storage tanks. Furthermore, a dynamic model of the system was implemented in Dymola, and validated by the use of test data from the two experimental setups. This paper shows that there is a good consistency between the model and the experimental tests.",
+      "lang": "en",
+      "mimetype": "text/plain"
+    }
+  ],
+  "contribs": [
+    {
+      "given_name": "E.",
+      "index": 0,
+      "raw_name": "E. ROTHUIZEN",
+      "role": "author",
+      "surname": "ROTHUIZEN"
+    },
+    {
+      "given_name": "B.",
+      "index": 1,
+      "raw_name": "B. ELMEGAARD",
+      "role": "author",
+      "surname": "ELMEGAARD"
+    },
+    {
+      "given_name": "B.",
+      "index": 2,
+      "raw_name": "B. MARKUSSEN W.",
+      "role": "author",
+      "surname": "MARKUSSEN W."
+    },
+    {
+      "index": 3,
+      "raw_name": "Et Al.",
+      "role": "author"
+    }
+  ],
+  "ext_ids": {
+    "doi": "10.18462/iir.icr.2015.0926"
+  },
   "extra": {
     "datacite": {
+      "resourceType": "Dataset",
+      "resourceTypeGeneral": "Dataset",
       "subjects": [
         {
           "subject": "HEAT PUMP"
@@ -23,54 +63,14 @@
         {
           "subject": "MODEL"
         }
-      ],
-      "resourceType": "Dataset",
-      "resourceTypeGeneral": "Dataset"
+      ]
     }
   },
-  "title": "High efficient heat pump system using storage tanks to increase cop by means of the ISEC concept. 1: model validation.",
-  "release_type": "dataset",
-  "release_stage": "published",
-  "release_year": 2015,
-  "ext_ids": {
-    "doi": "10.18462/iir.icr.2015.0926"
-  },
-  "publisher": "International Institute of Refrigeration (IIR)",
   "language": "en",
-  "contribs": [
-    {
-      "index": 0,
-      "raw_name": "E. ROTHUIZEN",
-      "given_name": "E.",
-      "surname": "ROTHUIZEN",
-      "role": "author"
-    },
-    {
-      "index": 1,
-      "raw_name": "B. ELMEGAARD",
-      "given_name": "B.",
-      "surname": "ELMEGAARD",
-      "role": "author"
-    },
-    {
-      "index": 2,
-      "raw_name": "B. MARKUSSEN W.",
-      "given_name": "B.",
-      "surname": "MARKUSSEN W.",
-      "role": "author"
-    },
-    {
-      "index": 3,
-      "raw_name": "Et Al.",
-      "role": "author"
-    }
-  ],
+  "publisher": "International Institute of Refrigeration (IIR)",
   "refs": [],
-  "abstracts": [
-    {
-      "content": "The purpose of the ISEC concept is to provide a high-efficient heat pump system for hot water production. The ISEC concept uses two storage tanks for the water, one discharged and one charged. Hot water for the industrial process is tapped from the charged tank, while the other tank is charging. Charging is done by circulating the water in the tank through the condenser of a heat pump several times and thereby gradually heating the water. The charging is done with a higher mass flow rate than the discharging to reach several circulations of the water during the time frame of one discharging. This result in a lower condensing temperature than if the water was heated in one step. Two test setups were built, one to test the performance of the heat pump gradually heating the water and one to investigate the stratification in the storage tanks. Furthermore, a dynamic model of the system was implemented in Dymola, and validated by the use of test data from the two experimental setups. This paper shows that there is a good consistency between the model and the experimental tests.",
-      "mimetype": "text/plain",
-      "lang": "en"
-    }
-  ]
+  "release_stage": "published",
+  "release_type": "dataset",
+  "release_year": 2015,
+  "title": "High efficient heat pump system using storage tanks to increase cop by means of the ISEC concept. 1: model validation."
 }
diff --git a/python/tests/files/datacite/datacite_result_08.json b/python/tests/files/datacite/datacite_result_08.json
index ff942d0a..70237280 100644
--- a/python/tests/files/datacite/datacite_result_08.json
+++ b/python/tests/files/datacite/datacite_result_08.json
@@ -1,6 +1,35 @@
 {
+  "abstracts": [
+    {
+      "content": "International society recognizes that the scarcity of fresh water is increasing and farming sectors suffer from lack of irrigation water. However, if we look at this issue with a framework of relative factor endowment, a different view will arise. In emerging states with rapid industrialization and labor migration, labor scarcity increases at a faster pace than that of irrigation water. Using the historical review of Japan's irrigation policies as well as the case studies of India and China, this paper shows that the introduction of policies which do not reflect the actual relative resource scarcity may mislead the development path. We argue that under increasing relative labor scarcity it is important to realize the substitution of capital for labor for surface irrigation system management and that the substitution needs public support because the service of surface irrigation system has some externalities. Through this argument, this paper also intends to shed the light back to the role of the state for local resource management which seems to be unfairly undervalued since the boom of community participatory approach in the 1980s.",
+      "lang": "en",
+      "mimetype": "text/plain"
+    }
+  ],
+  "contribs": [
+    {
+      "given_name": "Kei",
+      "index": 0,
+      "raw_name": "Kei Kajisa",
+      "role": "author",
+      "surname": "Kajisa"
+    },
+    {
+      "given_name": "Kei",
+      "index": 1,
+      "raw_name": "Kei Kajisa",
+      "role": "author",
+      "surname": "Kajisa"
+    }
+  ],
+  "ext_ids": {
+    "doi": "10.22004/ag.econ.284864"
+  },
   "extra": {
     "datacite": {
+      "metadataVersion": 1,
+      "resourceType": "Text",
+      "resourceTypeGeneral": "Text",
       "subjects": [
         {
           "subject": "Land Economics/Use"
@@ -17,41 +46,12 @@
           "subject": "collective action",
           "subjectScheme": "keyword"
         }
-      ],
-      "metadataVersion": 1,
-      "resourceType": "Text",
-      "resourceTypeGeneral": "Text"
+      ]
     }
   },
-  "title": "Irrigation Policies under Rapid Industrialization and Labor Migration: Lessons from Japan, China and India",
-  "release_type": "article-journal",
-  "release_year": 2017,
-  "ext_ids": {
-    "doi": "10.22004/ag.econ.284864"
-  },
   "language": "en",
-  "contribs": [
-    {
-      "index": 0,
-      "raw_name": "Kei Kajisa",
-      "given_name": "Kei",
-      "surname": "Kajisa",
-      "role": "author"
-    },
-    {
-      "index": 1,
-      "raw_name": "Kei Kajisa",
-      "given_name": "Kei",
-      "surname": "Kajisa",
-      "role": "author"
-    }
-  ],
   "refs": [],
-  "abstracts": [
-    {
-      "content": "International society recognizes that the scarcity of fresh water is increasing and farming sectors suffer from lack of irrigation water. However, if we look at this issue with a framework of relative factor endowment, a different view will arise. In emerging states with rapid industrialization and labor migration, labor scarcity increases at a faster pace than that of irrigation water. Using the historical review of Japan's irrigation policies as well as the case studies of India and China, this paper shows that the introduction of policies which do not reflect the actual relative resource scarcity may mislead the development path. We argue that under increasing relative labor scarcity it is important to realize the substitution of capital for labor for surface irrigation system management and that the substitution needs public support because the service of surface irrigation system has some externalities. Through this argument, this paper also intends to shed the light back to the role of the state for local resource management which seems to be unfairly undervalued since the boom of community participatory approach in the 1980s.",
-      "mimetype": "text/plain",
-      "lang": "en"
-    }
-  ]
+  "release_type": "article-journal",
+  "release_year": 2017,
+  "title": "Irrigation Policies under Rapid Industrialization and Labor Migration: Lessons from Japan, China and India"
 }
diff --git a/python/tests/files/datacite/datacite_result_09.json b/python/tests/files/datacite/datacite_result_09.json
index c93dc769..79571360 100644
--- a/python/tests/files/datacite/datacite_result_09.json
+++ b/python/tests/files/datacite/datacite_result_09.json
@@ -1,37 +1,12 @@
 {
-  "extra": {
-    "datacite": {
-      "subjects": [
-        {
-          "subject": "Direktdiodenlasersysteme"
-        },
-        {
-          "subject": "Physics",
-          "subjectScheme": "linsearch"
-        }
-      ],
-      "metadataVersion": 9,
-      "resourceType": "Report",
-      "resourceTypeGeneral": "Text",
-      "schemaVersion": "http://datacite.org/schema/kernel-4"
-    }
-  },
-  "title": "BrightLas : TP3.3. Module für Direktdiodenstrahlquellen bis 4kW und Untersuchungen zur Leistungsskalierung (Diodemodul) : zum Verbundvorhaben Direktdiodenlaseranlagen und -systeme (VP3) im Förderschwerpunkt innovative regionale Wachstumskerne, BMBF : Abschlussbericht",
-  "release_type": "report",
-  "release_stage": "published",
-  "release_year": 2016,
-  "ext_ids": {
-    "doi": "10.2314/gbv:880813733"
-  },
-  "publisher": "[Lumics GmbH]",
-  "language": "de",
+  "abstracts": [],
   "contribs": [
     {
+      "given_name": "Nils",
       "index": 0,
       "raw_name": "Nils Kirstaedter",
-      "given_name": "Nils",
-      "surname": "Kirstaedter",
-      "role": "author"
+      "role": "author",
+      "surname": "Kirstaedter"
     },
     {
       "extra": {
@@ -39,13 +14,38 @@
       }
     },
     {
-      "raw_name": "Technische Informationsbibliothek (TIB)",
       "extra": {
         "type": "DataManager"
-      }
+      },
+      "raw_name": "Technische Informationsbibliothek (TIB)"
     }
   ],
+  "ext_ids": {
+    "doi": "10.2314/gbv:880813733"
+  },
+  "extra": {
+    "datacite": {
+      "metadataVersion": 9,
+      "resourceType": "Report",
+      "resourceTypeGeneral": "Text",
+      "schemaVersion": "http://datacite.org/schema/kernel-4",
+      "subjects": [
+        {
+          "subject": "Direktdiodenlasersysteme"
+        },
+        {
+          "subject": "Physics",
+          "subjectScheme": "linsearch"
+        }
+      ]
+    }
+  },
+  "language": "de",
+  "publisher": "[Lumics GmbH]",
   "refs": [],
-  "abstracts": [],
+  "release_stage": "published",
+  "release_type": "report",
+  "release_year": 2016,
+  "title": "BrightLas : TP3.3. Module für Direktdiodenstrahlquellen bis 4kW und Untersuchungen zur Leistungsskalierung (Diodemodul) : zum Verbundvorhaben Direktdiodenlaseranlagen und -systeme (VP3) im Förderschwerpunkt innovative regionale Wachstumskerne, BMBF : Abschlussbericht",
   "version": "1.0"
 }
diff --git a/python/tests/files/datacite/datacite_result_10.json b/python/tests/files/datacite/datacite_result_10.json
index 8dea8957..1d39feb0 100644
--- a/python/tests/files/datacite/datacite_result_10.json
+++ b/python/tests/files/datacite/datacite_result_10.json
@@ -1,6 +1,20 @@
 {
+  "abstracts": [],
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "Unknown",
+      "role": "author"
+    }
+  ],
+  "ext_ids": {
+    "doi": "10.25549/wpacards-m6171"
+  },
   "extra": {
     "datacite": {
+      "resourceType": "Dataset",
+      "resourceTypeGeneral": "Dataset",
+      "schemaVersion": "http://datacite.org/schema/kernel-4",
       "subjects": [
         {
           "subject": "housing areas"
@@ -8,28 +22,14 @@
         {
           "subject": "Dwellings"
         }
-      ],
-      "resourceType": "Dataset",
-      "resourceTypeGeneral": "Dataset",
-      "schemaVersion": "http://datacite.org/schema/kernel-4"
+      ]
     }
   },
-  "title": "WPA household census for 210 E VERNON, Los Angeles",
-  "release_type": "dataset",
-  "release_stage": "published",
-  "release_year": 2012,
-  "ext_ids": {
-    "doi": "10.25549/wpacards-m6171"
-  },
-  "publisher": "University of Southern California Digital Library (USC.DL)",
   "language": "en",
-  "contribs": [
-    {
-      "index": 0,
-      "raw_name": "Unknown",
-      "role": "author"
-    }
-  ],
+  "publisher": "University of Southern California Digital Library (USC.DL)",
   "refs": [],
-  "abstracts": []
+  "release_stage": "published",
+  "release_type": "dataset",
+  "release_year": 2012,
+  "title": "WPA household census for 210 E VERNON, Los Angeles"
 }
diff --git a/python/tests/files/datacite/datacite_result_11.json b/python/tests/files/datacite/datacite_result_11.json
index 944ca718..761a99c9 100644
--- a/python/tests/files/datacite/datacite_result_11.json
+++ b/python/tests/files/datacite/datacite_result_11.json
@@ -1,4 +1,15 @@
 {
+  "abstracts": [],
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "Comet Photo AG (Zürich)",
+      "role": "author"
+    }
+  ],
+  "ext_ids": {
+    "doi": "10.3932/ethz-a-000055869"
+  },
   "extra": {
     "datacite": {
       "metadataVersion": 6,
@@ -6,22 +17,11 @@
       "schemaVersion": "http://datacite.org/schema/kernel-3"
     }
   },
-  "title": "N1 bei Safenwil",
-  "release_type": "graphic",
-  "release_stage": "published",
-  "release_year": 1965,
-  "ext_ids": {
-    "doi": "10.3932/ethz-a-000055869"
-  },
-  "publisher": "ETH-Bibliothek Zürich, Bildarchiv",
   "language": "de",
-  "contribs": [
-    {
-      "index": 0,
-      "raw_name": "Comet Photo AG (Zürich)",
-      "role": "author"
-    }
-  ],
+  "publisher": "ETH-Bibliothek Zürich, Bildarchiv",
   "refs": [],
-  "abstracts": []
+  "release_stage": "published",
+  "release_type": "graphic",
+  "release_year": 1965,
+  "title": "N1 bei Safenwil"
 }
diff --git a/python/tests/files/datacite/datacite_result_12.json b/python/tests/files/datacite/datacite_result_12.json
index 6977ecea..4e966d6c 100644
--- a/python/tests/files/datacite/datacite_result_12.json
+++ b/python/tests/files/datacite/datacite_result_12.json
@@ -1,49 +1,49 @@
 {
-  "extra": {
-    "datacite": {
-      "resourceTypeGeneral": "Text"
-    },
-    "release_month": 6
-  },
-  "title": "Anthropometric and Physiological Profile of Mixed Martial Art Athletes: A Brief Review",
-  "release_type": "article-journal",
-  "release_stage": "published",
-  "release_date": "2019-06-14",
-  "release_year": 2019,
-  "ext_ids": {
-    "doi": "10.5167/uzh-171449"
-  },
-  "publisher": "MDPI Publishing",
+  "abstracts": [],
   "contribs": [
     {
+      "given_name": "Charalampos",
       "index": 0,
       "raw_name": "Charalampos Spanias",
-      "given_name": "Charalampos",
-      "surname": "Spanias",
-      "role": "author"
+      "role": "author",
+      "surname": "Spanias"
     },
     {
+      "given_name": "Pantelis T",
       "index": 1,
       "raw_name": "Pantelis T Nikolaidis",
-      "given_name": "Pantelis T",
-      "surname": "Nikolaidis",
-      "role": "author"
+      "role": "author",
+      "surname": "Nikolaidis"
     },
     {
+      "given_name": "Thomas",
       "index": 2,
       "raw_name": "Thomas Rosemann",
-      "given_name": "Thomas",
-      "surname": "Rosemann",
-      "role": "author"
+      "role": "author",
+      "surname": "Rosemann"
     },
     {
+      "given_name": "Beat",
       "index": 3,
       "raw_name": "Beat Knechtle",
-      "given_name": "Beat",
-      "surname": "Knechtle",
-      "role": "author"
+      "role": "author",
+      "surname": "Knechtle"
     }
   ],
+  "ext_ids": {
+    "doi": "10.5167/uzh-171449"
+  },
+  "extra": {
+    "datacite": {
+      "resourceTypeGeneral": "Text"
+    },
+    "release_month": 6
+  },
+  "publisher": "MDPI Publishing",
   "refs": [],
-  "abstracts": []
+  "release_date": "2019-06-14",
+  "release_stage": "published",
+  "release_type": "article-journal",
+  "release_year": 2019,
+  "title": "Anthropometric and Physiological Profile of Mixed Martial Art Athletes: A Brief Review"
 }
diff --git a/python/tests/files/datacite/datacite_result_13.json b/python/tests/files/datacite/datacite_result_13.json
index 91126c5a..923f2ea8 100644
--- a/python/tests/files/datacite/datacite_result_13.json
+++ b/python/tests/files/datacite/datacite_result_13.json
@@ -1,22 +1,5 @@
 {
-  "extra": {
-    "datacite": {
-      "metadataVersion": 17,
-      "resourceType": "Journal Article",
-      "resourceTypeGeneral": "Text",
-      "schemaVersion": "http://datacite.org/schema/kernel-3"
-    },
-    "release_month": 10
-  },
-  "title": "[Müssen wir des Glücks uns schämen?]",
-  "release_type": "article-journal",
-  "release_stage": "published",
-  "release_date": "1940-10-05",
-  "release_year": 1940,
-  "ext_ids": {
-    "doi": "10.5169/seals-314104"
-  },
-  "publisher": "Buchdruckerei Büchler & Co.",
+  "abstracts": [],
   "contribs": [
     {
       "index": 0,
@@ -24,13 +7,30 @@
       "role": "author"
     },
     {
+      "given_name": "Hermann",
       "index": 1,
       "raw_name": "Hermann Hiltbrunner",
-      "given_name": "Hermann",
-      "surname": "Hiltbrunner",
-      "role": "author"
+      "role": "author",
+      "surname": "Hiltbrunner"
     }
   ],
+  "ext_ids": {
+    "doi": "10.5169/seals-314104"
+  },
+  "extra": {
+    "datacite": {
+      "metadataVersion": 17,
+      "resourceType": "Journal Article",
+      "resourceTypeGeneral": "Text",
+      "schemaVersion": "http://datacite.org/schema/kernel-3"
+    },
+    "release_month": 10
+  },
+  "publisher": "Buchdruckerei Büchler & Co.",
   "refs": [],
-  "abstracts": []
+  "release_date": "1940-10-05",
+  "release_stage": "published",
+  "release_type": "article-journal",
+  "release_year": 1940,
+  "title": "[Müssen wir des Glücks uns schämen?]"
 }
diff --git a/python/tests/files/datacite/datacite_result_14.json b/python/tests/files/datacite/datacite_result_14.json
index 20f6bfd4..2ce68d29 100644
--- a/python/tests/files/datacite/datacite_result_14.json
+++ b/python/tests/files/datacite/datacite_result_14.json
@@ -1,114 +1,114 @@
 {
-  "extra": {
-    "datacite": {
-      "subjects": [
-        {
-          "subject": "Crystal Structure"
-        },
-        {
-          "subject": "Experimental 3D Coordinates"
-        },
-        {
-          "subject": "Crystal System"
-        },
-        {
-          "subject": "Space Group"
-        },
-        {
-          "subject": "Cell Parameters"
-        },
-        {
-          "subject": "Crystallography"
-        },
-        {
-          "subject": "bis(mu~2~-5-(3,5-Di-t-butylphenyl)-15-(4-(2-(diphenylphosphino)ethynyl)phenyl)-2,8,12,18-tetrahexyl-3,7,13,17-tetramethylporphyrinato)-(5,15-bis(3,5-di-t-butylphenyl)-2,8,12,18-tetraethyl-3,7,13,17-tetramethylporphyrinato)-di-nickel-ruthenium chloroform solvate"
-        }
-      ],
-      "relations": [
-        {
-          "relationType": "IsSupplementTo",
-          "relatedIdentifier": "10.1021/ic034699w",
-          "relatedIdentifierType": "DOI"
-        }
-      ],
-      "metadataVersion": 2,
-      "resourceTypeGeneral": "Dataset",
-      "schemaVersion": "http://datacite.org/schema/kernel-3"
+  "abstracts": [
+    {
+      "content": "An entry from the Cambridge Structural Database, the world's repository for small molecule crystal structures. The entry contains experimental data from a crystal diffraction study. The deposited dataset for this entry is freely available from the CCDC and typically includes 3D coordinates, cell parameters, space group, experimental conditions and quality measures.",
+      "lang": "en",
+      "mimetype": "text/plain"
     }
-  },
-  "title": "CCDC 222635: Experimental Crystal Structure Determination",
-  "release_type": "entry",
-  "release_stage": "published",
-  "release_year": 2004,
-  "ext_ids": {
-    "doi": "10.5517/cc7gns3"
-  },
-  "publisher": "Cambridge Crystallographic Data Centre",
-  "language": "en",
+  ],
   "contribs": [
     {
+      "given_name": "E.",
       "index": 0,
       "raw_name": "E. Stulz",
-      "given_name": "E.",
-      "surname": "Stulz",
-      "role": "author"
+      "role": "author",
+      "surname": "Stulz"
     },
     {
+      "given_name": "S.M.",
       "index": 1,
       "raw_name": "S.M. Scott",
-      "given_name": "S.M.",
-      "surname": "Scott",
-      "role": "author"
+      "role": "author",
+      "surname": "Scott"
     },
     {
+      "given_name": "Yiu-Fai",
       "index": 2,
       "raw_name": "Yiu-Fai Ng",
-      "given_name": "Yiu-Fai",
-      "surname": "Ng",
-      "role": "author"
+      "role": "author",
+      "surname": "Ng"
     },
     {
+      "given_name": "A.D.",
       "index": 3,
       "raw_name": "A.D. Bond",
-      "given_name": "A.D.",
-      "surname": "Bond",
-      "role": "author"
+      "role": "author",
+      "surname": "Bond"
     },
     {
+      "given_name": "S.J.",
       "index": 4,
       "raw_name": "S.J. Teat",
-      "given_name": "S.J.",
-      "surname": "Teat",
-      "role": "author"
+      "role": "author",
+      "surname": "Teat"
     },
     {
+      "given_name": "S.L.",
       "index": 5,
       "raw_name": "S.L. Darling",
-      "given_name": "S.L.",
-      "surname": "Darling",
-      "role": "author"
+      "role": "author",
+      "surname": "Darling"
     },
     {
+      "given_name": "N.",
       "index": 6,
       "raw_name": "N. Feeder",
-      "given_name": "N.",
-      "surname": "Feeder",
-      "role": "author"
+      "role": "author",
+      "surname": "Feeder"
     },
     {
+      "given_name": "J.K.M.",
       "index": 7,
       "raw_name": "J.K.M. Sanders",
-      "given_name": "J.K.M.",
-      "surname": "Sanders",
-      "role": "author"
+      "role": "author",
+      "surname": "Sanders"
     }
   ],
-  "refs": [],
-  "abstracts": [
-    {
-      "content": "An entry from the Cambridge Structural Database, the world's repository for small molecule crystal structures. The entry contains experimental data from a crystal diffraction study. The deposited dataset for this entry is freely available from the CCDC and typically includes 3D coordinates, cell parameters, space group, experimental conditions and quality measures.",
-      "mimetype": "text/plain",
-      "lang": "en"
+  "ext_ids": {
+    "doi": "10.5517/cc7gns3"
+  },
+  "extra": {
+    "datacite": {
+      "metadataVersion": 2,
+      "relations": [
+        {
+          "relatedIdentifier": "10.1021/ic034699w",
+          "relatedIdentifierType": "DOI",
+          "relationType": "IsSupplementTo"
+        }
+      ],
+      "resourceTypeGeneral": "Dataset",
+      "schemaVersion": "http://datacite.org/schema/kernel-3",
+      "subjects": [
+        {
+          "subject": "Crystal Structure"
+        },
+        {
+          "subject": "Experimental 3D Coordinates"
+        },
+        {
+          "subject": "Crystal System"
+        },
+        {
+          "subject": "Space Group"
+        },
+        {
+          "subject": "Cell Parameters"
+        },
+        {
+          "subject": "Crystallography"
+        },
+        {
+          "subject": "bis(mu~2~-5-(3,5-Di-t-butylphenyl)-15-(4-(2-(diphenylphosphino)ethynyl)phenyl)-2,8,12,18-tetrahexyl-3,7,13,17-tetramethylporphyrinato)-(5,15-bis(3,5-di-t-butylphenyl)-2,8,12,18-tetraethyl-3,7,13,17-tetramethylporphyrinato)-di-nickel-ruthenium chloroform solvate"
+        }
+      ]
     }
-  ]
+  },
+  "language": "en",
+  "publisher": "Cambridge Crystallographic Data Centre",
+  "refs": [],
+  "release_stage": "published",
+  "release_type": "entry",
+  "release_year": 2004,
+  "title": "CCDC 222635: Experimental Crystal Structure Determination"
 }
diff --git a/python/tests/files/datacite/datacite_result_15.json b/python/tests/files/datacite/datacite_result_15.json
index 3a03dfb6..5e7180c4 100644
--- a/python/tests/files/datacite/datacite_result_15.json
+++ b/python/tests/files/datacite/datacite_result_15.json
@@ -1,4 +1,17 @@
 {
+  "abstracts": [],
+  "contribs": [
+    {
+      "given_name": "David",
+      "index": 0,
+      "raw_name": "David Richardson",
+      "role": "author",
+      "surname": "Richardson"
+    }
+  ],
+  "ext_ids": {
+    "doi": "10.6073/pasta/95296d8416aae24f3d39b4ecb27f0b28"
+  },
   "extra": {
     "datacite": {
       "metadataVersion": 1,
@@ -7,23 +20,10 @@
       "schemaVersion": "http://datacite.org/schema/kernel-2.2"
     }
   },
-  "title": "Parramore Island of the Virginia Coast Reserve Permanent Plot Resurvey: Tree data 1997",
-  "release_type": "dataset",
-  "release_stage": "published",
-  "release_year": 2017,
-  "ext_ids": {
-    "doi": "10.6073/pasta/95296d8416aae24f3d39b4ecb27f0b28"
-  },
   "publisher": "Environmental Data Initiative",
-  "contribs": [
-    {
-      "index": 0,
-      "raw_name": "David Richardson",
-      "given_name": "David",
-      "surname": "Richardson",
-      "role": "author"
-    }
-  ],
   "refs": [],
-  "abstracts": []
+  "release_stage": "published",
+  "release_type": "dataset",
+  "release_year": 2017,
+  "title": "Parramore Island of the Virginia Coast Reserve Permanent Plot Resurvey: Tree data 1997"
 }
diff --git a/python/tests/files/datacite/datacite_result_16.json b/python/tests/files/datacite/datacite_result_16.json
index 8cf762b6..dc9d18af 100644
--- a/python/tests/files/datacite/datacite_result_16.json
+++ b/python/tests/files/datacite/datacite_result_16.json
@@ -1,4 +1,17 @@
 {
+  "abstracts": [],
+  "contribs": [
+    {
+      "given_name": "Taha",
+      "index": 0,
+      "raw_name": "Taha Sochi",
+      "role": "author",
+      "surname": "Sochi"
+    }
+  ],
+  "ext_ids": {
+    "doi": "10.6084/m9.figshare.1282478"
+  },
   "extra": {
     "datacite": {
       "license": [
@@ -12,23 +25,10 @@
       "schemaVersion": "http://datacite.org/schema/kernel-3"
     }
   },
-  "title": "Testing the Connectivity of Networks",
-  "release_type": "dataset",
-  "release_stage": "published",
-  "release_year": 2014,
-  "ext_ids": {
-    "doi": "10.6084/m9.figshare.1282478"
-  },
   "publisher": "Figshare",
-  "contribs": [
-    {
-      "index": 0,
-      "raw_name": "Taha Sochi",
-      "given_name": "Taha",
-      "surname": "Sochi",
-      "role": "author"
-    }
-  ],
   "refs": [],
-  "abstracts": []
+  "release_stage": "published",
+  "release_type": "dataset",
+  "release_year": 2014,
+  "title": "Testing the Connectivity of Networks"
 }
diff --git a/python/tests/files/datacite/datacite_result_17.json b/python/tests/files/datacite/datacite_result_17.json
index 6e8c4e34..0f768179 100644
--- a/python/tests/files/datacite/datacite_result_17.json
+++ b/python/tests/files/datacite/datacite_result_17.json
@@ -1,18 +1,5 @@
 {
-  "extra": {
-    "datacite": {
-      "resourceTypeGeneral": "Dataset",
-      "schemaVersion": "http://datacite.org/schema/kernel-4"
-    }
-  },
-  "title": "gel_BSA-FITC_Markov_segmntation0343.tif",
-  "release_type": "dataset",
-  "release_stage": "published",
-  "release_year": 2018,
-  "ext_ids": {
-    "doi": "10.7910/dvn/tsqfwc/yytj22"
-  },
-  "publisher": "Harvard Dataverse",
+  "abstracts": [],
   "contribs": [
     {
       "index": 0,
@@ -20,6 +7,19 @@
       "role": "author"
     }
   ],
+  "ext_ids": {
+    "doi": "10.7910/dvn/tsqfwc/yytj22"
+  },
+  "extra": {
+    "datacite": {
+      "resourceTypeGeneral": "Dataset",
+      "schemaVersion": "http://datacite.org/schema/kernel-4"
+    }
+  },
+  "publisher": "Harvard Dataverse",
   "refs": [],
-  "abstracts": []
+  "release_stage": "published",
+  "release_type": "dataset",
+  "release_year": 2018,
+  "title": "gel_BSA-FITC_Markov_segmntation0343.tif"
 }
diff --git a/python/tests/files/datacite/datacite_result_18.json b/python/tests/files/datacite/datacite_result_18.json
index 6e69bad2..7f2d2792 100644
--- a/python/tests/files/datacite/datacite_result_18.json
+++ b/python/tests/files/datacite/datacite_result_18.json
@@ -1,4 +1,9 @@
 {
+  "abstracts": [],
+  "contribs": [],
+  "ext_ids": {
+    "doi": "10.7916/d81z522m"
+  },
   "extra": {
     "datacite": {
       "metadataVersion": 2,
@@ -6,16 +11,11 @@
     },
     "release_month": 8
   },
-  "title": "Eastern questionnaire, answer sheet for Interviewee 53215, page 064",
-  "release_type": "article",
-  "release_stage": "published",
-  "release_date": "2017-08-21",
-  "release_year": 2017,
-  "ext_ids": {
-    "doi": "10.7916/d81z522m"
-  },
   "publisher": "Columbia University",
-  "contribs": [],
   "refs": [],
-  "abstracts": []
+  "release_date": "2017-08-21",
+  "release_stage": "published",
+  "release_type": "article",
+  "release_year": 2017,
+  "title": "Eastern questionnaire, answer sheet for Interviewee 53215, page 064"
 }
diff --git a/python/tests/files/datacite/datacite_result_19.json b/python/tests/files/datacite/datacite_result_19.json
index 2f2f217e..4ff00a56 100644
--- a/python/tests/files/datacite/datacite_result_19.json
+++ b/python/tests/files/datacite/datacite_result_19.json
@@ -1,4 +1,9 @@
 {
+  "abstracts": [],
+  "contribs": [],
+  "ext_ids": {
+    "doi": "10.7916/d86x0cg1"
+  },
   "extra": {
     "datacite": {
       "metadataVersion": 3,
@@ -6,16 +11,11 @@
     },
     "release_month": 8
   },
-  "title": "Eastern questionnaire, answer sheet for Interviewee 55236, page 092",
-  "release_type": "article",
-  "release_stage": "published",
-  "release_date": "2017-08-24",
-  "release_year": 2017,
-  "ext_ids": {
-    "doi": "10.7916/d86x0cg1"
-  },
   "publisher": "Columbia University",
-  "contribs": [],
   "refs": [],
-  "abstracts": []
+  "release_date": "2017-08-24",
+  "release_stage": "published",
+  "release_type": "article",
+  "release_year": 2017,
+  "title": "Eastern questionnaire, answer sheet for Interviewee 55236, page 092"
 }
diff --git a/python/tests/files/datacite/datacite_result_20.json b/python/tests/files/datacite/datacite_result_20.json
index 0f99e2a2..5a6d3473 100644
--- a/python/tests/files/datacite/datacite_result_20.json
+++ b/python/tests/files/datacite/datacite_result_20.json
@@ -1,17 +1,17 @@
 {
+  "abstracts": [],
+  "contribs": [],
+  "ext_ids": {
+    "doi": "10.7916/d86x0cg1"
+  },
   "extra": {
     "datacite": {},
     "release_month": 8
   },
-  "title": "<h1>Eastern questionnaire</h1>",
-  "release_type": "article",
-  "release_stage": "published",
+  "refs": [],
   "release_date": "2017-08-24",
+  "release_stage": "published",
+  "release_type": "article",
   "release_year": 2017,
-  "ext_ids": {
-    "doi": "10.7916/d86x0cg1"
-  },
-  "contribs": [],
-  "refs": [],
-  "abstracts": []
+  "title": "<h1>Eastern questionnaire</h1>"
 }
diff --git a/python/tests/files/datacite/datacite_result_21.json b/python/tests/files/datacite/datacite_result_21.json
index 3dfcf1bf..54c22538 100644
--- a/python/tests/files/datacite/datacite_result_21.json
+++ b/python/tests/files/datacite/datacite_result_21.json
@@ -1,18 +1,18 @@
 {
+  "abstracts": [],
+  "contribs": [],
+  "ext_ids": {
+    "doi": "10.7916/d86x0cg1"
+  },
   "extra": {
     "datacite": {},
     "release_month": 8
   },
-  "title": "ABC",
-  "release_type": "article",
-  "release_stage": "published",
-  "release_date": "2017-08-24",
-  "release_year": 2017,
-  "ext_ids": {
-    "doi": "10.7916/d86x0cg1"
-  },
   "language": "de",
-  "contribs": [],
   "refs": [],
-  "abstracts": []
+  "release_date": "2017-08-24",
+  "release_stage": "published",
+  "release_type": "article",
+  "release_year": 2017,
+  "title": "ABC"
 }
diff --git a/python/tests/files/datacite/datacite_result_22.json b/python/tests/files/datacite/datacite_result_22.json
index bd88c358..913fbbb6 100644
--- a/python/tests/files/datacite/datacite_result_22.json
+++ b/python/tests/files/datacite/datacite_result_22.json
@@ -1,25 +1,25 @@
 {
-  "extra": {
-    "datacite": {},
-    "release_month": 8
-  },
-  "title": "ABC",
-  "release_type": "article",
-  "release_stage": "published",
-  "release_date": "2017-08-24",
-  "release_year": 2017,
-  "ext_ids": {
-    "doi": "10.7916/d86x0cg1"
-  },
-  "language": "de",
+  "abstracts": [],
   "contribs": [
     {
       "index": 0,
+      "raw_affiliation": "Department of pataphysics",
       "raw_name": "Anton Welch",
-      "role": "author",
-      "raw_affiliation": "Department of pataphysics"
+      "role": "author"
     }
   ],
+  "ext_ids": {
+    "doi": "10.7916/d86x0cg1"
+  },
+  "extra": {
+    "datacite": {},
+    "release_month": 8
+  },
+  "language": "de",
   "refs": [],
-  "abstracts": []
+  "release_date": "2017-08-24",
+  "release_stage": "published",
+  "release_type": "article",
+  "release_year": 2017,
+  "title": "ABC"
 }
diff --git a/python/tests/files/datacite/datacite_result_23.json b/python/tests/files/datacite/datacite_result_23.json
index e82925af..0ac6a06d 100644
--- a/python/tests/files/datacite/datacite_result_23.json
+++ b/python/tests/files/datacite/datacite_result_23.json
@@ -1,25 +1,25 @@
 {
-  "extra": {
-    "datacite": {},
-    "release_month": 8
-  },
-  "title": "ABC",
-  "release_type": "article",
-  "release_stage": "published",
-  "release_date": "2017-08-24",
-  "release_year": 2017,
-  "ext_ids": {
-    "doi": "10.7916/d86x0cg1-xxx"
-  },
-  "language": "de",
+  "abstracts": [],
   "contribs": [
     {
       "index": 0,
+      "raw_affiliation": "Department of pataphysics",
       "raw_name": "Anton Welch",
-      "role": "author",
-      "raw_affiliation": "Department of pataphysics"
+      "role": "author"
     }
   ],
+  "ext_ids": {
+    "doi": "10.7916/d86x0cg1-xxx"
+  },
+  "extra": {
+    "datacite": {},
+    "release_month": 8
+  },
+  "language": "de",
   "refs": [],
-  "abstracts": []
+  "release_date": "2017-08-24",
+  "release_stage": "published",
+  "release_type": "article",
+  "release_year": 2017,
+  "title": "ABC"
 }
diff --git a/python/tests/files/datacite/datacite_result_24.json b/python/tests/files/datacite/datacite_result_24.json
index 2d95d300..cd9898f9 100644
--- a/python/tests/files/datacite/datacite_result_24.json
+++ b/python/tests/files/datacite/datacite_result_24.json
@@ -1,25 +1,25 @@
 {
-  "extra": {
-    "datacite": {},
-    "release_month": 8
-  },
-  "title": "ABC",
-  "subtitle": "DEF",
-  "release_type": "article",
-  "release_stage": "published",
-  "release_date": "2017-08-24",
-  "release_year": 2017,
-  "ext_ids": {
-    "doi": "10.7916/d86x0cg1"
-  },
+  "abstracts": [],
   "contribs": [
     {
       "index": 0,
+      "raw_affiliation": "Department of pataphysics",
       "raw_name": "Anton Welch",
-      "role": "author",
-      "raw_affiliation": "Department of pataphysics"
+      "role": "author"
     }
   ],
+  "ext_ids": {
+    "doi": "10.7916/d86x0cg1"
+  },
+  "extra": {
+    "datacite": {},
+    "release_month": 8
+  },
   "refs": [],
-  "abstracts": []
+  "release_date": "2017-08-24",
+  "release_stage": "published",
+  "release_type": "article",
+  "release_year": 2017,
+  "subtitle": "DEF",
+  "title": "ABC"
 }
diff --git a/python/tests/files/datacite/datacite_result_25.json b/python/tests/files/datacite/datacite_result_25.json
index aad6d17e..6a29e8de 100644
--- a/python/tests/files/datacite/datacite_result_25.json
+++ b/python/tests/files/datacite/datacite_result_25.json
@@ -1,25 +1,25 @@
 {
-  "extra": {
-    "datacite": {},
-    "release_month": 8
-  },
-  "title": "Additional file 123: ABC",
-  "subtitle": "DEF",
-  "release_type": "stub",
-  "release_stage": "published",
-  "release_date": "2017-08-24",
-  "release_year": 2017,
-  "ext_ids": {
-    "doi": "10.7916/d86x0cg1"
-  },
+  "abstracts": [],
   "contribs": [
     {
       "index": 0,
+      "raw_affiliation": "Department of pataphysics",
       "raw_name": "Anton Welch",
-      "role": "author",
-      "raw_affiliation": "Department of pataphysics"
+      "role": "author"
     }
   ],
+  "ext_ids": {
+    "doi": "10.7916/d86x0cg1"
+  },
+  "extra": {
+    "datacite": {},
+    "release_month": 8
+  },
   "refs": [],
-  "abstracts": []
+  "release_date": "2017-08-24",
+  "release_stage": "published",
+  "release_type": "stub",
+  "release_year": 2017,
+  "subtitle": "DEF",
+  "title": "Additional file 123: ABC"
 }
diff --git a/python/tests/files/datacite/datacite_result_26.json b/python/tests/files/datacite/datacite_result_26.json
index 8d26197c..267eb9c2 100644
--- a/python/tests/files/datacite/datacite_result_26.json
+++ b/python/tests/files/datacite/datacite_result_26.json
@@ -1,31 +1,33 @@
 {
-  "extra": {
-    "datacite": {},
-    "release_month": 8
-  },
-  "title": "Additional file 123: ABC",
-  "subtitle": "DEF",
-  "release_type": "stub",
-  "release_stage": "published",
-  "release_date": "2017-08-24",
-  "release_year": 2017,
-  "ext_ids": {
-    "doi": "10.7916/d86x0cg1"
-  },
+  "abstracts": [],
   "contribs": [
     {
       "index": 0,
+      "raw_affiliation": "Department of pataphysics",
       "raw_name": "Anton Welch",
-      "role": "author",
-      "raw_affiliation": "Department of pataphysics"
+      "role": "author"
     },
-      {
-        "extra": {"type": "Editor"},
-        "raw_name": "David Wemmer",
-        "given_name": "David",
-        "surname": "Wemmer"
-      }
+    {
+      "extra": {
+        "type": "Editor"
+      },
+      "given_name": "David",
+      "raw_name": "David Wemmer",
+      "surname": "Wemmer"
+    }
   ],
+  "ext_ids": {
+    "doi": "10.7916/d86x0cg1"
+  },
+  "extra": {
+    "datacite": {},
+    "release_month": 8
+  },
   "refs": [],
-  "abstracts": []
+  "release_date": "2017-08-24",
+  "release_stage": "published",
+  "release_type": "stub",
+  "release_year": 2017,
+  "subtitle": "DEF",
+  "title": "Additional file 123: ABC"
 }
-- 
cgit v1.2.3


From 277bd183d7139bb1a8857bc2a48c0aa92012455d Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 8 Jan 2020 23:19:48 +0100
Subject: datacite: pass in doi into factored out method

---
 python/fatcat_tools/importers/datacite.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 9ca72758..b1862b44 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -307,7 +307,7 @@ class DataciteImporter(EntityImporter):
         creators = attributes.get('creators', []) or []
         contributors = attributes.get('contributors', []) or []  # Much fewer than creators.
 
-        contribs = self.parse_datacite_creators(creators) + self.parse_datacite_creators(contributors, role=None, set_index=False)
+        contribs = self.parse_datacite_creators(creators, doi=doi) + self.parse_datacite_creators(contributors, role=None, set_index=False, doi=doi)
 
         # Title, may come with "attributes.titles[].titleType", like
         # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
@@ -690,10 +690,11 @@ class DataciteImporter(EntityImporter):
                     extra=self.editgroup_extra),
                 entity_list=batch))
 
-    def parse_datacite_creators(self, creators, role='author', set_index=True):
+    def parse_datacite_creators(self, creators, role='author', set_index=True, doi=None):
         """
         Parses a list of creators into a list of ReleaseContrib objects. Set
         set_index to False, if the index contrib field should be left blank.
+        The doi parameter is only used for debugging.
         """
         # Contributors. Many nameIdentifierSchemes, we do not use (yet):
         # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme":
-- 
cgit v1.2.3