From fcc6f24a95a7b77bda4ec813daecc2b737a82412 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 7 Jul 2020 02:08:26 +0200
Subject: datacite: address duplicated contributor issue

Use string comparison.

* https://fatcat.wiki/release/spjysmrnsrgyzgq6ise5o44rlu/contribs
* https://api.datacite.org/dois/10.25940/roper-31098406
---
 python/fatcat_tools/importers/datacite.py          | 16 ++++++
 python/tests/files/datacite/datacite_doc_33.json   | 62 ++++++++++++++++++++++
 .../tests/files/datacite/datacite_result_05.json   |  3 --
 .../tests/files/datacite/datacite_result_08.json   |  7 ---
 .../tests/files/datacite/datacite_result_33.json   | 31 +++++++++++
 python/tests/import_datacite.py                    |  2 +-
 6 files changed, 110 insertions(+), 11 deletions(-)
 create mode 100644 python/tests/files/datacite/datacite_doc_33.json
 create mode 100644 python/tests/files/datacite/datacite_result_33.json

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 434a2941..66ec2023 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -298,6 +298,9 @@ class DataciteImporter(EntityImporter):
 
         contribs = self.parse_datacite_creators(creators, doi=doi) + self.parse_datacite_creators(contributors, role=None, set_index=False, doi=doi)
 
+        # Address duplicated author names; use raw_name string comparison; refs #59.
+        contribs = unique_contributors(contribs)
+
         # Title, may come with "attributes.titles[].titleType", like
         # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
         titles = attributes.get('titles', []) or []
@@ -823,6 +826,19 @@ class DataciteImporter(EntityImporter):
         return contribs
 
 
+def unique_contributors(contribs):
+    """
+    Given a list of ReleaseContrib items, return a list of unique
+    ReleaseContribs, refs GH #59.
+    """
+    unique_names, unique_contribs = set(), []
+    for rc in contribs:
+        if rc.raw_name and rc.raw_name in unique_names:
+            continue
+        unique_names.add(rc.raw_name)
+        unique_contribs.append(rc)
+    return unique_contribs
+
 def lookup_license_slug(raw):
     """
     Resolve a variety of strings into a some pseudo-canonical form, e.g.
diff --git a/python/tests/files/datacite/datacite_doc_33.json b/python/tests/files/datacite/datacite_doc_33.json
new file mode 100644
index 00000000..571d1220
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_33.json
@@ -0,0 +1,62 @@
+{
+  "id": "10.17912/micropub.biology.000143",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.17912/micropub.biology.000143",
+    "identifiers": null,
+    "creators": [
+      {
+        "name": "ABC News",
+        "givenName": "",
+        "familyName": "",
+        "affiliation": [],
+        "role": "author"
+      }
+    ],
+    "titles": [
+      {
+        "title": "Sample"
+      }
+    ],
+    "publisher": "microPublication Biology",
+    "publicationYear": 2019,
+    "types": {
+      "resourceTypeGeneral": "DataPaper"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [],
+    "version": null,
+    "rightsList": [],
+    "descriptions": [
+      {
+        "description": 1234567890,
+        "descriptionType": "Abstract"
+      }
+    ],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143",
+    "created": "2019-08-19T14:43:08.000Z",
+    "registered": "2019-08-19T14:43:09.000Z",
+    "published": "2019",
+    "updated": "2019-11-09T12:32:02.000Z",
+    "contributors": [
+      {
+        "name": "ABC News",
+        "givenName": "",
+        "familyName": "",
+        "affiliation": [],
+        "role": ""
+      }
+    ]
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "caltech.micropub",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json
index 79c2a8fb..d634490d 100644
--- a/python/tests/files/datacite/datacite_result_05.json
+++ b/python/tests/files/datacite/datacite_result_05.json
@@ -504,9 +504,6 @@
       "role": "author",
       "surname": "Wurzbacher"
     },
-    {
-      "raw_name": "Kessy Abarenkov"
-    },
     {
       "raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden"
     }
diff --git a/python/tests/files/datacite/datacite_result_08.json b/python/tests/files/datacite/datacite_result_08.json
index 70237280..5a46ef50 100644
--- a/python/tests/files/datacite/datacite_result_08.json
+++ b/python/tests/files/datacite/datacite_result_08.json
@@ -13,13 +13,6 @@
       "raw_name": "Kei Kajisa",
       "role": "author",
       "surname": "Kajisa"
-    },
-    {
-      "given_name": "Kei",
-      "index": 1,
-      "raw_name": "Kei Kajisa",
-      "role": "author",
-      "surname": "Kajisa"
     }
   ],
   "ext_ids": {
diff --git a/python/tests/files/datacite/datacite_result_33.json b/python/tests/files/datacite/datacite_result_33.json
new file mode 100644
index 00000000..bcb72469
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_33.json
@@ -0,0 +1,31 @@
+{
+  "abstracts": [
+    {
+      "content": "1234567890",
+      "mimetype": "text/plain"
+    }
+  ],
+  "contribs": [
+    {
+      "given_name": "",
+      "surname": "",
+      "index": 0,
+      "raw_name": "ABC News",
+      "role": "author"
+    }
+  ],
+  "ext_ids": {
+    "doi": "10.17912/micropub.biology.000143"
+  },
+  "extra": {
+    "datacite": {
+      "resourceTypeGeneral": "DataPaper"
+    },
+    "container_name": "microPublication Biology"
+  },
+  "refs": [],
+  "release_stage": "published",
+  "release_year": 2019,
+  "publisher": "microPublication Biology",
+  "title": "Sample"
+}
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 20c1eaf8..1472b8ea 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -288,7 +288,7 @@ def test_datacite_conversions(datacite_importer):
     for now.
     """
     datacite_importer.debug = True
-    for i in range(33):
+    for i in range(34):
         src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i)
         dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i)
         with open(src, 'r') as f:
-- 
cgit v1.2.3


From 40f77b78aa331ca67b510dfece77e6a6000f8c2f Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 10 Jul 2020 00:50:34 +0200
Subject: wip: contrib, GH59

---
 python/tests/files/datacite/datacite_doc_34.json   | 61 ++++++++++++++++++++++
 .../tests/files/datacite/datacite_result_05.json   |  3 +-
 .../tests/files/datacite/datacite_result_09.json   |  3 +-
 .../tests/files/datacite/datacite_result_26.json   |  3 +-
 .../tests/files/datacite/datacite_result_34.json   | 38 ++++++++++++++
 5 files changed, 105 insertions(+), 3 deletions(-)
 create mode 100644 python/tests/files/datacite/datacite_doc_34.json
 create mode 100644 python/tests/files/datacite/datacite_result_34.json

diff --git a/python/tests/files/datacite/datacite_doc_34.json b/python/tests/files/datacite/datacite_doc_34.json
new file mode 100644
index 00000000..5dcf65f4
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_34.json
@@ -0,0 +1,61 @@
+{
+  "id": "10.17912/micropub.biology.000143",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.17912/micropub.biology.000143",
+    "identifiers": null,
+    "creators": [
+      {
+        "name": "Paul Katz",
+        "givenName": "",
+        "familyName": "",
+        "affiliation": [],
+        "role": "author"
+      }
+    ],
+    "titles": [
+      {
+        "title": "Sample"
+      }
+    ],
+    "publisher": "microPublication Biology",
+    "publicationYear": 2019,
+    "types": {
+      "resourceTypeGeneral": "DataPaper"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [],
+    "version": null,
+    "rightsList": [],
+    "descriptions": [
+      {
+        "description": 1234567890,
+        "descriptionType": "Abstract"
+      }
+    ],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143",
+    "created": "2019-08-19T14:43:08.000Z",
+    "registered": "2019-08-19T14:43:09.000Z",
+    "published": "2019",
+    "updated": "2019-11-09T12:32:02.000Z",
+    "contributors": [
+      {
+        "name": "Paul Katz",
+        "givenName": "",
+        "familyName": "",
+        "affiliation": [],
+        "role": "illustrator"
+      } ]
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "caltech.micropub",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json
index d634490d..c91f3a7f 100644
--- a/python/tests/files/datacite/datacite_result_05.json
+++ b/python/tests/files/datacite/datacite_result_05.json
@@ -505,7 +505,8 @@
       "surname": "Wurzbacher"
     },
     {
-      "raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden"
+      "raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden",
+      "role": "author"
     }
   ],
   "ext_ids": {
diff --git a/python/tests/files/datacite/datacite_result_09.json b/python/tests/files/datacite/datacite_result_09.json
index 09e02fc7..f6ec524a 100644
--- a/python/tests/files/datacite/datacite_result_09.json
+++ b/python/tests/files/datacite/datacite_result_09.json
@@ -17,7 +17,8 @@
       "extra": {
         "type": "DataManager"
       },
-      "raw_name": "Technische Informationsbibliothek (TIB)"
+      "raw_name": "Technische Informationsbibliothek (TIB)",
+      "role": "author"
     }
   ],
   "ext_ids": {
diff --git a/python/tests/files/datacite/datacite_result_26.json b/python/tests/files/datacite/datacite_result_26.json
index 267eb9c2..f6e589ef 100644
--- a/python/tests/files/datacite/datacite_result_26.json
+++ b/python/tests/files/datacite/datacite_result_26.json
@@ -13,7 +13,8 @@
       },
       "given_name": "David",
       "raw_name": "David Wemmer",
-      "surname": "Wemmer"
+      "surname": "Wemmer",
+      "role": "author"
     }
   ],
   "ext_ids": {
diff --git a/python/tests/files/datacite/datacite_result_34.json b/python/tests/files/datacite/datacite_result_34.json
new file mode 100644
index 00000000..8e087ab5
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_34.json
@@ -0,0 +1,38 @@
+{
+  "abstracts": [
+    {
+      "content": "1234567890",
+      "mimetype": "text/plain"
+    }
+  ],
+  "contribs": [
+    {
+      "given_name": "",
+      "surname": "",
+      "index": 0,
+      "raw_name": "Paul Katz",
+      "role": "author"
+    },
+    {
+      "given_name": "",
+      "surname": "",
+      "index": 0,
+      "raw_name": "Paul Katz",
+      "role": "illustrator"
+    }
+  ],
+  "ext_ids": {
+    "doi": "10.17912/micropub.biology.000143"
+  },
+  "extra": {
+    "datacite": {
+      "resourceTypeGeneral": "DataPaper"
+    },
+    "container_name": "microPublication Biology"
+  },
+  "refs": [],
+  "release_stage": "published",
+  "release_year": 2019,
+  "publisher": "microPublication Biology",
+  "title": "Sample"
+}
-- 
cgit v1.2.3


From df8dcde8d5eaf530e35f1467951271bff7475e64 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 10 Jul 2020 00:50:42 +0200
Subject: wip: contrib, GH59

---
 python/fatcat_tools/importers/datacite.py |  38 +-
 python/tests/import_datacite.py           | 590 ++++++++++++++++++------------
 2 files changed, 383 insertions(+), 245 deletions(-)

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 66ec2023..7797812f 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -292,14 +292,17 @@ class DataciteImporter(EntityImporter):
             print('[{}] skipping non-ascii doi for now'.format(doi))
             return None
 
-
         creators = attributes.get('creators', []) or []
         contributors = attributes.get('contributors', []) or []  # Much fewer than creators.
 
-        contribs = self.parse_datacite_creators(creators, doi=doi) + self.parse_datacite_creators(contributors, role=None, set_index=False, doi=doi)
+        contribs = self.parse_datacite_creators(creators, doi=doi)
+        contribs_extra_contributors = self.parse_datacite_creators(contributors, set_index=False, doi=doi)
 
-        # Address duplicated author names; use raw_name string comparison; refs #59.
-        contribs = unique_contributors(contribs)
+        # Unfortunately, creators and contributors might overlap, refs GH59.
+        for cc in contribs_extra_contributors:
+            if contributor_list_contains_contributor(contribs, cc):
+                continue
+            contribs.append(cc)
 
         # Title, may come with "attributes.titles[].titleType", like
         # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
@@ -800,8 +803,7 @@ class DataciteImporter(EntityImporter):
                 if contributorType:
                     extra = {'type': contributorType}
 
-                contribs.append(
-                    fatcat_openapi_client.ReleaseContrib(
+                rc = fatcat_openapi_client.ReleaseContrib(
                         creator_id=creator_id,
                         index=i,
                         raw_name=name,
@@ -810,7 +812,9 @@ class DataciteImporter(EntityImporter):
                         role=role,
                         raw_affiliation=raw_affiliation,
                         extra=extra,
-                    ))
+                    )
+                if not contributor_list_contains_contributor(contribs, rc):
+                    contribs.append(rc)
             elif nameType == 'Organizational':
                 name = c.get('name', '') or ''
                 if name in UNKNOWN_MARKERS:
@@ -826,18 +830,20 @@ class DataciteImporter(EntityImporter):
         return contribs
 
 
-def unique_contributors(contribs):
+def contributor_list_contains_contributor(contributor_list, contributor):
     """
-    Given a list of ReleaseContrib items, return a list of unique
-    ReleaseContribs, refs GH #59.
+    Given a list of contributors, determine, whether contrib is in that list.
     """
-    unique_names, unique_contribs = set(), []
-    for rc in contribs:
-        if rc.raw_name and rc.raw_name in unique_names:
+    for cc in contributor_list:
+        if cc.raw_name != contributor.raw_name:
+            continue
+        cc_role = cc.role or 'author'
+        contributor_role = contributor.role or 'author'
+        if cc_role != contributor_role:
             continue
-        unique_names.add(rc.raw_name)
-        unique_contribs.append(rc)
-    return unique_contribs
+        return True
+    return False
+
 
 def lookup_license_slug(raw):
     """
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 1472b8ea..b01a11e6 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -7,33 +7,54 @@ import datetime
 import pytest
 import gzip
 from fatcat_tools.importers import DataciteImporter, JsonLinePusher
-from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates, clean_doi, index_form_to_display_name, lookup_license_slug
+from fatcat_tools.importers.datacite import (
+    find_original_language_title,
+    parse_datacite_titles,
+    parse_datacite_dates,
+    clean_doi,
+    index_form_to_display_name,
+    lookup_license_slug,
+    contributor_list_contains_contributor,
+)
 from fatcat_tools.transforms import entity_to_dict
+import fatcat_openapi_client
 from fixtures import api
 import json
 
 
 @pytest.fixture(scope="function")
 def datacite_importer(api):
-    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
-        yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3',
-                               bezerk_mode=True)
+    with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
+        yield DataciteImporter(
+            api,
+            issn_file,
+            extid_map_file="tests/files/example_map.sqlite3",
+            bezerk_mode=True,
+        )
+
 
 @pytest.fixture(scope="function")
 def datacite_importer_existing(api):
-    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
-        yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3',
-                               bezerk_mode=False)
+    with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
+        yield DataciteImporter(
+            api,
+            issn_file,
+            extid_map_file="tests/files/example_map.sqlite3",
+            bezerk_mode=False,
+        )
+
 
 @pytest.mark.skip(reason="larger datacite import slows tests down")
 def test_datacite_importer_huge(datacite_importer):
     last_index = datacite_importer.api.get_changelog(limit=1)[0].index
-    with gzip.open('tests/files/datacite_1k_records.jsonl.gz', 'rt') as f:
+    with gzip.open("tests/files/datacite_1k_records.jsonl.gz", "rt") as f:
         datacite_importer.bezerk_mode = True
         counts = JsonLinePusher(datacite_importer, f).run()
-    assert counts['insert'] == 998
-    change = datacite_importer.api.get_changelog_entry(index=last_index+1)
-    release = datacite_importer.api.get_release(change.editgroup.edits.releases[0].ident)
+    assert counts["insert"] == 998
+    change = datacite_importer.api.get_changelog_entry(index=last_index + 1)
+    release = datacite_importer.api.get_release(
+        change.editgroup.edits.releases[0].ident
+    )
     assert len(release.contribs) == 3
 
 
@@ -41,122 +62,161 @@ def test_find_original_language_title():
     """
     Original language might be included, in various ways.
     """
-    Case = collections.namedtuple('Case', 'about input result')
+    Case = collections.namedtuple("Case", "about input result")
     cases = [
-        Case('defaults to None', {}, None),
-        Case('ignore unknown keys', {'broken': 'kv'}, None),
-        Case('just a title', {'title': 'Noise Reduction'}, None),
-        Case('same title should be ignored', {
-            'title': 'Noise Reduction',
-            'original_language_title': 'Noise Reduction'
-        }, None),
-        Case('empty subdict is ignored', {
-            'title': 'Noise Reduction',
-            'original_language_title': {},
-        }, None),
-        Case('unknown subdict keys are ignored', {
-            'title': 'Noise Reduction',
-            'original_language_title': {'broken': 'kv'},
-        }, None),
-        Case('original string', {
-            'title': 'Noise Reduction',
-            'original_language_title': 'Подавление шума',
-        }, 'Подавление шума'),
-        Case('language tag is ignored, since its broken', {
-            'title': 'Noise Reduction',
-            'original_language_title': {
-                'language': 'ja',
-                '__content__': 'Noise Reduction'
+        Case("defaults to None", {}, None),
+        Case("ignore unknown keys", {"broken": "kv"}, None),
+        Case("just a title", {"title": "Noise Reduction"}, None),
+        Case(
+            "same title should be ignored",
+            {"title": "Noise Reduction", "original_language_title": "Noise Reduction"},
+            None,
+        ),
+        Case(
+            "empty subdict is ignored",
+            {"title": "Noise Reduction", "original_language_title": {},},
+            None,
+        ),
+        Case(
+            "unknown subdict keys are ignored",
+            {"title": "Noise Reduction", "original_language_title": {"broken": "kv"},},
+            None,
+        ),
+        Case(
+            "original string",
+            {"title": "Noise Reduction", "original_language_title": "Подавление шума",},
+            "Подавление шума",
+        ),
+        Case(
+            "language tag is ignored, since its broken",
+            {
+                "title": "Noise Reduction",
+                "original_language_title": {
+                    "language": "ja",
+                    "__content__": "Noise Reduction",
+                },
             },
-        }, None),
-        Case('do not care about language', {
-            'title': 'Noise Reduction',
-            'original_language_title': {
-                'language': 'ja',
-                '__content__': 'Rauschunterdrückung',
+            None,
+        ),
+        Case(
+            "do not care about language",
+            {
+                "title": "Noise Reduction",
+                "original_language_title": {
+                    "language": "ja",
+                    "__content__": "Rauschunterdrückung",
+                },
             },
-        }, 'Rauschunterdrückung'),
-        Case('ignore excessive questionmarks', {
-            'title': 'Noise Reduction',
-            'original_language_title': {
-                'language': 'ja',
-                '__content__': '???? However',
+            "Rauschunterdrückung",
+        ),
+        Case(
+            "ignore excessive questionmarks",
+            {
+                "title": "Noise Reduction",
+                "original_language_title": {
+                    "language": "ja",
+                    "__content__": "???? However",
+                },
             },
-        }, None),
+            None,
+        ),
     ]
 
     for case in cases:
         result = find_original_language_title(case.input)
         assert result == case.result
 
+
 def test_parse_datacite_titles():
     """
     Given a list of titles, find title, original_language_title and subtitle.
     Result is a 3-tuple of title, original_language_title, subtitle.
     """
-    Case = collections.namedtuple('Case', 'about input result')
+    Case = collections.namedtuple("Case", "about input result")
     cases = [
-        Case('handle None', None, (None, None, None)),
-        Case('empty list', [], (None, None, None)),
-        Case('empty item', [{}], (None, None, None)),
-        Case('broken keys', [{'broken': 'kv'}], (None, None, None)),
-        Case('title only', [{'title': 'Total carbon dioxide'}],
-             ('Total carbon dioxide', None, None),
-        ),
-        Case('title and subtitle', [
-            {'title': 'Total carbon dioxide'},
-            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
-        ],
-             ('Total carbon dioxide', None, 'Station TT043_7-9'),
-        ),
-        Case('title, subtitle order does not matter', [
-            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
-            {'title': 'Total carbon dioxide'},
-        ],
-             ('Total carbon dioxide', None, 'Station TT043_7-9'),
-        ),
-        Case('multiple titles, first wins', [
-            {'title': 'Total carbon dioxide'},
-            {'title': 'Meeting Heterogeneity'},
-        ],
-             ('Total carbon dioxide', None, None),
-        ),
-        Case('multiple titles, plus sub', [
-            {'title': 'Total carbon dioxide'},
-            {'title': 'Meeting Heterogeneity'},
-            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
-        ],
-             ('Total carbon dioxide', None, 'Station TT043_7-9'),
-        ),
-        Case('multiple titles, multiple subs', [
-            {'title': 'Total carbon dioxide'},
-            {'title': 'Meeting Heterogeneity'},
-            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
-            {'title': 'Some other subtitle', 'titleType': 'Subtitle'},
-        ],
-             ('Total carbon dioxide', None, 'Station TT043_7-9'),
-        ),
-        Case('title, original, sub', [
-            {'title': 'Total carbon dioxide', 'original_language_title': 'Всего углекислого газа'},
-            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
-        ],
-             ('Total carbon dioxide', 'Всего углекислого газа', 'Station TT043_7-9'),
-        ),
-        Case('title, original same as title, sub', [
-            {'title': 'Total carbon dioxide', 'original_language_title': {
-                '__content__': 'Total carbon dioxide',
-            }},
-            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
-        ],
-             ('Total carbon dioxide', None, 'Station TT043_7-9'),
-        ),
-        Case('title, original dict, sub', [
-            {'title': 'Total carbon dioxide', 'original_language_title': {
-                '__content__': 'Всего углекислого газа',
-            }},
-            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
-        ],
-             ('Total carbon dioxide', 'Всего углекислого газа', 'Station TT043_7-9'),
+        Case("handle None", None, (None, None, None)),
+        Case("empty list", [], (None, None, None)),
+        Case("empty item", [{}], (None, None, None)),
+        Case("broken keys", [{"broken": "kv"}], (None, None, None)),
+        Case(
+            "title only",
+            [{"title": "Total carbon dioxide"}],
+            ("Total carbon dioxide", None, None),
+        ),
+        Case(
+            "title and subtitle",
+            [
+                {"title": "Total carbon dioxide"},
+                {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+            ],
+            ("Total carbon dioxide", None, "Station TT043_7-9"),
+        ),
+        Case(
+            "title, subtitle order does not matter",
+            [
+                {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+                {"title": "Total carbon dioxide"},
+            ],
+            ("Total carbon dioxide", None, "Station TT043_7-9"),
+        ),
+        Case(
+            "multiple titles, first wins",
+            [{"title": "Total carbon dioxide"}, {"title": "Meeting Heterogeneity"},],
+            ("Total carbon dioxide", None, None),
+        ),
+        Case(
+            "multiple titles, plus sub",
+            [
+                {"title": "Total carbon dioxide"},
+                {"title": "Meeting Heterogeneity"},
+                {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+            ],
+            ("Total carbon dioxide", None, "Station TT043_7-9"),
+        ),
+        Case(
+            "multiple titles, multiple subs",
+            [
+                {"title": "Total carbon dioxide"},
+                {"title": "Meeting Heterogeneity"},
+                {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+                {"title": "Some other subtitle", "titleType": "Subtitle"},
+            ],
+            ("Total carbon dioxide", None, "Station TT043_7-9"),
+        ),
+        Case(
+            "title, original, sub",
+            [
+                {
+                    "title": "Total carbon dioxide",
+                    "original_language_title": "Всего углекислого газа",
+                },
+                {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+            ],
+            ("Total carbon dioxide", "Всего углекислого газа", "Station TT043_7-9"),
+        ),
+        Case(
+            "title, original same as title, sub",
+            [
+                {
+                    "title": "Total carbon dioxide",
+                    "original_language_title": {"__content__": "Total carbon dioxide",},
+                },
+                {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+            ],
+            ("Total carbon dioxide", None, "Station TT043_7-9"),
+        ),
+        Case(
+            "title, original dict, sub",
+            [
+                {
+                    "title": "Total carbon dioxide",
+                    "original_language_title": {
+                        "__content__": "Всего углекислого газа",
+                    },
+                },
+                {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+            ],
+            ("Total carbon dioxide", "Всего углекислого газа", "Station TT043_7-9"),
         ),
     ]
 
@@ -164,91 +224,128 @@ def test_parse_datacite_titles():
         result = parse_datacite_titles(case.input)
         assert result == case.result, case.about
 
+
 def test_parse_datacite_dates():
     """
     Test datacite date parsing.
     """
-    Case = collections.namedtuple('Case', 'about input result')
+    Case = collections.namedtuple("Case", "about input result")
     cases = [
-        Case('None is None', None, (None, None, None)),
-        Case('empty list is None', [], (None, None, None)),
-        Case('empty item is None', [{}], (None, None, None)),
-        Case('year only yields year only', [{'date': '2019'}], (None, None, 2019)),
-        Case('int year', [{'date': 2019}], (None, None, 2019)),
-        Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, None, 2019)),
-        Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, None, 2020)),
-        Case('first with type', [
-            {'date': '2019', 'dateType': 'Accepted'}, {'date': '2020'}
-        ], (None, None, 2019)),
-        Case('full date', [
-            {'date': '2019-12-01', 'dateType': 'Valid'},
-        ], (datetime.date(2019, 12, 1), 12, 2019)),
-        Case('date type prio', [
-            {'date': '2000-12-01', 'dateType': 'Valid'},
-            {'date': '2010-01-01', 'dateType': 'Updated'},
-        ], (datetime.date(2000, 12, 1), 12, 2000)),
-        Case('date type prio, Available > Updated', [
-            {'date': '2010-01-01', 'dateType': 'Updated'},
-            {'date': '2000-12-01', 'dateType': 'Available'},
-        ], (datetime.date(2000, 12, 1), 12, 2000)),
-        Case('allow different date formats, Available > Updated', [
-            {'date': '2010-01-01T10:00:00', 'dateType': 'Updated'},
-            {'date': '2000-12-01T10:00:00', 'dateType': 'Available'},
-        ], (datetime.date(2000, 12, 1), 12, 2000)),
-        Case('allow different date formats, Available > Updated', [
-            {'date': '2010-01-01T10:00:00Z', 'dateType': 'Updated'},
-            {'date': '2000-12-01T10:00:00Z', 'dateType': 'Available'},
-        ], (datetime.date(2000, 12, 1), 12, 2000)),
-        Case('allow fuzzy date formats, Available > Updated', [
-            {'date': '2010', 'dateType': 'Updated'},
-            {'date': '2000 Dec 01', 'dateType': 'Available'},
-        ], (datetime.date(2000, 12, 1), 12, 2000)),
-        Case('fuzzy year only', [
-            {'date': 'Year 2010', 'dateType': 'Issued'},
-        ], (None, None, 2010)),
-        Case('fuzzy year and month', [
-            {'date': 'Year 2010 Feb', 'dateType': 'Issued'},
-        ], (None, 2, 2010)),
-        Case('fuzzy year, month, day', [
-            {'date': 'Year 2010 Feb 24', 'dateType': 'Issued'},
-        ], (datetime.date(2010, 2, 24), 2, 2010)),
-        Case('ignore broken date', [
-            {'date': 'Febrrr 45', 'dateType': 'Updated'},
-        ], (None, None, None)),
+        Case("None is None", None, (None, None, None)),
+        Case("empty list is None", [], (None, None, None)),
+        Case("empty item is None", [{}], (None, None, None)),
+        Case("year only yields year only", [{"date": "2019"}], (None, None, 2019)),
+        Case("int year", [{"date": 2019}], (None, None, 2019)),
+        Case("first wins", [{"date": "2019"}, {"date": "2020"}], (None, None, 2019)),
+        Case(
+            "skip bogus year", [{"date": "abc"}, {"date": "2020"}], (None, None, 2020)
+        ),
+        Case(
+            "first with type",
+            [{"date": "2019", "dateType": "Accepted"}, {"date": "2020"}],
+            (None, None, 2019),
+        ),
+        Case(
+            "full date",
+            [{"date": "2019-12-01", "dateType": "Valid"},],
+            (datetime.date(2019, 12, 1), 12, 2019),
+        ),
+        Case(
+            "date type prio",
+            [
+                {"date": "2000-12-01", "dateType": "Valid"},
+                {"date": "2010-01-01", "dateType": "Updated"},
+            ],
+            (datetime.date(2000, 12, 1), 12, 2000),
+        ),
+        Case(
+            "date type prio, Available > Updated",
+            [
+                {"date": "2010-01-01", "dateType": "Updated"},
+                {"date": "2000-12-01", "dateType": "Available"},
+            ],
+            (datetime.date(2000, 12, 1), 12, 2000),
+        ),
+        Case(
+            "allow different date formats, Available > Updated",
+            [
+                {"date": "2010-01-01T10:00:00", "dateType": "Updated"},
+                {"date": "2000-12-01T10:00:00", "dateType": "Available"},
+            ],
+            (datetime.date(2000, 12, 1), 12, 2000),
+        ),
+        Case(
+            "allow different date formats, Available > Updated",
+            [
+                {"date": "2010-01-01T10:00:00Z", "dateType": "Updated"},
+                {"date": "2000-12-01T10:00:00Z", "dateType": "Available"},
+            ],
+            (datetime.date(2000, 12, 1), 12, 2000),
+        ),
+        Case(
+            "allow fuzzy date formats, Available > Updated",
+            [
+                {"date": "2010", "dateType": "Updated"},
+                {"date": "2000 Dec 01", "dateType": "Available"},
+            ],
+            (datetime.date(2000, 12, 1), 12, 2000),
+        ),
+        Case(
+            "fuzzy year only",
+            [{"date": "Year 2010", "dateType": "Issued"},],
+            (None, None, 2010),
+        ),
+        Case(
+            "fuzzy year and month",
+            [{"date": "Year 2010 Feb", "dateType": "Issued"},],
+            (None, 2, 2010),
+        ),
+        Case(
+            "fuzzy year, month, day",
+            [{"date": "Year 2010 Feb 24", "dateType": "Issued"},],
+            (datetime.date(2010, 2, 24), 2, 2010),
+        ),
+        Case(
+            "ignore broken date",
+            [{"date": "Febrrr 45", "dateType": "Updated"},],
+            (None, None, None),
+        ),
     ]
     for case in cases:
         result = parse_datacite_dates(case.input)
         assert result == case.result, case.about
 
+
 def test_datacite_importer(datacite_importer):
     last_index = datacite_importer.api.get_changelog(limit=1)[0].index
-    with open('tests/files/datacite_sample.jsonl', 'r') as f:
+    with open("tests/files/datacite_sample.jsonl", "r") as f:
         datacite_importer.bezerk_mode = True
         counts = JsonLinePusher(datacite_importer, f).run()
-    assert counts['insert'] == 1
-    assert counts['exists'] == 0
-    assert counts['skip'] == 0
+    assert counts["insert"] == 1
+    assert counts["exists"] == 0
+    assert counts["skip"] == 0
 
     # fetch most recent editgroup
-    change = datacite_importer.api.get_changelog_entry(index=last_index+1)
+    change = datacite_importer.api.get_changelog_entry(index=last_index + 1)
     eg = change.editgroup
     assert eg.description
     assert "datacite" in eg.description.lower()
-    assert eg.extra['git_rev']
-    assert "fatcat_tools.DataciteImporter" in eg.extra['agent']
+    assert eg.extra["git_rev"]
+    assert "fatcat_tools.DataciteImporter" in eg.extra["agent"]
 
     last_index = datacite_importer.api.get_changelog(limit=1)[0].index
-    with open('tests/files/datacite_sample.jsonl', 'r') as f:
+    with open("tests/files/datacite_sample.jsonl", "r") as f:
         datacite_importer.bezerk_mode = False
         datacite_importer.reset()
         counts = JsonLinePusher(datacite_importer, f).run()
-    assert counts['insert'] == 0
-    assert counts['exists'] == 1
-    assert counts['skip'] == 0
+    assert counts["insert"] == 0
+    assert counts["exists"] == 1
+    assert counts["skip"] == 0
     assert last_index == datacite_importer.api.get_changelog(limit=1)[0].index
 
+
 def test_datacite_dict_parse(datacite_importer):
-    with open('tests/files/datacite_sample.jsonl', 'r') as f:
+    with open("tests/files/datacite_sample.jsonl", "r") as f:
         raw = json.load(f)
         r = datacite_importer.parse_record(raw)
         # ensure the API server is ok with format
@@ -256,7 +353,9 @@ def test_datacite_dict_parse(datacite_importer):
 
         print(r.extra)
         assert r.title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090"
-        assert r.publisher == "International Centre for Agricultural Research in Dry Areas"
+        assert (
+            r.publisher == "International Centre for Agricultural Research in Dry Areas"
+        )
         assert r.release_type == "article"
         assert r.release_stage == "published"
         assert r.license_slug == None
@@ -267,13 +366,15 @@ def test_datacite_dict_parse(datacite_importer):
         assert r.subtitle == None
         assert r.release_date == None
         assert r.release_year == 1986
-        assert 'subtitle' not in r.extra
-        assert 'subtitle' not in r.extra['datacite']
-        assert 'funder' not in r.extra
-        assert 'funder' not in r.extra['datacite']
+        assert "subtitle" not in r.extra
+        assert "subtitle" not in r.extra["datacite"]
+        assert "funder" not in r.extra
+        assert "funder" not in r.extra["datacite"]
         # matched by ISSN, so shouldn't be in there
-        #assert extra['container_name'] == "International Journal of Quantum Chemistry"
-        assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}]
+        # assert extra['container_name'] == "International Journal of Quantum Chemistry"
+        assert r.extra["datacite"]["subjects"] == [
+            {"subject": "Plant Genetic Resource for Food and Agriculture"}
+        ]
         assert len(r.abstracts) == 1
         assert len(r.abstracts[0].content) == 421
         assert len(r.contribs) == 2
@@ -282,34 +383,41 @@ def test_datacite_dict_parse(datacite_importer):
         assert r.contribs[0].surname == None
         assert len(r.refs) == 0
 
+
 def test_datacite_conversions(datacite_importer):
     """
     Datacite JSON to release entity JSON representation. The count is hardcoded
     for now.
     """
     datacite_importer.debug = True
-    for i in range(34):
-        src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i)
-        dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i)
-        with open(src, 'r') as f:
+    for i in range(35):
+        src = "tests/files/datacite/datacite_doc_{0:02d}.json".format(i)
+        dst = "tests/files/datacite/datacite_result_{0:02d}.json".format(i)
+        with open(src, "r") as f:
             re = datacite_importer.parse_record(json.load(f))
             result = entity_to_dict(re)
-        with open(dst, 'r') as f:
-           expected = json.loads(f.read())
+        with open(dst, "r") as f:
+            expected = json.loads(f.read())
+
+        assert result == expected, "output mismatch in {}".format(dst)
 
-        assert result == expected, 'output mismatch in {}'.format(dst)
 
 def test_index_form_to_display_name():
-    Case = collections.namedtuple('Case', 'input output')
+    Case = collections.namedtuple("Case", "input output")
     cases = [
-        Case('', ''),
-        Case('ABC', 'ABC'),
-        Case('International Space Station', 'International Space Station'),
-        Case('Jin, Shan', 'Shan Jin'),
-        Case('Volkshochschule Der Bundesstadt Bonn', 'Volkshochschule Der Bundesstadt Bonn'),
-        Case('Solomon, P. M.', 'P. M. Solomon'),
-        Case('Sujeevan Ratnasingham', 'Sujeevan Ratnasingham'),
-        Case('Paul Stöckli (1906-1991), Künstler', 'Paul Stöckli (1906-1991), Künstler'),
+        Case("", ""),
+        Case("ABC", "ABC"),
+        Case("International Space Station", "International Space Station"),
+        Case("Jin, Shan", "Shan Jin"),
+        Case(
+            "Volkshochschule Der Bundesstadt Bonn",
+            "Volkshochschule Der Bundesstadt Bonn",
+        ),
+        Case("Solomon, P. M.", "P. M. Solomon"),
+        Case("Sujeevan Ratnasingham", "Sujeevan Ratnasingham"),
+        Case(
+            "Paul Stöckli (1906-1991), Künstler", "Paul Stöckli (1906-1991), Künstler"
+        ),
     ]
 
     for c in cases:
@@ -317,45 +425,69 @@ def test_index_form_to_display_name():
 
 
 def test_lookup_license_slug():
-    Case = collections.namedtuple('Case', 'input output')
+    Case = collections.namedtuple("Case", "input output")
     cases = [
-        Case('https://opensource.org/licenses/MIT', 'MIT'),
-        Case('creativecommons.org/licenses/by-nc-nd/3.0/', 'CC-BY-NC-ND'),
-        Case('http://creativecommons.org/licences/by-nc-sa/4.0', 'CC-BY-NC-SA'),
-        Case('http://creativecommons.org/licenses/by-nc-nd/2.5/co', 'CC-BY-NC-ND'),
-        Case('http://creativecommons.org/licenses/by-nd/4.0/legalcode', 'CC-BY-ND'),
-        Case('http://creativecommons.org/licenses/by/2.0/uk/legalcode', 'CC-BY'),
-        Case('http://creativecommons.org/publicdomain/zero/1.0/legalcode', 'CC-0'),
-        Case('http://doi.wiley.com/10.1002/tdm_license_1.1', 'WILEY-TDM-1.1'),
-        Case('http://homepage.data-planet.com/terms-use', 'SAGE-DATA-PLANET'),
-        Case('http://www.springer.com/tdm', 'SPRINGER-TDM'),
-        Case('https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess.xhtml', 'ADS-UK'),
-        Case('https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess', 'ADS-UK'),
-        Case('https://creativecommons.org/public-domain/cc0', 'CC-0'),
-        Case('https://creativecommons.org/publicdomain/zero/1.0', 'CC-0'),
-        Case('https://creativecommons.org/share-your-work/public-domain/cc0', 'CC-0'),
-        Case('https://www.elsevier.com/tdm/userlicense/1.0', 'ELSEVIER-USER-1.0'),
-        Case('https://www.gnu.org/licenses/gpl-3.0.html', 'GPL-3.0'),
-        Case('http://rightsstatements.org/page/InC/1.0?language=en', 'RS-INC'),
-        Case('http://onlinelibrary.wiley.com/termsAndConditions', 'WILEY'),
-        Case('https://publikationen.bibliothek.kit.edu/kitopen-lizenz', 'KIT-OPEN'),
-        Case('http://journals.sagepub.com/page/policies/text-and-data-mining-license', 'SAGE-TDM'),
-    	Case('https://creativecommons.org/publicdomain/mark/1.0/deed.de', 'CC-PUBLICDOMAIN'),
-    	Case('http://creativecommons.org/publicdomain/mark/1.0', 'CC-PUBLICDOMAIN'),
-    	Case('https://creativecommons.org/publicdomain/mark/1.0', 'CC-PUBLICDOMAIN'),
-    	Case('https://creativecommons.org/publicdomain/mark/1.0/', 'CC-PUBLICDOMAIN'),
-    	Case('https://creativecommons.org/publicdomain/mark/1.0/deed.de', 'CC-PUBLICDOMAIN'),
-    	Case('https://creativecommons.org/share-your-work/public-domain/cc0/', 'CC-0'),
-    	Case('http://spdx.org/licenses/CC0-1.0.json', 'CC-0'),
-    	Case('http://spdx.org/licenses/CC-BY-1.0.json', 'CC-BY'),
-    	Case('http://spdx.org/licenses/CC-BY-4.0.json', 'CC-BY'),
-    	Case('http://spdx.org/licenses/CC-BY-NC-4.0.json', 'CC-BY-NC'),
-    	Case('http://spdx.org/licenses/CC-BY-SA-3.0.json', 'CC-BY-SA'),
-    	Case('http://spdx.org/licenses/CC-BY-SA-4.0.json', 'CC-BY-SA'),
-    	Case('http://spdx.org/licenses/MIT.json', 'MIT'),
-    	Case('http://spdx.org/licenses/OGL-Canada-2.0.json', 'OGL-CANADA'),
+        Case("https://opensource.org/licenses/MIT", "MIT"),
+        Case("creativecommons.org/licenses/by-nc-nd/3.0/", "CC-BY-NC-ND"),
+        Case("http://creativecommons.org/licences/by-nc-sa/4.0", "CC-BY-NC-SA"),
+        Case("http://creativecommons.org/licenses/by-nc-nd/2.5/co", "CC-BY-NC-ND"),
+        Case("http://creativecommons.org/licenses/by-nd/4.0/legalcode", "CC-BY-ND"),
+        Case("http://creativecommons.org/licenses/by/2.0/uk/legalcode", "CC-BY"),
+        Case("http://creativecommons.org/publicdomain/zero/1.0/legalcode", "CC-0"),
+        Case("http://doi.wiley.com/10.1002/tdm_license_1.1", "WILEY-TDM-1.1"),
+        Case("http://homepage.data-planet.com/terms-use", "SAGE-DATA-PLANET"),
+        Case("http://www.springer.com/tdm", "SPRINGER-TDM"),
+        Case(
+            "https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess.xhtml",
+            "ADS-UK",
+        ),
+        Case(
+            "https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess", "ADS-UK"
+        ),
+        Case("https://creativecommons.org/public-domain/cc0", "CC-0"),
+        Case("https://creativecommons.org/publicdomain/zero/1.0", "CC-0"),
+        Case("https://creativecommons.org/share-your-work/public-domain/cc0", "CC-0"),
+        Case("https://www.elsevier.com/tdm/userlicense/1.0", "ELSEVIER-USER-1.0"),
+        Case("https://www.gnu.org/licenses/gpl-3.0.html", "GPL-3.0"),
+        Case("http://rightsstatements.org/page/InC/1.0?language=en", "RS-INC"),
+        Case("http://onlinelibrary.wiley.com/termsAndConditions", "WILEY"),
+        Case("https://publikationen.bibliothek.kit.edu/kitopen-lizenz", "KIT-OPEN"),
+        Case(
+            "http://journals.sagepub.com/page/policies/text-and-data-mining-license",
+            "SAGE-TDM",
+        ),
+        Case(
+            "https://creativecommons.org/publicdomain/mark/1.0/deed.de",
+            "CC-PUBLICDOMAIN",
+        ),
+        Case("http://creativecommons.org/publicdomain/mark/1.0", "CC-PUBLICDOMAIN"),
+        Case("https://creativecommons.org/publicdomain/mark/1.0", "CC-PUBLICDOMAIN"),
+        Case("https://creativecommons.org/publicdomain/mark/1.0/", "CC-PUBLICDOMAIN"),
+        Case(
+            "https://creativecommons.org/publicdomain/mark/1.0/deed.de",
+            "CC-PUBLICDOMAIN",
+        ),
+        Case("https://creativecommons.org/share-your-work/public-domain/cc0/", "CC-0"),
+        Case("http://spdx.org/licenses/CC0-1.0.json", "CC-0"),
+        Case("http://spdx.org/licenses/CC-BY-1.0.json", "CC-BY"),
+        Case("http://spdx.org/licenses/CC-BY-4.0.json", "CC-BY"),
+        Case("http://spdx.org/licenses/CC-BY-NC-4.0.json", "CC-BY-NC"),
+        Case("http://spdx.org/licenses/CC-BY-SA-3.0.json", "CC-BY-SA"),
+        Case("http://spdx.org/licenses/CC-BY-SA-4.0.json", "CC-BY-SA"),
+        Case("http://spdx.org/licenses/MIT.json", "MIT"),
+        Case("http://spdx.org/licenses/OGL-Canada-2.0.json", "OGL-CANADA"),
     ]
 
     for c in cases:
         got = lookup_license_slug(c.input)
-        assert c.output == got, '{}: got {}, want {}'.format(c.input, got, c.output)
+        assert c.output == got, "{}: got {}, want {}".format(c.input, got, c.output)
+
+
+def test_contributor_list_contains_contributor():
+    Case = collections.namedtuple("Case", "contrib_list contrib want")
+    cases = [
+        Case([], fatcat_openapi_client.ReleaseContrib(raw_name="Paul Katz"), False),
+    ]
+    for c in cases:
+        got = contributor_list_contains_contributor(c.contrib_list, c.contrib)
+        assert got == c.want
-- 
cgit v1.2.3


From 2411bad315b48b99c19958ea3c393dc4d09d6486 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 10 Jul 2020 18:29:00 +0200
Subject: datacite: document contributor types

---
 python/fatcat_tools/importers/datacite.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 7797812f..797ccf19 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -296,6 +296,31 @@ class DataciteImporter(EntityImporter):
         contributors = attributes.get('contributors', []) or []  # Much fewer than creators.
 
         contribs = self.parse_datacite_creators(creators, doi=doi)
+
+        # Beside creators, we have contributors in datacite. Sample:
+        # ContactPerson, DataCollector, DataCurator, DataManager, Distributor,
+        # Editor, Funder, HostingInstitution, Other, Producer, ProjectLeader,
+        # ProjectMember, RelatedPerson, ResearchGroup, Researcher,
+        # RightsHolder, Sponsor, Supervisor
+        #
+        # Datacite schema:
+        # https://schema.datacite.org/meta/kernel-4.3/doc/DataCite-MetadataKernel_v4.3.pdf#page=32
+        # -- could be used as a form of controlled vocab?
+        #
+        # Currently (07/2020) in release_contrib:
+        #
+        # select count(*), role from release_contrib group by role;
+        #    count   |    role
+        # -----------+------------
+        #  500269665 | author
+        #    4386563 | editor
+        #      17871 | translator
+        #   10870584 |
+        # (4 rows)
+	#
+        # Related: https://guide.fatcat.wiki/entity_release.html -- role
+        # (string, of a set): the type of contribution, from a controlled
+        # vocabulary. TODO: vocabulary needs review.
         contribs_extra_contributors = self.parse_datacite_creators(contributors, set_index=False, doi=doi)
 
         # Unfortunately, creators and contributors might overlap, refs GH59.
-- 
cgit v1.2.3


From d2bcd77f73c6496a2ffdd865d2348f33f4fb17f1 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 10 Jul 2020 18:29:31 +0200
Subject: datacite: there should be no index gaps

---
 python/fatcat_tools/importers/datacite.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 797ccf19..962d80c6 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -758,9 +758,10 @@ class DataciteImporter(EntityImporter):
         # Names, that should be ignored right away.
         name_blacklist = set(('Occdownload Gbif.Org',))
 
-        for i, c in enumerate(creators):
+        i = 0
+        for c in creators:
             if not set_index:
-                i = None
+               i = None
             nameType = c.get('nameType', '') or ''
             if nameType in ('', 'Personal'):
                 creator_id = None
@@ -838,8 +839,11 @@ class DataciteImporter(EntityImporter):
                         raw_affiliation=raw_affiliation,
                         extra=extra,
                     )
+                # Filter out duplicates early.
                 if not contributor_list_contains_contributor(contribs, rc):
                     contribs.append(rc)
+                    if i is not None:
+                        i += 1
             elif nameType == 'Organizational':
                 name = c.get('name', '') or ''
                 if name in UNKNOWN_MARKERS:
@@ -849,6 +853,8 @@ class DataciteImporter(EntityImporter):
                 extra = {'organization': name}
                 contribs.append(fatcat_openapi_client.ReleaseContrib(
                     index=i, extra=extra))
+                if i is not None:
+                    i += 1
             else:
                 print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr)
 
-- 
cgit v1.2.3


From fdf1028c19b0623e30b91e49ffa65ed130dcfdc1 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 10 Jul 2020 18:29:47 +0200
Subject: datacite: adjust tests

---
 python/tests/files/datacite/datacite_result_27.json | 3 ++-
 python/tests/files/datacite/datacite_result_28.json | 3 ++-
 python/tests/files/datacite/datacite_result_29.json | 3 ++-
 python/tests/files/datacite/datacite_result_34.json | 7 -------
 4 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/python/tests/files/datacite/datacite_result_27.json b/python/tests/files/datacite/datacite_result_27.json
index 3d033e6a..e934fb41 100644
--- a/python/tests/files/datacite/datacite_result_27.json
+++ b/python/tests/files/datacite/datacite_result_27.json
@@ -13,7 +13,8 @@
       },
       "given_name": "David",
       "raw_name": "David Wemmer",
-      "surname": "Wemmer"
+      "surname": "Wemmer",
+      "role": "author"
     }
   ],
   "ext_ids": {
diff --git a/python/tests/files/datacite/datacite_result_28.json b/python/tests/files/datacite/datacite_result_28.json
index 84bed9c8..bcb1caaf 100644
--- a/python/tests/files/datacite/datacite_result_28.json
+++ b/python/tests/files/datacite/datacite_result_28.json
@@ -13,7 +13,8 @@
       },
       "given_name": "David",
       "raw_name": "David Wemmer",
-      "surname": "Wemmer"
+      "surname": "Wemmer",
+      "role": "author"
     }
   ],
   "ext_ids": {
diff --git a/python/tests/files/datacite/datacite_result_29.json b/python/tests/files/datacite/datacite_result_29.json
index 84bed9c8..bcb1caaf 100644
--- a/python/tests/files/datacite/datacite_result_29.json
+++ b/python/tests/files/datacite/datacite_result_29.json
@@ -13,7 +13,8 @@
       },
       "given_name": "David",
       "raw_name": "David Wemmer",
-      "surname": "Wemmer"
+      "surname": "Wemmer",
+      "role": "author"
     }
   ],
   "ext_ids": {
diff --git a/python/tests/files/datacite/datacite_result_34.json b/python/tests/files/datacite/datacite_result_34.json
index 8e087ab5..4a52e22c 100644
--- a/python/tests/files/datacite/datacite_result_34.json
+++ b/python/tests/files/datacite/datacite_result_34.json
@@ -12,13 +12,6 @@
       "index": 0,
       "raw_name": "Paul Katz",
       "role": "author"
-    },
-    {
-      "given_name": "",
-      "surname": "",
-      "index": 0,
-      "raw_name": "Paul Katz",
-      "role": "illustrator"
     }
   ],
   "ext_ids": {
-- 
cgit v1.2.3