datacite: factor out contributor handling

Use values from: * attributes.creators[] * attributes.contributors[]
author: Martin Czygan <martin.czygan@gmail.com> 2020-01-08 22:33:58 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2020-01-08 22:33:58 +0100
commit: 62d6a7e48d6bea1bc7f451c6043f38aee2051f9b (patch)
tree: c28079d6f3a12f106607fb6fc1fa9a4a5c83d9de
parent: b7a325360ca8ae3107411e9e1966d93b999bbb52 (diff)
download: fatcat-62d6a7e48d6bea1bc7f451c6043f38aee2051f9b.tar.gz
fatcat-62d6a7e48d6bea1bc7f451c6043f38aee2051f9b.zip
6 files changed, 210 insertions, 82 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index fc986994..9ca72758 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -303,88 +303,11 @@ class DataciteImporter(EntityImporter):
             print('[{}] skipping non-ascii doi for now'.format(doi))
             return None
 
-        # Contributors. Many nameIdentifierSchemes, we do not use (yet):
-        # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme":
-        # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID",
-        # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"].
-        contribs = []
-
-        # Names, that should be ignored right away.
-        name_blacklist = set(('Occdownload Gbif.Org',))
-
-        for i, c in enumerate(attributes['creators']):
-            nameType = c.get('nameType', '') or ''
-            if nameType in ('', 'Personal'):
-                creator_id = None
-                for nid in c.get('nameIdentifiers', []):
-                    name_scheme = nid.get('nameIdentifierScheme', '') or ''
-                    if not name_scheme.lower() == "orcid":
-                        continue
-                    orcid = nid.get('nameIdentifier',
-                                    '').replace('https://orcid.org/', '')
-                    if not orcid:
-                        continue
-                    creator_id = self.lookup_orcid(orcid)
-                    # TODO(martin): If creator_id is None, should we create creators?
-
-                # If there are multiple affiliation strings, use the first one.
-                affiliations = c.get('affiliation', []) or []
-                raw_affiliation = None
-                if len(affiliations) == 0:
-                    raw_affiliation = None
-                else:
-                    raw_affiliation = clean(affiliations[0])
-
-                name = c.get('name')
-                given_name = c.get('givenName')
-                surname = c.get('familyName')
-
-                if name:
-                    name = clean(name)
-
-                if name in name_blacklist:
-                    continue
-
-                if given_name:
-                    given_name = clean(given_name)
-
-                if surname:
-                    surname = clean(surname)
-
-                if not name:
-                    continue
-
-                if raw_affiliation == '':
-                    continue
 
-                if name.lower() in UNKNOWN_MARKERS:
-                    continue
+        creators = attributes.get('creators', []) or []
+        contributors = attributes.get('contributors', []) or []  # Much fewer than creators.
 
-                # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'.
-                if name:
-                    name = index_form_to_display_name(name)
-
-                contribs.append(
-                    fatcat_openapi_client.ReleaseContrib(
-                        creator_id=creator_id,
-                        index=i,
-                        raw_name=name,
-                        given_name=given_name,
-                        surname=surname,
-                        role='author',
-                        raw_affiliation=raw_affiliation,
-                    ))
-            elif nameType == 'Organizational':
-                name = c.get('name', '') or ''
-                if name in UNKNOWN_MARKERS:
-                    continue
-                if len(name) < 3:
-                    continue
-                extra = {'organization': name}
-                contribs.append(fatcat_openapi_client.ReleaseContrib(
-                    index=i, extra=extra))
-            else:
-                print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr)
+        contribs = self.parse_datacite_creators(creators) + self.parse_datacite_creators(contributors, role=None, set_index=False)
 
         # Title, may come with "attributes.titles[].titleType", like
         # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
@@ -767,6 +690,104 @@ class DataciteImporter(EntityImporter):
                     extra=self.editgroup_extra),
                 entity_list=batch))
 
+    def parse_datacite_creators(self, creators, role='author', set_index=True):
+        """
+        Parses a list of creators into a list of ReleaseContrib objects. Set
+        set_index to False, if the index contrib field should be left blank.
+        """
+        # Contributors. Many nameIdentifierSchemes, we do not use (yet):
+        # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme":
+        # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID",
+        # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"].
+        contribs = []
+
+        # Names, that should be ignored right away.
+        name_blacklist = set(('Occdownload Gbif.Org',))
+
+        for i, c in enumerate(creators):
+            if not set_index:
+                i = None
+            nameType = c.get('nameType', '') or ''
+            if nameType in ('', 'Personal'):
+                creator_id = None
+                for nid in c.get('nameIdentifiers', []):
+                    name_scheme = nid.get('nameIdentifierScheme', '') or ''
+                    if not name_scheme.lower() == "orcid":
+                        continue
+                    orcid = nid.get('nameIdentifier', '').replace('https://orcid.org/', '')
+                    if not orcid:
+                        continue
+                    creator_id = self.lookup_orcid(orcid)
+                    # TODO(martin): If creator_id is None, should we create creators?
+
+                # If there are multiple affiliation strings, use the first one.
+                affiliations = c.get('affiliation', []) or []
+                raw_affiliation = None
+                if len(affiliations) == 0:
+                    raw_affiliation = None
+                else:
+                    raw_affiliation = clean(affiliations[0])
+
+                name = c.get('name')
+                given_name = c.get('givenName')
+                surname = c.get('familyName')
+
+                if name:
+                    name = clean(name)
+                if not name:
+                    continue
+                if name in name_blacklist:
+                    continue
+                if name.lower() in UNKNOWN_MARKERS:
+                    continue
+                # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'.
+                if name:
+                    name = index_form_to_display_name(name)
+
+                if given_name:
+                    given_name = clean(given_name)
+                if surname:
+                    surname = clean(surname)
+                if raw_affiliation == '':
+                    continue
+
+                extra = None
+
+                # "DataManager", "DataCurator", "ContactPerson", "Distributor",
+                # "RegistrationAgency", "Sponsor", "Researcher",
+                # "RelatedPerson", "ProjectLeader", "Editor", "Other",
+                # "ProjectMember", "Funder", "RightsHolder", "DataCollector",
+                # "Supervisor", "Producer", "HostingInstitution", "ResearchGroup"
+                contributorType = c.get('contributorType', '') or ''
+
+                if contributorType:
+                    extra = {'type': contributorType}
+
+                contribs.append(
+                    fatcat_openapi_client.ReleaseContrib(
+                        creator_id=creator_id,
+                        index=i,
+                        raw_name=name,
+                        given_name=given_name,
+                        surname=surname,
+                        role=role,
+                        raw_affiliation=raw_affiliation,
+                        extra=extra,
+                    ))
+            elif nameType == 'Organizational':
+                name = c.get('name', '') or ''
+                if name in UNKNOWN_MARKERS:
+                    continue
+                if len(name) < 3:
+                    continue
+                extra = {'organization': name}
+                contribs.append(fatcat_openapi_client.ReleaseContrib(
+                    index=i, extra=extra))
+            else:
+                print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr)
+
+        return contribs
+
 
 def lookup_license_slug(raw):
     """
@@ -971,6 +992,8 @@ def index_form_to_display_name(s):
     if s.count(',') > 1:
         # "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan"
         return s
+
+    # Not names, but sprinkled in fields where authors live.
     stopwords = [s.lower() for s in (
         'Archive',
         'Collection',
diff --git a/python/tests/files/datacite/datacite_doc_26.json b/python/tests/files/datacite/datacite_doc_26.json
new file mode 100644
index 00000000..c2abb1b2
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_26.json
@@ -0,0 +1,57 @@
+{
+  "attributes": {
+    "doi": "10.7916/d86x0cg1",
+    "creators": [
+      {
+        "name": "Anton Welch",
+        "affiliation": [
+          "Department of pataphysics"
+        ],
+        "nameIdentifiers": []
+      }
+    ],
+    "contributors": [
+      {
+        "name": "Wemmer, David",
+        "nameType": "Personal",
+        "givenName": "David",
+        "familyName": "Wemmer",
+        "affiliation": [],
+        "contributorType": "Editor"
+      }
+    ],
+    "titles": [
+      {
+        "title": "Additional file 123: ABC"
+      },
+      {
+        "title": "DEF",
+        "titleType": "Subtitle"
+      }
+    ],
+    "publicationYear": 2016,
+    "language": "DE-CH",
+    "types": {
+      "ris": "GEN",
+      "bibtex": "misc",
+      "citeproc": "article",
+      "schemaOrg": "CreativeWork"
+    },
+    "dates": [
+      {
+        "date": "2017-08-24",
+        "dateType": "Created"
+      },
+      {
+        "date": "2019-08-04",
+        "dateType": "Updated"
+      },
+      {
+        "date": "2017",
+        "dateType": "Issued"
+      }
+    ],
+    "isActive": true,
+    "state": "findable"
+  }
+}
diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json
index 22542a10..c4e5418d 100644
--- a/python/tests/files/datacite/datacite_result_05.json
+++ b/python/tests/files/datacite/datacite_result_05.json
@@ -523,6 +523,12 @@
       "given_name": "Christian",
       "surname": "Wurzbacher",
       "role": "author"
+    },
+    {
+      "raw_name": "Kessy Abarenkov"
+    },
+    {
+      "raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden"
     }
   ],
   "refs": [],
diff --git a/python/tests/files/datacite/datacite_result_09.json b/python/tests/files/datacite/datacite_result_09.json
index fd873309..c93dc769 100644
--- a/python/tests/files/datacite/datacite_result_09.json
+++ b/python/tests/files/datacite/datacite_result_09.json
@@ -32,6 +32,17 @@
       "given_name": "Nils",
       "surname": "Kirstaedter",
       "role": "author"
+    },
+    {
+      "extra": {
+        "organization": "TIB-Technische Informationsbibliothek Universitätsbibliothek Hannover"
+      }
+    },
+    {
+      "raw_name": "Technische Informationsbibliothek (TIB)",
+      "extra": {
+        "type": "DataManager"
+      }
     }
   ],
   "refs": [],
diff --git a/python/tests/files/datacite/datacite_result_26.json b/python/tests/files/datacite/datacite_result_26.json
new file mode 100644
index 00000000..8d26197c
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_26.json
@@ -0,0 +1,31 @@
+{
+  "extra": {
+    "datacite": {},
+    "release_month": 8
+  },
+  "title": "Additional file 123: ABC",
+  "subtitle": "DEF",
+  "release_type": "stub",
+  "release_stage": "published",
+  "release_date": "2017-08-24",
+  "release_year": 2017,
+  "ext_ids": {
+    "doi": "10.7916/d86x0cg1"
+  },
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "Anton Welch",
+      "role": "author",
+      "raw_affiliation": "Department of pataphysics"
+    },
+      {
+        "extra": {"type": "Editor"},
+        "raw_name": "David Wemmer",
+        "given_name": "David",
+        "surname": "Wemmer"
+      }
+  ],
+  "refs": [],
+  "abstracts": []
+}
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 7293ecac..5ad7ef2c 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -275,7 +275,7 @@ def test_datacite_dict_parse(datacite_importer):
         assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}]
         assert len(r.abstracts) == 1
         assert len(r.abstracts[0].content) == 421
-        assert len(r.contribs) == 1
+        assert len(r.contribs) == 2
         assert r.contribs[0].raw_name == "GLIS Of The ITPGRFA"
         assert r.contribs[0].given_name == None
         assert r.contribs[0].surname == None
@@ -287,7 +287,7 @@ def test_datacite_conversions(datacite_importer):
     for now.
     """
     datacite_importer.debug = True
-    for i in range(26):
+    for i in range(27):
         src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i)
         dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i)
         print('testing mapping from {} => {}'.format(src, dst))
author	Martin Czygan <martin.czygan@gmail.com>	2020-01-08 22:33:58 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2020-01-08 22:33:58 +0100
commit	62d6a7e48d6bea1bc7f451c6043f38aee2051f9b (patch)
tree	c28079d6f3a12f106607fb6fc1fa9a4a5c83d9de
parent	b7a325360ca8ae3107411e9e1966d93b999bbb52 (diff)
download	fatcat-62d6a7e48d6bea1bc7f451c6043f38aee2051f9b.tar.gz fatcat-62d6a7e48d6bea1bc7f451c6043f38aee2051f9b.zip