Merge branch 'martin-datacite-duplicated-author-gh-59' into 'master'

datacite: address duplicated contributor issue See merge request webgroup/fatcat!65
author: bnewbold <bnewbold@archive.org> 2020-07-11 00:31:47 +0000
committer: bnewbold <bnewbold@archive.org> 2020-07-11 00:31:47 +0000
commit: f5aefab6a6431ab9db99761457fd47b36b920b8c (patch)
tree: d144988d310aeecf8521cfc33aca9f0667dfedbc
parent: 26b455ffad566bef58684a78654a2719c409588a (diff)
parent: 3c266e07771271241aa8cff3e3199a45109362af (diff)
download: fatcat-f5aefab6a6431ab9db99761457fd47b36b920b8c.tar.gz
fatcat-f5aefab6a6431ab9db99761457fd47b36b920b8c.zip
13 files changed, 619 insertions, 251 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 785107ee..ebb29feb 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -294,7 +294,39 @@ class DataciteImporter(EntityImporter):
         creators = attributes.get('creators', []) or []
         contributors = attributes.get('contributors', []) or []  # Much fewer than creators.
 
-        contribs = self.parse_datacite_creators(creators, doi=doi) + self.parse_datacite_creators(contributors, role=None, set_index=False, doi=doi)
+        contribs = self.parse_datacite_creators(creators, doi=doi)
+
+        # Beside creators, we have contributors in datacite. Sample:
+        # ContactPerson, DataCollector, DataCurator, DataManager, Distributor,
+        # Editor, Funder, HostingInstitution, Other, Producer, ProjectLeader,
+        # ProjectMember, RelatedPerson, ResearchGroup, Researcher,
+        # RightsHolder, Sponsor, Supervisor
+        #
+        # Datacite schema:
+        # https://schema.datacite.org/meta/kernel-4.3/doc/DataCite-MetadataKernel_v4.3.pdf#page=32
+        # -- could be used as a form of controlled vocab?
+        #
+        # Currently (07/2020) in release_contrib:
+        #
+        # select count(*), role from release_contrib group by role;
+        #    count   |    role
+        # -----------+------------
+        #  500269665 | author
+        #    4386563 | editor
+        #      17871 | translator
+        #   10870584 |
+        # (4 rows)
+	#
+        # Related: https://guide.fatcat.wiki/entity_release.html -- role
+        # (string, of a set): the type of contribution, from a controlled
+        # vocabulary. TODO: vocabulary needs review.
+        contribs_extra_contributors = self.parse_datacite_creators(contributors, set_index=False, doi=doi)
+
+        # Unfortunately, creators and contributors might overlap, refs GH59.
+        for cc in contribs_extra_contributors:
+            if contributor_list_contains_contributor(contribs, cc):
+                continue
+            contribs.append(cc)
 
         # Title, may come with "attributes.titles[].titleType", like
         # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
@@ -725,9 +757,10 @@ class DataciteImporter(EntityImporter):
         # Names, that should be ignored right away.
         name_blacklist = set(('Occdownload Gbif.Org',))
 
-        for i, c in enumerate(creators):
+        i = 0
+        for c in creators:
             if not set_index:
-                i = None
+               i = None
             nameType = c.get('nameType', '') or ''
             if nameType in ('', 'Personal'):
                 creator_id = None
@@ -799,8 +832,7 @@ class DataciteImporter(EntityImporter):
                 if contributorType:
                     extra = {'type': contributorType}
 
-                contribs.append(
-                    fatcat_openapi_client.ReleaseContrib(
+                rc = fatcat_openapi_client.ReleaseContrib(
                         creator_id=creator_id,
                         index=i,
                         raw_name=name,
@@ -809,7 +841,12 @@ class DataciteImporter(EntityImporter):
                         role=role,
                         raw_affiliation=raw_affiliation,
                         extra=extra,
-                    ))
+                    )
+                # Filter out duplicates early.
+                if not contributor_list_contains_contributor(contribs, rc):
+                    contribs.append(rc)
+                    if i is not None:
+                        i += 1
             elif nameType == 'Organizational':
                 name = c.get('name', '') or ''
                 if name in UNKNOWN_MARKERS:
@@ -819,12 +856,29 @@ class DataciteImporter(EntityImporter):
                 extra = {'organization': name}
                 contribs.append(fatcat_openapi_client.ReleaseContrib(
                     index=i, extra=extra))
+                if i is not None:
+                    i += 1
             else:
                 print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr)
 
         return contribs
 
 
+def contributor_list_contains_contributor(contributor_list, contributor):
+    """
+    Given a list of contributors, determine, whether contrib is in that list.
+    """
+    for cc in contributor_list:
+        if cc.raw_name != contributor.raw_name:
+            continue
+        cc_role = cc.role or 'author'
+        contributor_role = contributor.role or 'author'
+        if cc_role != contributor_role:
+            continue
+        return True
+    return False
+
+
 def lookup_license_slug(raw):
     """
     Resolve a variety of strings into a some pseudo-canonical form, e.g.
diff --git a/python/tests/files/datacite/datacite_doc_33.json b/python/tests/files/datacite/datacite_doc_33.json
new file mode 100644
index 00000000..571d1220
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_33.json
@@ -0,0 +1,62 @@
+{
+  "id": "10.17912/micropub.biology.000143",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.17912/micropub.biology.000143",
+    "identifiers": null,
+    "creators": [
+      {
+        "name": "ABC News",
+        "givenName": "",
+        "familyName": "",
+        "affiliation": [],
+        "role": "author"
+      }
+    ],
+    "titles": [
+      {
+        "title": "Sample"
+      }
+    ],
+    "publisher": "microPublication Biology",
+    "publicationYear": 2019,
+    "types": {
+      "resourceTypeGeneral": "DataPaper"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [],
+    "version": null,
+    "rightsList": [],
+    "descriptions": [
+      {
+        "description": 1234567890,
+        "descriptionType": "Abstract"
+      }
+    ],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143",
+    "created": "2019-08-19T14:43:08.000Z",
+    "registered": "2019-08-19T14:43:09.000Z",
+    "published": "2019",
+    "updated": "2019-11-09T12:32:02.000Z",
+    "contributors": [
+      {
+        "name": "ABC News",
+        "givenName": "",
+        "familyName": "",
+        "affiliation": [],
+        "role": ""
+      }
+    ]
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "caltech.micropub",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_doc_34.json b/python/tests/files/datacite/datacite_doc_34.json
new file mode 100644
index 00000000..5dcf65f4
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_34.json
@@ -0,0 +1,61 @@
+{
+  "id": "10.17912/micropub.biology.000143",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.17912/micropub.biology.000143",
+    "identifiers": null,
+    "creators": [
+      {
+        "name": "Paul Katz",
+        "givenName": "",
+        "familyName": "",
+        "affiliation": [],
+        "role": "author"
+      }
+    ],
+    "titles": [
+      {
+        "title": "Sample"
+      }
+    ],
+    "publisher": "microPublication Biology",
+    "publicationYear": 2019,
+    "types": {
+      "resourceTypeGeneral": "DataPaper"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [],
+    "version": null,
+    "rightsList": [],
+    "descriptions": [
+      {
+        "description": 1234567890,
+        "descriptionType": "Abstract"
+      }
+    ],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143",
+    "created": "2019-08-19T14:43:08.000Z",
+    "registered": "2019-08-19T14:43:09.000Z",
+    "published": "2019",
+    "updated": "2019-11-09T12:32:02.000Z",
+    "contributors": [
+      {
+        "name": "Paul Katz",
+        "givenName": "",
+        "familyName": "",
+        "affiliation": [],
+        "role": "illustrator"
+      } ]
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "caltech.micropub",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json
index 79c2a8fb..c91f3a7f 100644
--- a/python/tests/files/datacite/datacite_result_05.json
+++ b/python/tests/files/datacite/datacite_result_05.json
@@ -505,10 +505,8 @@
       "surname": "Wurzbacher"
     },
     {
-      "raw_name": "Kessy Abarenkov"
-    },
-    {
-      "raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden"
+      "raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden",
+      "role": "author"
     }
   ],
   "ext_ids": {
diff --git a/python/tests/files/datacite/datacite_result_08.json b/python/tests/files/datacite/datacite_result_08.json
index 70237280..5a46ef50 100644
--- a/python/tests/files/datacite/datacite_result_08.json
+++ b/python/tests/files/datacite/datacite_result_08.json
@@ -13,13 +13,6 @@
       "raw_name": "Kei Kajisa",
       "role": "author",
       "surname": "Kajisa"
-    },
-    {
-      "given_name": "Kei",
-      "index": 1,
-      "raw_name": "Kei Kajisa",
-      "role": "author",
-      "surname": "Kajisa"
     }
   ],
   "ext_ids": {
diff --git a/python/tests/files/datacite/datacite_result_09.json b/python/tests/files/datacite/datacite_result_09.json
index 09e02fc7..f6ec524a 100644
--- a/python/tests/files/datacite/datacite_result_09.json
+++ b/python/tests/files/datacite/datacite_result_09.json
@@ -17,7 +17,8 @@
       "extra": {
         "type": "DataManager"
       },
-      "raw_name": "Technische Informationsbibliothek (TIB)"
+      "raw_name": "Technische Informationsbibliothek (TIB)",
+      "role": "author"
     }
   ],
   "ext_ids": {
diff --git a/python/tests/files/datacite/datacite_result_26.json b/python/tests/files/datacite/datacite_result_26.json
index 267eb9c2..f6e589ef 100644
--- a/python/tests/files/datacite/datacite_result_26.json
+++ b/python/tests/files/datacite/datacite_result_26.json
@@ -13,7 +13,8 @@
       },
       "given_name": "David",
       "raw_name": "David Wemmer",
-      "surname": "Wemmer"
+      "surname": "Wemmer",
+      "role": "author"
     }
   ],
   "ext_ids": {
diff --git a/python/tests/files/datacite/datacite_result_27.json b/python/tests/files/datacite/datacite_result_27.json
index 3d033e6a..e934fb41 100644
--- a/python/tests/files/datacite/datacite_result_27.json
+++ b/python/tests/files/datacite/datacite_result_27.json
@@ -13,7 +13,8 @@
       },
       "given_name": "David",
       "raw_name": "David Wemmer",
-      "surname": "Wemmer"
+      "surname": "Wemmer",
+      "role": "author"
     }
   ],
   "ext_ids": {
diff --git a/python/tests/files/datacite/datacite_result_28.json b/python/tests/files/datacite/datacite_result_28.json
index 84bed9c8..bcb1caaf 100644
--- a/python/tests/files/datacite/datacite_result_28.json
+++ b/python/tests/files/datacite/datacite_result_28.json
@@ -13,7 +13,8 @@
       },
       "given_name": "David",
       "raw_name": "David Wemmer",
-      "surname": "Wemmer"
+      "surname": "Wemmer",
+      "role": "author"
     }
   ],
   "ext_ids": {
diff --git a/python/tests/files/datacite/datacite_result_29.json b/python/tests/files/datacite/datacite_result_29.json
index 84bed9c8..bcb1caaf 100644
--- a/python/tests/files/datacite/datacite_result_29.json
+++ b/python/tests/files/datacite/datacite_result_29.json
@@ -13,7 +13,8 @@
       },
       "given_name": "David",
       "raw_name": "David Wemmer",
-      "surname": "Wemmer"
+      "surname": "Wemmer",
+      "role": "author"
     }
   ],
   "ext_ids": {
diff --git a/python/tests/files/datacite/datacite_result_33.json b/python/tests/files/datacite/datacite_result_33.json
new file mode 100644
index 00000000..bcb72469
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_33.json
@@ -0,0 +1,31 @@
+{
+  "abstracts": [
+    {
+      "content": "1234567890",
+      "mimetype": "text/plain"
+    }
+  ],
+  "contribs": [
+    {
+      "given_name": "",
+      "surname": "",
+      "index": 0,
+      "raw_name": "ABC News",
+      "role": "author"
+    }
+  ],
+  "ext_ids": {
+    "doi": "10.17912/micropub.biology.000143"
+  },
+  "extra": {
+    "datacite": {
+      "resourceTypeGeneral": "DataPaper"
+    },
+    "container_name": "microPublication Biology"
+  },
+  "refs": [],
+  "release_stage": "published",
+  "release_year": 2019,
+  "publisher": "microPublication Biology",
+  "title": "Sample"
+}
diff --git a/python/tests/files/datacite/datacite_result_34.json b/python/tests/files/datacite/datacite_result_34.json
new file mode 100644
index 00000000..4a52e22c
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_34.json
@@ -0,0 +1,31 @@
+{
+  "abstracts": [
+    {
+      "content": "1234567890",
+      "mimetype": "text/plain"
+    }
+  ],
+  "contribs": [
+    {
+      "given_name": "",
+      "surname": "",
+      "index": 0,
+      "raw_name": "Paul Katz",
+      "role": "author"
+    }
+  ],
+  "ext_ids": {
+    "doi": "10.17912/micropub.biology.000143"
+  },
+  "extra": {
+    "datacite": {
+      "resourceTypeGeneral": "DataPaper"
+    },
+    "container_name": "microPublication Biology"
+  },
+  "refs": [],
+  "release_stage": "published",
+  "release_year": 2019,
+  "publisher": "microPublication Biology",
+  "title": "Sample"
+}
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 8fb2d079..b94b6bc5 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -10,32 +10,54 @@ import collections
 import pytest
 
 from fatcat_tools.importers import DataciteImporter, JsonLinePusher
-from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates, index_form_to_display_name, lookup_license_slug
+from fatcat_tools.importers.datacite import (
+    find_original_language_title,
+    parse_datacite_titles,
+    parse_datacite_dates,
+    clean_doi,
+    index_form_to_display_name,
+    lookup_license_slug,
+    contributor_list_contains_contributor,
+)
 from fatcat_tools.transforms import entity_to_dict
-from fixtures import *
+import fatcat_openapi_client
+from fixtures import api
+import json
 
 
 @pytest.fixture(scope="function")
 def datacite_importer(api):
-    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
-        yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3',
-                               bezerk_mode=True)
+    with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
+        yield DataciteImporter(
+            api,
+            issn_file,
+            extid_map_file="tests/files/example_map.sqlite3",
+            bezerk_mode=True,
+        )
+
 
 @pytest.fixture(scope="function")
 def datacite_importer_existing(api):
-    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
-        yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3',
-                               bezerk_mode=False)
+    with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
+        yield DataciteImporter(
+            api,
+            issn_file,
+            extid_map_file="tests/files/example_map.sqlite3",
+            bezerk_mode=False,
+        )
+
 
 @pytest.mark.skip(reason="larger datacite import slows tests down")
 def test_datacite_importer_huge(datacite_importer):
     last_index = datacite_importer.api.get_changelog(limit=1)[0].index
-    with gzip.open('tests/files/datacite_1k_records.jsonl.gz', 'rt') as f:
+    with gzip.open("tests/files/datacite_1k_records.jsonl.gz", "rt") as f:
         datacite_importer.bezerk_mode = True
         counts = JsonLinePusher(datacite_importer, f).run()
-    assert counts['insert'] == 998
-    change = datacite_importer.api.get_changelog_entry(index=last_index+1)
-    release = datacite_importer.api.get_release(change.editgroup.edits.releases[0].ident)
+    assert counts["insert"] == 998
+    change = datacite_importer.api.get_changelog_entry(index=last_index + 1)
+    release = datacite_importer.api.get_release(
+        change.editgroup.edits.releases[0].ident
+    )
     assert len(release.contribs) == 3
 
 
@@ -43,122 +65,161 @@ def test_find_original_language_title():
     """
     Original language might be included, in various ways.
     """
-    Case = collections.namedtuple('Case', 'about input result')
+    Case = collections.namedtuple("Case", "about input result")
     cases = [
-        Case('defaults to None', {}, None),
-        Case('ignore unknown keys', {'broken': 'kv'}, None),
-        Case('just a title', {'title': 'Noise Reduction'}, None),
-        Case('same title should be ignored', {
-            'title': 'Noise Reduction',
-            'original_language_title': 'Noise Reduction'
-        }, None),
-        Case('empty subdict is ignored', {
-            'title': 'Noise Reduction',
-            'original_language_title': {},
-        }, None),
-        Case('unknown subdict keys are ignored', {
-            'title': 'Noise Reduction',
-            'original_language_title': {'broken': 'kv'},
-        }, None),
-        Case('original string', {
-            'title': 'Noise Reduction',
-            'original_language_title': 'Подавление шума',
-        }, 'Подавление шума'),
-        Case('language tag is ignored, since its broken', {
-            'title': 'Noise Reduction',
-            'original_language_title': {
-                'language': 'ja',
-                '__content__': 'Noise Reduction'
+        Case("defaults to None", {}, None),
+        Case("ignore unknown keys", {"broken": "kv"}, None),
+        Case("just a title", {"title": "Noise Reduction"}, None),
+        Case(
+            "same title should be ignored",
+            {"title": "Noise Reduction", "original_language_title": "Noise Reduction"},
+            None,
+        ),
+        Case(
+            "empty subdict is ignored",
+            {"title": "Noise Reduction", "original_language_title": {},},
+            None,
+        ),
+        Case(
+            "unknown subdict keys are ignored",
+            {"title": "Noise Reduction", "original_language_title": {"broken": "kv"},},
+            None,
+        ),
+        Case(
+            "original string",
+            {"title": "Noise Reduction", "original_language_title": "Подавление шума",},
+            "Подавление шума",
+        ),
+        Case(
+            "language tag is ignored, since its broken",
+            {
+                "title": "Noise Reduction",
+                "original_language_title": {
+                    "language": "ja",
+                    "__content__": "Noise Reduction",
+                },
             },
-        }, None),
-        Case('do not care about language', {
-            'title': 'Noise Reduction',
-            'original_language_title': {
-                'language': 'ja',
-                '__content__': 'Rauschunterdrückung',
+            None,
+        ),
+        Case(
+            "do not care about language",
+            {
+                "title": "Noise Reduction",
+                "original_language_title": {
+                    "language": "ja",
+                    "__content__": "Rauschunterdrückung",
+                },
             },
-        }, 'Rauschunterdrückung'),
-        Case('ignore excessive questionmarks', {
-            'title': 'Noise Reduction',
-            'original_language_title': {
-                'language': 'ja',
-                '__content__': '???? However',
+            "Rauschunterdrückung",
+        ),
+        Case(
+            "ignore excessive questionmarks",
+            {
+                "title": "Noise Reduction",
+                "original_language_title": {
+                    "language": "ja",
+                    "__content__": "???? However",
+                },
             },
-        }, None),
+            None,
+        ),
     ]
 
     for case in cases:
         result = find_original_language_title(case.input)
         assert result == case.result
 
+
 def test_parse_datacite_titles():
     """
     Given a list of titles, find title, original_language_title and subtitle.
     Result is a 3-tuple of title, original_language_title, subtitle.
     """
-    Case = collections.namedtuple('Case', 'about input result')
+    Case = collections.namedtuple("Case", "about input result")
     cases = [
-        Case('handle None', None, (None, None, None)),
-        Case('empty list', [], (None, None, None)),
-        Case('empty item', [{}], (None, None, None)),
-        Case('broken keys', [{'broken': 'kv'}], (None, None, None)),
-        Case('title only', [{'title': 'Total carbon dioxide'}],
-             ('Total carbon dioxide', None, None),
-        ),
-        Case('title and subtitle', [
-            {'title': 'Total carbon dioxide'},
-            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
-        ],
-             ('Total carbon dioxide', None, 'Station TT043_7-9'),
-        ),
-        Case('title, subtitle order does not matter', [
-            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
-            {'title': 'Total carbon dioxide'},
-        ],
-             ('Total carbon dioxide', None, 'Station TT043_7-9'),
-        ),
-        Case('multiple titles, first wins', [
-            {'title': 'Total carbon dioxide'},
-            {'title': 'Meeting Heterogeneity'},
-        ],
-             ('Total carbon dioxide', None, None),
-        ),
-        Case('multiple titles, plus sub', [
-            {'title': 'Total carbon dioxide'},
-            {'title': 'Meeting Heterogeneity'},
-            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
-        ],
-             ('Total carbon dioxide', None, 'Station TT043_7-9'),
-        ),
-        Case('multiple titles, multiple subs', [
-            {'title': 'Total carbon dioxide'},
-            {'title': 'Meeting Heterogeneity'},
-            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
-            {'title': 'Some other subtitle', 'titleType': 'Subtitle'},
-        ],
-             ('Total carbon dioxide', None, 'Station TT043_7-9'),
-        ),
-        Case('title, original, sub', [
-            {'title': 'Total carbon dioxide', 'original_language_title': 'Всего углекислого газа'},
-            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
-        ],
-             ('Total carbon dioxide', 'Всего углекислого газа', 'Station TT043_7-9'),
-        ),
-        Case('title, original same as title, sub', [
-            {'title': 'Total carbon dioxide', 'original_language_title': {
-                '__content__': 'Total carbon dioxide',
-            }},
-            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
-        ],
-             ('Total carbon dioxide', None, 'Station TT043_7-9'),
-        ),
-        Case('title, original dict, sub', [
-            {'title': 'Total carbon dioxide', 'original_language_title': {
-                '__content__': 'Всего углекислого газа',
-            }},
-            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
-        ],
-             ('Total carbon dioxide', 'Всего углекислого газа', 'Station TT043_7-9'),
+        Case("handle None", None, (None, None, None)),
+        Case("empty list", [], (None, None, None)),
+        Case("empty item", [{}], (None, None, None)),
+        Case("broken keys", [{"broken": "kv"}], (None, None, None)),
+        Case(
+            "title only",
+            [{"title": "Total carbon dioxide"}],
+            ("Total carbon dioxide", None, None),
+        ),
+        Case(
+            "title and subtitle",
+            [
+                {"title": "Total carbon dioxide"},
+                {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+            ],
+            ("Total carbon dioxide", None, "Station TT043_7-9"),
+        ),
+        Case(
+            "title, subtitle order does not matter",
+            [
+                {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+                {"title": "Total carbon dioxide"},
+            ],
+            ("Total carbon dioxide", None, "Station TT043_7-9"),
+        ),
+        Case(
+            "multiple titles, first wins",
+            [{"title": "Total carbon dioxide"}, {"title": "Meeting Heterogeneity"},],
+            ("Total carbon dioxide", None, None),
+        ),
+        Case(
+            "multiple titles, plus sub",
+            [
+                {"title": "Total carbon dioxide"},
+                {"title": "Meeting Heterogeneity"},
+                {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+            ],
+            ("Total carbon dioxide", None, "Station TT043_7-9"),
+        ),
+        Case(
+            "multiple titles, multiple subs",
+            [
+                {"title": "Total carbon dioxide"},
+                {"title": "Meeting Heterogeneity"},
+                {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+                {"title": "Some other subtitle", "titleType": "Subtitle"},
+            ],
+            ("Total carbon dioxide", None, "Station TT043_7-9"),
+        ),
+        Case(
+            "title, original, sub",
+            [
+                {
+                    "title": "Total carbon dioxide",
+                    "original_language_title": "Всего углекислого газа",
+                },
+                {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+            ],
+            ("Total carbon dioxide", "Всего углекислого газа", "Station TT043_7-9"),
+        ),
+        Case(
+            "title, original same as title, sub",
+            [
+                {
+                    "title": "Total carbon dioxide",
+                    "original_language_title": {"__content__": "Total carbon dioxide",},
+                },
+                {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+            ],
+            ("Total carbon dioxide", None, "Station TT043_7-9"),
+        ),
+        Case(
+            "title, original dict, sub",
+            [
+                {
+                    "title": "Total carbon dioxide",
+                    "original_language_title": {
+                        "__content__": "Всего углекислого газа",
+                    },
+                },
+                {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+            ],
+            ("Total carbon dioxide", "Всего углекислого газа", "Station TT043_7-9"),
         ),
     ]
 
@@ -166,91 +227,128 @@ def test_parse_datacite_titles():
         result = parse_datacite_titles(case.input)
         assert result == case.result, case.about
 
+
 def test_parse_datacite_dates():
     """
     Test datacite date parsing.
     """
-    Case = collections.namedtuple('Case', 'about input result')
+    Case = collections.namedtuple("Case", "about input result")
     cases = [
-        Case('None is None', None, (None, None, None)),
-        Case('empty list is None', [], (None, None, None)),
-        Case('empty item is None', [{}], (None, None, None)),
-        Case('year only yields year only', [{'date': '2019'}], (None, None, 2019)),
-        Case('int year', [{'date': 2019}], (None, None, 2019)),
-        Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, None, 2019)),
-        Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, None, 2020)),
-        Case('first with type', [
-            {'date': '2019', 'dateType': 'Accepted'}, {'date': '2020'}
-        ], (None, None, 2019)),
-        Case('full date', [
-            {'date': '2019-12-01', 'dateType': 'Valid'},
-        ], (datetime.date(2019, 12, 1), 12, 2019)),
-        Case('date type prio', [
-            {'date': '2000-12-01', 'dateType': 'Valid'},
-            {'date': '2010-01-01', 'dateType': 'Updated'},
-        ], (datetime.date(2000, 12, 1), 12, 2000)),
-        Case('date type prio, Available > Updated', [
-            {'date': '2010-01-01', 'dateType': 'Updated'},
-            {'date': '2000-12-01', 'dateType': 'Available'},
-        ], (datetime.date(2000, 12, 1), 12, 2000)),
-        Case('allow different date formats, Available > Updated', [
-            {'date': '2010-01-01T10:00:00', 'dateType': 'Updated'},
-            {'date': '2000-12-01T10:00:00', 'dateType': 'Available'},
-        ], (datetime.date(2000, 12, 1), 12, 2000)),
-        Case('allow different date formats, Available > Updated', [
-            {'date': '2010-01-01T10:00:00Z', 'dateType': 'Updated'},
-            {'date': '2000-12-01T10:00:00Z', 'dateType': 'Available'},
-        ], (datetime.date(2000, 12, 1), 12, 2000)),
-        Case('allow fuzzy date formats, Available > Updated', [
-            {'date': '2010', 'dateType': 'Updated'},
-            {'date': '2000 Dec 01', 'dateType': 'Available'},
-        ], (datetime.date(2000, 12, 1), 12, 2000)),
-        Case('fuzzy year only', [
-            {'date': 'Year 2010', 'dateType': 'Issued'},
-        ], (None, None, 2010)),
-        Case('fuzzy year and month', [
-            {'date': 'Year 2010 Feb', 'dateType': 'Issued'},
-        ], (None, 2, 2010)),
-        Case('fuzzy year, month, day', [
-            {'date': 'Year 2010 Feb 24', 'dateType': 'Issued'},
-        ], (datetime.date(2010, 2, 24), 2, 2010)),
-        Case('ignore broken date', [
-            {'date': 'Febrrr 45', 'dateType': 'Updated'},
-        ], (None, None, None)),
+        Case("None is None", None, (None, None, None)),
+        Case("empty list is None", [], (None, None, None)),
+        Case("empty item is None", [{}], (None, None, None)),
+        Case("year only yields year only", [{"date": "2019"}], (None, None, 2019)),
+        Case("int year", [{"date": 2019}], (None, None, 2019)),
+        Case("first wins", [{"date": "2019"}, {"date": "2020"}], (None, None, 2019)),
+        Case(
+            "skip bogus year", [{"date": "abc"}, {"date": "2020"}], (None, None, 2020)
+        ),
+        Case(
+            "first with type",
+            [{"date": "2019", "dateType": "Accepted"}, {"date": "2020"}],
+            (None, None, 2019),
+        ),
+        Case(
+            "full date",
+            [{"date": "2019-12-01", "dateType": "Valid"},],
+            (datetime.date(2019, 12, 1), 12, 2019),
+        ),
+        Case(
+            "date type prio",
+            [
+                {"date": "2000-12-01", "dateType": "Valid"},
+                {"date": "2010-01-01", "dateType": "Updated"},
+            ],
+            (datetime.date(2000, 12, 1), 12, 2000),
+        ),
+        Case(
+            "date type prio, Available > Updated",
+            [
+                {"date": "2010-01-01", "dateType": "Updated"},
+                {"date": "2000-12-01", "dateType": "Available"},
+            ],
+            (datetime.date(2000, 12, 1), 12, 2000),
+        ),
+        Case(
+            "allow different date formats, Available > Updated",
+            [
+                {"date": "2010-01-01T10:00:00", "dateType": "Updated"},
+                {"date": "2000-12-01T10:00:00", "dateType": "Available"},
+            ],
+            (datetime.date(2000, 12, 1), 12, 2000),
+        ),
+        Case(
+            "allow different date formats, Available > Updated",
+            [
+                {"date": "2010-01-01T10:00:00Z", "dateType": "Updated"},
+                {"date": "2000-12-01T10:00:00Z", "dateType": "Available"},
+            ],
+            (datetime.date(2000, 12, 1), 12, 2000),
+        ),
+        Case(
+            "allow fuzzy date formats, Available > Updated",
+            [
+                {"date": "2010", "dateType": "Updated"},
+                {"date": "2000 Dec 01", "dateType": "Available"},
+            ],
+            (datetime.date(2000, 12, 1), 12, 2000),
+        ),
+        Case(
+            "fuzzy year only",
+            [{"date": "Year 2010", "dateType": "Issued"},],
+            (None, None, 2010),
+        ),
+        Case(
+            "fuzzy year and month",
+            [{"date": "Year 2010 Feb", "dateType": "Issued"},],
+            (None, 2, 2010),
+        ),
+        Case(
+            "fuzzy year, month, day",
+            [{"date": "Year 2010 Feb 24", "dateType": "Issued"},],
+            (datetime.date(2010, 2, 24), 2, 2010),
+        ),
+        Case(
+            "ignore broken date",
+            [{"date": "Febrrr 45", "dateType": "Updated"},],
+            (None, None, None),
+        ),
     ]
     for case in cases:
         result = parse_datacite_dates(case.input)
         assert result == case.result, case.about
 
+
 def test_datacite_importer(datacite_importer):
     last_index = datacite_importer.api.get_changelog(limit=1)[0].index
-    with open('tests/files/datacite_sample.jsonl', 'r') as f:
+    with open("tests/files/datacite_sample.jsonl", "r") as f:
         datacite_importer.bezerk_mode = True
         counts = JsonLinePusher(datacite_importer, f).run()
-    assert counts['insert'] == 1
-    assert counts['exists'] == 0
-    assert counts['skip'] == 0
+    assert counts["insert"] == 1
+    assert counts["exists"] == 0
+    assert counts["skip"] == 0
 
     # fetch most recent editgroup
-    change = datacite_importer.api.get_changelog_entry(index=last_index+1)
+    change = datacite_importer.api.get_changelog_entry(index=last_index + 1)
     eg = change.editgroup
     assert eg.description
     assert "datacite" in eg.description.lower()
-    assert eg.extra['git_rev']
-    assert "fatcat_tools.DataciteImporter" in eg.extra['agent']
+    assert eg.extra["git_rev"]
+    assert "fatcat_tools.DataciteImporter" in eg.extra["agent"]
 
     last_index = datacite_importer.api.get_changelog(limit=1)[0].index
-    with open('tests/files/datacite_sample.jsonl', 'r') as f:
+    with open("tests/files/datacite_sample.jsonl", "r") as f:
         datacite_importer.bezerk_mode = False
         datacite_importer.reset()
         counts = JsonLinePusher(datacite_importer, f).run()
-    assert counts['insert'] == 0
-    assert counts['exists'] == 1
-    assert counts['skip'] == 0
+    assert counts["insert"] == 0
+    assert counts["exists"] == 1
+    assert counts["skip"] == 0
     assert last_index == datacite_importer.api.get_changelog(limit=1)[0].index
 
+
 def test_datacite_dict_parse(datacite_importer):
-    with open('tests/files/datacite_sample.jsonl', 'r') as f:
+    with open("tests/files/datacite_sample.jsonl", "r") as f:
         raw = json.load(f)
         r = datacite_importer.parse_record(raw)
         # ensure the API server is ok with format
@@ -258,7 +356,9 @@ def test_datacite_dict_parse(datacite_importer):
 
         print(r.extra)
         assert r.title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090"
-        assert r.publisher == "International Centre for Agricultural Research in Dry Areas"
+        assert (
+            r.publisher == "International Centre for Agricultural Research in Dry Areas"
+        )
         assert r.release_type == "article"
         assert r.release_stage == "published"
         assert r.license_slug == None
@@ -269,13 +369,15 @@ def test_datacite_dict_parse(datacite_importer):
         assert r.subtitle == None
         assert r.release_date == None
         assert r.release_year == 1986
-        assert 'subtitle' not in r.extra
-        assert 'subtitle' not in r.extra['datacite']
-        assert 'funder' not in r.extra
-        assert 'funder' not in r.extra['datacite']
+        assert "subtitle" not in r.extra
+        assert "subtitle" not in r.extra["datacite"]
+        assert "funder" not in r.extra
+        assert "funder" not in r.extra["datacite"]
         # matched by ISSN, so shouldn't be in there
-        #assert extra['container_name'] == "International Journal of Quantum Chemistry"
-        assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}]
+        # assert extra['container_name'] == "International Journal of Quantum Chemistry"
+        assert r.extra["datacite"]["subjects"] == [
+            {"subject": "Plant Genetic Resource for Food and Agriculture"}
+        ]
         assert len(r.abstracts) == 1
         assert len(r.abstracts[0].content) == 421
         assert len(r.contribs) == 2
@@ -284,34 +386,41 @@ def test_datacite_dict_parse(datacite_importer):
         assert r.contribs[0].surname == None
         assert len(r.refs) == 0
 
+
 def test_datacite_conversions(datacite_importer):
     """
     Datacite JSON to release entity JSON representation. The count is hardcoded
     for now.
     """
     datacite_importer.debug = True
-    for i in range(33):
-        src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i)
-        dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i)
-        with open(src, 'r') as f:
+    for i in range(35):
+        src = "tests/files/datacite/datacite_doc_{0:02d}.json".format(i)
+        dst = "tests/files/datacite/datacite_result_{0:02d}.json".format(i)
+        with open(src, "r") as f:
             re = datacite_importer.parse_record(json.load(f))
             result = entity_to_dict(re)
-        with open(dst, 'r') as f:
+        with open(dst, "r") as f:
             expected = json.loads(f.read())
 
-        assert result == expected, 'output mismatch in {}'.format(dst)
+        assert result == expected, "output mismatch in {}".format(dst)
+
 
 def test_index_form_to_display_name():
-    Case = collections.namedtuple('Case', 'input output')
+    Case = collections.namedtuple("Case", "input output")
     cases = [
-        Case('', ''),
-        Case('ABC', 'ABC'),
-        Case('International Space Station', 'International Space Station'),
-        Case('Jin, Shan', 'Shan Jin'),
-        Case('Volkshochschule Der Bundesstadt Bonn', 'Volkshochschule Der Bundesstadt Bonn'),
-        Case('Solomon, P. M.', 'P. M. Solomon'),
-        Case('Sujeevan Ratnasingham', 'Sujeevan Ratnasingham'),
-        Case('Paul Stöckli (1906-1991), Künstler', 'Paul Stöckli (1906-1991), Künstler'),
+        Case("", ""),
+        Case("ABC", "ABC"),
+        Case("International Space Station", "International Space Station"),
+        Case("Jin, Shan", "Shan Jin"),
+        Case(
+            "Volkshochschule Der Bundesstadt Bonn",
+            "Volkshochschule Der Bundesstadt Bonn",
+        ),
+        Case("Solomon, P. M.", "P. M. Solomon"),
+        Case("Sujeevan Ratnasingham", "Sujeevan Ratnasingham"),
+        Case(
+            "Paul Stöckli (1906-1991), Künstler", "Paul Stöckli (1906-1991), Künstler"
+        ),
     ]
 
     for c in cases:
@@ -319,45 +428,69 @@ def test_index_form_to_display_name():
 
 
 def test_lookup_license_slug():
-    Case = collections.namedtuple('Case', 'input output')
+    Case = collections.namedtuple("Case", "input output")
     cases = [
-        Case('https://opensource.org/licenses/MIT', 'MIT'),
-        Case('creativecommons.org/licenses/by-nc-nd/3.0/', 'CC-BY-NC-ND'),
-        Case('http://creativecommons.org/licences/by-nc-sa/4.0', 'CC-BY-NC-SA'),
-        Case('http://creativecommons.org/licenses/by-nc-nd/2.5/co', 'CC-BY-NC-ND'),
-        Case('http://creativecommons.org/licenses/by-nd/4.0/legalcode', 'CC-BY-ND'),
-        Case('http://creativecommons.org/licenses/by/2.0/uk/legalcode', 'CC-BY'),
-        Case('http://creativecommons.org/publicdomain/zero/1.0/legalcode', 'CC-0'),
-        Case('http://doi.wiley.com/10.1002/tdm_license_1.1', 'WILEY-TDM-1.1'),
-        Case('http://homepage.data-planet.com/terms-use', 'SAGE-DATA-PLANET'),
-        Case('http://www.springer.com/tdm', 'SPRINGER-TDM'),
-        Case('https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess.xhtml', 'ADS-UK'),
-        Case('https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess', 'ADS-UK'),
-        Case('https://creativecommons.org/public-domain/cc0', 'CC-0'),
-        Case('https://creativecommons.org/publicdomain/zero/1.0', 'CC-0'),
-        Case('https://creativecommons.org/share-your-work/public-domain/cc0', 'CC-0'),
-        Case('https://www.elsevier.com/tdm/userlicense/1.0', 'ELSEVIER-USER-1.0'),
-        Case('https://www.gnu.org/licenses/gpl-3.0.html', 'GPL-3.0'),
-        Case('http://rightsstatements.org/page/InC/1.0?language=en', 'RS-INC'),
-        Case('http://onlinelibrary.wiley.com/termsAndConditions', 'WILEY'),
-        Case('https://publikationen.bibliothek.kit.edu/kitopen-lizenz', 'KIT-OPEN'),
-        Case('http://journals.sagepub.com/page/policies/text-and-data-mining-license', 'SAGE-TDM'),
-        Case('https://creativecommons.org/publicdomain/mark/1.0/deed.de', 'CC-PUBLICDOMAIN'),
-        Case('http://creativecommons.org/publicdomain/mark/1.0', 'CC-PUBLICDOMAIN'),
-        Case('https://creativecommons.org/publicdomain/mark/1.0', 'CC-PUBLICDOMAIN'),
-        Case('https://creativecommons.org/publicdomain/mark/1.0/', 'CC-PUBLICDOMAIN'),
-        Case('https://creativecommons.org/publicdomain/mark/1.0/deed.de', 'CC-PUBLICDOMAIN'),
-        Case('https://creativecommons.org/share-your-work/public-domain/cc0/', 'CC-0'),
-        Case('http://spdx.org/licenses/CC0-1.0.json', 'CC-0'),
-        Case('http://spdx.org/licenses/CC-BY-1.0.json', 'CC-BY'),
-        Case('http://spdx.org/licenses/CC-BY-4.0.json', 'CC-BY'),
-        Case('http://spdx.org/licenses/CC-BY-NC-4.0.json', 'CC-BY-NC'),
-        Case('http://spdx.org/licenses/CC-BY-SA-3.0.json', 'CC-BY-SA'),
-        Case('http://spdx.org/licenses/CC-BY-SA-4.0.json', 'CC-BY-SA'),
-        Case('http://spdx.org/licenses/MIT.json', 'MIT'),
-        Case('http://spdx.org/licenses/OGL-Canada-2.0.json', 'OGL-CANADA'),
+        Case("https://opensource.org/licenses/MIT", "MIT"),
+        Case("creativecommons.org/licenses/by-nc-nd/3.0/", "CC-BY-NC-ND"),
+        Case("http://creativecommons.org/licences/by-nc-sa/4.0", "CC-BY-NC-SA"),
+        Case("http://creativecommons.org/licenses/by-nc-nd/2.5/co", "CC-BY-NC-ND"),
+        Case("http://creativecommons.org/licenses/by-nd/4.0/legalcode", "CC-BY-ND"),
+        Case("http://creativecommons.org/licenses/by/2.0/uk/legalcode", "CC-BY"),
+        Case("http://creativecommons.org/publicdomain/zero/1.0/legalcode", "CC-0"),
+        Case("http://doi.wiley.com/10.1002/tdm_license_1.1", "WILEY-TDM-1.1"),
+        Case("http://homepage.data-planet.com/terms-use", "SAGE-DATA-PLANET"),
+        Case("http://www.springer.com/tdm", "SPRINGER-TDM"),
+        Case(
+            "https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess.xhtml",
+            "ADS-UK",
+        ),
+        Case(
+            "https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess", "ADS-UK"
+        ),
+        Case("https://creativecommons.org/public-domain/cc0", "CC-0"),
+        Case("https://creativecommons.org/publicdomain/zero/1.0", "CC-0"),
+        Case("https://creativecommons.org/share-your-work/public-domain/cc0", "CC-0"),
+        Case("https://www.elsevier.com/tdm/userlicense/1.0", "ELSEVIER-USER-1.0"),
+        Case("https://www.gnu.org/licenses/gpl-3.0.html", "GPL-3.0"),
+        Case("http://rightsstatements.org/page/InC/1.0?language=en", "RS-INC"),
+        Case("http://onlinelibrary.wiley.com/termsAndConditions", "WILEY"),
+        Case("https://publikationen.bibliothek.kit.edu/kitopen-lizenz", "KIT-OPEN"),
+        Case(
+            "http://journals.sagepub.com/page/policies/text-and-data-mining-license",
+            "SAGE-TDM",
+        ),
+        Case(
+            "https://creativecommons.org/publicdomain/mark/1.0/deed.de",
+            "CC-PUBLICDOMAIN",
+        ),
+        Case("http://creativecommons.org/publicdomain/mark/1.0", "CC-PUBLICDOMAIN"),
+        Case("https://creativecommons.org/publicdomain/mark/1.0", "CC-PUBLICDOMAIN"),
+        Case("https://creativecommons.org/publicdomain/mark/1.0/", "CC-PUBLICDOMAIN"),
+        Case(
+            "https://creativecommons.org/publicdomain/mark/1.0/deed.de",
+            "CC-PUBLICDOMAIN",
+        ),
+        Case("https://creativecommons.org/share-your-work/public-domain/cc0/", "CC-0"),
+        Case("http://spdx.org/licenses/CC0-1.0.json", "CC-0"),
+        Case("http://spdx.org/licenses/CC-BY-1.0.json", "CC-BY"),
+        Case("http://spdx.org/licenses/CC-BY-4.0.json", "CC-BY"),
+        Case("http://spdx.org/licenses/CC-BY-NC-4.0.json", "CC-BY-NC"),
+        Case("http://spdx.org/licenses/CC-BY-SA-3.0.json", "CC-BY-SA"),
+        Case("http://spdx.org/licenses/CC-BY-SA-4.0.json", "CC-BY-SA"),
+        Case("http://spdx.org/licenses/MIT.json", "MIT"),
+        Case("http://spdx.org/licenses/OGL-Canada-2.0.json", "OGL-CANADA"),
     ]
 
     for c in cases:
         got = lookup_license_slug(c.input)
-        assert c.output == got, '{}: got {}, want {}'.format(c.input, got, c.output)
+        assert c.output == got, "{}: got {}, want {}".format(c.input, got, c.output)
+
+
+def test_contributor_list_contains_contributor():
+    Case = collections.namedtuple("Case", "contrib_list contrib want")
+    cases = [
+        Case([], fatcat_openapi_client.ReleaseContrib(raw_name="Paul Katz"), False),
+    ]
+    for c in cases:
+        got = contributor_list_contains_contributor(c.contrib_list, c.contrib)
+        assert got == c.want
author	bnewbold <bnewbold@archive.org>	2020-07-11 00:31:47 +0000
committer	bnewbold <bnewbold@archive.org>	2020-07-11 00:31:47 +0000
commit	f5aefab6a6431ab9db99761457fd47b36b920b8c (patch)
tree	d144988d310aeecf8521cfc33aca9f0667dfedbc
parent	26b455ffad566bef58684a78654a2719c409588a (diff)
parent	3c266e07771271241aa8cff3e3199a45109362af (diff)
download	fatcat-f5aefab6a6431ab9db99761457fd47b36b920b8c.tar.gz fatcat-f5aefab6a6431ab9db99761457fd47b36b920b8c.zip