diff options
| -rw-r--r-- | python/fatcat_tools/importers/datacite.py | 66 | ||||
| -rw-r--r-- | python/tests/files/datacite/datacite_doc_33.json | 62 | ||||
| -rw-r--r-- | python/tests/files/datacite/datacite_doc_34.json | 61 | ||||
| -rw-r--r-- | python/tests/files/datacite/datacite_result_05.json | 6 | ||||
| -rw-r--r-- | python/tests/files/datacite/datacite_result_08.json | 7 | ||||
| -rw-r--r-- | python/tests/files/datacite/datacite_result_09.json | 3 | ||||
| -rw-r--r-- | python/tests/files/datacite/datacite_result_26.json | 3 | ||||
| -rw-r--r-- | python/tests/files/datacite/datacite_result_27.json | 3 | ||||
| -rw-r--r-- | python/tests/files/datacite/datacite_result_28.json | 3 | ||||
| -rw-r--r-- | python/tests/files/datacite/datacite_result_29.json | 3 | ||||
| -rw-r--r-- | python/tests/files/datacite/datacite_result_33.json | 31 | ||||
| -rw-r--r-- | python/tests/files/datacite/datacite_result_34.json | 31 | ||||
| -rw-r--r-- | python/tests/import_datacite.py | 591 | 
13 files changed, 619 insertions, 251 deletions
| diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 785107ee..ebb29feb 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -294,7 +294,39 @@ class DataciteImporter(EntityImporter):          creators = attributes.get('creators', []) or []          contributors = attributes.get('contributors', []) or []  # Much fewer than creators. -        contribs = self.parse_datacite_creators(creators, doi=doi) + self.parse_datacite_creators(contributors, role=None, set_index=False, doi=doi) +        contribs = self.parse_datacite_creators(creators, doi=doi) + +        # Beside creators, we have contributors in datacite. Sample: +        # ContactPerson, DataCollector, DataCurator, DataManager, Distributor, +        # Editor, Funder, HostingInstitution, Other, Producer, ProjectLeader, +        # ProjectMember, RelatedPerson, ResearchGroup, Researcher, +        # RightsHolder, Sponsor, Supervisor +        # +        # Datacite schema: +        # https://schema.datacite.org/meta/kernel-4.3/doc/DataCite-MetadataKernel_v4.3.pdf#page=32 +        # -- could be used as a form of controlled vocab? +        # +        # Currently (07/2020) in release_contrib: +        # +        # select count(*), role from release_contrib group by role; +        #    count   |    role +        # -----------+------------ +        #  500269665 | author +        #    4386563 | editor +        #      17871 | translator +        #   10870584 | +        # (4 rows) +	# +        # Related: https://guide.fatcat.wiki/entity_release.html -- role +        # (string, of a set): the type of contribution, from a controlled +        # vocabulary. TODO: vocabulary needs review. +        contribs_extra_contributors = self.parse_datacite_creators(contributors, set_index=False, doi=doi) + +        # Unfortunately, creators and contributors might overlap, refs GH59. +        for cc in contribs_extra_contributors: +            if contributor_list_contains_contributor(contribs, cc): +                continue +            contribs.append(cc)          # Title, may come with "attributes.titles[].titleType", like          # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" @@ -725,9 +757,10 @@ class DataciteImporter(EntityImporter):          # Names, that should be ignored right away.          name_blacklist = set(('Occdownload Gbif.Org',)) -        for i, c in enumerate(creators): +        i = 0 +        for c in creators:              if not set_index: -                i = None +               i = None              nameType = c.get('nameType', '') or ''              if nameType in ('', 'Personal'):                  creator_id = None @@ -799,8 +832,7 @@ class DataciteImporter(EntityImporter):                  if contributorType:                      extra = {'type': contributorType} -                contribs.append( -                    fatcat_openapi_client.ReleaseContrib( +                rc = fatcat_openapi_client.ReleaseContrib(                          creator_id=creator_id,                          index=i,                          raw_name=name, @@ -809,7 +841,12 @@ class DataciteImporter(EntityImporter):                          role=role,                          raw_affiliation=raw_affiliation,                          extra=extra, -                    )) +                    ) +                # Filter out duplicates early. +                if not contributor_list_contains_contributor(contribs, rc): +                    contribs.append(rc) +                    if i is not None: +                        i += 1              elif nameType == 'Organizational':                  name = c.get('name', '') or ''                  if name in UNKNOWN_MARKERS: @@ -819,12 +856,29 @@ class DataciteImporter(EntityImporter):                  extra = {'organization': name}                  contribs.append(fatcat_openapi_client.ReleaseContrib(                      index=i, extra=extra)) +                if i is not None: +                    i += 1              else:                  print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr)          return contribs +def contributor_list_contains_contributor(contributor_list, contributor): +    """ +    Given a list of contributors, determine, whether contrib is in that list. +    """ +    for cc in contributor_list: +        if cc.raw_name != contributor.raw_name: +            continue +        cc_role = cc.role or 'author' +        contributor_role = contributor.role or 'author' +        if cc_role != contributor_role: +            continue +        return True +    return False + +  def lookup_license_slug(raw):      """      Resolve a variety of strings into a some pseudo-canonical form, e.g. diff --git a/python/tests/files/datacite/datacite_doc_33.json b/python/tests/files/datacite/datacite_doc_33.json new file mode 100644 index 00000000..571d1220 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_33.json @@ -0,0 +1,62 @@ +{ +  "id": "10.17912/micropub.biology.000143", +  "type": "dois", +  "attributes": { +    "doi": "10.17912/micropub.biology.000143", +    "identifiers": null, +    "creators": [ +      { +        "name": "ABC News", +        "givenName": "", +        "familyName": "", +        "affiliation": [], +        "role": "author" +      } +    ], +    "titles": [ +      { +        "title": "Sample" +      } +    ], +    "publisher": "microPublication Biology", +    "publicationYear": 2019, +    "types": { +      "resourceTypeGeneral": "DataPaper" +    }, +    "relatedIdentifiers": [], +    "sizes": [], +    "formats": [], +    "version": null, +    "rightsList": [], +    "descriptions": [ +      { +        "description": 1234567890, +        "descriptionType": "Abstract" +      } +    ], +    "geoLocations": [], +    "fundingReferences": [], +    "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143", +    "created": "2019-08-19T14:43:08.000Z", +    "registered": "2019-08-19T14:43:09.000Z", +    "published": "2019", +    "updated": "2019-11-09T12:32:02.000Z", +    "contributors": [ +      { +        "name": "ABC News", +        "givenName": "", +        "familyName": "", +        "affiliation": [], +        "role": "" +      } +    ] +  }, +  "relationships": { +    "client": { +      "data": { +        "id": "caltech.micropub", +        "type": "clients" +      } +    } +  } +} diff --git a/python/tests/files/datacite/datacite_doc_34.json b/python/tests/files/datacite/datacite_doc_34.json new file mode 100644 index 00000000..5dcf65f4 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_34.json @@ -0,0 +1,61 @@ +{ +  "id": "10.17912/micropub.biology.000143", +  "type": "dois", +  "attributes": { +    "doi": "10.17912/micropub.biology.000143", +    "identifiers": null, +    "creators": [ +      { +        "name": "Paul Katz", +        "givenName": "", +        "familyName": "", +        "affiliation": [], +        "role": "author" +      } +    ], +    "titles": [ +      { +        "title": "Sample" +      } +    ], +    "publisher": "microPublication Biology", +    "publicationYear": 2019, +    "types": { +      "resourceTypeGeneral": "DataPaper" +    }, +    "relatedIdentifiers": [], +    "sizes": [], +    "formats": [], +    "version": null, +    "rightsList": [], +    "descriptions": [ +      { +        "description": 1234567890, +        "descriptionType": "Abstract" +      } +    ], +    "geoLocations": [], +    "fundingReferences": [], +    "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143", +    "created": "2019-08-19T14:43:08.000Z", +    "registered": "2019-08-19T14:43:09.000Z", +    "published": "2019", +    "updated": "2019-11-09T12:32:02.000Z", +    "contributors": [ +      { +        "name": "Paul Katz", +        "givenName": "", +        "familyName": "", +        "affiliation": [], +        "role": "illustrator" +      } ] +  }, +  "relationships": { +    "client": { +      "data": { +        "id": "caltech.micropub", +        "type": "clients" +      } +    } +  } +} diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json index 79c2a8fb..c91f3a7f 100644 --- a/python/tests/files/datacite/datacite_result_05.json +++ b/python/tests/files/datacite/datacite_result_05.json @@ -505,10 +505,8 @@        "surname": "Wurzbacher"      },      { -      "raw_name": "Kessy Abarenkov" -    }, -    { -      "raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden" +      "raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden", +      "role": "author"      }    ],    "ext_ids": { diff --git a/python/tests/files/datacite/datacite_result_08.json b/python/tests/files/datacite/datacite_result_08.json index 70237280..5a46ef50 100644 --- a/python/tests/files/datacite/datacite_result_08.json +++ b/python/tests/files/datacite/datacite_result_08.json @@ -13,13 +13,6 @@        "raw_name": "Kei Kajisa",        "role": "author",        "surname": "Kajisa" -    }, -    { -      "given_name": "Kei", -      "index": 1, -      "raw_name": "Kei Kajisa", -      "role": "author", -      "surname": "Kajisa"      }    ],    "ext_ids": { diff --git a/python/tests/files/datacite/datacite_result_09.json b/python/tests/files/datacite/datacite_result_09.json index 09e02fc7..f6ec524a 100644 --- a/python/tests/files/datacite/datacite_result_09.json +++ b/python/tests/files/datacite/datacite_result_09.json @@ -17,7 +17,8 @@        "extra": {          "type": "DataManager"        }, -      "raw_name": "Technische Informationsbibliothek (TIB)" +      "raw_name": "Technische Informationsbibliothek (TIB)", +      "role": "author"      }    ],    "ext_ids": { diff --git a/python/tests/files/datacite/datacite_result_26.json b/python/tests/files/datacite/datacite_result_26.json index 267eb9c2..f6e589ef 100644 --- a/python/tests/files/datacite/datacite_result_26.json +++ b/python/tests/files/datacite/datacite_result_26.json @@ -13,7 +13,8 @@        },        "given_name": "David",        "raw_name": "David Wemmer", -      "surname": "Wemmer" +      "surname": "Wemmer", +      "role": "author"      }    ],    "ext_ids": { diff --git a/python/tests/files/datacite/datacite_result_27.json b/python/tests/files/datacite/datacite_result_27.json index 3d033e6a..e934fb41 100644 --- a/python/tests/files/datacite/datacite_result_27.json +++ b/python/tests/files/datacite/datacite_result_27.json @@ -13,7 +13,8 @@        },        "given_name": "David",        "raw_name": "David Wemmer", -      "surname": "Wemmer" +      "surname": "Wemmer", +      "role": "author"      }    ],    "ext_ids": { diff --git a/python/tests/files/datacite/datacite_result_28.json b/python/tests/files/datacite/datacite_result_28.json index 84bed9c8..bcb1caaf 100644 --- a/python/tests/files/datacite/datacite_result_28.json +++ b/python/tests/files/datacite/datacite_result_28.json @@ -13,7 +13,8 @@        },        "given_name": "David",        "raw_name": "David Wemmer", -      "surname": "Wemmer" +      "surname": "Wemmer", +      "role": "author"      }    ],    "ext_ids": { diff --git a/python/tests/files/datacite/datacite_result_29.json b/python/tests/files/datacite/datacite_result_29.json index 84bed9c8..bcb1caaf 100644 --- a/python/tests/files/datacite/datacite_result_29.json +++ b/python/tests/files/datacite/datacite_result_29.json @@ -13,7 +13,8 @@        },        "given_name": "David",        "raw_name": "David Wemmer", -      "surname": "Wemmer" +      "surname": "Wemmer", +      "role": "author"      }    ],    "ext_ids": { diff --git a/python/tests/files/datacite/datacite_result_33.json b/python/tests/files/datacite/datacite_result_33.json new file mode 100644 index 00000000..bcb72469 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_33.json @@ -0,0 +1,31 @@ +{ +  "abstracts": [ +    { +      "content": "1234567890", +      "mimetype": "text/plain" +    } +  ], +  "contribs": [ +    { +      "given_name": "", +      "surname": "", +      "index": 0, +      "raw_name": "ABC News", +      "role": "author" +    } +  ], +  "ext_ids": { +    "doi": "10.17912/micropub.biology.000143" +  }, +  "extra": { +    "datacite": { +      "resourceTypeGeneral": "DataPaper" +    }, +    "container_name": "microPublication Biology" +  }, +  "refs": [], +  "release_stage": "published", +  "release_year": 2019, +  "publisher": "microPublication Biology", +  "title": "Sample" +} diff --git a/python/tests/files/datacite/datacite_result_34.json b/python/tests/files/datacite/datacite_result_34.json new file mode 100644 index 00000000..4a52e22c --- /dev/null +++ b/python/tests/files/datacite/datacite_result_34.json @@ -0,0 +1,31 @@ +{ +  "abstracts": [ +    { +      "content": "1234567890", +      "mimetype": "text/plain" +    } +  ], +  "contribs": [ +    { +      "given_name": "", +      "surname": "", +      "index": 0, +      "raw_name": "Paul Katz", +      "role": "author" +    } +  ], +  "ext_ids": { +    "doi": "10.17912/micropub.biology.000143" +  }, +  "extra": { +    "datacite": { +      "resourceTypeGeneral": "DataPaper" +    }, +    "container_name": "microPublication Biology" +  }, +  "refs": [], +  "release_stage": "published", +  "release_year": 2019, +  "publisher": "microPublication Biology", +  "title": "Sample" +} diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 8fb2d079..b94b6bc5 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -10,32 +10,54 @@ import collections  import pytest  from fatcat_tools.importers import DataciteImporter, JsonLinePusher -from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates, index_form_to_display_name, lookup_license_slug +from fatcat_tools.importers.datacite import ( +    find_original_language_title, +    parse_datacite_titles, +    parse_datacite_dates, +    clean_doi, +    index_form_to_display_name, +    lookup_license_slug, +    contributor_list_contains_contributor, +)  from fatcat_tools.transforms import entity_to_dict -from fixtures import * +import fatcat_openapi_client +from fixtures import api +import json  @pytest.fixture(scope="function")  def datacite_importer(api): -    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: -        yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', -                               bezerk_mode=True) +    with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: +        yield DataciteImporter( +            api, +            issn_file, +            extid_map_file="tests/files/example_map.sqlite3", +            bezerk_mode=True, +        ) +  @pytest.fixture(scope="function")  def datacite_importer_existing(api): -    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: -        yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', -                               bezerk_mode=False) +    with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: +        yield DataciteImporter( +            api, +            issn_file, +            extid_map_file="tests/files/example_map.sqlite3", +            bezerk_mode=False, +        ) +  @pytest.mark.skip(reason="larger datacite import slows tests down")  def test_datacite_importer_huge(datacite_importer):      last_index = datacite_importer.api.get_changelog(limit=1)[0].index -    with gzip.open('tests/files/datacite_1k_records.jsonl.gz', 'rt') as f: +    with gzip.open("tests/files/datacite_1k_records.jsonl.gz", "rt") as f:          datacite_importer.bezerk_mode = True          counts = JsonLinePusher(datacite_importer, f).run() -    assert counts['insert'] == 998 -    change = datacite_importer.api.get_changelog_entry(index=last_index+1) -    release = datacite_importer.api.get_release(change.editgroup.edits.releases[0].ident) +    assert counts["insert"] == 998 +    change = datacite_importer.api.get_changelog_entry(index=last_index + 1) +    release = datacite_importer.api.get_release( +        change.editgroup.edits.releases[0].ident +    )      assert len(release.contribs) == 3 @@ -43,122 +65,161 @@ def test_find_original_language_title():      """      Original language might be included, in various ways.      """ -    Case = collections.namedtuple('Case', 'about input result') +    Case = collections.namedtuple("Case", "about input result")      cases = [ -        Case('defaults to None', {}, None), -        Case('ignore unknown keys', {'broken': 'kv'}, None), -        Case('just a title', {'title': 'Noise Reduction'}, None), -        Case('same title should be ignored', { -            'title': 'Noise Reduction', -            'original_language_title': 'Noise Reduction' -        }, None), -        Case('empty subdict is ignored', { -            'title': 'Noise Reduction', -            'original_language_title': {}, -        }, None), -        Case('unknown subdict keys are ignored', { -            'title': 'Noise Reduction', -            'original_language_title': {'broken': 'kv'}, -        }, None), -        Case('original string', { -            'title': 'Noise Reduction', -            'original_language_title': 'Подавление шума', -        }, 'Подавление шума'), -        Case('language tag is ignored, since its broken', { -            'title': 'Noise Reduction', -            'original_language_title': { -                'language': 'ja', -                '__content__': 'Noise Reduction' +        Case("defaults to None", {}, None), +        Case("ignore unknown keys", {"broken": "kv"}, None), +        Case("just a title", {"title": "Noise Reduction"}, None), +        Case( +            "same title should be ignored", +            {"title": "Noise Reduction", "original_language_title": "Noise Reduction"}, +            None, +        ), +        Case( +            "empty subdict is ignored", +            {"title": "Noise Reduction", "original_language_title": {},}, +            None, +        ), +        Case( +            "unknown subdict keys are ignored", +            {"title": "Noise Reduction", "original_language_title": {"broken": "kv"},}, +            None, +        ), +        Case( +            "original string", +            {"title": "Noise Reduction", "original_language_title": "Подавление шума",}, +            "Подавление шума", +        ), +        Case( +            "language tag is ignored, since its broken", +            { +                "title": "Noise Reduction", +                "original_language_title": { +                    "language": "ja", +                    "__content__": "Noise Reduction", +                },              }, -        }, None), -        Case('do not care about language', { -            'title': 'Noise Reduction', -            'original_language_title': { -                'language': 'ja', -                '__content__': 'Rauschunterdrückung', +            None, +        ), +        Case( +            "do not care about language", +            { +                "title": "Noise Reduction", +                "original_language_title": { +                    "language": "ja", +                    "__content__": "Rauschunterdrückung", +                },              }, -        }, 'Rauschunterdrückung'), -        Case('ignore excessive questionmarks', { -            'title': 'Noise Reduction', -            'original_language_title': { -                'language': 'ja', -                '__content__': '???? However', +            "Rauschunterdrückung", +        ), +        Case( +            "ignore excessive questionmarks", +            { +                "title": "Noise Reduction", +                "original_language_title": { +                    "language": "ja", +                    "__content__": "???? However", +                },              }, -        }, None), +            None, +        ),      ]      for case in cases:          result = find_original_language_title(case.input)          assert result == case.result +  def test_parse_datacite_titles():      """      Given a list of titles, find title, original_language_title and subtitle.      Result is a 3-tuple of title, original_language_title, subtitle.      """ -    Case = collections.namedtuple('Case', 'about input result') +    Case = collections.namedtuple("Case", "about input result")      cases = [ -        Case('handle None', None, (None, None, None)), -        Case('empty list', [], (None, None, None)), -        Case('empty item', [{}], (None, None, None)), -        Case('broken keys', [{'broken': 'kv'}], (None, None, None)), -        Case('title only', [{'title': 'Total carbon dioxide'}], -             ('Total carbon dioxide', None, None), -        ), -        Case('title and subtitle', [ -            {'title': 'Total carbon dioxide'}, -            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, -        ], -             ('Total carbon dioxide', None, 'Station TT043_7-9'), -        ), -        Case('title, subtitle order does not matter', [ -            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, -            {'title': 'Total carbon dioxide'}, -        ], -             ('Total carbon dioxide', None, 'Station TT043_7-9'), -        ), -        Case('multiple titles, first wins', [ -            {'title': 'Total carbon dioxide'}, -            {'title': 'Meeting Heterogeneity'}, -        ], -             ('Total carbon dioxide', None, None), -        ), -        Case('multiple titles, plus sub', [ -            {'title': 'Total carbon dioxide'}, -            {'title': 'Meeting Heterogeneity'}, -            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, -        ], -             ('Total carbon dioxide', None, 'Station TT043_7-9'), -        ), -        Case('multiple titles, multiple subs', [ -            {'title': 'Total carbon dioxide'}, -            {'title': 'Meeting Heterogeneity'}, -            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, -            {'title': 'Some other subtitle', 'titleType': 'Subtitle'}, -        ], -             ('Total carbon dioxide', None, 'Station TT043_7-9'), -        ), -        Case('title, original, sub', [ -            {'title': 'Total carbon dioxide', 'original_language_title': 'Всего углекислого газа'}, -            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, -        ], -             ('Total carbon dioxide', 'Всего углекислого газа', 'Station TT043_7-9'), -        ), -        Case('title, original same as title, sub', [ -            {'title': 'Total carbon dioxide', 'original_language_title': { -                '__content__': 'Total carbon dioxide', -            }}, -            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, -        ], -             ('Total carbon dioxide', None, 'Station TT043_7-9'), -        ), -        Case('title, original dict, sub', [ -            {'title': 'Total carbon dioxide', 'original_language_title': { -                '__content__': 'Всего углекислого газа', -            }}, -            {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, -        ], -             ('Total carbon dioxide', 'Всего углекислого газа', 'Station TT043_7-9'), +        Case("handle None", None, (None, None, None)), +        Case("empty list", [], (None, None, None)), +        Case("empty item", [{}], (None, None, None)), +        Case("broken keys", [{"broken": "kv"}], (None, None, None)), +        Case( +            "title only", +            [{"title": "Total carbon dioxide"}], +            ("Total carbon dioxide", None, None), +        ), +        Case( +            "title and subtitle", +            [ +                {"title": "Total carbon dioxide"}, +                {"title": "Station TT043_7-9", "titleType": "Subtitle"}, +            ], +            ("Total carbon dioxide", None, "Station TT043_7-9"), +        ), +        Case( +            "title, subtitle order does not matter", +            [ +                {"title": "Station TT043_7-9", "titleType": "Subtitle"}, +                {"title": "Total carbon dioxide"}, +            ], +            ("Total carbon dioxide", None, "Station TT043_7-9"), +        ), +        Case( +            "multiple titles, first wins", +            [{"title": "Total carbon dioxide"}, {"title": "Meeting Heterogeneity"},], +            ("Total carbon dioxide", None, None), +        ), +        Case( +            "multiple titles, plus sub", +            [ +                {"title": "Total carbon dioxide"}, +                {"title": "Meeting Heterogeneity"}, +                {"title": "Station TT043_7-9", "titleType": "Subtitle"}, +            ], +            ("Total carbon dioxide", None, "Station TT043_7-9"), +        ), +        Case( +            "multiple titles, multiple subs", +            [ +                {"title": "Total carbon dioxide"}, +                {"title": "Meeting Heterogeneity"}, +                {"title": "Station TT043_7-9", "titleType": "Subtitle"}, +                {"title": "Some other subtitle", "titleType": "Subtitle"}, +            ], +            ("Total carbon dioxide", None, "Station TT043_7-9"), +        ), +        Case( +            "title, original, sub", +            [ +                { +                    "title": "Total carbon dioxide", +                    "original_language_title": "Всего углекислого газа", +                }, +                {"title": "Station TT043_7-9", "titleType": "Subtitle"}, +            ], +            ("Total carbon dioxide", "Всего углекислого газа", "Station TT043_7-9"), +        ), +        Case( +            "title, original same as title, sub", +            [ +                { +                    "title": "Total carbon dioxide", +                    "original_language_title": {"__content__": "Total carbon dioxide",}, +                }, +                {"title": "Station TT043_7-9", "titleType": "Subtitle"}, +            ], +            ("Total carbon dioxide", None, "Station TT043_7-9"), +        ), +        Case( +            "title, original dict, sub", +            [ +                { +                    "title": "Total carbon dioxide", +                    "original_language_title": { +                        "__content__": "Всего углекислого газа", +                    }, +                }, +                {"title": "Station TT043_7-9", "titleType": "Subtitle"}, +            ], +            ("Total carbon dioxide", "Всего углекислого газа", "Station TT043_7-9"),          ),      ] @@ -166,91 +227,128 @@ def test_parse_datacite_titles():          result = parse_datacite_titles(case.input)          assert result == case.result, case.about +  def test_parse_datacite_dates():      """      Test datacite date parsing.      """ -    Case = collections.namedtuple('Case', 'about input result') +    Case = collections.namedtuple("Case", "about input result")      cases = [ -        Case('None is None', None, (None, None, None)), -        Case('empty list is None', [], (None, None, None)), -        Case('empty item is None', [{}], (None, None, None)), -        Case('year only yields year only', [{'date': '2019'}], (None, None, 2019)), -        Case('int year', [{'date': 2019}], (None, None, 2019)), -        Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, None, 2019)), -        Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, None, 2020)), -        Case('first with type', [ -            {'date': '2019', 'dateType': 'Accepted'}, {'date': '2020'} -        ], (None, None, 2019)), -        Case('full date', [ -            {'date': '2019-12-01', 'dateType': 'Valid'}, -        ], (datetime.date(2019, 12, 1), 12, 2019)), -        Case('date type prio', [ -            {'date': '2000-12-01', 'dateType': 'Valid'}, -            {'date': '2010-01-01', 'dateType': 'Updated'}, -        ], (datetime.date(2000, 12, 1), 12, 2000)), -        Case('date type prio, Available > Updated', [ -            {'date': '2010-01-01', 'dateType': 'Updated'}, -            {'date': '2000-12-01', 'dateType': 'Available'}, -        ], (datetime.date(2000, 12, 1), 12, 2000)), -        Case('allow different date formats, Available > Updated', [ -            {'date': '2010-01-01T10:00:00', 'dateType': 'Updated'}, -            {'date': '2000-12-01T10:00:00', 'dateType': 'Available'}, -        ], (datetime.date(2000, 12, 1), 12, 2000)), -        Case('allow different date formats, Available > Updated', [ -            {'date': '2010-01-01T10:00:00Z', 'dateType': 'Updated'}, -            {'date': '2000-12-01T10:00:00Z', 'dateType': 'Available'}, -        ], (datetime.date(2000, 12, 1), 12, 2000)), -        Case('allow fuzzy date formats, Available > Updated', [ -            {'date': '2010', 'dateType': 'Updated'}, -            {'date': '2000 Dec 01', 'dateType': 'Available'}, -        ], (datetime.date(2000, 12, 1), 12, 2000)), -        Case('fuzzy year only', [ -            {'date': 'Year 2010', 'dateType': 'Issued'}, -        ], (None, None, 2010)), -        Case('fuzzy year and month', [ -            {'date': 'Year 2010 Feb', 'dateType': 'Issued'}, -        ], (None, 2, 2010)), -        Case('fuzzy year, month, day', [ -            {'date': 'Year 2010 Feb 24', 'dateType': 'Issued'}, -        ], (datetime.date(2010, 2, 24), 2, 2010)), -        Case('ignore broken date', [ -            {'date': 'Febrrr 45', 'dateType': 'Updated'}, -        ], (None, None, None)), +        Case("None is None", None, (None, None, None)), +        Case("empty list is None", [], (None, None, None)), +        Case("empty item is None", [{}], (None, None, None)), +        Case("year only yields year only", [{"date": "2019"}], (None, None, 2019)), +        Case("int year", [{"date": 2019}], (None, None, 2019)), +        Case("first wins", [{"date": "2019"}, {"date": "2020"}], (None, None, 2019)), +        Case( +            "skip bogus year", [{"date": "abc"}, {"date": "2020"}], (None, None, 2020) +        ), +        Case( +            "first with type", +            [{"date": "2019", "dateType": "Accepted"}, {"date": "2020"}], +            (None, None, 2019), +        ), +        Case( +            "full date", +            [{"date": "2019-12-01", "dateType": "Valid"},], +            (datetime.date(2019, 12, 1), 12, 2019), +        ), +        Case( +            "date type prio", +            [ +                {"date": "2000-12-01", "dateType": "Valid"}, +                {"date": "2010-01-01", "dateType": "Updated"}, +            ], +            (datetime.date(2000, 12, 1), 12, 2000), +        ), +        Case( +            "date type prio, Available > Updated", +            [ +                {"date": "2010-01-01", "dateType": "Updated"}, +                {"date": "2000-12-01", "dateType": "Available"}, +            ], +            (datetime.date(2000, 12, 1), 12, 2000), +        ), +        Case( +            "allow different date formats, Available > Updated", +            [ +                {"date": "2010-01-01T10:00:00", "dateType": "Updated"}, +                {"date": "2000-12-01T10:00:00", "dateType": "Available"}, +            ], +            (datetime.date(2000, 12, 1), 12, 2000), +        ), +        Case( +            "allow different date formats, Available > Updated", +            [ +                {"date": "2010-01-01T10:00:00Z", "dateType": "Updated"}, +                {"date": "2000-12-01T10:00:00Z", "dateType": "Available"}, +            ], +            (datetime.date(2000, 12, 1), 12, 2000), +        ), +        Case( +            "allow fuzzy date formats, Available > Updated", +            [ +                {"date": "2010", "dateType": "Updated"}, +                {"date": "2000 Dec 01", "dateType": "Available"}, +            ], +            (datetime.date(2000, 12, 1), 12, 2000), +        ), +        Case( +            "fuzzy year only", +            [{"date": "Year 2010", "dateType": "Issued"},], +            (None, None, 2010), +        ), +        Case( +            "fuzzy year and month", +            [{"date": "Year 2010 Feb", "dateType": "Issued"},], +            (None, 2, 2010), +        ), +        Case( +            "fuzzy year, month, day", +            [{"date": "Year 2010 Feb 24", "dateType": "Issued"},], +            (datetime.date(2010, 2, 24), 2, 2010), +        ), +        Case( +            "ignore broken date", +            [{"date": "Febrrr 45", "dateType": "Updated"},], +            (None, None, None), +        ),      ]      for case in cases:          result = parse_datacite_dates(case.input)          assert result == case.result, case.about +  def test_datacite_importer(datacite_importer):      last_index = datacite_importer.api.get_changelog(limit=1)[0].index -    with open('tests/files/datacite_sample.jsonl', 'r') as f: +    with open("tests/files/datacite_sample.jsonl", "r") as f:          datacite_importer.bezerk_mode = True          counts = JsonLinePusher(datacite_importer, f).run() -    assert counts['insert'] == 1 -    assert counts['exists'] == 0 -    assert counts['skip'] == 0 +    assert counts["insert"] == 1 +    assert counts["exists"] == 0 +    assert counts["skip"] == 0      # fetch most recent editgroup -    change = datacite_importer.api.get_changelog_entry(index=last_index+1) +    change = datacite_importer.api.get_changelog_entry(index=last_index + 1)      eg = change.editgroup      assert eg.description      assert "datacite" in eg.description.lower() -    assert eg.extra['git_rev'] -    assert "fatcat_tools.DataciteImporter" in eg.extra['agent'] +    assert eg.extra["git_rev"] +    assert "fatcat_tools.DataciteImporter" in eg.extra["agent"]      last_index = datacite_importer.api.get_changelog(limit=1)[0].index -    with open('tests/files/datacite_sample.jsonl', 'r') as f: +    with open("tests/files/datacite_sample.jsonl", "r") as f:          datacite_importer.bezerk_mode = False          datacite_importer.reset()          counts = JsonLinePusher(datacite_importer, f).run() -    assert counts['insert'] == 0 -    assert counts['exists'] == 1 -    assert counts['skip'] == 0 +    assert counts["insert"] == 0 +    assert counts["exists"] == 1 +    assert counts["skip"] == 0      assert last_index == datacite_importer.api.get_changelog(limit=1)[0].index +  def test_datacite_dict_parse(datacite_importer): -    with open('tests/files/datacite_sample.jsonl', 'r') as f: +    with open("tests/files/datacite_sample.jsonl", "r") as f:          raw = json.load(f)          r = datacite_importer.parse_record(raw)          # ensure the API server is ok with format @@ -258,7 +356,9 @@ def test_datacite_dict_parse(datacite_importer):          print(r.extra)          assert r.title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090" -        assert r.publisher == "International Centre for Agricultural Research in Dry Areas" +        assert ( +            r.publisher == "International Centre for Agricultural Research in Dry Areas" +        )          assert r.release_type == "article"          assert r.release_stage == "published"          assert r.license_slug == None @@ -269,13 +369,15 @@ def test_datacite_dict_parse(datacite_importer):          assert r.subtitle == None          assert r.release_date == None          assert r.release_year == 1986 -        assert 'subtitle' not in r.extra -        assert 'subtitle' not in r.extra['datacite'] -        assert 'funder' not in r.extra -        assert 'funder' not in r.extra['datacite'] +        assert "subtitle" not in r.extra +        assert "subtitle" not in r.extra["datacite"] +        assert "funder" not in r.extra +        assert "funder" not in r.extra["datacite"]          # matched by ISSN, so shouldn't be in there -        #assert extra['container_name'] == "International Journal of Quantum Chemistry" -        assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}] +        # assert extra['container_name'] == "International Journal of Quantum Chemistry" +        assert r.extra["datacite"]["subjects"] == [ +            {"subject": "Plant Genetic Resource for Food and Agriculture"} +        ]          assert len(r.abstracts) == 1          assert len(r.abstracts[0].content) == 421          assert len(r.contribs) == 2 @@ -284,34 +386,41 @@ def test_datacite_dict_parse(datacite_importer):          assert r.contribs[0].surname == None          assert len(r.refs) == 0 +  def test_datacite_conversions(datacite_importer):      """      Datacite JSON to release entity JSON representation. The count is hardcoded      for now.      """      datacite_importer.debug = True -    for i in range(33): -        src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i) -        dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i) -        with open(src, 'r') as f: +    for i in range(35): +        src = "tests/files/datacite/datacite_doc_{0:02d}.json".format(i) +        dst = "tests/files/datacite/datacite_result_{0:02d}.json".format(i) +        with open(src, "r") as f:              re = datacite_importer.parse_record(json.load(f))              result = entity_to_dict(re) -        with open(dst, 'r') as f: +        with open(dst, "r") as f:              expected = json.loads(f.read()) -        assert result == expected, 'output mismatch in {}'.format(dst) +        assert result == expected, "output mismatch in {}".format(dst) +  def test_index_form_to_display_name(): -    Case = collections.namedtuple('Case', 'input output') +    Case = collections.namedtuple("Case", "input output")      cases = [ -        Case('', ''), -        Case('ABC', 'ABC'), -        Case('International Space Station', 'International Space Station'), -        Case('Jin, Shan', 'Shan Jin'), -        Case('Volkshochschule Der Bundesstadt Bonn', 'Volkshochschule Der Bundesstadt Bonn'), -        Case('Solomon, P. M.', 'P. M. Solomon'), -        Case('Sujeevan Ratnasingham', 'Sujeevan Ratnasingham'), -        Case('Paul Stöckli (1906-1991), Künstler', 'Paul Stöckli (1906-1991), Künstler'), +        Case("", ""), +        Case("ABC", "ABC"), +        Case("International Space Station", "International Space Station"), +        Case("Jin, Shan", "Shan Jin"), +        Case( +            "Volkshochschule Der Bundesstadt Bonn", +            "Volkshochschule Der Bundesstadt Bonn", +        ), +        Case("Solomon, P. M.", "P. M. Solomon"), +        Case("Sujeevan Ratnasingham", "Sujeevan Ratnasingham"), +        Case( +            "Paul Stöckli (1906-1991), Künstler", "Paul Stöckli (1906-1991), Künstler" +        ),      ]      for c in cases: @@ -319,45 +428,69 @@ def test_index_form_to_display_name():  def test_lookup_license_slug(): -    Case = collections.namedtuple('Case', 'input output') +    Case = collections.namedtuple("Case", "input output")      cases = [ -        Case('https://opensource.org/licenses/MIT', 'MIT'), -        Case('creativecommons.org/licenses/by-nc-nd/3.0/', 'CC-BY-NC-ND'), -        Case('http://creativecommons.org/licences/by-nc-sa/4.0', 'CC-BY-NC-SA'), -        Case('http://creativecommons.org/licenses/by-nc-nd/2.5/co', 'CC-BY-NC-ND'), -        Case('http://creativecommons.org/licenses/by-nd/4.0/legalcode', 'CC-BY-ND'), -        Case('http://creativecommons.org/licenses/by/2.0/uk/legalcode', 'CC-BY'), -        Case('http://creativecommons.org/publicdomain/zero/1.0/legalcode', 'CC-0'), -        Case('http://doi.wiley.com/10.1002/tdm_license_1.1', 'WILEY-TDM-1.1'), -        Case('http://homepage.data-planet.com/terms-use', 'SAGE-DATA-PLANET'), -        Case('http://www.springer.com/tdm', 'SPRINGER-TDM'), -        Case('https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess.xhtml', 'ADS-UK'), -        Case('https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess', 'ADS-UK'), -        Case('https://creativecommons.org/public-domain/cc0', 'CC-0'), -        Case('https://creativecommons.org/publicdomain/zero/1.0', 'CC-0'), -        Case('https://creativecommons.org/share-your-work/public-domain/cc0', 'CC-0'), -        Case('https://www.elsevier.com/tdm/userlicense/1.0', 'ELSEVIER-USER-1.0'), -        Case('https://www.gnu.org/licenses/gpl-3.0.html', 'GPL-3.0'), -        Case('http://rightsstatements.org/page/InC/1.0?language=en', 'RS-INC'), -        Case('http://onlinelibrary.wiley.com/termsAndConditions', 'WILEY'), -        Case('https://publikationen.bibliothek.kit.edu/kitopen-lizenz', 'KIT-OPEN'), -        Case('http://journals.sagepub.com/page/policies/text-and-data-mining-license', 'SAGE-TDM'), -        Case('https://creativecommons.org/publicdomain/mark/1.0/deed.de', 'CC-PUBLICDOMAIN'), -        Case('http://creativecommons.org/publicdomain/mark/1.0', 'CC-PUBLICDOMAIN'), -        Case('https://creativecommons.org/publicdomain/mark/1.0', 'CC-PUBLICDOMAIN'), -        Case('https://creativecommons.org/publicdomain/mark/1.0/', 'CC-PUBLICDOMAIN'), -        Case('https://creativecommons.org/publicdomain/mark/1.0/deed.de', 'CC-PUBLICDOMAIN'), -        Case('https://creativecommons.org/share-your-work/public-domain/cc0/', 'CC-0'), -        Case('http://spdx.org/licenses/CC0-1.0.json', 'CC-0'), -        Case('http://spdx.org/licenses/CC-BY-1.0.json', 'CC-BY'), -        Case('http://spdx.org/licenses/CC-BY-4.0.json', 'CC-BY'), -        Case('http://spdx.org/licenses/CC-BY-NC-4.0.json', 'CC-BY-NC'), -        Case('http://spdx.org/licenses/CC-BY-SA-3.0.json', 'CC-BY-SA'), -        Case('http://spdx.org/licenses/CC-BY-SA-4.0.json', 'CC-BY-SA'), -        Case('http://spdx.org/licenses/MIT.json', 'MIT'), -        Case('http://spdx.org/licenses/OGL-Canada-2.0.json', 'OGL-CANADA'), +        Case("https://opensource.org/licenses/MIT", "MIT"), +        Case("creativecommons.org/licenses/by-nc-nd/3.0/", "CC-BY-NC-ND"), +        Case("http://creativecommons.org/licences/by-nc-sa/4.0", "CC-BY-NC-SA"), +        Case("http://creativecommons.org/licenses/by-nc-nd/2.5/co", "CC-BY-NC-ND"), +        Case("http://creativecommons.org/licenses/by-nd/4.0/legalcode", "CC-BY-ND"), +        Case("http://creativecommons.org/licenses/by/2.0/uk/legalcode", "CC-BY"), +        Case("http://creativecommons.org/publicdomain/zero/1.0/legalcode", "CC-0"), +        Case("http://doi.wiley.com/10.1002/tdm_license_1.1", "WILEY-TDM-1.1"), +        Case("http://homepage.data-planet.com/terms-use", "SAGE-DATA-PLANET"), +        Case("http://www.springer.com/tdm", "SPRINGER-TDM"), +        Case( +            "https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess.xhtml", +            "ADS-UK", +        ), +        Case( +            "https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess", "ADS-UK" +        ), +        Case("https://creativecommons.org/public-domain/cc0", "CC-0"), +        Case("https://creativecommons.org/publicdomain/zero/1.0", "CC-0"), +        Case("https://creativecommons.org/share-your-work/public-domain/cc0", "CC-0"), +        Case("https://www.elsevier.com/tdm/userlicense/1.0", "ELSEVIER-USER-1.0"), +        Case("https://www.gnu.org/licenses/gpl-3.0.html", "GPL-3.0"), +        Case("http://rightsstatements.org/page/InC/1.0?language=en", "RS-INC"), +        Case("http://onlinelibrary.wiley.com/termsAndConditions", "WILEY"), +        Case("https://publikationen.bibliothek.kit.edu/kitopen-lizenz", "KIT-OPEN"), +        Case( +            "http://journals.sagepub.com/page/policies/text-and-data-mining-license", +            "SAGE-TDM", +        ), +        Case( +            "https://creativecommons.org/publicdomain/mark/1.0/deed.de", +            "CC-PUBLICDOMAIN", +        ), +        Case("http://creativecommons.org/publicdomain/mark/1.0", "CC-PUBLICDOMAIN"), +        Case("https://creativecommons.org/publicdomain/mark/1.0", "CC-PUBLICDOMAIN"), +        Case("https://creativecommons.org/publicdomain/mark/1.0/", "CC-PUBLICDOMAIN"), +        Case( +            "https://creativecommons.org/publicdomain/mark/1.0/deed.de", +            "CC-PUBLICDOMAIN", +        ), +        Case("https://creativecommons.org/share-your-work/public-domain/cc0/", "CC-0"), +        Case("http://spdx.org/licenses/CC0-1.0.json", "CC-0"), +        Case("http://spdx.org/licenses/CC-BY-1.0.json", "CC-BY"), +        Case("http://spdx.org/licenses/CC-BY-4.0.json", "CC-BY"), +        Case("http://spdx.org/licenses/CC-BY-NC-4.0.json", "CC-BY-NC"), +        Case("http://spdx.org/licenses/CC-BY-SA-3.0.json", "CC-BY-SA"), +        Case("http://spdx.org/licenses/CC-BY-SA-4.0.json", "CC-BY-SA"), +        Case("http://spdx.org/licenses/MIT.json", "MIT"), +        Case("http://spdx.org/licenses/OGL-Canada-2.0.json", "OGL-CANADA"),      ]      for c in cases:          got = lookup_license_slug(c.input) -        assert c.output == got, '{}: got {}, want {}'.format(c.input, got, c.output) +        assert c.output == got, "{}: got {}, want {}".format(c.input, got, c.output) + + +def test_contributor_list_contains_contributor(): +    Case = collections.namedtuple("Case", "contrib_list contrib want") +    cases = [ +        Case([], fatcat_openapi_client.ReleaseContrib(raw_name="Paul Katz"), False), +    ] +    for c in cases: +        got = contributor_list_contains_contributor(c.contrib_list, c.contrib) +        assert got == c.want | 
