aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2020-07-11 00:31:47 +0000
committerbnewbold <bnewbold@archive.org>2020-07-11 00:31:47 +0000
commitf5aefab6a6431ab9db99761457fd47b36b920b8c (patch)
treed144988d310aeecf8521cfc33aca9f0667dfedbc
parent26b455ffad566bef58684a78654a2719c409588a (diff)
parent3c266e07771271241aa8cff3e3199a45109362af (diff)
downloadfatcat-f5aefab6a6431ab9db99761457fd47b36b920b8c.tar.gz
fatcat-f5aefab6a6431ab9db99761457fd47b36b920b8c.zip
Merge branch 'martin-datacite-duplicated-author-gh-59' into 'master'
datacite: address duplicated contributor issue See merge request webgroup/fatcat!65
-rw-r--r--python/fatcat_tools/importers/datacite.py66
-rw-r--r--python/tests/files/datacite/datacite_doc_33.json62
-rw-r--r--python/tests/files/datacite/datacite_doc_34.json61
-rw-r--r--python/tests/files/datacite/datacite_result_05.json6
-rw-r--r--python/tests/files/datacite/datacite_result_08.json7
-rw-r--r--python/tests/files/datacite/datacite_result_09.json3
-rw-r--r--python/tests/files/datacite/datacite_result_26.json3
-rw-r--r--python/tests/files/datacite/datacite_result_27.json3
-rw-r--r--python/tests/files/datacite/datacite_result_28.json3
-rw-r--r--python/tests/files/datacite/datacite_result_29.json3
-rw-r--r--python/tests/files/datacite/datacite_result_33.json31
-rw-r--r--python/tests/files/datacite/datacite_result_34.json31
-rw-r--r--python/tests/import_datacite.py591
13 files changed, 619 insertions, 251 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 785107ee..ebb29feb 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -294,7 +294,39 @@ class DataciteImporter(EntityImporter):
creators = attributes.get('creators', []) or []
contributors = attributes.get('contributors', []) or [] # Much fewer than creators.
- contribs = self.parse_datacite_creators(creators, doi=doi) + self.parse_datacite_creators(contributors, role=None, set_index=False, doi=doi)
+ contribs = self.parse_datacite_creators(creators, doi=doi)
+
+ # Beside creators, we have contributors in datacite. Sample:
+ # ContactPerson, DataCollector, DataCurator, DataManager, Distributor,
+ # Editor, Funder, HostingInstitution, Other, Producer, ProjectLeader,
+ # ProjectMember, RelatedPerson, ResearchGroup, Researcher,
+ # RightsHolder, Sponsor, Supervisor
+ #
+ # Datacite schema:
+ # https://schema.datacite.org/meta/kernel-4.3/doc/DataCite-MetadataKernel_v4.3.pdf#page=32
+ # -- could be used as a form of controlled vocab?
+ #
+ # Currently (07/2020) in release_contrib:
+ #
+ # select count(*), role from release_contrib group by role;
+ # count | role
+ # -----------+------------
+ # 500269665 | author
+ # 4386563 | editor
+ # 17871 | translator
+ # 10870584 |
+ # (4 rows)
+ #
+ # Related: https://guide.fatcat.wiki/entity_release.html -- role
+ # (string, of a set): the type of contribution, from a controlled
+ # vocabulary. TODO: vocabulary needs review.
+ contribs_extra_contributors = self.parse_datacite_creators(contributors, set_index=False, doi=doi)
+
+ # Unfortunately, creators and contributors might overlap, refs GH59.
+ for cc in contribs_extra_contributors:
+ if contributor_list_contains_contributor(contribs, cc):
+ continue
+ contribs.append(cc)
# Title, may come with "attributes.titles[].titleType", like
# "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
@@ -725,9 +757,10 @@ class DataciteImporter(EntityImporter):
# Names, that should be ignored right away.
name_blacklist = set(('Occdownload Gbif.Org',))
- for i, c in enumerate(creators):
+ i = 0
+ for c in creators:
if not set_index:
- i = None
+ i = None
nameType = c.get('nameType', '') or ''
if nameType in ('', 'Personal'):
creator_id = None
@@ -799,8 +832,7 @@ class DataciteImporter(EntityImporter):
if contributorType:
extra = {'type': contributorType}
- contribs.append(
- fatcat_openapi_client.ReleaseContrib(
+ rc = fatcat_openapi_client.ReleaseContrib(
creator_id=creator_id,
index=i,
raw_name=name,
@@ -809,7 +841,12 @@ class DataciteImporter(EntityImporter):
role=role,
raw_affiliation=raw_affiliation,
extra=extra,
- ))
+ )
+ # Filter out duplicates early.
+ if not contributor_list_contains_contributor(contribs, rc):
+ contribs.append(rc)
+ if i is not None:
+ i += 1
elif nameType == 'Organizational':
name = c.get('name', '') or ''
if name in UNKNOWN_MARKERS:
@@ -819,12 +856,29 @@ class DataciteImporter(EntityImporter):
extra = {'organization': name}
contribs.append(fatcat_openapi_client.ReleaseContrib(
index=i, extra=extra))
+ if i is not None:
+ i += 1
else:
print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr)
return contribs
+def contributor_list_contains_contributor(contributor_list, contributor):
+ """
+ Given a list of contributors, determine, whether contrib is in that list.
+ """
+ for cc in contributor_list:
+ if cc.raw_name != contributor.raw_name:
+ continue
+ cc_role = cc.role or 'author'
+ contributor_role = contributor.role or 'author'
+ if cc_role != contributor_role:
+ continue
+ return True
+ return False
+
+
def lookup_license_slug(raw):
"""
Resolve a variety of strings into a some pseudo-canonical form, e.g.
diff --git a/python/tests/files/datacite/datacite_doc_33.json b/python/tests/files/datacite/datacite_doc_33.json
new file mode 100644
index 00000000..571d1220
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_33.json
@@ -0,0 +1,62 @@
+{
+ "id": "10.17912/micropub.biology.000143",
+ "type": "dois",
+ "attributes": {
+ "doi": "10.17912/micropub.biology.000143",
+ "identifiers": null,
+ "creators": [
+ {
+ "name": "ABC News",
+ "givenName": "",
+ "familyName": "",
+ "affiliation": [],
+ "role": "author"
+ }
+ ],
+ "titles": [
+ {
+ "title": "Sample"
+ }
+ ],
+ "publisher": "microPublication Biology",
+ "publicationYear": 2019,
+ "types": {
+ "resourceTypeGeneral": "DataPaper"
+ },
+ "relatedIdentifiers": [],
+ "sizes": [],
+ "formats": [],
+ "version": null,
+ "rightsList": [],
+ "descriptions": [
+ {
+ "description": 1234567890,
+ "descriptionType": "Abstract"
+ }
+ ],
+ "geoLocations": [],
+ "fundingReferences": [],
+ "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143",
+ "created": "2019-08-19T14:43:08.000Z",
+ "registered": "2019-08-19T14:43:09.000Z",
+ "published": "2019",
+ "updated": "2019-11-09T12:32:02.000Z",
+ "contributors": [
+ {
+ "name": "ABC News",
+ "givenName": "",
+ "familyName": "",
+ "affiliation": [],
+ "role": ""
+ }
+ ]
+ },
+ "relationships": {
+ "client": {
+ "data": {
+ "id": "caltech.micropub",
+ "type": "clients"
+ }
+ }
+ }
+}
diff --git a/python/tests/files/datacite/datacite_doc_34.json b/python/tests/files/datacite/datacite_doc_34.json
new file mode 100644
index 00000000..5dcf65f4
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_34.json
@@ -0,0 +1,61 @@
+{
+ "id": "10.17912/micropub.biology.000143",
+ "type": "dois",
+ "attributes": {
+ "doi": "10.17912/micropub.biology.000143",
+ "identifiers": null,
+ "creators": [
+ {
+ "name": "Paul Katz",
+ "givenName": "",
+ "familyName": "",
+ "affiliation": [],
+ "role": "author"
+ }
+ ],
+ "titles": [
+ {
+ "title": "Sample"
+ }
+ ],
+ "publisher": "microPublication Biology",
+ "publicationYear": 2019,
+ "types": {
+ "resourceTypeGeneral": "DataPaper"
+ },
+ "relatedIdentifiers": [],
+ "sizes": [],
+ "formats": [],
+ "version": null,
+ "rightsList": [],
+ "descriptions": [
+ {
+ "description": 1234567890,
+ "descriptionType": "Abstract"
+ }
+ ],
+ "geoLocations": [],
+ "fundingReferences": [],
+ "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143",
+ "created": "2019-08-19T14:43:08.000Z",
+ "registered": "2019-08-19T14:43:09.000Z",
+ "published": "2019",
+ "updated": "2019-11-09T12:32:02.000Z",
+ "contributors": [
+ {
+ "name": "Paul Katz",
+ "givenName": "",
+ "familyName": "",
+ "affiliation": [],
+ "role": "illustrator"
+ } ]
+ },
+ "relationships": {
+ "client": {
+ "data": {
+ "id": "caltech.micropub",
+ "type": "clients"
+ }
+ }
+ }
+}
diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json
index 79c2a8fb..c91f3a7f 100644
--- a/python/tests/files/datacite/datacite_result_05.json
+++ b/python/tests/files/datacite/datacite_result_05.json
@@ -505,10 +505,8 @@
"surname": "Wurzbacher"
},
{
- "raw_name": "Kessy Abarenkov"
- },
- {
- "raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden"
+ "raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden",
+ "role": "author"
}
],
"ext_ids": {
diff --git a/python/tests/files/datacite/datacite_result_08.json b/python/tests/files/datacite/datacite_result_08.json
index 70237280..5a46ef50 100644
--- a/python/tests/files/datacite/datacite_result_08.json
+++ b/python/tests/files/datacite/datacite_result_08.json
@@ -13,13 +13,6 @@
"raw_name": "Kei Kajisa",
"role": "author",
"surname": "Kajisa"
- },
- {
- "given_name": "Kei",
- "index": 1,
- "raw_name": "Kei Kajisa",
- "role": "author",
- "surname": "Kajisa"
}
],
"ext_ids": {
diff --git a/python/tests/files/datacite/datacite_result_09.json b/python/tests/files/datacite/datacite_result_09.json
index 09e02fc7..f6ec524a 100644
--- a/python/tests/files/datacite/datacite_result_09.json
+++ b/python/tests/files/datacite/datacite_result_09.json
@@ -17,7 +17,8 @@
"extra": {
"type": "DataManager"
},
- "raw_name": "Technische Informationsbibliothek (TIB)"
+ "raw_name": "Technische Informationsbibliothek (TIB)",
+ "role": "author"
}
],
"ext_ids": {
diff --git a/python/tests/files/datacite/datacite_result_26.json b/python/tests/files/datacite/datacite_result_26.json
index 267eb9c2..f6e589ef 100644
--- a/python/tests/files/datacite/datacite_result_26.json
+++ b/python/tests/files/datacite/datacite_result_26.json
@@ -13,7 +13,8 @@
},
"given_name": "David",
"raw_name": "David Wemmer",
- "surname": "Wemmer"
+ "surname": "Wemmer",
+ "role": "author"
}
],
"ext_ids": {
diff --git a/python/tests/files/datacite/datacite_result_27.json b/python/tests/files/datacite/datacite_result_27.json
index 3d033e6a..e934fb41 100644
--- a/python/tests/files/datacite/datacite_result_27.json
+++ b/python/tests/files/datacite/datacite_result_27.json
@@ -13,7 +13,8 @@
},
"given_name": "David",
"raw_name": "David Wemmer",
- "surname": "Wemmer"
+ "surname": "Wemmer",
+ "role": "author"
}
],
"ext_ids": {
diff --git a/python/tests/files/datacite/datacite_result_28.json b/python/tests/files/datacite/datacite_result_28.json
index 84bed9c8..bcb1caaf 100644
--- a/python/tests/files/datacite/datacite_result_28.json
+++ b/python/tests/files/datacite/datacite_result_28.json
@@ -13,7 +13,8 @@
},
"given_name": "David",
"raw_name": "David Wemmer",
- "surname": "Wemmer"
+ "surname": "Wemmer",
+ "role": "author"
}
],
"ext_ids": {
diff --git a/python/tests/files/datacite/datacite_result_29.json b/python/tests/files/datacite/datacite_result_29.json
index 84bed9c8..bcb1caaf 100644
--- a/python/tests/files/datacite/datacite_result_29.json
+++ b/python/tests/files/datacite/datacite_result_29.json
@@ -13,7 +13,8 @@
},
"given_name": "David",
"raw_name": "David Wemmer",
- "surname": "Wemmer"
+ "surname": "Wemmer",
+ "role": "author"
}
],
"ext_ids": {
diff --git a/python/tests/files/datacite/datacite_result_33.json b/python/tests/files/datacite/datacite_result_33.json
new file mode 100644
index 00000000..bcb72469
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_33.json
@@ -0,0 +1,31 @@
+{
+ "abstracts": [
+ {
+ "content": "1234567890",
+ "mimetype": "text/plain"
+ }
+ ],
+ "contribs": [
+ {
+ "given_name": "",
+ "surname": "",
+ "index": 0,
+ "raw_name": "ABC News",
+ "role": "author"
+ }
+ ],
+ "ext_ids": {
+ "doi": "10.17912/micropub.biology.000143"
+ },
+ "extra": {
+ "datacite": {
+ "resourceTypeGeneral": "DataPaper"
+ },
+ "container_name": "microPublication Biology"
+ },
+ "refs": [],
+ "release_stage": "published",
+ "release_year": 2019,
+ "publisher": "microPublication Biology",
+ "title": "Sample"
+}
diff --git a/python/tests/files/datacite/datacite_result_34.json b/python/tests/files/datacite/datacite_result_34.json
new file mode 100644
index 00000000..4a52e22c
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_34.json
@@ -0,0 +1,31 @@
+{
+ "abstracts": [
+ {
+ "content": "1234567890",
+ "mimetype": "text/plain"
+ }
+ ],
+ "contribs": [
+ {
+ "given_name": "",
+ "surname": "",
+ "index": 0,
+ "raw_name": "Paul Katz",
+ "role": "author"
+ }
+ ],
+ "ext_ids": {
+ "doi": "10.17912/micropub.biology.000143"
+ },
+ "extra": {
+ "datacite": {
+ "resourceTypeGeneral": "DataPaper"
+ },
+ "container_name": "microPublication Biology"
+ },
+ "refs": [],
+ "release_stage": "published",
+ "release_year": 2019,
+ "publisher": "microPublication Biology",
+ "title": "Sample"
+}
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 8fb2d079..b94b6bc5 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -10,32 +10,54 @@ import collections
import pytest
from fatcat_tools.importers import DataciteImporter, JsonLinePusher
-from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates, index_form_to_display_name, lookup_license_slug
+from fatcat_tools.importers.datacite import (
+ find_original_language_title,
+ parse_datacite_titles,
+ parse_datacite_dates,
+ clean_doi,
+ index_form_to_display_name,
+ lookup_license_slug,
+ contributor_list_contains_contributor,
+)
from fatcat_tools.transforms import entity_to_dict
-from fixtures import *
+import fatcat_openapi_client
+from fixtures import api
+import json
@pytest.fixture(scope="function")
def datacite_importer(api):
- with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
- yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3',
- bezerk_mode=True)
+ with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
+ yield DataciteImporter(
+ api,
+ issn_file,
+ extid_map_file="tests/files/example_map.sqlite3",
+ bezerk_mode=True,
+ )
+
@pytest.fixture(scope="function")
def datacite_importer_existing(api):
- with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
- yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3',
- bezerk_mode=False)
+ with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
+ yield DataciteImporter(
+ api,
+ issn_file,
+ extid_map_file="tests/files/example_map.sqlite3",
+ bezerk_mode=False,
+ )
+
@pytest.mark.skip(reason="larger datacite import slows tests down")
def test_datacite_importer_huge(datacite_importer):
last_index = datacite_importer.api.get_changelog(limit=1)[0].index
- with gzip.open('tests/files/datacite_1k_records.jsonl.gz', 'rt') as f:
+ with gzip.open("tests/files/datacite_1k_records.jsonl.gz", "rt") as f:
datacite_importer.bezerk_mode = True
counts = JsonLinePusher(datacite_importer, f).run()
- assert counts['insert'] == 998
- change = datacite_importer.api.get_changelog_entry(index=last_index+1)
- release = datacite_importer.api.get_release(change.editgroup.edits.releases[0].ident)
+ assert counts["insert"] == 998
+ change = datacite_importer.api.get_changelog_entry(index=last_index + 1)
+ release = datacite_importer.api.get_release(
+ change.editgroup.edits.releases[0].ident
+ )
assert len(release.contribs) == 3
@@ -43,122 +65,161 @@ def test_find_original_language_title():
"""
Original language might be included, in various ways.
"""
- Case = collections.namedtuple('Case', 'about input result')
+ Case = collections.namedtuple("Case", "about input result")
cases = [
- Case('defaults to None', {}, None),
- Case('ignore unknown keys', {'broken': 'kv'}, None),
- Case('just a title', {'title': 'Noise Reduction'}, None),
- Case('same title should be ignored', {
- 'title': 'Noise Reduction',
- 'original_language_title': 'Noise Reduction'
- }, None),
- Case('empty subdict is ignored', {
- 'title': 'Noise Reduction',
- 'original_language_title': {},
- }, None),
- Case('unknown subdict keys are ignored', {
- 'title': 'Noise Reduction',
- 'original_language_title': {'broken': 'kv'},
- }, None),
- Case('original string', {
- 'title': 'Noise Reduction',
- 'original_language_title': 'Подавление шума',
- }, 'Подавление шума'),
- Case('language tag is ignored, since its broken', {
- 'title': 'Noise Reduction',
- 'original_language_title': {
- 'language': 'ja',
- '__content__': 'Noise Reduction'
+ Case("defaults to None", {}, None),
+ Case("ignore unknown keys", {"broken": "kv"}, None),
+ Case("just a title", {"title": "Noise Reduction"}, None),
+ Case(
+ "same title should be ignored",
+ {"title": "Noise Reduction", "original_language_title": "Noise Reduction"},
+ None,
+ ),
+ Case(
+ "empty subdict is ignored",
+ {"title": "Noise Reduction", "original_language_title": {},},
+ None,
+ ),
+ Case(
+ "unknown subdict keys are ignored",
+ {"title": "Noise Reduction", "original_language_title": {"broken": "kv"},},
+ None,
+ ),
+ Case(
+ "original string",
+ {"title": "Noise Reduction", "original_language_title": "Подавление шума",},
+ "Подавление шума",
+ ),
+ Case(
+ "language tag is ignored, since its broken",
+ {
+ "title": "Noise Reduction",
+ "original_language_title": {
+ "language": "ja",
+ "__content__": "Noise Reduction",
+ },
},
- }, None),
- Case('do not care about language', {
- 'title': 'Noise Reduction',
- 'original_language_title': {
- 'language': 'ja',
- '__content__': 'Rauschunterdrückung',
+ None,
+ ),
+ Case(
+ "do not care about language",
+ {
+ "title": "Noise Reduction",
+ "original_language_title": {
+ "language": "ja",
+ "__content__": "Rauschunterdrückung",
+ },
},
- }, 'Rauschunterdrückung'),
- Case('ignore excessive questionmarks', {
- 'title': 'Noise Reduction',
- 'original_language_title': {
- 'language': 'ja',
- '__content__': '???? However',
+ "Rauschunterdrückung",
+ ),
+ Case(
+ "ignore excessive questionmarks",
+ {
+ "title": "Noise Reduction",
+ "original_language_title": {
+ "language": "ja",
+ "__content__": "???? However",
+ },
},
- }, None),
+ None,
+ ),
]
for case in cases:
result = find_original_language_title(case.input)
assert result == case.result
+
def test_parse_datacite_titles():
"""
Given a list of titles, find title, original_language_title and subtitle.
Result is a 3-tuple of title, original_language_title, subtitle.
"""
- Case = collections.namedtuple('Case', 'about input result')
+ Case = collections.namedtuple("Case", "about input result")
cases = [
- Case('handle None', None, (None, None, None)),
- Case('empty list', [], (None, None, None)),
- Case('empty item', [{}], (None, None, None)),
- Case('broken keys', [{'broken': 'kv'}], (None, None, None)),
- Case('title only', [{'title': 'Total carbon dioxide'}],
- ('Total carbon dioxide', None, None),
- ),
- Case('title and subtitle', [
- {'title': 'Total carbon dioxide'},
- {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
- ],
- ('Total carbon dioxide', None, 'Station TT043_7-9'),
- ),
- Case('title, subtitle order does not matter', [
- {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
- {'title': 'Total carbon dioxide'},
- ],
- ('Total carbon dioxide', None, 'Station TT043_7-9'),
- ),
- Case('multiple titles, first wins', [
- {'title': 'Total carbon dioxide'},
- {'title': 'Meeting Heterogeneity'},
- ],
- ('Total carbon dioxide', None, None),
- ),
- Case('multiple titles, plus sub', [
- {'title': 'Total carbon dioxide'},
- {'title': 'Meeting Heterogeneity'},
- {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
- ],
- ('Total carbon dioxide', None, 'Station TT043_7-9'),
- ),
- Case('multiple titles, multiple subs', [
- {'title': 'Total carbon dioxide'},
- {'title': 'Meeting Heterogeneity'},
- {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
- {'title': 'Some other subtitle', 'titleType': 'Subtitle'},
- ],
- ('Total carbon dioxide', None, 'Station TT043_7-9'),
- ),
- Case('title, original, sub', [
- {'title': 'Total carbon dioxide', 'original_language_title': 'Всего углекислого газа'},
- {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
- ],
- ('Total carbon dioxide', 'Всего углекислого газа', 'Station TT043_7-9'),
- ),
- Case('title, original same as title, sub', [
- {'title': 'Total carbon dioxide', 'original_language_title': {
- '__content__': 'Total carbon dioxide',
- }},
- {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
- ],
- ('Total carbon dioxide', None, 'Station TT043_7-9'),
- ),
- Case('title, original dict, sub', [
- {'title': 'Total carbon dioxide', 'original_language_title': {
- '__content__': 'Всего углекислого газа',
- }},
- {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
- ],
- ('Total carbon dioxide', 'Всего углекислого газа', 'Station TT043_7-9'),
+ Case("handle None", None, (None, None, None)),
+ Case("empty list", [], (None, None, None)),
+ Case("empty item", [{}], (None, None, None)),
+ Case("broken keys", [{"broken": "kv"}], (None, None, None)),
+ Case(
+ "title only",
+ [{"title": "Total carbon dioxide"}],
+ ("Total carbon dioxide", None, None),
+ ),
+ Case(
+ "title and subtitle",
+ [
+ {"title": "Total carbon dioxide"},
+ {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+ ],
+ ("Total carbon dioxide", None, "Station TT043_7-9"),
+ ),
+ Case(
+ "title, subtitle order does not matter",
+ [
+ {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+ {"title": "Total carbon dioxide"},
+ ],
+ ("Total carbon dioxide", None, "Station TT043_7-9"),
+ ),
+ Case(
+ "multiple titles, first wins",
+ [{"title": "Total carbon dioxide"}, {"title": "Meeting Heterogeneity"},],
+ ("Total carbon dioxide", None, None),
+ ),
+ Case(
+ "multiple titles, plus sub",
+ [
+ {"title": "Total carbon dioxide"},
+ {"title": "Meeting Heterogeneity"},
+ {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+ ],
+ ("Total carbon dioxide", None, "Station TT043_7-9"),
+ ),
+ Case(
+ "multiple titles, multiple subs",
+ [
+ {"title": "Total carbon dioxide"},
+ {"title": "Meeting Heterogeneity"},
+ {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+ {"title": "Some other subtitle", "titleType": "Subtitle"},
+ ],
+ ("Total carbon dioxide", None, "Station TT043_7-9"),
+ ),
+ Case(
+ "title, original, sub",
+ [
+ {
+ "title": "Total carbon dioxide",
+ "original_language_title": "Всего углекислого газа",
+ },
+ {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+ ],
+ ("Total carbon dioxide", "Всего углекислого газа", "Station TT043_7-9"),
+ ),
+ Case(
+ "title, original same as title, sub",
+ [
+ {
+ "title": "Total carbon dioxide",
+ "original_language_title": {"__content__": "Total carbon dioxide",},
+ },
+ {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+ ],
+ ("Total carbon dioxide", None, "Station TT043_7-9"),
+ ),
+ Case(
+ "title, original dict, sub",
+ [
+ {
+ "title": "Total carbon dioxide",
+ "original_language_title": {
+ "__content__": "Всего углекислого газа",
+ },
+ },
+ {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+ ],
+ ("Total carbon dioxide", "Всего углекислого газа", "Station TT043_7-9"),
),
]
@@ -166,91 +227,128 @@ def test_parse_datacite_titles():
result = parse_datacite_titles(case.input)
assert result == case.result, case.about
+
def test_parse_datacite_dates():
"""
Test datacite date parsing.
"""
- Case = collections.namedtuple('Case', 'about input result')
+ Case = collections.namedtuple("Case", "about input result")
cases = [
- Case('None is None', None, (None, None, None)),
- Case('empty list is None', [], (None, None, None)),
- Case('empty item is None', [{}], (None, None, None)),
- Case('year only yields year only', [{'date': '2019'}], (None, None, 2019)),
- Case('int year', [{'date': 2019}], (None, None, 2019)),
- Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, None, 2019)),
- Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, None, 2020)),
- Case('first with type', [
- {'date': '2019', 'dateType': 'Accepted'}, {'date': '2020'}
- ], (None, None, 2019)),
- Case('full date', [
- {'date': '2019-12-01', 'dateType': 'Valid'},
- ], (datetime.date(2019, 12, 1), 12, 2019)),
- Case('date type prio', [
- {'date': '2000-12-01', 'dateType': 'Valid'},
- {'date': '2010-01-01', 'dateType': 'Updated'},
- ], (datetime.date(2000, 12, 1), 12, 2000)),
- Case('date type prio, Available > Updated', [
- {'date': '2010-01-01', 'dateType': 'Updated'},
- {'date': '2000-12-01', 'dateType': 'Available'},
- ], (datetime.date(2000, 12, 1), 12, 2000)),
- Case('allow different date formats, Available > Updated', [
- {'date': '2010-01-01T10:00:00', 'dateType': 'Updated'},
- {'date': '2000-12-01T10:00:00', 'dateType': 'Available'},
- ], (datetime.date(2000, 12, 1), 12, 2000)),
- Case('allow different date formats, Available > Updated', [
- {'date': '2010-01-01T10:00:00Z', 'dateType': 'Updated'},
- {'date': '2000-12-01T10:00:00Z', 'dateType': 'Available'},
- ], (datetime.date(2000, 12, 1), 12, 2000)),
- Case('allow fuzzy date formats, Available > Updated', [
- {'date': '2010', 'dateType': 'Updated'},
- {'date': '2000 Dec 01', 'dateType': 'Available'},
- ], (datetime.date(2000, 12, 1), 12, 2000)),
- Case('fuzzy year only', [
- {'date': 'Year 2010', 'dateType': 'Issued'},
- ], (None, None, 2010)),
- Case('fuzzy year and month', [
- {'date': 'Year 2010 Feb', 'dateType': 'Issued'},
- ], (None, 2, 2010)),
- Case('fuzzy year, month, day', [
- {'date': 'Year 2010 Feb 24', 'dateType': 'Issued'},
- ], (datetime.date(2010, 2, 24), 2, 2010)),
- Case('ignore broken date', [
- {'date': 'Febrrr 45', 'dateType': 'Updated'},
- ], (None, None, None)),
+ Case("None is None", None, (None, None, None)),
+ Case("empty list is None", [], (None, None, None)),
+ Case("empty item is None", [{}], (None, None, None)),
+ Case("year only yields year only", [{"date": "2019"}], (None, None, 2019)),
+ Case("int year", [{"date": 2019}], (None, None, 2019)),
+ Case("first wins", [{"date": "2019"}, {"date": "2020"}], (None, None, 2019)),
+ Case(
+ "skip bogus year", [{"date": "abc"}, {"date": "2020"}], (None, None, 2020)
+ ),
+ Case(
+ "first with type",
+ [{"date": "2019", "dateType": "Accepted"}, {"date": "2020"}],
+ (None, None, 2019),
+ ),
+ Case(
+ "full date",
+ [{"date": "2019-12-01", "dateType": "Valid"},],
+ (datetime.date(2019, 12, 1), 12, 2019),
+ ),
+ Case(
+ "date type prio",
+ [
+ {"date": "2000-12-01", "dateType": "Valid"},
+ {"date": "2010-01-01", "dateType": "Updated"},
+ ],
+ (datetime.date(2000, 12, 1), 12, 2000),
+ ),
+ Case(
+ "date type prio, Available > Updated",
+ [
+ {"date": "2010-01-01", "dateType": "Updated"},
+ {"date": "2000-12-01", "dateType": "Available"},
+ ],
+ (datetime.date(2000, 12, 1), 12, 2000),
+ ),
+ Case(
+ "allow different date formats, Available > Updated",
+ [
+ {"date": "2010-01-01T10:00:00", "dateType": "Updated"},
+ {"date": "2000-12-01T10:00:00", "dateType": "Available"},
+ ],
+ (datetime.date(2000, 12, 1), 12, 2000),
+ ),
+ Case(
+ "allow different date formats, Available > Updated",
+ [
+ {"date": "2010-01-01T10:00:00Z", "dateType": "Updated"},
+ {"date": "2000-12-01T10:00:00Z", "dateType": "Available"},
+ ],
+ (datetime.date(2000, 12, 1), 12, 2000),
+ ),
+ Case(
+ "allow fuzzy date formats, Available > Updated",
+ [
+ {"date": "2010", "dateType": "Updated"},
+ {"date": "2000 Dec 01", "dateType": "Available"},
+ ],
+ (datetime.date(2000, 12, 1), 12, 2000),
+ ),
+ Case(
+ "fuzzy year only",
+ [{"date": "Year 2010", "dateType": "Issued"},],
+ (None, None, 2010),
+ ),
+ Case(
+ "fuzzy year and month",
+ [{"date": "Year 2010 Feb", "dateType": "Issued"},],
+ (None, 2, 2010),
+ ),
+ Case(
+ "fuzzy year, month, day",
+ [{"date": "Year 2010 Feb 24", "dateType": "Issued"},],
+ (datetime.date(2010, 2, 24), 2, 2010),
+ ),
+ Case(
+ "ignore broken date",
+ [{"date": "Febrrr 45", "dateType": "Updated"},],
+ (None, None, None),
+ ),
]
for case in cases:
result = parse_datacite_dates(case.input)
assert result == case.result, case.about
+
def test_datacite_importer(datacite_importer):
last_index = datacite_importer.api.get_changelog(limit=1)[0].index
- with open('tests/files/datacite_sample.jsonl', 'r') as f:
+ with open("tests/files/datacite_sample.jsonl", "r") as f:
datacite_importer.bezerk_mode = True
counts = JsonLinePusher(datacite_importer, f).run()
- assert counts['insert'] == 1
- assert counts['exists'] == 0
- assert counts['skip'] == 0
+ assert counts["insert"] == 1
+ assert counts["exists"] == 0
+ assert counts["skip"] == 0
# fetch most recent editgroup
- change = datacite_importer.api.get_changelog_entry(index=last_index+1)
+ change = datacite_importer.api.get_changelog_entry(index=last_index + 1)
eg = change.editgroup
assert eg.description
assert "datacite" in eg.description.lower()
- assert eg.extra['git_rev']
- assert "fatcat_tools.DataciteImporter" in eg.extra['agent']
+ assert eg.extra["git_rev"]
+ assert "fatcat_tools.DataciteImporter" in eg.extra["agent"]
last_index = datacite_importer.api.get_changelog(limit=1)[0].index
- with open('tests/files/datacite_sample.jsonl', 'r') as f:
+ with open("tests/files/datacite_sample.jsonl", "r") as f:
datacite_importer.bezerk_mode = False
datacite_importer.reset()
counts = JsonLinePusher(datacite_importer, f).run()
- assert counts['insert'] == 0
- assert counts['exists'] == 1
- assert counts['skip'] == 0
+ assert counts["insert"] == 0
+ assert counts["exists"] == 1
+ assert counts["skip"] == 0
assert last_index == datacite_importer.api.get_changelog(limit=1)[0].index
+
def test_datacite_dict_parse(datacite_importer):
- with open('tests/files/datacite_sample.jsonl', 'r') as f:
+ with open("tests/files/datacite_sample.jsonl", "r") as f:
raw = json.load(f)
r = datacite_importer.parse_record(raw)
# ensure the API server is ok with format
@@ -258,7 +356,9 @@ def test_datacite_dict_parse(datacite_importer):
print(r.extra)
assert r.title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090"
- assert r.publisher == "International Centre for Agricultural Research in Dry Areas"
+ assert (
+ r.publisher == "International Centre for Agricultural Research in Dry Areas"
+ )
assert r.release_type == "article"
assert r.release_stage == "published"
assert r.license_slug == None
@@ -269,13 +369,15 @@ def test_datacite_dict_parse(datacite_importer):
assert r.subtitle == None
assert r.release_date == None
assert r.release_year == 1986
- assert 'subtitle' not in r.extra
- assert 'subtitle' not in r.extra['datacite']
- assert 'funder' not in r.extra
- assert 'funder' not in r.extra['datacite']
+ assert "subtitle" not in r.extra
+ assert "subtitle" not in r.extra["datacite"]
+ assert "funder" not in r.extra
+ assert "funder" not in r.extra["datacite"]
# matched by ISSN, so shouldn't be in there
- #assert extra['container_name'] == "International Journal of Quantum Chemistry"
- assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}]
+ # assert extra['container_name'] == "International Journal of Quantum Chemistry"
+ assert r.extra["datacite"]["subjects"] == [
+ {"subject": "Plant Genetic Resource for Food and Agriculture"}
+ ]
assert len(r.abstracts) == 1
assert len(r.abstracts[0].content) == 421
assert len(r.contribs) == 2
@@ -284,34 +386,41 @@ def test_datacite_dict_parse(datacite_importer):
assert r.contribs[0].surname == None
assert len(r.refs) == 0
+
def test_datacite_conversions(datacite_importer):
"""
Datacite JSON to release entity JSON representation. The count is hardcoded
for now.
"""
datacite_importer.debug = True
- for i in range(33):
- src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i)
- dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i)
- with open(src, 'r') as f:
+ for i in range(35):
+ src = "tests/files/datacite/datacite_doc_{0:02d}.json".format(i)
+ dst = "tests/files/datacite/datacite_result_{0:02d}.json".format(i)
+ with open(src, "r") as f:
re = datacite_importer.parse_record(json.load(f))
result = entity_to_dict(re)
- with open(dst, 'r') as f:
+ with open(dst, "r") as f:
expected = json.loads(f.read())
- assert result == expected, 'output mismatch in {}'.format(dst)
+ assert result == expected, "output mismatch in {}".format(dst)
+
def test_index_form_to_display_name():
- Case = collections.namedtuple('Case', 'input output')
+ Case = collections.namedtuple("Case", "input output")
cases = [
- Case('', ''),
- Case('ABC', 'ABC'),
- Case('International Space Station', 'International Space Station'),
- Case('Jin, Shan', 'Shan Jin'),
- Case('Volkshochschule Der Bundesstadt Bonn', 'Volkshochschule Der Bundesstadt Bonn'),
- Case('Solomon, P. M.', 'P. M. Solomon'),
- Case('Sujeevan Ratnasingham', 'Sujeevan Ratnasingham'),
- Case('Paul Stöckli (1906-1991), Künstler', 'Paul Stöckli (1906-1991), Künstler'),
+ Case("", ""),
+ Case("ABC", "ABC"),
+ Case("International Space Station", "International Space Station"),
+ Case("Jin, Shan", "Shan Jin"),
+ Case(
+ "Volkshochschule Der Bundesstadt Bonn",
+ "Volkshochschule Der Bundesstadt Bonn",
+ ),
+ Case("Solomon, P. M.", "P. M. Solomon"),
+ Case("Sujeevan Ratnasingham", "Sujeevan Ratnasingham"),
+ Case(
+ "Paul Stöckli (1906-1991), Künstler", "Paul Stöckli (1906-1991), Künstler"
+ ),
]
for c in cases:
@@ -319,45 +428,69 @@ def test_index_form_to_display_name():
def test_lookup_license_slug():
- Case = collections.namedtuple('Case', 'input output')
+ Case = collections.namedtuple("Case", "input output")
cases = [
- Case('https://opensource.org/licenses/MIT', 'MIT'),
- Case('creativecommons.org/licenses/by-nc-nd/3.0/', 'CC-BY-NC-ND'),
- Case('http://creativecommons.org/licences/by-nc-sa/4.0', 'CC-BY-NC-SA'),
- Case('http://creativecommons.org/licenses/by-nc-nd/2.5/co', 'CC-BY-NC-ND'),
- Case('http://creativecommons.org/licenses/by-nd/4.0/legalcode', 'CC-BY-ND'),
- Case('http://creativecommons.org/licenses/by/2.0/uk/legalcode', 'CC-BY'),
- Case('http://creativecommons.org/publicdomain/zero/1.0/legalcode', 'CC-0'),
- Case('http://doi.wiley.com/10.1002/tdm_license_1.1', 'WILEY-TDM-1.1'),
- Case('http://homepage.data-planet.com/terms-use', 'SAGE-DATA-PLANET'),
- Case('http://www.springer.com/tdm', 'SPRINGER-TDM'),
- Case('https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess.xhtml', 'ADS-UK'),
- Case('https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess', 'ADS-UK'),
- Case('https://creativecommons.org/public-domain/cc0', 'CC-0'),
- Case('https://creativecommons.org/publicdomain/zero/1.0', 'CC-0'),
- Case('https://creativecommons.org/share-your-work/public-domain/cc0', 'CC-0'),
- Case('https://www.elsevier.com/tdm/userlicense/1.0', 'ELSEVIER-USER-1.0'),
- Case('https://www.gnu.org/licenses/gpl-3.0.html', 'GPL-3.0'),
- Case('http://rightsstatements.org/page/InC/1.0?language=en', 'RS-INC'),
- Case('http://onlinelibrary.wiley.com/termsAndConditions', 'WILEY'),
- Case('https://publikationen.bibliothek.kit.edu/kitopen-lizenz', 'KIT-OPEN'),
- Case('http://journals.sagepub.com/page/policies/text-and-data-mining-license', 'SAGE-TDM'),
- Case('https://creativecommons.org/publicdomain/mark/1.0/deed.de', 'CC-PUBLICDOMAIN'),
- Case('http://creativecommons.org/publicdomain/mark/1.0', 'CC-PUBLICDOMAIN'),
- Case('https://creativecommons.org/publicdomain/mark/1.0', 'CC-PUBLICDOMAIN'),
- Case('https://creativecommons.org/publicdomain/mark/1.0/', 'CC-PUBLICDOMAIN'),
- Case('https://creativecommons.org/publicdomain/mark/1.0/deed.de', 'CC-PUBLICDOMAIN'),
- Case('https://creativecommons.org/share-your-work/public-domain/cc0/', 'CC-0'),
- Case('http://spdx.org/licenses/CC0-1.0.json', 'CC-0'),
- Case('http://spdx.org/licenses/CC-BY-1.0.json', 'CC-BY'),
- Case('http://spdx.org/licenses/CC-BY-4.0.json', 'CC-BY'),
- Case('http://spdx.org/licenses/CC-BY-NC-4.0.json', 'CC-BY-NC'),
- Case('http://spdx.org/licenses/CC-BY-SA-3.0.json', 'CC-BY-SA'),
- Case('http://spdx.org/licenses/CC-BY-SA-4.0.json', 'CC-BY-SA'),
- Case('http://spdx.org/licenses/MIT.json', 'MIT'),
- Case('http://spdx.org/licenses/OGL-Canada-2.0.json', 'OGL-CANADA'),
+ Case("https://opensource.org/licenses/MIT", "MIT"),
+ Case("creativecommons.org/licenses/by-nc-nd/3.0/", "CC-BY-NC-ND"),
+ Case("http://creativecommons.org/licences/by-nc-sa/4.0", "CC-BY-NC-SA"),
+ Case("http://creativecommons.org/licenses/by-nc-nd/2.5/co", "CC-BY-NC-ND"),
+ Case("http://creativecommons.org/licenses/by-nd/4.0/legalcode", "CC-BY-ND"),
+ Case("http://creativecommons.org/licenses/by/2.0/uk/legalcode", "CC-BY"),
+ Case("http://creativecommons.org/publicdomain/zero/1.0/legalcode", "CC-0"),
+ Case("http://doi.wiley.com/10.1002/tdm_license_1.1", "WILEY-TDM-1.1"),
+ Case("http://homepage.data-planet.com/terms-use", "SAGE-DATA-PLANET"),
+ Case("http://www.springer.com/tdm", "SPRINGER-TDM"),
+ Case(
+ "https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess.xhtml",
+ "ADS-UK",
+ ),
+ Case(
+ "https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess", "ADS-UK"
+ ),
+ Case("https://creativecommons.org/public-domain/cc0", "CC-0"),
+ Case("https://creativecommons.org/publicdomain/zero/1.0", "CC-0"),
+ Case("https://creativecommons.org/share-your-work/public-domain/cc0", "CC-0"),
+ Case("https://www.elsevier.com/tdm/userlicense/1.0", "ELSEVIER-USER-1.0"),
+ Case("https://www.gnu.org/licenses/gpl-3.0.html", "GPL-3.0"),
+ Case("http://rightsstatements.org/page/InC/1.0?language=en", "RS-INC"),
+ Case("http://onlinelibrary.wiley.com/termsAndConditions", "WILEY"),
+ Case("https://publikationen.bibliothek.kit.edu/kitopen-lizenz", "KIT-OPEN"),
+ Case(
+ "http://journals.sagepub.com/page/policies/text-and-data-mining-license",
+ "SAGE-TDM",
+ ),
+ Case(
+ "https://creativecommons.org/publicdomain/mark/1.0/deed.de",
+ "CC-PUBLICDOMAIN",
+ ),
+ Case("http://creativecommons.org/publicdomain/mark/1.0", "CC-PUBLICDOMAIN"),
+ Case("https://creativecommons.org/publicdomain/mark/1.0", "CC-PUBLICDOMAIN"),
+ Case("https://creativecommons.org/publicdomain/mark/1.0/", "CC-PUBLICDOMAIN"),
+ Case(
+ "https://creativecommons.org/publicdomain/mark/1.0/deed.de",
+ "CC-PUBLICDOMAIN",
+ ),
+ Case("https://creativecommons.org/share-your-work/public-domain/cc0/", "CC-0"),
+ Case("http://spdx.org/licenses/CC0-1.0.json", "CC-0"),
+ Case("http://spdx.org/licenses/CC-BY-1.0.json", "CC-BY"),
+ Case("http://spdx.org/licenses/CC-BY-4.0.json", "CC-BY"),
+ Case("http://spdx.org/licenses/CC-BY-NC-4.0.json", "CC-BY-NC"),
+ Case("http://spdx.org/licenses/CC-BY-SA-3.0.json", "CC-BY-SA"),
+ Case("http://spdx.org/licenses/CC-BY-SA-4.0.json", "CC-BY-SA"),
+ Case("http://spdx.org/licenses/MIT.json", "MIT"),
+ Case("http://spdx.org/licenses/OGL-Canada-2.0.json", "OGL-CANADA"),
]
for c in cases:
got = lookup_license_slug(c.input)
- assert c.output == got, '{}: got {}, want {}'.format(c.input, got, c.output)
+ assert c.output == got, "{}: got {}, want {}".format(c.input, got, c.output)
+
+
+def test_contributor_list_contains_contributor():
+ Case = collections.namedtuple("Case", "contrib_list contrib want")
+ cases = [
+ Case([], fatcat_openapi_client.ReleaseContrib(raw_name="Paul Katz"), False),
+ ]
+ for c in cases:
+ got = contributor_list_contains_contributor(c.contrib_list, c.contrib)
+ assert got == c.want