summaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-07-10 00:50:42 +0200
committerMartin Czygan <martin.czygan@gmail.com>2020-07-10 00:50:42 +0200
commitdf8dcde8d5eaf530e35f1467951271bff7475e64 (patch)
treee9fc117d824b8997c83e416cd0e021bb1f3dce74 /python
parent40f77b78aa331ca67b510dfece77e6a6000f8c2f (diff)
downloadfatcat-df8dcde8d5eaf530e35f1467951271bff7475e64.tar.gz
fatcat-df8dcde8d5eaf530e35f1467951271bff7475e64.zip
wip: contrib, GH59
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_tools/importers/datacite.py38
-rw-r--r--python/tests/import_datacite.py590
2 files changed, 383 insertions, 245 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 66ec2023..7797812f 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -292,14 +292,17 @@ class DataciteImporter(EntityImporter):
print('[{}] skipping non-ascii doi for now'.format(doi))
return None
-
creators = attributes.get('creators', []) or []
contributors = attributes.get('contributors', []) or [] # Much fewer than creators.
- contribs = self.parse_datacite_creators(creators, doi=doi) + self.parse_datacite_creators(contributors, role=None, set_index=False, doi=doi)
+ contribs = self.parse_datacite_creators(creators, doi=doi)
+ contribs_extra_contributors = self.parse_datacite_creators(contributors, set_index=False, doi=doi)
- # Address duplicated author names; use raw_name string comparison; refs #59.
- contribs = unique_contributors(contribs)
+ # Unfortunately, creators and contributors might overlap, refs GH59.
+ for cc in contribs_extra_contributors:
+ if contributor_list_contains_contributor(contribs, cc):
+ continue
+ contribs.append(cc)
# Title, may come with "attributes.titles[].titleType", like
# "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
@@ -800,8 +803,7 @@ class DataciteImporter(EntityImporter):
if contributorType:
extra = {'type': contributorType}
- contribs.append(
- fatcat_openapi_client.ReleaseContrib(
+ rc = fatcat_openapi_client.ReleaseContrib(
creator_id=creator_id,
index=i,
raw_name=name,
@@ -810,7 +812,9 @@ class DataciteImporter(EntityImporter):
role=role,
raw_affiliation=raw_affiliation,
extra=extra,
- ))
+ )
+ if not contributor_list_contains_contributor(contribs, rc):
+ contribs.append(rc)
elif nameType == 'Organizational':
name = c.get('name', '') or ''
if name in UNKNOWN_MARKERS:
@@ -826,18 +830,20 @@ class DataciteImporter(EntityImporter):
return contribs
-def unique_contributors(contribs):
+def contributor_list_contains_contributor(contributor_list, contributor):
"""
- Given a list of ReleaseContrib items, return a list of unique
- ReleaseContribs, refs GH #59.
+ Given a list of contributors, determine, whether contrib is in that list.
"""
- unique_names, unique_contribs = set(), []
- for rc in contribs:
- if rc.raw_name and rc.raw_name in unique_names:
+ for cc in contributor_list:
+ if cc.raw_name != contributor.raw_name:
+ continue
+ cc_role = cc.role or 'author'
+ contributor_role = contributor.role or 'author'
+ if cc_role != contributor_role:
continue
- unique_names.add(rc.raw_name)
- unique_contribs.append(rc)
- return unique_contribs
+ return True
+ return False
+
def lookup_license_slug(raw):
"""
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 1472b8ea..b01a11e6 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -7,33 +7,54 @@ import datetime
import pytest
import gzip
from fatcat_tools.importers import DataciteImporter, JsonLinePusher
-from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates, clean_doi, index_form_to_display_name, lookup_license_slug
+from fatcat_tools.importers.datacite import (
+ find_original_language_title,
+ parse_datacite_titles,
+ parse_datacite_dates,
+ clean_doi,
+ index_form_to_display_name,
+ lookup_license_slug,
+ contributor_list_contains_contributor,
+)
from fatcat_tools.transforms import entity_to_dict
+import fatcat_openapi_client
from fixtures import api
import json
@pytest.fixture(scope="function")
def datacite_importer(api):
- with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
- yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3',
- bezerk_mode=True)
+ with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
+ yield DataciteImporter(
+ api,
+ issn_file,
+ extid_map_file="tests/files/example_map.sqlite3",
+ bezerk_mode=True,
+ )
+
@pytest.fixture(scope="function")
def datacite_importer_existing(api):
- with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
- yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3',
- bezerk_mode=False)
+ with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
+ yield DataciteImporter(
+ api,
+ issn_file,
+ extid_map_file="tests/files/example_map.sqlite3",
+ bezerk_mode=False,
+ )
+
@pytest.mark.skip(reason="larger datacite import slows tests down")
def test_datacite_importer_huge(datacite_importer):
last_index = datacite_importer.api.get_changelog(limit=1)[0].index
- with gzip.open('tests/files/datacite_1k_records.jsonl.gz', 'rt') as f:
+ with gzip.open("tests/files/datacite_1k_records.jsonl.gz", "rt") as f:
datacite_importer.bezerk_mode = True
counts = JsonLinePusher(datacite_importer, f).run()
- assert counts['insert'] == 998
- change = datacite_importer.api.get_changelog_entry(index=last_index+1)
- release = datacite_importer.api.get_release(change.editgroup.edits.releases[0].ident)
+ assert counts["insert"] == 998
+ change = datacite_importer.api.get_changelog_entry(index=last_index + 1)
+ release = datacite_importer.api.get_release(
+ change.editgroup.edits.releases[0].ident
+ )
assert len(release.contribs) == 3
@@ -41,122 +62,161 @@ def test_find_original_language_title():
"""
Original language might be included, in various ways.
"""
- Case = collections.namedtuple('Case', 'about input result')
+ Case = collections.namedtuple("Case", "about input result")
cases = [
- Case('defaults to None', {}, None),
- Case('ignore unknown keys', {'broken': 'kv'}, None),
- Case('just a title', {'title': 'Noise Reduction'}, None),
- Case('same title should be ignored', {
- 'title': 'Noise Reduction',
- 'original_language_title': 'Noise Reduction'
- }, None),
- Case('empty subdict is ignored', {
- 'title': 'Noise Reduction',
- 'original_language_title': {},
- }, None),
- Case('unknown subdict keys are ignored', {
- 'title': 'Noise Reduction',
- 'original_language_title': {'broken': 'kv'},
- }, None),
- Case('original string', {
- 'title': 'Noise Reduction',
- 'original_language_title': 'Подавление шума',
- }, 'Подавление шума'),
- Case('language tag is ignored, since its broken', {
- 'title': 'Noise Reduction',
- 'original_language_title': {
- 'language': 'ja',
- '__content__': 'Noise Reduction'
+ Case("defaults to None", {}, None),
+ Case("ignore unknown keys", {"broken": "kv"}, None),
+ Case("just a title", {"title": "Noise Reduction"}, None),
+ Case(
+ "same title should be ignored",
+ {"title": "Noise Reduction", "original_language_title": "Noise Reduction"},
+ None,
+ ),
+ Case(
+ "empty subdict is ignored",
+ {"title": "Noise Reduction", "original_language_title": {},},
+ None,
+ ),
+ Case(
+ "unknown subdict keys are ignored",
+ {"title": "Noise Reduction", "original_language_title": {"broken": "kv"},},
+ None,
+ ),
+ Case(
+ "original string",
+ {"title": "Noise Reduction", "original_language_title": "Подавление шума",},
+ "Подавление шума",
+ ),
+ Case(
+ "language tag is ignored, since its broken",
+ {
+ "title": "Noise Reduction",
+ "original_language_title": {
+ "language": "ja",
+ "__content__": "Noise Reduction",
+ },
},
- }, None),
- Case('do not care about language', {
- 'title': 'Noise Reduction',
- 'original_language_title': {
- 'language': 'ja',
- '__content__': 'Rauschunterdrückung',
+ None,
+ ),
+ Case(
+ "do not care about language",
+ {
+ "title": "Noise Reduction",
+ "original_language_title": {
+ "language": "ja",
+ "__content__": "Rauschunterdrückung",
+ },
},
- }, 'Rauschunterdrückung'),
- Case('ignore excessive questionmarks', {
- 'title': 'Noise Reduction',
- 'original_language_title': {
- 'language': 'ja',
- '__content__': '???? However',
+ "Rauschunterdrückung",
+ ),
+ Case(
+ "ignore excessive questionmarks",
+ {
+ "title": "Noise Reduction",
+ "original_language_title": {
+ "language": "ja",
+ "__content__": "???? However",
+ },
},
- }, None),
+ None,
+ ),
]
for case in cases:
result = find_original_language_title(case.input)
assert result == case.result
+
def test_parse_datacite_titles():
"""
Given a list of titles, find title, original_language_title and subtitle.
Result is a 3-tuple of title, original_language_title, subtitle.
"""
- Case = collections.namedtuple('Case', 'about input result')
+ Case = collections.namedtuple("Case", "about input result")
cases = [
- Case('handle None', None, (None, None, None)),
- Case('empty list', [], (None, None, None)),
- Case('empty item', [{}], (None, None, None)),
- Case('broken keys', [{'broken': 'kv'}], (None, None, None)),
- Case('title only', [{'title': 'Total carbon dioxide'}],
- ('Total carbon dioxide', None, None),
- ),
- Case('title and subtitle', [
- {'title': 'Total carbon dioxide'},
- {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
- ],
- ('Total carbon dioxide', None, 'Station TT043_7-9'),
- ),
- Case('title, subtitle order does not matter', [
- {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
- {'title': 'Total carbon dioxide'},
- ],
- ('Total carbon dioxide', None, 'Station TT043_7-9'),
- ),
- Case('multiple titles, first wins', [
- {'title': 'Total carbon dioxide'},
- {'title': 'Meeting Heterogeneity'},
- ],
- ('Total carbon dioxide', None, None),
- ),
- Case('multiple titles, plus sub', [
- {'title': 'Total carbon dioxide'},
- {'title': 'Meeting Heterogeneity'},
- {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
- ],
- ('Total carbon dioxide', None, 'Station TT043_7-9'),
- ),
- Case('multiple titles, multiple subs', [
- {'title': 'Total carbon dioxide'},
- {'title': 'Meeting Heterogeneity'},
- {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
- {'title': 'Some other subtitle', 'titleType': 'Subtitle'},
- ],
- ('Total carbon dioxide', None, 'Station TT043_7-9'),
- ),
- Case('title, original, sub', [
- {'title': 'Total carbon dioxide', 'original_language_title': 'Всего углекислого газа'},
- {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
- ],
- ('Total carbon dioxide', 'Всего углекислого газа', 'Station TT043_7-9'),
- ),
- Case('title, original same as title, sub', [
- {'title': 'Total carbon dioxide', 'original_language_title': {
- '__content__': 'Total carbon dioxide',
- }},
- {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
- ],
- ('Total carbon dioxide', None, 'Station TT043_7-9'),
- ),
- Case('title, original dict, sub', [
- {'title': 'Total carbon dioxide', 'original_language_title': {
- '__content__': 'Всего углекислого газа',
- }},
- {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'},
- ],
- ('Total carbon dioxide', 'Всего углекислого газа', 'Station TT043_7-9'),
+ Case("handle None", None, (None, None, None)),
+ Case("empty list", [], (None, None, None)),
+ Case("empty item", [{}], (None, None, None)),
+ Case("broken keys", [{"broken": "kv"}], (None, None, None)),
+ Case(
+ "title only",
+ [{"title": "Total carbon dioxide"}],
+ ("Total carbon dioxide", None, None),
+ ),
+ Case(
+ "title and subtitle",
+ [
+ {"title": "Total carbon dioxide"},
+ {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+ ],
+ ("Total carbon dioxide", None, "Station TT043_7-9"),
+ ),
+ Case(
+ "title, subtitle order does not matter",
+ [
+ {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+ {"title": "Total carbon dioxide"},
+ ],
+ ("Total carbon dioxide", None, "Station TT043_7-9"),
+ ),
+ Case(
+ "multiple titles, first wins",
+ [{"title": "Total carbon dioxide"}, {"title": "Meeting Heterogeneity"},],
+ ("Total carbon dioxide", None, None),
+ ),
+ Case(
+ "multiple titles, plus sub",
+ [
+ {"title": "Total carbon dioxide"},
+ {"title": "Meeting Heterogeneity"},
+ {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+ ],
+ ("Total carbon dioxide", None, "Station TT043_7-9"),
+ ),
+ Case(
+ "multiple titles, multiple subs",
+ [
+ {"title": "Total carbon dioxide"},
+ {"title": "Meeting Heterogeneity"},
+ {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+ {"title": "Some other subtitle", "titleType": "Subtitle"},
+ ],
+ ("Total carbon dioxide", None, "Station TT043_7-9"),
+ ),
+ Case(
+ "title, original, sub",
+ [
+ {
+ "title": "Total carbon dioxide",
+ "original_language_title": "Всего углекислого газа",
+ },
+ {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+ ],
+ ("Total carbon dioxide", "Всего углекислого газа", "Station TT043_7-9"),
+ ),
+ Case(
+ "title, original same as title, sub",
+ [
+ {
+ "title": "Total carbon dioxide",
+ "original_language_title": {"__content__": "Total carbon dioxide",},
+ },
+ {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+ ],
+ ("Total carbon dioxide", None, "Station TT043_7-9"),
+ ),
+ Case(
+ "title, original dict, sub",
+ [
+ {
+ "title": "Total carbon dioxide",
+ "original_language_title": {
+ "__content__": "Всего углекислого газа",
+ },
+ },
+ {"title": "Station TT043_7-9", "titleType": "Subtitle"},
+ ],
+ ("Total carbon dioxide", "Всего углекислого газа", "Station TT043_7-9"),
),
]
@@ -164,91 +224,128 @@ def test_parse_datacite_titles():
result = parse_datacite_titles(case.input)
assert result == case.result, case.about
+
def test_parse_datacite_dates():
"""
Test datacite date parsing.
"""
- Case = collections.namedtuple('Case', 'about input result')
+ Case = collections.namedtuple("Case", "about input result")
cases = [
- Case('None is None', None, (None, None, None)),
- Case('empty list is None', [], (None, None, None)),
- Case('empty item is None', [{}], (None, None, None)),
- Case('year only yields year only', [{'date': '2019'}], (None, None, 2019)),
- Case('int year', [{'date': 2019}], (None, None, 2019)),
- Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, None, 2019)),
- Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, None, 2020)),
- Case('first with type', [
- {'date': '2019', 'dateType': 'Accepted'}, {'date': '2020'}
- ], (None, None, 2019)),
- Case('full date', [
- {'date': '2019-12-01', 'dateType': 'Valid'},
- ], (datetime.date(2019, 12, 1), 12, 2019)),
- Case('date type prio', [
- {'date': '2000-12-01', 'dateType': 'Valid'},
- {'date': '2010-01-01', 'dateType': 'Updated'},
- ], (datetime.date(2000, 12, 1), 12, 2000)),
- Case('date type prio, Available > Updated', [
- {'date': '2010-01-01', 'dateType': 'Updated'},
- {'date': '2000-12-01', 'dateType': 'Available'},
- ], (datetime.date(2000, 12, 1), 12, 2000)),
- Case('allow different date formats, Available > Updated', [
- {'date': '2010-01-01T10:00:00', 'dateType': 'Updated'},
- {'date': '2000-12-01T10:00:00', 'dateType': 'Available'},
- ], (datetime.date(2000, 12, 1), 12, 2000)),
- Case('allow different date formats, Available > Updated', [
- {'date': '2010-01-01T10:00:00Z', 'dateType': 'Updated'},
- {'date': '2000-12-01T10:00:00Z', 'dateType': 'Available'},
- ], (datetime.date(2000, 12, 1), 12, 2000)),
- Case('allow fuzzy date formats, Available > Updated', [
- {'date': '2010', 'dateType': 'Updated'},
- {'date': '2000 Dec 01', 'dateType': 'Available'},
- ], (datetime.date(2000, 12, 1), 12, 2000)),
- Case('fuzzy year only', [
- {'date': 'Year 2010', 'dateType': 'Issued'},
- ], (None, None, 2010)),
- Case('fuzzy year and month', [
- {'date': 'Year 2010 Feb', 'dateType': 'Issued'},
- ], (None, 2, 2010)),
- Case('fuzzy year, month, day', [
- {'date': 'Year 2010 Feb 24', 'dateType': 'Issued'},
- ], (datetime.date(2010, 2, 24), 2, 2010)),
- Case('ignore broken date', [
- {'date': 'Febrrr 45', 'dateType': 'Updated'},
- ], (None, None, None)),
+ Case("None is None", None, (None, None, None)),
+ Case("empty list is None", [], (None, None, None)),
+ Case("empty item is None", [{}], (None, None, None)),
+ Case("year only yields year only", [{"date": "2019"}], (None, None, 2019)),
+ Case("int year", [{"date": 2019}], (None, None, 2019)),
+ Case("first wins", [{"date": "2019"}, {"date": "2020"}], (None, None, 2019)),
+ Case(
+ "skip bogus year", [{"date": "abc"}, {"date": "2020"}], (None, None, 2020)
+ ),
+ Case(
+ "first with type",
+ [{"date": "2019", "dateType": "Accepted"}, {"date": "2020"}],
+ (None, None, 2019),
+ ),
+ Case(
+ "full date",
+ [{"date": "2019-12-01", "dateType": "Valid"},],
+ (datetime.date(2019, 12, 1), 12, 2019),
+ ),
+ Case(
+ "date type prio",
+ [
+ {"date": "2000-12-01", "dateType": "Valid"},
+ {"date": "2010-01-01", "dateType": "Updated"},
+ ],
+ (datetime.date(2000, 12, 1), 12, 2000),
+ ),
+ Case(
+ "date type prio, Available > Updated",
+ [
+ {"date": "2010-01-01", "dateType": "Updated"},
+ {"date": "2000-12-01", "dateType": "Available"},
+ ],
+ (datetime.date(2000, 12, 1), 12, 2000),
+ ),
+ Case(
+ "allow different date formats, Available > Updated",
+ [
+ {"date": "2010-01-01T10:00:00", "dateType": "Updated"},
+ {"date": "2000-12-01T10:00:00", "dateType": "Available"},
+ ],
+ (datetime.date(2000, 12, 1), 12, 2000),
+ ),
+ Case(
+ "allow different date formats, Available > Updated",
+ [
+ {"date": "2010-01-01T10:00:00Z", "dateType": "Updated"},
+ {"date": "2000-12-01T10:00:00Z", "dateType": "Available"},
+ ],
+ (datetime.date(2000, 12, 1), 12, 2000),
+ ),
+ Case(
+ "allow fuzzy date formats, Available > Updated",
+ [
+ {"date": "2010", "dateType": "Updated"},
+ {"date": "2000 Dec 01", "dateType": "Available"},
+ ],
+ (datetime.date(2000, 12, 1), 12, 2000),
+ ),
+ Case(
+ "fuzzy year only",
+ [{"date": "Year 2010", "dateType": "Issued"},],
+ (None, None, 2010),
+ ),
+ Case(
+ "fuzzy year and month",
+ [{"date": "Year 2010 Feb", "dateType": "Issued"},],
+ (None, 2, 2010),
+ ),
+ Case(
+ "fuzzy year, month, day",
+ [{"date": "Year 2010 Feb 24", "dateType": "Issued"},],
+ (datetime.date(2010, 2, 24), 2, 2010),
+ ),
+ Case(
+ "ignore broken date",
+ [{"date": "Febrrr 45", "dateType": "Updated"},],
+ (None, None, None),
+ ),
]
for case in cases:
result = parse_datacite_dates(case.input)
assert result == case.result, case.about
+
def test_datacite_importer(datacite_importer):
last_index = datacite_importer.api.get_changelog(limit=1)[0].index
- with open('tests/files/datacite_sample.jsonl', 'r') as f:
+ with open("tests/files/datacite_sample.jsonl", "r") as f:
datacite_importer.bezerk_mode = True
counts = JsonLinePusher(datacite_importer, f).run()
- assert counts['insert'] == 1
- assert counts['exists'] == 0
- assert counts['skip'] == 0
+ assert counts["insert"] == 1
+ assert counts["exists"] == 0
+ assert counts["skip"] == 0
# fetch most recent editgroup
- change = datacite_importer.api.get_changelog_entry(index=last_index+1)
+ change = datacite_importer.api.get_changelog_entry(index=last_index + 1)
eg = change.editgroup
assert eg.description
assert "datacite" in eg.description.lower()
- assert eg.extra['git_rev']
- assert "fatcat_tools.DataciteImporter" in eg.extra['agent']
+ assert eg.extra["git_rev"]
+ assert "fatcat_tools.DataciteImporter" in eg.extra["agent"]
last_index = datacite_importer.api.get_changelog(limit=1)[0].index
- with open('tests/files/datacite_sample.jsonl', 'r') as f:
+ with open("tests/files/datacite_sample.jsonl", "r") as f:
datacite_importer.bezerk_mode = False
datacite_importer.reset()
counts = JsonLinePusher(datacite_importer, f).run()
- assert counts['insert'] == 0
- assert counts['exists'] == 1
- assert counts['skip'] == 0
+ assert counts["insert"] == 0
+ assert counts["exists"] == 1
+ assert counts["skip"] == 0
assert last_index == datacite_importer.api.get_changelog(limit=1)[0].index
+
def test_datacite_dict_parse(datacite_importer):
- with open('tests/files/datacite_sample.jsonl', 'r') as f:
+ with open("tests/files/datacite_sample.jsonl", "r") as f:
raw = json.load(f)
r = datacite_importer.parse_record(raw)
# ensure the API server is ok with format
@@ -256,7 +353,9 @@ def test_datacite_dict_parse(datacite_importer):
print(r.extra)
assert r.title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090"
- assert r.publisher == "International Centre for Agricultural Research in Dry Areas"
+ assert (
+ r.publisher == "International Centre for Agricultural Research in Dry Areas"
+ )
assert r.release_type == "article"
assert r.release_stage == "published"
assert r.license_slug == None
@@ -267,13 +366,15 @@ def test_datacite_dict_parse(datacite_importer):
assert r.subtitle == None
assert r.release_date == None
assert r.release_year == 1986
- assert 'subtitle' not in r.extra
- assert 'subtitle' not in r.extra['datacite']
- assert 'funder' not in r.extra
- assert 'funder' not in r.extra['datacite']
+ assert "subtitle" not in r.extra
+ assert "subtitle" not in r.extra["datacite"]
+ assert "funder" not in r.extra
+ assert "funder" not in r.extra["datacite"]
# matched by ISSN, so shouldn't be in there
- #assert extra['container_name'] == "International Journal of Quantum Chemistry"
- assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}]
+ # assert extra['container_name'] == "International Journal of Quantum Chemistry"
+ assert r.extra["datacite"]["subjects"] == [
+ {"subject": "Plant Genetic Resource for Food and Agriculture"}
+ ]
assert len(r.abstracts) == 1
assert len(r.abstracts[0].content) == 421
assert len(r.contribs) == 2
@@ -282,34 +383,41 @@ def test_datacite_dict_parse(datacite_importer):
assert r.contribs[0].surname == None
assert len(r.refs) == 0
+
def test_datacite_conversions(datacite_importer):
"""
Datacite JSON to release entity JSON representation. The count is hardcoded
for now.
"""
datacite_importer.debug = True
- for i in range(34):
- src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i)
- dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i)
- with open(src, 'r') as f:
+ for i in range(35):
+ src = "tests/files/datacite/datacite_doc_{0:02d}.json".format(i)
+ dst = "tests/files/datacite/datacite_result_{0:02d}.json".format(i)
+ with open(src, "r") as f:
re = datacite_importer.parse_record(json.load(f))
result = entity_to_dict(re)
- with open(dst, 'r') as f:
- expected = json.loads(f.read())
+ with open(dst, "r") as f:
+ expected = json.loads(f.read())
+
+ assert result == expected, "output mismatch in {}".format(dst)
- assert result == expected, 'output mismatch in {}'.format(dst)
def test_index_form_to_display_name():
- Case = collections.namedtuple('Case', 'input output')
+ Case = collections.namedtuple("Case", "input output")
cases = [
- Case('', ''),
- Case('ABC', 'ABC'),
- Case('International Space Station', 'International Space Station'),
- Case('Jin, Shan', 'Shan Jin'),
- Case('Volkshochschule Der Bundesstadt Bonn', 'Volkshochschule Der Bundesstadt Bonn'),
- Case('Solomon, P. M.', 'P. M. Solomon'),
- Case('Sujeevan Ratnasingham', 'Sujeevan Ratnasingham'),
- Case('Paul Stöckli (1906-1991), Künstler', 'Paul Stöckli (1906-1991), Künstler'),
+ Case("", ""),
+ Case("ABC", "ABC"),
+ Case("International Space Station", "International Space Station"),
+ Case("Jin, Shan", "Shan Jin"),
+ Case(
+ "Volkshochschule Der Bundesstadt Bonn",
+ "Volkshochschule Der Bundesstadt Bonn",
+ ),
+ Case("Solomon, P. M.", "P. M. Solomon"),
+ Case("Sujeevan Ratnasingham", "Sujeevan Ratnasingham"),
+ Case(
+ "Paul Stöckli (1906-1991), Künstler", "Paul Stöckli (1906-1991), Künstler"
+ ),
]
for c in cases:
@@ -317,45 +425,69 @@ def test_index_form_to_display_name():
def test_lookup_license_slug():
- Case = collections.namedtuple('Case', 'input output')
+ Case = collections.namedtuple("Case", "input output")
cases = [
- Case('https://opensource.org/licenses/MIT', 'MIT'),
- Case('creativecommons.org/licenses/by-nc-nd/3.0/', 'CC-BY-NC-ND'),
- Case('http://creativecommons.org/licences/by-nc-sa/4.0', 'CC-BY-NC-SA'),
- Case('http://creativecommons.org/licenses/by-nc-nd/2.5/co', 'CC-BY-NC-ND'),
- Case('http://creativecommons.org/licenses/by-nd/4.0/legalcode', 'CC-BY-ND'),
- Case('http://creativecommons.org/licenses/by/2.0/uk/legalcode', 'CC-BY'),
- Case('http://creativecommons.org/publicdomain/zero/1.0/legalcode', 'CC-0'),
- Case('http://doi.wiley.com/10.1002/tdm_license_1.1', 'WILEY-TDM-1.1'),
- Case('http://homepage.data-planet.com/terms-use', 'SAGE-DATA-PLANET'),
- Case('http://www.springer.com/tdm', 'SPRINGER-TDM'),
- Case('https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess.xhtml', 'ADS-UK'),
- Case('https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess', 'ADS-UK'),
- Case('https://creativecommons.org/public-domain/cc0', 'CC-0'),
- Case('https://creativecommons.org/publicdomain/zero/1.0', 'CC-0'),
- Case('https://creativecommons.org/share-your-work/public-domain/cc0', 'CC-0'),
- Case('https://www.elsevier.com/tdm/userlicense/1.0', 'ELSEVIER-USER-1.0'),
- Case('https://www.gnu.org/licenses/gpl-3.0.html', 'GPL-3.0'),
- Case('http://rightsstatements.org/page/InC/1.0?language=en', 'RS-INC'),
- Case('http://onlinelibrary.wiley.com/termsAndConditions', 'WILEY'),
- Case('https://publikationen.bibliothek.kit.edu/kitopen-lizenz', 'KIT-OPEN'),
- Case('http://journals.sagepub.com/page/policies/text-and-data-mining-license', 'SAGE-TDM'),
- Case('https://creativecommons.org/publicdomain/mark/1.0/deed.de', 'CC-PUBLICDOMAIN'),
- Case('http://creativecommons.org/publicdomain/mark/1.0', 'CC-PUBLICDOMAIN'),
- Case('https://creativecommons.org/publicdomain/mark/1.0', 'CC-PUBLICDOMAIN'),
- Case('https://creativecommons.org/publicdomain/mark/1.0/', 'CC-PUBLICDOMAIN'),
- Case('https://creativecommons.org/publicdomain/mark/1.0/deed.de', 'CC-PUBLICDOMAIN'),
- Case('https://creativecommons.org/share-your-work/public-domain/cc0/', 'CC-0'),
- Case('http://spdx.org/licenses/CC0-1.0.json', 'CC-0'),
- Case('http://spdx.org/licenses/CC-BY-1.0.json', 'CC-BY'),
- Case('http://spdx.org/licenses/CC-BY-4.0.json', 'CC-BY'),
- Case('http://spdx.org/licenses/CC-BY-NC-4.0.json', 'CC-BY-NC'),
- Case('http://spdx.org/licenses/CC-BY-SA-3.0.json', 'CC-BY-SA'),
- Case('http://spdx.org/licenses/CC-BY-SA-4.0.json', 'CC-BY-SA'),
- Case('http://spdx.org/licenses/MIT.json', 'MIT'),
- Case('http://spdx.org/licenses/OGL-Canada-2.0.json', 'OGL-CANADA'),
+ Case("https://opensource.org/licenses/MIT", "MIT"),
+ Case("creativecommons.org/licenses/by-nc-nd/3.0/", "CC-BY-NC-ND"),
+ Case("http://creativecommons.org/licences/by-nc-sa/4.0", "CC-BY-NC-SA"),
+ Case("http://creativecommons.org/licenses/by-nc-nd/2.5/co", "CC-BY-NC-ND"),
+ Case("http://creativecommons.org/licenses/by-nd/4.0/legalcode", "CC-BY-ND"),
+ Case("http://creativecommons.org/licenses/by/2.0/uk/legalcode", "CC-BY"),
+ Case("http://creativecommons.org/publicdomain/zero/1.0/legalcode", "CC-0"),
+ Case("http://doi.wiley.com/10.1002/tdm_license_1.1", "WILEY-TDM-1.1"),
+ Case("http://homepage.data-planet.com/terms-use", "SAGE-DATA-PLANET"),
+ Case("http://www.springer.com/tdm", "SPRINGER-TDM"),
+ Case(
+ "https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess.xhtml",
+ "ADS-UK",
+ ),
+ Case(
+ "https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess", "ADS-UK"
+ ),
+ Case("https://creativecommons.org/public-domain/cc0", "CC-0"),
+ Case("https://creativecommons.org/publicdomain/zero/1.0", "CC-0"),
+ Case("https://creativecommons.org/share-your-work/public-domain/cc0", "CC-0"),
+ Case("https://www.elsevier.com/tdm/userlicense/1.0", "ELSEVIER-USER-1.0"),
+ Case("https://www.gnu.org/licenses/gpl-3.0.html", "GPL-3.0"),
+ Case("http://rightsstatements.org/page/InC/1.0?language=en", "RS-INC"),
+ Case("http://onlinelibrary.wiley.com/termsAndConditions", "WILEY"),
+ Case("https://publikationen.bibliothek.kit.edu/kitopen-lizenz", "KIT-OPEN"),
+ Case(
+ "http://journals.sagepub.com/page/policies/text-and-data-mining-license",
+ "SAGE-TDM",
+ ),
+ Case(
+ "https://creativecommons.org/publicdomain/mark/1.0/deed.de",
+ "CC-PUBLICDOMAIN",
+ ),
+ Case("http://creativecommons.org/publicdomain/mark/1.0", "CC-PUBLICDOMAIN"),
+ Case("https://creativecommons.org/publicdomain/mark/1.0", "CC-PUBLICDOMAIN"),
+ Case("https://creativecommons.org/publicdomain/mark/1.0/", "CC-PUBLICDOMAIN"),
+ Case(
+ "https://creativecommons.org/publicdomain/mark/1.0/deed.de",
+ "CC-PUBLICDOMAIN",
+ ),
+ Case("https://creativecommons.org/share-your-work/public-domain/cc0/", "CC-0"),
+ Case("http://spdx.org/licenses/CC0-1.0.json", "CC-0"),
+ Case("http://spdx.org/licenses/CC-BY-1.0.json", "CC-BY"),
+ Case("http://spdx.org/licenses/CC-BY-4.0.json", "CC-BY"),
+ Case("http://spdx.org/licenses/CC-BY-NC-4.0.json", "CC-BY-NC"),
+ Case("http://spdx.org/licenses/CC-BY-SA-3.0.json", "CC-BY-SA"),
+ Case("http://spdx.org/licenses/CC-BY-SA-4.0.json", "CC-BY-SA"),
+ Case("http://spdx.org/licenses/MIT.json", "MIT"),
+ Case("http://spdx.org/licenses/OGL-Canada-2.0.json", "OGL-CANADA"),
]
for c in cases:
got = lookup_license_slug(c.input)
- assert c.output == got, '{}: got {}, want {}'.format(c.input, got, c.output)
+ assert c.output == got, "{}: got {}, want {}".format(c.input, got, c.output)
+
+
+def test_contributor_list_contains_contributor():
+ Case = collections.namedtuple("Case", "contrib_list contrib want")
+ cases = [
+ Case([], fatcat_openapi_client.ReleaseContrib(raw_name="Paul Katz"), False),
+ ]
+ for c in cases:
+ got = contributor_list_contains_contributor(c.contrib_list, c.contrib)
+ assert got == c.want