From be43049db0da2df4343bd5e1392d6c5201fc67d0 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 2 Jan 2020 18:11:35 +0100 Subject: datacite: address raw_name index form comment > The convention for display_name and raw_name is to be how the name would normally be printed, not in index form (surname comma given_name). So we might need to un-encode names like "Tricart, Pierre". Use an additional `index_form_to_display_name` function to convert index from to display form, heuristically. --- python/fatcat_tools/importers/datacite.py | 43 +++++++ .../tests/files/datacite/datacite_result_00.json | 4 +- .../tests/files/datacite/datacite_result_01.json | 2 +- .../tests/files/datacite/datacite_result_02.json | 2 +- .../tests/files/datacite/datacite_result_04.json | 2 +- .../tests/files/datacite/datacite_result_05.json | 142 ++++++++++----------- .../tests/files/datacite/datacite_result_07.json | 6 +- .../tests/files/datacite/datacite_result_08.json | 4 +- .../tests/files/datacite/datacite_result_09.json | 2 +- .../tests/files/datacite/datacite_result_12.json | 8 +- .../tests/files/datacite/datacite_result_13.json | 2 +- .../tests/files/datacite/datacite_result_14.json | 16 +-- .../tests/files/datacite/datacite_result_15.json | 2 +- .../tests/files/datacite/datacite_result_16.json | 2 +- .../tests/files/datacite/datacite_result_18.json | 2 +- .../tests/files/datacite/datacite_result_19.json | 2 +- .../tests/files/datacite/datacite_result_20.json | 2 +- .../tests/files/datacite/datacite_result_21.json | 6 +- .../tests/files/datacite/datacite_result_22.json | 10 +- .../tests/files/datacite/datacite_result_23.json | 6 +- python/tests/import_datacite.py | 18 ++- 21 files changed, 171 insertions(+), 112 deletions(-) diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index a03587c0..bd135569 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -331,6 +331,10 @@ class DataciteImporter(EntityImporter): if name in ('(:Unav)', 'NA', 'NN', '(:Null)'): continue + # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'. + if name: + name = index_form_to_display_name(name) + contribs.append( fatcat_openapi_client.ReleaseContrib( creator_id=creator_id, @@ -859,3 +863,42 @@ def clean_doi(doi): doi = doi.replace(c, "-") return doi +def index_form_to_display_name(s): + """ + Try to convert an index form name, like 'Razis, Panos A' into display_name, + e.g. 'Panos A Razis'. + """ + if ',' not in s: + return s + skip_on_chars = ['(', ')', '*'] + for char in skip_on_chars: + if char in s: + return s + if s.count(',') > 1: + # "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan" + return s + stopwords = [ + 'Archive', + 'Collection', + 'Coordinator', + 'Department', + 'Germany', + 'International', + 'National', + 'Netherlands', + 'Office', + 'Organisation', + 'Organization', + 'Service', + 'Services', + 'United States', + 'University', + 'Verein', + 'Volkshochschule', + ] + for stop in stopwords: + if stop.lower() in s.lower(): + return s + + a, b = s.split(',') + return '{} {}'.format(b.strip(), a.strip()) diff --git a/python/tests/files/datacite/datacite_result_00.json b/python/tests/files/datacite/datacite_result_00.json index 085e23f3..a4b28076 100644 --- a/python/tests/files/datacite/datacite_result_00.json +++ b/python/tests/files/datacite/datacite_result_00.json @@ -32,14 +32,14 @@ "contribs": [ { "index": 0, - "raw_name": "Li, Qian-Jin", + "raw_name": "Qian-Jin Li", "given_name": "Qian-Jin", "surname": "Li", "role": "author" }, { "index": 1, - "raw_name": "Yang, Chun-Long", + "raw_name": "Chun-Long Yang", "given_name": "Chun-Long", "surname": "Yang", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_01.json b/python/tests/files/datacite/datacite_result_01.json index f8c6b930..46be2515 100644 --- a/python/tests/files/datacite/datacite_result_01.json +++ b/python/tests/files/datacite/datacite_result_01.json @@ -21,7 +21,7 @@ "contribs": [ { "index": 0, - "raw_name": "Dargenty, G.", + "raw_name": "G. Dargenty", "given_name": "G.", "surname": "Dargenty", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_02.json b/python/tests/files/datacite/datacite_result_02.json index f8b85f38..bdcb4951 100644 --- a/python/tests/files/datacite/datacite_result_02.json +++ b/python/tests/files/datacite/datacite_result_02.json @@ -25,7 +25,7 @@ "contribs": [ { "index": 0, - "raw_name": "Weyersberg, Albert", + "raw_name": "Albert Weyersberg", "given_name": "Albert", "surname": "Weyersberg", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_04.json b/python/tests/files/datacite/datacite_result_04.json index 7ca70d6c..54b19ef9 100644 --- a/python/tests/files/datacite/datacite_result_04.json +++ b/python/tests/files/datacite/datacite_result_04.json @@ -12,7 +12,7 @@ "contribs": [ { "index": 0, - "raw_name": "Nicollerat, Marc Andre", + "raw_name": "Marc Andre Nicollerat", "given_name": "Marc Andre", "surname": "Nicollerat", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json index e61769de..a790c26e 100644 --- a/python/tests/files/datacite/datacite_result_05.json +++ b/python/tests/files/datacite/datacite_result_05.json @@ -24,497 +24,497 @@ "contribs": [ { "index": 0, - "raw_name": "K\u00f5ljalg, Urmas", + "raw_name": "Urmas K\u00f5ljalg", "given_name": "Urmas", "surname": "K\u00f5ljalg", "role": "author" }, { "index": 1, - "raw_name": "Abarenkov, Kessy", + "raw_name": "Kessy Abarenkov", "given_name": "Kessy", "surname": "Abarenkov", "role": "author" }, { "index": 2, - "raw_name": "Nilsson, R. Henrik", + "raw_name": "R. Henrik Nilsson", "given_name": "R. Henrik", "surname": "Nilsson", "role": "author" }, { "index": 3, - "raw_name": "Larsson, Karl-Henrik", + "raw_name": "Karl-Henrik Larsson", "given_name": "Karl-Henrik", "surname": "Larsson", "role": "author" }, { "index": 4, - "raw_name": "Aas, Anders Bj\u00f8rnsgard", + "raw_name": "Anders Bj\u00f8rnsgard Aas", "given_name": "Anders Bj\u00f8rnsgard", "surname": "Aas", "role": "author" }, { "index": 5, - "raw_name": "Adams, Rachel", + "raw_name": "Rachel Adams", "given_name": "Rachel", "surname": "Adams", "role": "author" }, { "index": 6, - "raw_name": "Alves, Artur", + "raw_name": "Artur Alves", "given_name": "Artur", "surname": "Alves", "role": "author" }, { "index": 7, - "raw_name": "Ammirati, Joseph F.", + "raw_name": "Joseph F. Ammirati", "given_name": "Joseph F.", "surname": "Ammirati", "role": "author" }, { "index": 8, - "raw_name": "Arnold, A. Elizabeth", + "raw_name": "A. Elizabeth Arnold", "given_name": "A. Elizabeth", "surname": "Arnold", "role": "author" }, { "index": 9, - "raw_name": "Bahram, Mohammad", + "raw_name": "Mohammad Bahram", "given_name": "Mohammad", "surname": "Bahram", "role": "author" }, { "index": 10, - "raw_name": "Bengtsson-Palme, Johan", + "raw_name": "Johan Bengtsson-Palme", "given_name": "Johan", "surname": "Bengtsson-Palme", "role": "author" }, { "index": 11, - "raw_name": "Berlin, Anna", + "raw_name": "Anna Berlin", "given_name": "Anna", "surname": "Berlin", "role": "author" }, { "index": 12, - "raw_name": "Botnen, Synn\u00f8ve", + "raw_name": "Synn\u00f8ve Botnen", "given_name": "Synn\u00f8ve", "surname": "Botnen", "role": "author" }, { "index": 13, - "raw_name": "Bourlat, Sarah", + "raw_name": "Sarah Bourlat", "given_name": "Sarah", "surname": "Bourlat", "role": "author" }, { "index": 14, - "raw_name": "Cheeke, Tanya", + "raw_name": "Tanya Cheeke", "given_name": "Tanya", "surname": "Cheeke", "role": "author" }, { "index": 15, - "raw_name": "Dima, B\u00e1lint", + "raw_name": "B\u00e1lint Dima", "given_name": "B\u00e1lint", "surname": "Dima", "role": "author" }, { "index": 16, - "raw_name": "Drenkhan, Rein", + "raw_name": "Rein Drenkhan", "given_name": "Rein", "surname": "Drenkhan", "role": "author" }, { "index": 17, - "raw_name": "Duarte, Camila", + "raw_name": "Camila Duarte", "given_name": "Camila", "surname": "Duarte", "role": "author" }, { "index": 18, - "raw_name": "Due\u00f1as, Margarita", + "raw_name": "Margarita Due\u00f1as", "given_name": "Margarita", "surname": "Due\u00f1as", "role": "author" }, { "index": 19, - "raw_name": "Eberhardt, Ursula", + "raw_name": "Ursula Eberhardt", "given_name": "Ursula", "surname": "Eberhardt", "role": "author" }, { "index": 20, - "raw_name": "Friberg, Hanna", + "raw_name": "Hanna Friberg", "given_name": "Hanna", "surname": "Friberg", "role": "author" }, { "index": 21, - "raw_name": "Fr\u00f8slev, Tobias G.", + "raw_name": "Tobias G. Fr\u00f8slev", "given_name": "Tobias G.", "surname": "Fr\u00f8slev", "role": "author" }, { "index": 22, - "raw_name": "Garnica, Sigisfredo", + "raw_name": "Sigisfredo Garnica", "given_name": "Sigisfredo", "surname": "Garnica", "role": "author" }, { "index": 23, - "raw_name": "Geml, J\u00f3zsef", + "raw_name": "J\u00f3zsef Geml", "given_name": "J\u00f3zsef", "surname": "Geml", "role": "author" }, { "index": 24, - "raw_name": "Ghobad-Nejhad, Masoomeh", + "raw_name": "Masoomeh Ghobad-Nejhad", "given_name": "Masoomeh", "surname": "Ghobad-Nejhad", "role": "author" }, { "index": 25, - "raw_name": "Grebenc, Tine", + "raw_name": "Tine Grebenc", "given_name": "Tine", "surname": "Grebenc", "role": "author" }, { "index": 26, - "raw_name": "Griffith, Gareth W.", + "raw_name": "Gareth W. Griffith", "given_name": "Gareth W.", "surname": "Griffith", "role": "author" }, { "index": 27, - "raw_name": "Hampe, Felix", + "raw_name": "Felix Hampe", "given_name": "Felix", "surname": "Hampe", "role": "author" }, { "index": 28, - "raw_name": "Kennedy, Peter", + "raw_name": "Peter Kennedy", "given_name": "Peter", "surname": "Kennedy", "role": "author" }, { "index": 29, - "raw_name": "Khomich, Maryia", + "raw_name": "Maryia Khomich", "given_name": "Maryia", "surname": "Khomich", "role": "author" }, { "index": 30, - "raw_name": "Kohout, Petr", + "raw_name": "Petr Kohout", "given_name": "Petr", "surname": "Kohout", "role": "author" }, { "index": 31, - "raw_name": "Kollom, Anu", + "raw_name": "Anu Kollom", "given_name": "Anu", "surname": "Kollom", "role": "author" }, { "index": 32, - "raw_name": "Larsson, Ellen", + "raw_name": "Ellen Larsson", "given_name": "Ellen", "surname": "Larsson", "role": "author" }, { "index": 33, - "raw_name": "Laszlo, Irinyi", + "raw_name": "Irinyi Laszlo", "given_name": "Irinyi", "surname": "Laszlo", "role": "author" }, { "index": 34, - "raw_name": "Leavitt, Steven", + "raw_name": "Steven Leavitt", "given_name": "Steven", "surname": "Leavitt", "role": "author" }, { "index": 35, - "raw_name": "Liimatainen, Kare", + "raw_name": "Kare Liimatainen", "given_name": "Kare", "surname": "Liimatainen", "role": "author" }, { "index": 36, - "raw_name": "Lindahl, Bj\u00f6rn", + "raw_name": "Bj\u00f6rn Lindahl", "given_name": "Bj\u00f6rn", "surname": "Lindahl", "role": "author" }, { "index": 37, - "raw_name": "Lodge, Deborah J.", + "raw_name": "Deborah J. Lodge", "given_name": "Deborah J.", "surname": "Lodge", "role": "author" }, { "index": 38, - "raw_name": "Lumbsch, Helge Thorsten", + "raw_name": "Helge Thorsten Lumbsch", "given_name": "Helge Thorsten", "surname": "Lumbsch", "role": "author" }, { "index": 39, - "raw_name": "Mart\u00edn Esteban, Mar\u00eda Paz", + "raw_name": "Mar\u00eda Paz Mart\u00edn Esteban", "given_name": "Mar\u00eda Paz", "surname": "Mart\u00edn Esteban", "role": "author" }, { "index": 40, - "raw_name": "Meyer, Wieland", + "raw_name": "Wieland Meyer", "given_name": "Wieland", "surname": "Meyer", "role": "author" }, { "index": 41, - "raw_name": "Miettinen, Otto", + "raw_name": "Otto Miettinen", "given_name": "Otto", "surname": "Miettinen", "role": "author" }, { "index": 42, - "raw_name": "Nguyen, Nhu", + "raw_name": "Nhu Nguyen", "given_name": "Nhu", "surname": "Nguyen", "role": "author" }, { "index": 43, - "raw_name": "Niskanen, Tuula", + "raw_name": "Tuula Niskanen", "given_name": "Tuula", "surname": "Niskanen", "role": "author" }, { "index": 44, - "raw_name": "Oono, Ryoko", + "raw_name": "Ryoko Oono", "given_name": "Ryoko", "surname": "Oono", "role": "author" }, { "index": 45, - "raw_name": "\u00d6pik, Maarja", + "raw_name": "Maarja \u00d6pik", "given_name": "Maarja", "surname": "\u00d6pik", "role": "author" }, { "index": 46, - "raw_name": "Ordynets, Alexander", + "raw_name": "Alexander Ordynets", "given_name": "Alexander", "surname": "Ordynets", "role": "author" }, { "index": 47, - "raw_name": "Paw\u0142owska, Julia", + "raw_name": "Julia Paw\u0142owska", "given_name": "Julia", "surname": "Paw\u0142owska", "role": "author" }, { "index": 48, - "raw_name": "Peintner, Ursula", + "raw_name": "Ursula Peintner", "given_name": "Ursula", "surname": "Peintner", "role": "author" }, { "index": 49, - "raw_name": "Pereira, Olinto Liparini", + "raw_name": "Olinto Liparini Pereira", "given_name": "Olinto Liparini", "surname": "Pereira", "role": "author" }, { "index": 50, - "raw_name": "Pinho, Danilo Batista", + "raw_name": "Danilo Batista Pinho", "given_name": "Danilo Batista", "surname": "Pinho", "role": "author" }, { "index": 51, - "raw_name": "P\u00f5ldmaa, Kadri", + "raw_name": "Kadri P\u00f5ldmaa", "given_name": "Kadri", "surname": "P\u00f5ldmaa", "role": "author" }, { "index": 52, - "raw_name": "Runnel, Kadri", + "raw_name": "Kadri Runnel", "given_name": "Kadri", "surname": "Runnel", "role": "author" }, { "index": 53, - "raw_name": "Ryberg, Martin", + "raw_name": "Martin Ryberg", "given_name": "Martin", "surname": "Ryberg", "role": "author" }, { "index": 54, - "raw_name": "Saar, Irja", + "raw_name": "Irja Saar", "given_name": "Irja", "surname": "Saar", "role": "author" }, { "index": 55, - "raw_name": "Sanli, Kemal", + "raw_name": "Kemal Sanli", "given_name": "Kemal", "surname": "Sanli", "role": "author" }, { "index": 56, - "raw_name": "Scott, James", + "raw_name": "James Scott", "given_name": "James", "surname": "Scott", "role": "author" }, { "index": 57, - "raw_name": "Spirin, Viacheslav", + "raw_name": "Viacheslav Spirin", "given_name": "Viacheslav", "surname": "Spirin", "role": "author" }, { "index": 58, - "raw_name": "Suija, Ave", + "raw_name": "Ave Suija", "given_name": "Ave", "surname": "Suija", "role": "author" }, { "index": 59, - "raw_name": "Svantesson, Sten", + "raw_name": "Sten Svantesson", "given_name": "Sten", "surname": "Svantesson", "role": "author" }, { "index": 60, - "raw_name": "Tadych, Mariusz", + "raw_name": "Mariusz Tadych", "given_name": "Mariusz", "surname": "Tadych", "role": "author" }, { "index": 61, - "raw_name": "Takamatsu, Susumu", + "raw_name": "Susumu Takamatsu", "given_name": "Susumu", "surname": "Takamatsu", "role": "author" }, { "index": 62, - "raw_name": "Tamm, Heidi", + "raw_name": "Heidi Tamm", "given_name": "Heidi", "surname": "Tamm", "role": "author" }, { "index": 63, - "raw_name": "Taylor, AFS.", + "raw_name": "AFS. Taylor", "given_name": "AFS.", "surname": "Taylor", "role": "author" }, { "index": 64, - "raw_name": "Tedersoo, Leho", + "raw_name": "Leho Tedersoo", "given_name": "Leho", "surname": "Tedersoo", "role": "author" }, { "index": 65, - "raw_name": "Telleria, M.T.", + "raw_name": "M.T. Telleria", "given_name": "M.T.", "surname": "Telleria", "role": "author" }, { "index": 66, - "raw_name": "Udayanga, Dhanushka", + "raw_name": "Dhanushka Udayanga", "given_name": "Dhanushka", "surname": "Udayanga", "role": "author" }, { "index": 67, - "raw_name": "Unterseher, Martin", + "raw_name": "Martin Unterseher", "given_name": "Martin", "surname": "Unterseher", "role": "author" }, { "index": 68, - "raw_name": "Volobuev, Sergey", + "raw_name": "Sergey Volobuev", "given_name": "Sergey", "surname": "Volobuev", "role": "author" }, { "index": 69, - "raw_name": "Weiss, Michael", + "raw_name": "Michael Weiss", "given_name": "Michael", "surname": "Weiss", "role": "author" }, { "index": 70, - "raw_name": "Wurzbacher, Christian", + "raw_name": "Christian Wurzbacher", "given_name": "Christian", "surname": "Wurzbacher", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_07.json b/python/tests/files/datacite/datacite_result_07.json index 324bb663..f572263c 100644 --- a/python/tests/files/datacite/datacite_result_07.json +++ b/python/tests/files/datacite/datacite_result_07.json @@ -38,21 +38,21 @@ "contribs": [ { "index": 0, - "raw_name": "ROTHUIZEN, E.", + "raw_name": "E. ROTHUIZEN", "given_name": "E.", "surname": "ROTHUIZEN", "role": "author" }, { "index": 1, - "raw_name": "ELMEGAARD, B.", + "raw_name": "B. ELMEGAARD", "given_name": "B.", "surname": "ELMEGAARD", "role": "author" }, { "index": 2, - "raw_name": "MARKUSSEN W., B.", + "raw_name": "B. MARKUSSEN W.", "given_name": "B.", "surname": "MARKUSSEN W.", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_08.json b/python/tests/files/datacite/datacite_result_08.json index 281c3679..581ca1eb 100644 --- a/python/tests/files/datacite/datacite_result_08.json +++ b/python/tests/files/datacite/datacite_result_08.json @@ -30,14 +30,14 @@ "contribs": [ { "index": 0, - "raw_name": "Kajisa, Kei", + "raw_name": "Kei Kajisa", "given_name": "Kei", "surname": "Kajisa", "role": "author" }, { "index": 1, - "raw_name": "Kajisa, Kei", + "raw_name": "Kei Kajisa", "given_name": "Kei", "surname": "Kajisa", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_09.json b/python/tests/files/datacite/datacite_result_09.json index 01f92f85..db103d2b 100644 --- a/python/tests/files/datacite/datacite_result_09.json +++ b/python/tests/files/datacite/datacite_result_09.json @@ -24,7 +24,7 @@ "contribs": [ { "index": 0, - "raw_name": "Kirstaedter, Nils", + "raw_name": "Nils Kirstaedter", "given_name": "Nils", "surname": "Kirstaedter", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_12.json b/python/tests/files/datacite/datacite_result_12.json index 6b6cad4a..192062e3 100644 --- a/python/tests/files/datacite/datacite_result_12.json +++ b/python/tests/files/datacite/datacite_result_12.json @@ -12,28 +12,28 @@ "contribs": [ { "index": 0, - "raw_name": "Spanias, Charalampos", + "raw_name": "Charalampos Spanias", "given_name": "Charalampos", "surname": "Spanias", "role": "author" }, { "index": 1, - "raw_name": "Nikolaidis, Pantelis T", + "raw_name": "Pantelis T Nikolaidis", "given_name": "Pantelis T", "surname": "Nikolaidis", "role": "author" }, { "index": 2, - "raw_name": "Rosemann, Thomas", + "raw_name": "Thomas Rosemann", "given_name": "Thomas", "surname": "Rosemann", "role": "author" }, { "index": 3, - "raw_name": "Knechtle, Beat", + "raw_name": "Beat Knechtle", "given_name": "Beat", "surname": "Knechtle", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_13.json b/python/tests/files/datacite/datacite_result_13.json index 3da3816d..c8971667 100644 --- a/python/tests/files/datacite/datacite_result_13.json +++ b/python/tests/files/datacite/datacite_result_13.json @@ -17,7 +17,7 @@ }, { "index": 1, - "raw_name": "Hiltbrunner, Hermann", + "raw_name": "Hermann Hiltbrunner", "given_name": "Hermann", "surname": "Hiltbrunner", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_14.json b/python/tests/files/datacite/datacite_result_14.json index 94c00472..94ad000a 100644 --- a/python/tests/files/datacite/datacite_result_14.json +++ b/python/tests/files/datacite/datacite_result_14.json @@ -45,56 +45,56 @@ "contribs": [ { "index": 0, - "raw_name": "Stulz, E.", + "raw_name": "E. Stulz", "given_name": "E.", "surname": "Stulz", "role": "author" }, { "index": 1, - "raw_name": "Scott, S.M.", + "raw_name": "S.M. Scott", "given_name": "S.M.", "surname": "Scott", "role": "author" }, { "index": 2, - "raw_name": "Ng, Yiu-Fai", + "raw_name": "Yiu-Fai Ng", "given_name": "Yiu-Fai", "surname": "Ng", "role": "author" }, { "index": 3, - "raw_name": "Bond, A.D.", + "raw_name": "A.D. Bond", "given_name": "A.D.", "surname": "Bond", "role": "author" }, { "index": 4, - "raw_name": "Teat, S.J.", + "raw_name": "S.J. Teat", "given_name": "S.J.", "surname": "Teat", "role": "author" }, { "index": 5, - "raw_name": "Darling, S.L.", + "raw_name": "S.L. Darling", "given_name": "S.L.", "surname": "Darling", "role": "author" }, { "index": 6, - "raw_name": "Feeder, N.", + "raw_name": "N. Feeder", "given_name": "N.", "surname": "Feeder", "role": "author" }, { "index": 7, - "raw_name": "Sanders, J.K.M.", + "raw_name": "J.K.M. Sanders", "given_name": "J.K.M.", "surname": "Sanders", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_15.json b/python/tests/files/datacite/datacite_result_15.json index 0614f6ba..bdeb8426 100644 --- a/python/tests/files/datacite/datacite_result_15.json +++ b/python/tests/files/datacite/datacite_result_15.json @@ -11,7 +11,7 @@ "contribs": [ { "index": 0, - "raw_name": "Richardson, David", + "raw_name": "David Richardson", "given_name": "David", "surname": "Richardson", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_16.json b/python/tests/files/datacite/datacite_result_16.json index 1d861cf6..ea8c2e59 100644 --- a/python/tests/files/datacite/datacite_result_16.json +++ b/python/tests/files/datacite/datacite_result_16.json @@ -20,7 +20,7 @@ "contribs": [ { "index": 0, - "raw_name": "Sochi, Taha", + "raw_name": "Taha Sochi", "given_name": "Taha", "surname": "Sochi", "role": "author" diff --git a/python/tests/files/datacite/datacite_result_18.json b/python/tests/files/datacite/datacite_result_18.json index 12ab39fe..274858c3 100644 --- a/python/tests/files/datacite/datacite_result_18.json +++ b/python/tests/files/datacite/datacite_result_18.json @@ -12,4 +12,4 @@ "contribs": [], "refs": [], "abstracts": [] -} +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_19.json b/python/tests/files/datacite/datacite_result_19.json index 1505db92..8d797268 100644 --- a/python/tests/files/datacite/datacite_result_19.json +++ b/python/tests/files/datacite/datacite_result_19.json @@ -12,4 +12,4 @@ "contribs": [], "refs": [], "abstracts": [] -} +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_20.json b/python/tests/files/datacite/datacite_result_20.json index 1868eede..97d7ae75 100644 --- a/python/tests/files/datacite/datacite_result_20.json +++ b/python/tests/files/datacite/datacite_result_20.json @@ -11,4 +11,4 @@ "contribs": [], "refs": [], "abstracts": [] -} +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_21.json b/python/tests/files/datacite/datacite_result_21.json index 9214065a..0a05a7cd 100644 --- a/python/tests/files/datacite/datacite_result_21.json +++ b/python/tests/files/datacite/datacite_result_21.json @@ -8,8 +8,8 @@ "ext_ids": { "doi": "10.7916/d86x0cg1" }, + "language": "de", "contribs": [], "refs": [], - "abstracts": [], - "language": "de" -} + "abstracts": [] +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_22.json b/python/tests/files/datacite/datacite_result_22.json index e9939e09..9e4225b5 100644 --- a/python/tests/files/datacite/datacite_result_22.json +++ b/python/tests/files/datacite/datacite_result_22.json @@ -8,15 +8,15 @@ "ext_ids": { "doi": "10.7916/d86x0cg1" }, + "language": "de", "contribs": [ { - "raw_affiliation": "Department of pataphysics", "index": 0, "raw_name": "Anton Welch", - "role": "author" + "role": "author", + "raw_affiliation": "Department of pataphysics" } ], "refs": [], - "abstracts": [], - "language": "de" -} + "abstracts": [] +} \ No newline at end of file diff --git a/python/tests/files/datacite/datacite_result_23.json b/python/tests/files/datacite/datacite_result_23.json index 2bf66eae..46f60492 100644 --- a/python/tests/files/datacite/datacite_result_23.json +++ b/python/tests/files/datacite/datacite_result_23.json @@ -8,6 +8,7 @@ "ext_ids": { "doi": "10.7916/d86x0cg1-xxx" }, + "language": "de", "contribs": [ { "index": 0, @@ -17,6 +18,5 @@ } ], "refs": [], - "abstracts": [], - "language": "de" -} + "abstracts": [] +} \ No newline at end of file diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index cdc165d7..3e47fce8 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -7,7 +7,7 @@ import datetime import pytest import gzip from fatcat_tools.importers import DataciteImporter, JsonLinePusher -from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates, clean_doi +from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates, clean_doi, index_form_to_display_name from fatcat_tools.transforms import entity_to_dict from fixtures import api import json @@ -294,3 +294,19 @@ def test_datacite_conversions(datacite_importer): assert result == expected +def test_index_form_to_display_name(): + Case = collections.namedtuple('Case', 'input output') + cases = [ + Case('', ''), + Case('ABC', 'ABC'), + Case('International Space Station', 'International Space Station'), + Case('Jin, Shan', 'Shan Jin'), + Case('Volkshochschule Der Bundesstadt Bonn', 'Volkshochschule Der Bundesstadt Bonn'), + Case('Solomon, P. M.', 'P. M. Solomon'), + Case('Sujeevan Ratnasingham', 'Sujeevan Ratnasingham'), + Case('Paul Stöckli (1906-1991), Künstler', 'Paul Stöckli (1906-1991), Künstler'), + ] + + for c in cases: + assert c.output == index_form_to_display_name(c.input) + -- cgit v1.2.3