From be43049db0da2df4343bd5e1392d6c5201fc67d0 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 2 Jan 2020 18:11:35 +0100 Subject: datacite: address raw_name index form comment > The convention for display_name and raw_name is to be how the name would normally be printed, not in index form (surname comma given_name). So we might need to un-encode names like "Tricart, Pierre". Use an additional `index_form_to_display_name` function to convert index from to display form, heuristically. --- python/fatcat_tools/importers/datacite.py | 43 +++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index a03587c0..bd135569 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -331,6 +331,10 @@ class DataciteImporter(EntityImporter): if name in ('(:Unav)', 'NA', 'NN', '(:Null)'): continue + # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'. + if name: + name = index_form_to_display_name(name) + contribs.append( fatcat_openapi_client.ReleaseContrib( creator_id=creator_id, @@ -859,3 +863,42 @@ def clean_doi(doi): doi = doi.replace(c, "-") return doi +def index_form_to_display_name(s): + """ + Try to convert an index form name, like 'Razis, Panos A' into display_name, + e.g. 'Panos A Razis'. + """ + if ',' not in s: + return s + skip_on_chars = ['(', ')', '*'] + for char in skip_on_chars: + if char in s: + return s + if s.count(',') > 1: + # "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan" + return s + stopwords = [ + 'Archive', + 'Collection', + 'Coordinator', + 'Department', + 'Germany', + 'International', + 'National', + 'Netherlands', + 'Office', + 'Organisation', + 'Organization', + 'Service', + 'Services', + 'United States', + 'University', + 'Verein', + 'Volkshochschule', + ] + for stop in stopwords: + if stop.lower() in s.lower(): + return s + + a, b = s.split(',') + return '{} {}'.format(b.strip(), a.strip()) -- cgit v1.2.3