diff options
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 43 |
1 files changed, 43 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index a03587c0..bd135569 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -331,6 +331,10 @@ class DataciteImporter(EntityImporter): if name in ('(:Unav)', 'NA', 'NN', '(:Null)'): continue + # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'. + if name: + name = index_form_to_display_name(name) + contribs.append( fatcat_openapi_client.ReleaseContrib( creator_id=creator_id, @@ -859,3 +863,42 @@ def clean_doi(doi): doi = doi.replace(c, "-") return doi +def index_form_to_display_name(s): + """ + Try to convert an index form name, like 'Razis, Panos A' into display_name, + e.g. 'Panos A Razis'. + """ + if ',' not in s: + return s + skip_on_chars = ['(', ')', '*'] + for char in skip_on_chars: + if char in s: + return s + if s.count(',') > 1: + # "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan" + return s + stopwords = [ + 'Archive', + 'Collection', + 'Coordinator', + 'Department', + 'Germany', + 'International', + 'National', + 'Netherlands', + 'Office', + 'Organisation', + 'Organization', + 'Service', + 'Services', + 'United States', + 'University', + 'Verein', + 'Volkshochschule', + ] + for stop in stopwords: + if stop.lower() in s.lower(): + return s + + a, b = s.split(',') + return '{} {}'.format(b.strip(), a.strip()) |