diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-01-02 18:11:35 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-01-02 18:11:35 +0100 |
commit | be43049db0da2df4343bd5e1392d6c5201fc67d0 (patch) | |
tree | 219fa25011f424da745eece11226438cf741f345 /python/fatcat_tools/importers | |
parent | cb223fccb64500a8e134b9ec721c8a08b1a60f19 (diff) | |
download | fatcat-be43049db0da2df4343bd5e1392d6c5201fc67d0.tar.gz fatcat-be43049db0da2df4343bd5e1392d6c5201fc67d0.zip |
datacite: address raw_name index form comment
> The convention for display_name and raw_name is to be how the name
would normally be printed, not in index form (surname comma given_name).
So we might need to un-encode names like "Tricart, Pierre".
Use an additional `index_form_to_display_name` function to convert index
from to display form, heuristically.
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 43 |
1 files changed, 43 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index a03587c0..bd135569 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -331,6 +331,10 @@ class DataciteImporter(EntityImporter): if name in ('(:Unav)', 'NA', 'NN', '(:Null)'): continue + # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'. + if name: + name = index_form_to_display_name(name) + contribs.append( fatcat_openapi_client.ReleaseContrib( creator_id=creator_id, @@ -859,3 +863,42 @@ def clean_doi(doi): doi = doi.replace(c, "-") return doi +def index_form_to_display_name(s): + """ + Try to convert an index form name, like 'Razis, Panos A' into display_name, + e.g. 'Panos A Razis'. + """ + if ',' not in s: + return s + skip_on_chars = ['(', ')', '*'] + for char in skip_on_chars: + if char in s: + return s + if s.count(',') > 1: + # "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan" + return s + stopwords = [ + 'Archive', + 'Collection', + 'Coordinator', + 'Department', + 'Germany', + 'International', + 'National', + 'Netherlands', + 'Office', + 'Organisation', + 'Organization', + 'Service', + 'Services', + 'United States', + 'University', + 'Verein', + 'Volkshochschule', + ] + for stop in stopwords: + if stop.lower() in s.lower(): + return s + + a, b = s.split(',') + return '{} {}'.format(b.strip(), a.strip()) |