summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-01-02 18:11:35 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-01-02 18:11:35 +0100
commitbe43049db0da2df4343bd5e1392d6c5201fc67d0 (patch)
tree219fa25011f424da745eece11226438cf741f345 /python/fatcat_tools/importers
parentcb223fccb64500a8e134b9ec721c8a08b1a60f19 (diff)
downloadfatcat-be43049db0da2df4343bd5e1392d6c5201fc67d0.tar.gz
fatcat-be43049db0da2df4343bd5e1392d6c5201fc67d0.zip
datacite: address raw_name index form comment
> The convention for display_name and raw_name is to be how the name would normally be printed, not in index form (surname comma given_name). So we might need to un-encode names like "Tricart, Pierre". Use an additional `index_form_to_display_name` function to convert index from to display form, heuristically.
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/datacite.py43
1 files changed, 43 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index a03587c0..bd135569 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -331,6 +331,10 @@ class DataciteImporter(EntityImporter):
if name in ('(:Unav)', 'NA', 'NN', '(:Null)'):
continue
+ # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'.
+ if name:
+ name = index_form_to_display_name(name)
+
contribs.append(
fatcat_openapi_client.ReleaseContrib(
creator_id=creator_id,
@@ -859,3 +863,42 @@ def clean_doi(doi):
doi = doi.replace(c, "-")
return doi
+def index_form_to_display_name(s):
+ """
+ Try to convert an index form name, like 'Razis, Panos A' into display_name,
+ e.g. 'Panos A Razis'.
+ """
+ if ',' not in s:
+ return s
+ skip_on_chars = ['(', ')', '*']
+ for char in skip_on_chars:
+ if char in s:
+ return s
+ if s.count(',') > 1:
+ # "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan"
+ return s
+ stopwords = [
+ 'Archive',
+ 'Collection',
+ 'Coordinator',
+ 'Department',
+ 'Germany',
+ 'International',
+ 'National',
+ 'Netherlands',
+ 'Office',
+ 'Organisation',
+ 'Organization',
+ 'Service',
+ 'Services',
+ 'United States',
+ 'University',
+ 'Verein',
+ 'Volkshochschule',
+ ]
+ for stop in stopwords:
+ if stop.lower() in s.lower():
+ return s
+
+ a, b = s.split(',')
+ return '{} {}'.format(b.strip(), a.strip())