From 5fcd26823207ae5ea0cdb5f1a6c8ddf7851ab6f4 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 31 Dec 2019 01:27:13 +0100 Subject: datacite: clean doi MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit address issue with EN DASH DOI. > "external identifier doesn't match required pattern for a DOI (expected, eg, '10.1234/aksjdfh'): 10.25513/1812-3996.2017.1.34–42" --- python/fatcat_tools/importers/datacite.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'python') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 7f0482b4..5b3065aa 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -268,7 +268,7 @@ class DataciteImporter(EntityImporter): return None attributes = obj['attributes'] - doi = attributes.get('doi', '').lower() + doi = clean_doi(attributes.get('doi', '').lower()) # Contributors. Many nameIdentifierSchemes, we do not use (yet): # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": @@ -832,3 +832,15 @@ def parse_datacite_dates(dates): break return release_date, release_year + +def clean_doi(doi): + """ + 10.25513/1812-3996.2017.1.34–42 // 8211, Hex 2013, Octal 20023 + See also: https://github.com/miku/throwaway-check-doi + + Replace unicode HYPHEN..HORIZONTAL BAR with HYPHEN-MINUS. + """ + for c in ('\u2010', '\u2011', '\u2012', '\u2013', '\u2014', '\u2015'): + doi = doi.replace(c, "-") + return doi + -- cgit v1.2.3