aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2019-12-31 01:27:13 +0100
committerMartin Czygan <martin.czygan@gmail.com>2019-12-31 01:27:13 +0100
commit5fcd26823207ae5ea0cdb5f1a6c8ddf7851ab6f4 (patch)
tree9fcb1ee262703763cd13b3117b4965702c426b10 /python/fatcat_tools/importers
parent1f27a42ac56d7b986905097fba662c2b18d5e8f8 (diff)
downloadfatcat-5fcd26823207ae5ea0cdb5f1a6c8ddf7851ab6f4.tar.gz
fatcat-5fcd26823207ae5ea0cdb5f1a6c8ddf7851ab6f4.zip
datacite: clean doi
address issue with EN DASH DOI. > "external identifier doesn't match required pattern for a DOI (expected, eg, '10.1234/aksjdfh'): 10.25513/1812-3996.2017.1.34–42"
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/datacite.py14
1 files changed, 13 insertions, 1 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 7f0482b4..5b3065aa 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -268,7 +268,7 @@ class DataciteImporter(EntityImporter):
return None
attributes = obj['attributes']
- doi = attributes.get('doi', '').lower()
+ doi = clean_doi(attributes.get('doi', '').lower())
# Contributors. Many nameIdentifierSchemes, we do not use (yet):
# "attributes.creators[].nameIdentifiers[].nameIdentifierScheme":
@@ -832,3 +832,15 @@ def parse_datacite_dates(dates):
break
return release_date, release_year
+
+def clean_doi(doi):
+ """
+ 10.25513/1812-3996.2017.1.34–42 // 8211, Hex 2013, Octal 20023
+ See also: https://github.com/miku/throwaway-check-doi
+
+ Replace unicode HYPHEN..HORIZONTAL BAR with HYPHEN-MINUS.
+ """
+ for c in ('\u2010', '\u2011', '\u2012', '\u2013', '\u2014', '\u2015'):
+ doi = doi.replace(c, "-")
+ return doi
+