diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2019-12-31 01:27:13 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2019-12-31 01:27:13 +0100 |
commit | 5fcd26823207ae5ea0cdb5f1a6c8ddf7851ab6f4 (patch) | |
tree | 9fcb1ee262703763cd13b3117b4965702c426b10 /python | |
parent | 1f27a42ac56d7b986905097fba662c2b18d5e8f8 (diff) | |
download | fatcat-5fcd26823207ae5ea0cdb5f1a6c8ddf7851ab6f4.tar.gz fatcat-5fcd26823207ae5ea0cdb5f1a6c8ddf7851ab6f4.zip |
datacite: clean doi
address issue with EN DASH DOI.
> "external identifier doesn't match required pattern for a DOI (expected,
eg, '10.1234/aksjdfh'): 10.25513/1812-3996.2017.1.34–42"
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 14 |
1 files changed, 13 insertions, 1 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 7f0482b4..5b3065aa 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -268,7 +268,7 @@ class DataciteImporter(EntityImporter): return None attributes = obj['attributes'] - doi = attributes.get('doi', '').lower() + doi = clean_doi(attributes.get('doi', '').lower()) # Contributors. Many nameIdentifierSchemes, we do not use (yet): # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": @@ -832,3 +832,15 @@ def parse_datacite_dates(dates): break return release_date, release_year + +def clean_doi(doi): + """ + 10.25513/1812-3996.2017.1.34–42 // 8211, Hex 2013, Octal 20023 + See also: https://github.com/miku/throwaway-check-doi + + Replace unicode HYPHEN..HORIZONTAL BAR with HYPHEN-MINUS. + """ + for c in ('\u2010', '\u2011', '\u2012', '\u2013', '\u2014', '\u2015'): + doi = doi.replace(c, "-") + return doi + |