From 9c0e5d714ea9fedeca64f1ceb5b47f67438629af Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 23 Jun 2020 18:38:33 -0700 Subject: strip control characters from titles (issn_meta) --- chocula/util.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/chocula/util.py b/chocula/util.py index 894af98..c2466cd 100644 --- a/chocula/util.py +++ b/chocula/util.py @@ -279,6 +279,10 @@ def clean_str(s: Optional[str]) -> Optional[str]: if not s: return None s = unquote(ftfy.fix_text(s)) + # these unicode characters are used by, eg, ISSN portal to mare prefixes as + # non-sorting + s.replace("\u02dc", "") + s.replace("\u0153", "") return s or None -- cgit v1.2.3