From f9035c7ca9637668911afa7e9345138563aad33e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 3 Jun 2020 19:30:15 -0700 Subject: improve text scrubbing Was going to use textpipe, but dependency was too large and failed to install with halfway modern GCC (due to CLD2 issue): https://github.com/GregBowyer/cld2-cffi/issues/12 So instead basically pulled out the clean_text function, which is quite short. --- tests/test_scrub.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 tests/test_scrub.py (limited to 'tests') diff --git a/tests/test_scrub.py b/tests/test_scrub.py new file mode 100644 index 0000000..6c357ae --- /dev/null +++ b/tests/test_scrub.py @@ -0,0 +1,15 @@ + +import pytest + +from fatcat_scholar.schema import * + + +def test_scrub(): + vectors = [ + ('“Please clean this piece… of text„', '"Please clean this piece... of text"'), + ("blah", "blah"), + ] + + for raw, fixed in vectors: + assert fixed == scrub_text(raw) + -- cgit v1.2.3