From dd0df0fe3574352011d6a0fe3c12e59b0a4b8259 Mon Sep 17 00:00:00 2001 From: Ellen Spertus Date: Sat, 28 Jul 2018 20:05:17 -0700 Subject: Added accent removal to titleToSlug(). --- .../scala/sandcrawler/HBaseCrossrefScoreTest.scala | 25 +++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'scalding/src/test/scala') diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala index e4cab95..655dda1 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala @@ -162,7 +162,30 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers { it should "return None if given a malformed json string" in { val slug = HBaseCrossrefScore.grobidToSlug(MalformedCrossrefString) - slug shouldBe None + slug shouldBe None + } + + "removeAccents()" should "handle the empty string" in { + HBaseCrossrefScore.removeAccents("") shouldBe "" + } + + it should "not change a string with unaccented characters" in { + HBaseCrossrefScore.removeAccents("abc123") shouldBe "abc123" + } + + it should "remove accents from Ls" in { + HBaseCrossrefScore.removeAccents("E\u0141\u0142en") shouldBe "ELlen" + } + + it should "remove accents from Es without changing case" in { + val result = HBaseCrossrefScore.removeAccents("\u00e9") + result should have length 1 + result shouldBe "e" + } + + it should "convert the ø in Soren" in { + HBaseCrossrefScore.removeAccents("Søren") shouldBe "Soren" + HBaseCrossrefScore.removeAccents("SØREN") shouldBe "SOREN" } // Pipeline tests -- cgit v1.2.3