diff options
author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-28 20:05:17 -0700 |
---|---|---|
committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-28 20:05:17 -0700 |
commit | dd0df0fe3574352011d6a0fe3c12e59b0a4b8259 (patch) | |
tree | 951ab504ce4e00ddfe79221c4ffdf1f9768f3368 | |
parent | 304196e01e69826047e5e14af949d5efc80d1ece (diff) | |
download | sandcrawler-dd0df0fe3574352011d6a0fe3c12e59b0a4b8259.tar.gz sandcrawler-dd0df0fe3574352011d6a0fe3c12e59b0a4b8259.zip |
Added accent removal to titleToSlug().
-rw-r--r-- | scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala | 28 | ||||
-rw-r--r-- | scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala | 25 |
2 files changed, 51 insertions, 2 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala index 7923e09..2a569a1 100644 --- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala +++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala @@ -1,7 +1,9 @@ package sandcrawler +import java.text.Normalizer import java.util.Arrays import java.util.Properties +import java.util.regex.Pattern import scala.math import scala.util.parsing.json.JSON @@ -124,7 +126,7 @@ object HBaseCrossrefScore { } def titleToSlug(title : String) : Option[String] = { - val slug = title.split(":")(0).toLowerCase() + val slug = removeAccents(title).split(":")(0).toLowerCase() if (slug.isEmpty) { None } else { @@ -172,4 +174,28 @@ object HBaseCrossrefScore { } } } + + // scalastyle:off + // Adapted from https://git-wip-us.apache.org/repos/asf?p=commons-lang.git;a=blob;f=src/main/java/org/apache/commons/lang3/StringUtils.java;h=1d7b9b99335865a88c509339f700ce71ce2c71f2;hb=HEAD#l934 + // scalastyle:on + def removeAccents(s : String) : String = { + val replacements = Map( + '\u0141' -> 'L', + '\u0142' -> 'l', // Letter ell + '\u00d8' -> 'O', + '\u00f8' -> 'o' + ) + val sb = new StringBuilder(Normalizer.normalize(s, Normalizer.Form.NFD)) + for (i <- 0 to sb.length - 1) { + for (key <- replacements.keys) { + if (sb(i) == key) { + sb.deleteCharAt(i); + sb.insert(i, replacements(key)) + } + } + } + val pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+") + pattern.matcher(sb).replaceAll("").toString + } } + diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala index e4cab95..655dda1 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala @@ -162,7 +162,30 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers { it should "return None if given a malformed json string" in { val slug = HBaseCrossrefScore.grobidToSlug(MalformedCrossrefString) - slug shouldBe None + slug shouldBe None + } + + "removeAccents()" should "handle the empty string" in { + HBaseCrossrefScore.removeAccents("") shouldBe "" + } + + it should "not change a string with unaccented characters" in { + HBaseCrossrefScore.removeAccents("abc123") shouldBe "abc123" + } + + it should "remove accents from Ls" in { + HBaseCrossrefScore.removeAccents("E\u0141\u0142en") shouldBe "ELlen" + } + + it should "remove accents from Es without changing case" in { + val result = HBaseCrossrefScore.removeAccents("\u00e9") + result should have length 1 + result shouldBe "e" + } + + it should "convert the ø in Soren" in { + HBaseCrossrefScore.removeAccents("Søren") shouldBe "Soren" + HBaseCrossrefScore.removeAccents("SØREN") shouldBe "SOREN" } // Pipeline tests |