Added accent removal to titleToSlug().

author: Ellen Spertus <ellen.spertus@gmail.com> 2018-07-28 20:05:17 -0700
committer: Ellen Spertus <ellen.spertus@gmail.com> 2018-07-28 20:05:17 -0700
commit: dd0df0fe3574352011d6a0fe3c12e59b0a4b8259 (patch)
tree: 951ab504ce4e00ddfe79221c4ffdf1f9768f3368 /scalding
parent: 304196e01e69826047e5e14af949d5efc80d1ece (diff)
download: sandcrawler-dd0df0fe3574352011d6a0fe3c12e59b0a4b8259.tar.gz
sandcrawler-dd0df0fe3574352011d6a0fe3c12e59b0a4b8259.zip
2 files changed, 51 insertions, 2 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
index 7923e09..2a569a1 100644
--- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
@@ -1,7 +1,9 @@
 package sandcrawler
 
+import java.text.Normalizer
 import java.util.Arrays
 import java.util.Properties
+import java.util.regex.Pattern
 
 import scala.math
 import scala.util.parsing.json.JSON
@@ -124,7 +126,7 @@ object HBaseCrossrefScore {
   }
 
   def titleToSlug(title : String) : Option[String] = {
-    val slug = title.split(":")(0).toLowerCase()
+    val slug = removeAccents(title).split(":")(0).toLowerCase()
     if (slug.isEmpty) {
       None
     } else {
@@ -172,4 +174,28 @@ object HBaseCrossrefScore {
       }
     }
   }
+
+  // scalastyle:off
+  // Adapted from https://git-wip-us.apache.org/repos/asf?p=commons-lang.git;a=blob;f=src/main/java/org/apache/commons/lang3/StringUtils.java;h=1d7b9b99335865a88c509339f700ce71ce2c71f2;hb=HEAD#l934
+  // scalastyle:on
+  def removeAccents(s : String) : String = {
+    val replacements = Map(
+      '\u0141' -> 'L',
+      '\u0142' -> 'l',  // Letter ell
+      '\u00d8' -> 'O',
+      '\u00f8' -> 'o'
+    )
+    val sb = new StringBuilder(Normalizer.normalize(s, Normalizer.Form.NFD))
+    for (i <- 0 to sb.length - 1) {
+      for (key <- replacements.keys) {
+        if (sb(i) == key) {
+          sb.deleteCharAt(i);
+          sb.insert(i, replacements(key))
+        }
+      }
+    }
+    val pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+")
+    pattern.matcher(sb).replaceAll("").toString
+  }
 }
+
diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
index e4cab95..655dda1 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
@@ -162,7 +162,30 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {
 
   it should "return None if given a malformed json string" in {
     val slug = HBaseCrossrefScore.grobidToSlug(MalformedCrossrefString)
-     slug shouldBe None
+    slug shouldBe None
+  }
+
+  "removeAccents()" should "handle the empty string" in {
+    HBaseCrossrefScore.removeAccents("") shouldBe ""
+  }
+
+  it should "not change a string with unaccented characters" in {
+    HBaseCrossrefScore.removeAccents("abc123") shouldBe "abc123"
+  }
+
+  it should "remove accents from Ls" in {
+    HBaseCrossrefScore.removeAccents("E\u0141\u0142en") shouldBe "ELlen"
+  }
+
+  it should "remove accents from Es without changing case" in {
+    val result = HBaseCrossrefScore.removeAccents("\u00e9")
+    result should have length 1
+    result shouldBe "e"
+  }
+
+  it should "convert the ø in Soren" in {
+    HBaseCrossrefScore.removeAccents("Søren") shouldBe "Soren"
+    HBaseCrossrefScore.removeAccents("SØREN") shouldBe "SOREN"
   }
 
   //  Pipeline tests
author	Ellen Spertus <ellen.spertus@gmail.com>	2018-07-28 20:05:17 -0700
committer	Ellen Spertus <ellen.spertus@gmail.com>	2018-07-28 20:05:17 -0700
commit	dd0df0fe3574352011d6a0fe3c12e59b0a4b8259 (patch)
tree	951ab504ce4e00ddfe79221c4ffdf1f9768f3368 /scalding
parent	304196e01e69826047e5e14af949d5efc80d1ece (diff)
download	sandcrawler-dd0df0fe3574352011d6a0fe3c12e59b0a4b8259.tar.gz sandcrawler-dd0df0fe3574352011d6a0fe3c12e59b0a4b8259.zip