aboutsummaryrefslogtreecommitdiffstats
path: root/scalding
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-07-28 20:05:17 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-07-28 20:05:17 -0700
commitdd0df0fe3574352011d6a0fe3c12e59b0a4b8259 (patch)
tree951ab504ce4e00ddfe79221c4ffdf1f9768f3368 /scalding
parent304196e01e69826047e5e14af949d5efc80d1ece (diff)
downloadsandcrawler-dd0df0fe3574352011d6a0fe3c12e59b0a4b8259.tar.gz
sandcrawler-dd0df0fe3574352011d6a0fe3c12e59b0a4b8259.zip
Added accent removal to titleToSlug().
Diffstat (limited to 'scalding')
-rw-r--r--scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala28
-rw-r--r--scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala25
2 files changed, 51 insertions, 2 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
index 7923e09..2a569a1 100644
--- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
@@ -1,7 +1,9 @@
package sandcrawler
+import java.text.Normalizer
import java.util.Arrays
import java.util.Properties
+import java.util.regex.Pattern
import scala.math
import scala.util.parsing.json.JSON
@@ -124,7 +126,7 @@ object HBaseCrossrefScore {
}
def titleToSlug(title : String) : Option[String] = {
- val slug = title.split(":")(0).toLowerCase()
+ val slug = removeAccents(title).split(":")(0).toLowerCase()
if (slug.isEmpty) {
None
} else {
@@ -172,4 +174,28 @@ object HBaseCrossrefScore {
}
}
}
+
+ // scalastyle:off
+ // Adapted from https://git-wip-us.apache.org/repos/asf?p=commons-lang.git;a=blob;f=src/main/java/org/apache/commons/lang3/StringUtils.java;h=1d7b9b99335865a88c509339f700ce71ce2c71f2;hb=HEAD#l934
+ // scalastyle:on
+ def removeAccents(s : String) : String = {
+ val replacements = Map(
+ '\u0141' -> 'L',
+ '\u0142' -> 'l', // Letter ell
+ '\u00d8' -> 'O',
+ '\u00f8' -> 'o'
+ )
+ val sb = new StringBuilder(Normalizer.normalize(s, Normalizer.Form.NFD))
+ for (i <- 0 to sb.length - 1) {
+ for (key <- replacements.keys) {
+ if (sb(i) == key) {
+ sb.deleteCharAt(i);
+ sb.insert(i, replacements(key))
+ }
+ }
+ }
+ val pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+")
+ pattern.matcher(sb).replaceAll("").toString
+ }
}
+
diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
index e4cab95..655dda1 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
@@ -162,7 +162,30 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {
it should "return None if given a malformed json string" in {
val slug = HBaseCrossrefScore.grobidToSlug(MalformedCrossrefString)
- slug shouldBe None
+ slug shouldBe None
+ }
+
+ "removeAccents()" should "handle the empty string" in {
+ HBaseCrossrefScore.removeAccents("") shouldBe ""
+ }
+
+ it should "not change a string with unaccented characters" in {
+ HBaseCrossrefScore.removeAccents("abc123") shouldBe "abc123"
+ }
+
+ it should "remove accents from Ls" in {
+ HBaseCrossrefScore.removeAccents("E\u0141\u0142en") shouldBe "ELlen"
+ }
+
+ it should "remove accents from Es without changing case" in {
+ val result = HBaseCrossrefScore.removeAccents("\u00e9")
+ result should have length 1
+ result shouldBe "e"
+ }
+
+ it should "convert the ø in Soren" in {
+ HBaseCrossrefScore.removeAccents("Søren") shouldBe "Soren"
+ HBaseCrossrefScore.removeAccents("SØREN") shouldBe "SOREN"
}
// Pipeline tests