diff options
Diffstat (limited to 'scalding')
| -rw-r--r-- | scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala | 28 | ||||
| -rw-r--r-- | scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala | 25 | 
2 files changed, 51 insertions, 2 deletions
| diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala index 7923e09..2a569a1 100644 --- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala +++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala @@ -1,7 +1,9 @@  package sandcrawler +import java.text.Normalizer  import java.util.Arrays  import java.util.Properties +import java.util.regex.Pattern  import scala.math  import scala.util.parsing.json.JSON @@ -124,7 +126,7 @@ object HBaseCrossrefScore {    }    def titleToSlug(title : String) : Option[String] = { -    val slug = title.split(":")(0).toLowerCase() +    val slug = removeAccents(title).split(":")(0).toLowerCase()      if (slug.isEmpty) {        None      } else { @@ -172,4 +174,28 @@ object HBaseCrossrefScore {        }      }    } + +  // scalastyle:off +  // Adapted from https://git-wip-us.apache.org/repos/asf?p=commons-lang.git;a=blob;f=src/main/java/org/apache/commons/lang3/StringUtils.java;h=1d7b9b99335865a88c509339f700ce71ce2c71f2;hb=HEAD#l934 +  // scalastyle:on +  def removeAccents(s : String) : String = { +    val replacements = Map( +      '\u0141' -> 'L', +      '\u0142' -> 'l',  // Letter ell +      '\u00d8' -> 'O', +      '\u00f8' -> 'o' +    ) +    val sb = new StringBuilder(Normalizer.normalize(s, Normalizer.Form.NFD)) +    for (i <- 0 to sb.length - 1) { +      for (key <- replacements.keys) { +        if (sb(i) == key) { +          sb.deleteCharAt(i); +          sb.insert(i, replacements(key)) +        } +      } +    } +    val pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+") +    pattern.matcher(sb).replaceAll("").toString +  }  } + diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala index e4cab95..655dda1 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala @@ -162,7 +162,30 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {    it should "return None if given a malformed json string" in {      val slug = HBaseCrossrefScore.grobidToSlug(MalformedCrossrefString) -     slug shouldBe None +    slug shouldBe None +  } + +  "removeAccents()" should "handle the empty string" in { +    HBaseCrossrefScore.removeAccents("") shouldBe "" +  } + +  it should "not change a string with unaccented characters" in { +    HBaseCrossrefScore.removeAccents("abc123") shouldBe "abc123" +  } + +  it should "remove accents from Ls" in { +    HBaseCrossrefScore.removeAccents("E\u0141\u0142en") shouldBe "ELlen" +  } + +  it should "remove accents from Es without changing case" in { +    val result = HBaseCrossrefScore.removeAccents("\u00e9") +    result should have length 1 +    result shouldBe "e" +  } + +  it should "convert the ø in Soren" in { +    HBaseCrossrefScore.removeAccents("Søren") shouldBe "Soren" +    HBaseCrossrefScore.removeAccents("SØREN") shouldBe "SOREN"    }    //  Pipeline tests | 
