diff options
Diffstat (limited to 'scalding')
-rw-r--r-- | scalding/scalastyle-config.xml | 2 | ||||
-rw-r--r-- | scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala | 57 | ||||
-rw-r--r-- | scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala | 49 |
3 files changed, 85 insertions, 23 deletions
diff --git a/scalding/scalastyle-config.xml b/scalding/scalastyle-config.xml index 86d8fca..47d0feb 100644 --- a/scalding/scalastyle-config.xml +++ b/scalding/scalastyle-config.xml @@ -35,7 +35,7 @@ <check level="warning" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check> <check level="warning" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="true"></check> <check level="warning" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check> - <check level="warning" class="org.scalastyle.file.FileLineLengthChecker" enabled="true"> + <check level="warning" class="org.scalastyle.file.FileLineLengthChecker" enabled="false"> <parameters> <parameter name="maxLineLength"><![CDATA[160]]></parameter> <parameter name="tabSize"><![CDATA[4]]></parameter> diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala index 2a569a1..01d852e 100644 --- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala +++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala @@ -76,7 +76,7 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConv ((slug0: String, sha1 : String, grobidJson : String), (slug1 : String, crossrefJson : String))) = entry HBaseCrossrefScore.computeOutput(sha1, grobidJson, crossrefJson)} - // Output: score, sha1, doi, grobid title, crossref title + // Output: score, sha1, doi, grobid title, crossref title .write(TypedTsv[(Int, String, String, String, String)](args("output"))) } @@ -134,22 +134,7 @@ object HBaseCrossrefScore { } } - val FullTitleMatch = 100 - val TitleLeftMatchBase = 50 - val MaxTitleLeftMatch = 80 - val SlugMatch = 25 - - def computeSimilarity(gTitle : String, cTitle : String) : Int = { - assert(titleToSlug(gTitle) == titleToSlug(cTitle)) - if (gTitle == cTitle) { - FullTitleMatch - } else if (gTitle.startsWith(cTitle) || cTitle.startsWith(gTitle)) { - math.min(TitleLeftMatchBase + math.abs(gTitle.length - cTitle.length), - MaxTitleLeftMatch) - } else { - SlugMatch - } - } + val MaxScore = 1000 def computeOutput(sha1 : String, grobidJson : String, crossrefJson : String) : // (score, sha1, doi, grobidTitle, crossrefTitle) @@ -164,7 +149,7 @@ object HBaseCrossrefScore { case Some(crossref) => { val crossrefTitle = crossref("title").asInstanceOf[List[String]](0).toLowerCase() - (computeSimilarity(grobidTitle, crossrefTitle), + (similarity(removeAccents(grobidTitle), removeAccents(crossrefTitle)), sha1, crossref("DOI").asInstanceOf[String], "'" + grobidTitle + "'", @@ -175,9 +160,7 @@ object HBaseCrossrefScore { } } - // scalastyle:off // Adapted from https://git-wip-us.apache.org/repos/asf?p=commons-lang.git;a=blob;f=src/main/java/org/apache/commons/lang3/StringUtils.java;h=1d7b9b99335865a88c509339f700ce71ce2c71f2;hb=HEAD#l934 - // scalastyle:on def removeAccents(s : String) : String = { val replacements = Map( '\u0141' -> 'L', @@ -195,7 +178,39 @@ object HBaseCrossrefScore { } } val pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+") - pattern.matcher(sb).replaceAll("").toString + pattern.matcher(sb).replaceAll("") + } + + // Adapted from: https://stackoverflow.com/a/16018452/631051 + def similarity(s1 : String, s2 : String) : Int = { + val longer : String = if (s1.length > s2.length) s1 else s2 + val shorter : String = if (s1.length > s2.length) s2 else s1 + if (longer.length == 0) { + // Both strings are empty. + MaxScore + } else { + (longer.length - stringDistance(longer, shorter)) * MaxScore / longer.length + } + } + + // Source: // https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/ + def stringDistance(s1: String, s2: String): Int = { + val memo = scala.collection.mutable.Map[(List[Char],List[Char]),Int]() + def min(a:Int, b:Int, c:Int) = Math.min( Math.min( a, b ), c) + def sd(s1: List[Char], s2: List[Char]): Int = { + if (!memo.contains((s1, s2))) { + memo((s1,s2)) = (s1, s2) match { + case (_, Nil) => s1.length + case (Nil, _) => s2.length + case (c1::t1, c2::t2) => + min( sd(t1,s2) + 1, sd(s1,t2) + 1, + sd(t1,t2) + (if (c1==c2) 0 else 1) ) + } + } + memo((s1,s2)) + } + + sd( s1.toList, s2.toList ) } } diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala index 655dda1..e6ff4a8 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala @@ -188,6 +188,53 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers { HBaseCrossrefScore.removeAccents("SØREN") shouldBe "SOREN" } + // Tests adapted from https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/ + "stringDistance" should "work on empty strings" in { + HBaseCrossrefScore.stringDistance("", "") shouldBe 0 + HBaseCrossrefScore.stringDistance("a", "") shouldBe 1 + HBaseCrossrefScore.stringDistance("", "a") shouldBe 1 + HBaseCrossrefScore.stringDistance("abc", "") shouldBe 3 + HBaseCrossrefScore.stringDistance("", "abc") shouldBe 3 + } + + it should "work on equal strings" in { + HBaseCrossrefScore.stringDistance("", "") shouldBe 0 + HBaseCrossrefScore.stringDistance("a", "a") shouldBe 0 + HBaseCrossrefScore.stringDistance("abc", "abc") shouldBe 0 + } + + it should "work where only inserts are needed" in { + HBaseCrossrefScore.stringDistance("", "a") shouldBe 1 + HBaseCrossrefScore.stringDistance("a", "ab") shouldBe 1 + HBaseCrossrefScore.stringDistance("b", "ab") shouldBe 1 + HBaseCrossrefScore.stringDistance("ac", "abc") shouldBe 1 + HBaseCrossrefScore.stringDistance("abcdefg", "xabxcdxxefxgx") shouldBe 6 + } + + it should "work where only deletes are needed" in { + HBaseCrossrefScore.stringDistance( "a", "") shouldBe 1 + HBaseCrossrefScore.stringDistance( "ab", "a") shouldBe 1 + HBaseCrossrefScore.stringDistance( "ab", "b") shouldBe 1 + HBaseCrossrefScore.stringDistance("abc", "ac") shouldBe 1 + HBaseCrossrefScore.stringDistance("xabxcdxxefxgx", "abcdefg") shouldBe 6 + } + + it should "work where only substitutions are needed" in { + HBaseCrossrefScore.stringDistance( "a", "b") shouldBe 1 + HBaseCrossrefScore.stringDistance( "ab", "ac") shouldBe 1 + HBaseCrossrefScore.stringDistance( "ac", "bc") shouldBe 1 + HBaseCrossrefScore.stringDistance("abc", "axc") shouldBe 1 + HBaseCrossrefScore.stringDistance("xabxcdxxefxgx", "1ab2cd34ef5g6") shouldBe 6 + } + + it should "work where many operations are needed" in { + HBaseCrossrefScore.stringDistance("example", "samples") shouldBe 3 + HBaseCrossrefScore.stringDistance("sturgeon", "urgently") shouldBe 6 + HBaseCrossrefScore.stringDistance("levenshtein", "frankenstein") shouldBe 6 + HBaseCrossrefScore.stringDistance("distance", "difference") shouldBe 5 + HBaseCrossrefScore.stringDistance("java was neat", "scala is great") shouldBe 7 + } + // Pipeline tests val output = "/tmp/testOutput" @@ -227,7 +274,7 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers { // "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted" // Join should have 3 "Title 1" slugs and 1 "Title 2" slug outputBuffer => - it should "return a 4-element list" in { + "The pipeline" should "return a 4-element list" in { outputBuffer should have length 4 } |