Changed scoring, including adding code to compute string differences. Turned off line length checking.

New scores: ['(583,sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q,DOI-0,'title 1','title 1: tng')'] ['(500,sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q,DOI-0.5,'title 1','title 1: tng 2')'] ['(500,sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q,DOI-0.75,'title 1','title 1: tng 3')'] ['(588,sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU,DOI-1,'title 2: tng','title 2: rebooted')']
author: Ellen Spertus <ellen.spertus@gmail.com> 2018-07-30 11:55:19 -0700
committer: Ellen Spertus <ellen.spertus@gmail.com> 2018-07-30 11:55:19 -0700
commit: 81dbd0e05653682dccb8bc74b39067b4ee7ac1f2 (patch)
tree: 657118763cc81f1f1ae5538bed1b18c8d82f8f6f /scalding/src
parent: dd0df0fe3574352011d6a0fe3c12e59b0a4b8259 (diff)
download: sandcrawler-81dbd0e05653682dccb8bc74b39067b4ee7ac1f2.tar.gz
sandcrawler-81dbd0e05653682dccb8bc74b39067b4ee7ac1f2.zip
2 files changed, 84 insertions, 22 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
index 2a569a1..01d852e 100644
--- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
@@ -76,7 +76,7 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConv
       ((slug0: String, sha1 : String, grobidJson : String),
         (slug1 : String, crossrefJson : String))) = entry
     HBaseCrossrefScore.computeOutput(sha1, grobidJson, crossrefJson)}
-  // Output: score, sha1, doi, grobid title, crossref title
+    // Output: score, sha1, doi, grobid title, crossref title
     .write(TypedTsv[(Int, String, String, String, String)](args("output")))
 
 }
@@ -134,22 +134,7 @@ object HBaseCrossrefScore {
     }
   }
 
-  val FullTitleMatch = 100
-  val TitleLeftMatchBase = 50
-  val MaxTitleLeftMatch = 80
-  val SlugMatch = 25
-
-  def computeSimilarity(gTitle : String, cTitle : String) : Int = {
-    assert(titleToSlug(gTitle) == titleToSlug(cTitle))
-    if (gTitle == cTitle) {
-      FullTitleMatch
-    } else if (gTitle.startsWith(cTitle) || cTitle.startsWith(gTitle)) {
-      math.min(TitleLeftMatchBase + math.abs(gTitle.length - cTitle.length),
-        MaxTitleLeftMatch)
-    } else {
-      SlugMatch
-    }
-  }
+  val MaxScore = 1000
 
   def computeOutput(sha1 : String, grobidJson : String, crossrefJson : String) :
     // (score, sha1, doi, grobidTitle, crossrefTitle)
@@ -164,7 +149,7 @@ object HBaseCrossrefScore {
           case Some(crossref) => {
             val crossrefTitle = crossref("title").asInstanceOf[List[String]](0).toLowerCase()
 
-            (computeSimilarity(grobidTitle, crossrefTitle),
+            (similarity(removeAccents(grobidTitle), removeAccents(crossrefTitle)),
               sha1,
               crossref("DOI").asInstanceOf[String],
               "'" + grobidTitle + "'",
@@ -175,9 +160,7 @@ object HBaseCrossrefScore {
     }
   }
 
-  // scalastyle:off
   // Adapted from https://git-wip-us.apache.org/repos/asf?p=commons-lang.git;a=blob;f=src/main/java/org/apache/commons/lang3/StringUtils.java;h=1d7b9b99335865a88c509339f700ce71ce2c71f2;hb=HEAD#l934
-  // scalastyle:on
   def removeAccents(s : String) : String = {
     val replacements = Map(
       '\u0141' -> 'L',
@@ -195,7 +178,39 @@ object HBaseCrossrefScore {
       }
     }
     val pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+")
-    pattern.matcher(sb).replaceAll("").toString
+    pattern.matcher(sb).replaceAll("")
+  }
+
+  // Adapted from: https://stackoverflow.com/a/16018452/631051
+  def similarity(s1 : String, s2 : String) : Int = {
+    val longer : String = if (s1.length > s2.length) s1 else s2
+    val shorter : String = if (s1.length > s2.length) s2 else s1
+    if (longer.length == 0) {
+      // Both strings are empty.
+      MaxScore
+    } else {
+      (longer.length - stringDistance(longer, shorter)) * MaxScore / longer.length
+    }
+  }
+
+  // Source: // https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/
+  def stringDistance(s1: String, s2: String): Int = {
+    val memo = scala.collection.mutable.Map[(List[Char],List[Char]),Int]()
+    def min(a:Int, b:Int, c:Int) = Math.min( Math.min( a, b ), c)
+    def sd(s1: List[Char], s2: List[Char]): Int = {
+      if (!memo.contains((s1, s2))) {
+        memo((s1,s2)) = (s1, s2) match {
+          case (_, Nil) => s1.length
+          case (Nil, _) => s2.length
+          case (c1::t1, c2::t2)  =>
+            min( sd(t1,s2) + 1, sd(s1,t2) + 1,
+              sd(t1,t2) + (if (c1==c2) 0 else 1) )
+        }
+      }
+      memo((s1,s2))
+    }
+
+    sd( s1.toList, s2.toList )
   }
 }
 
diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
index 655dda1..e6ff4a8 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
@@ -188,6 +188,53 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {
     HBaseCrossrefScore.removeAccents("SØREN") shouldBe "SOREN"
   }
 
+  // Tests adapted from https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/
+  "stringDistance" should "work on empty strings" in {
+    HBaseCrossrefScore.stringDistance("", "") shouldBe 0
+    HBaseCrossrefScore.stringDistance("a", "") shouldBe 1
+    HBaseCrossrefScore.stringDistance("", "a") shouldBe 1
+    HBaseCrossrefScore.stringDistance("abc", "") shouldBe 3
+    HBaseCrossrefScore.stringDistance("", "abc") shouldBe 3
+  }
+
+  it should "work on equal strings" in {
+    HBaseCrossrefScore.stringDistance("", "") shouldBe 0
+    HBaseCrossrefScore.stringDistance("a", "a") shouldBe 0
+    HBaseCrossrefScore.stringDistance("abc", "abc") shouldBe 0
+  }
+
+  it should "work where only inserts are needed" in {
+    HBaseCrossrefScore.stringDistance("", "a") shouldBe 1
+    HBaseCrossrefScore.stringDistance("a", "ab") shouldBe 1
+    HBaseCrossrefScore.stringDistance("b", "ab") shouldBe 1
+    HBaseCrossrefScore.stringDistance("ac", "abc") shouldBe 1
+    HBaseCrossrefScore.stringDistance("abcdefg", "xabxcdxxefxgx") shouldBe 6
+  }
+
+  it should "work where only deletes are needed" in {
+    HBaseCrossrefScore.stringDistance( "a", "") shouldBe 1
+    HBaseCrossrefScore.stringDistance( "ab", "a") shouldBe 1
+    HBaseCrossrefScore.stringDistance( "ab", "b") shouldBe 1
+    HBaseCrossrefScore.stringDistance("abc", "ac") shouldBe 1
+    HBaseCrossrefScore.stringDistance("xabxcdxxefxgx", "abcdefg") shouldBe 6
+  }
+
+  it should "work where only substitutions are needed" in {
+    HBaseCrossrefScore.stringDistance(  "a",   "b") shouldBe 1
+    HBaseCrossrefScore.stringDistance( "ab",  "ac") shouldBe 1
+    HBaseCrossrefScore.stringDistance( "ac",  "bc") shouldBe 1
+    HBaseCrossrefScore.stringDistance("abc", "axc") shouldBe 1
+    HBaseCrossrefScore.stringDistance("xabxcdxxefxgx", "1ab2cd34ef5g6") shouldBe 6
+  }
+
+  it should "work where many operations are needed" in {
+    HBaseCrossrefScore.stringDistance("example", "samples") shouldBe 3
+    HBaseCrossrefScore.stringDistance("sturgeon", "urgently") shouldBe 6
+    HBaseCrossrefScore.stringDistance("levenshtein", "frankenstein") shouldBe 6
+    HBaseCrossrefScore.stringDistance("distance", "difference") shouldBe 5
+    HBaseCrossrefScore.stringDistance("java was neat", "scala is great") shouldBe 7
+  }
+
   //  Pipeline tests
 
   val output = "/tmp/testOutput"
@@ -227,7 +274,7 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {
       //   "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted"
       // Join should have 3 "Title  1" slugs and 1 "Title 2" slug
       outputBuffer =>
-      it should "return a 4-element list" in {
+      "The pipeline" should "return a 4-element list" in {
         outputBuffer should have length 4
       }
author	Ellen Spertus <ellen.spertus@gmail.com>	2018-07-30 11:55:19 -0700
committer	Ellen Spertus <ellen.spertus@gmail.com>	2018-07-30 11:55:19 -0700
commit	81dbd0e05653682dccb8bc74b39067b4ee7ac1f2 (patch)
tree	657118763cc81f1f1ae5538bed1b18c8d82f8f6f /scalding/src
parent	dd0df0fe3574352011d6a0fe3c12e59b0a4b8259 (diff)
download	sandcrawler-81dbd0e05653682dccb8bc74b39067b4ee7ac1f2.tar.gz sandcrawler-81dbd0e05653682dccb8bc74b39067b4ee7ac1f2.zip