change slugification behavior to not split on colon

author: Bryan Newbold <bnewbold@archive.org> 2018-08-15 22:43:33 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2018-08-15 22:43:33 -0700
commit: 96ea0ddd06ee4a7c11c7d5def976749ab3675878 (patch)
tree: 279382cc39355475c8a93f5ca3efcfb05b26fa57
parent: 2277c2f793a007fa3a347af23fca35f4a3eafeef (diff)
download: sandcrawler-96ea0ddd06ee4a7c11c7d5def976749ab3675878.tar.gz
sandcrawler-96ea0ddd06ee4a7c11c7d5def976749ab3675878.zip
3 files changed, 25 insertions, 25 deletions
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
index 696b2ef..8ed3369 100644
--- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -32,8 +32,8 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S
       Scorable.NoSlug
     } else {
       val unaccented = StringUtilities.removeAccents(title)
-      // Remove punctuation after splitting on colon.
-      val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "")
+      // Remove punctuation
+      val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "")
       if (slug.isEmpty || slug == null || (slugBlacklist contains slug)) Scorable.NoSlug else slug
     }
   }
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
index 0acf0b8..80d92aa 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
@@ -14,7 +14,7 @@ class ScorableFeaturesTest extends FlatSpec with Matchers {
   }
 
   "mapToSlug()" should "extract the parts of titles before a colon" in {
-    titleToSlug("HELLO:there") shouldBe "hello"
+    titleToSlug("HELLO:there") shouldBe "hellothere"
   }
 
   it should "extract an entire colon-less string" in {
@@ -30,8 +30,8 @@ class ScorableFeaturesTest extends FlatSpec with Matchers {
   }
 
   it should "strip punctuation" in {
-    titleToSlug("HELLO!:the:re") shouldBe "hello"
-    titleToSlug("a:b:c") shouldBe "a"
+    titleToSlug("HELLO!:the:re") shouldBe "hellothere"
+    titleToSlug("a:b:c") shouldBe "abc"
     titleToSlug(
       "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands"
     titleToSlug(":;\"\'") shouldBe Scorable.NoSlug
@@ -44,14 +44,14 @@ class ScorableFeaturesTest extends FlatSpec with Matchers {
   }
 
   it should "strip special characters" in {
-    titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_…") shouldBe Scorable.NoSlug
-    // TODO: titleToSlug("©™₨№") shouldBe Scorable.NoSlug
+    titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_") shouldBe Scorable.NoSlug
+    // TODO: titleToSlug("©™₨№…") shouldBe Scorable.NoSlug
     // TODO: titleToSlug("πµΣσ") shouldBe Scorable.NoSlug
   }
 
   it should "remove whitespace" in {
-    titleToSlug("foo bar : baz ::") shouldBe "foobar"
-    titleToSlug("\na\t:b:c") shouldBe "a"
+    titleToSlug("foo bar : baz ::") shouldBe "foobarbaz"
+    titleToSlug("\na\t:b:c") shouldBe "abc"
     titleToSlug("\n \t \r  ") shouldBe Scorable.NoSlug
   }
 }
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
index 54ae801..f92ba31 100644
--- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -121,7 +121,7 @@ class ScoreJobTest extends FlatSpec with Matchers {
   val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
   val MalformedCrossrefString = CrossrefString.replace("}", "")
   val CrossrefStrings = List(
-    CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"),
+    CrossrefString.replace("<<TITLE>>", "Title 2: TNG").replace("<<DOI>>", "DOI-0"),
     CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2A").replace("<<DOI>>", "DOI-0.5"),
     CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
     CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"))
@@ -182,24 +182,24 @@ class ScoreJobTest extends FlatSpec with Matchers {
     .sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) {
       // Grobid titles and slugs (in parentheses):
       //   Title 1                       (title1)
-      //   Title 2: TNG                  (title2)
-      //   Title 3: The Sequel           (title3)
+      //   Title 2: TNG                  (title2tng)
+      //   Title 3: The Sequel           (title3thesequel)
       // crossref titles and slugs (in parentheses):
-      //   Title 1: TNG                  (title1)
-      //   Title 1: TNG 2A               (title1)
-      //   Title 1: TNG 3                (title1)
-      //   Title 2: Rebooted             (title2)
-      // Join should have 3 "title1" slugs and 1 "title2" slug
+      //   Title 2: TNG                  (title2tng)
+      //   Title 1: TNG 2A               (title1tng2a)
+      //   Title 1: TNG 3                (title1tng3)
+      //   Title 2: Rebooted             (title2rebooted)
+      // XXX: Join should have 3 "title1" slugs and 1 "title2tng" slug
       outputBuffer =>
-      "The pipeline" should "return a 4-element list" in {
-        outputBuffer should have length 4
+      "The pipeline" should "return a 1-element list" in {
+        outputBuffer should have length 1
       }
 
       it should "has right # of entries with each slug" in {
         val slugs = outputBuffer.map(_._1)
         val countMap : Map[String, Int] = slugs.groupBy(identity).mapValues(_.size)
-        countMap("title1") shouldBe 3
-        countMap("title2") shouldBe 1
+        // XXX: countMap("title1") shouldBe 3
+        countMap("title2tng") shouldBe 1
       }
 
       def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) : (String, Int, String, String) = {
@@ -215,10 +215,10 @@ class ScoreJobTest extends FlatSpec with Matchers {
       }
 
       it should "have right output values" in {
-        outputBuffer.exists(_ == bundle("title1", 0, 0))
-        outputBuffer.exists(_ == bundle("title1", 0, 2))
-        outputBuffer.exists(_ == bundle("title1", 0, 1))
-        outputBuffer.exists(_ == bundle("title2", 1, 3))
+        //outputBuffer.exists(_ == bundle("title1", 0, 0))
+        //outputBuffer.exists(_ == bundle("title1", 0, 2))
+        //outputBuffer.exists(_ == bundle("title1", 0, 1))
+        outputBuffer.exists(_ == bundle("title2tng", 1, 3))
       }
     }
     .run
author	Bryan Newbold <bnewbold@archive.org>	2018-08-15 22:43:33 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2018-08-15 22:43:33 -0700
commit	96ea0ddd06ee4a7c11c7d5def976749ab3675878 (patch)
tree	279382cc39355475c8a93f5ca3efcfb05b26fa57
parent	2277c2f793a007fa3a347af23fca35f4a3eafeef (diff)
download	sandcrawler-96ea0ddd06ee4a7c11c7d5def976749ab3675878.tar.gz sandcrawler-96ea0ddd06ee4a7c11c7d5def976749ab3675878.zip