From 96ea0ddd06ee4a7c11c7d5def976749ab3675878 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 15 Aug 2018 22:43:33 -0700 Subject: change slugification behavior to not split on colon --- .../main/scala/sandcrawler/ScorableFeatures.scala | 4 +-- .../scala/sandcrawler/ScorableFeaturesTest.scala | 14 +++++----- .../src/test/scala/sandcrawler/ScoreJobTest.scala | 32 +++++++++++----------- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 696b2ef..8ed3369 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -32,8 +32,8 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S Scorable.NoSlug } else { val unaccented = StringUtilities.removeAccents(title) - // Remove punctuation after splitting on colon. - val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "") + // Remove punctuation + val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "") if (slug.isEmpty || slug == null || (slugBlacklist contains slug)) Scorable.NoSlug else slug } } diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index 0acf0b8..80d92aa 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -14,7 +14,7 @@ class ScorableFeaturesTest extends FlatSpec with Matchers { } "mapToSlug()" should "extract the parts of titles before a colon" in { - titleToSlug("HELLO:there") shouldBe "hello" + titleToSlug("HELLO:there") shouldBe "hellothere" } it should "extract an entire colon-less string" in { @@ -30,8 +30,8 @@ class ScorableFeaturesTest extends FlatSpec with Matchers { } it should "strip punctuation" in { - titleToSlug("HELLO!:the:re") shouldBe "hello" - titleToSlug("a:b:c") shouldBe "a" + titleToSlug("HELLO!:the:re") shouldBe "hellothere" + titleToSlug("a:b:c") shouldBe "abc" titleToSlug( "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands" titleToSlug(":;\"\'") shouldBe Scorable.NoSlug @@ -44,14 +44,14 @@ class ScorableFeaturesTest extends FlatSpec with Matchers { } it should "strip special characters" in { - titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_…") shouldBe Scorable.NoSlug - // TODO: titleToSlug("©™₨№") shouldBe Scorable.NoSlug + titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_") shouldBe Scorable.NoSlug + // TODO: titleToSlug("©™₨№…") shouldBe Scorable.NoSlug // TODO: titleToSlug("πµΣσ") shouldBe Scorable.NoSlug } it should "remove whitespace" in { - titleToSlug("foo bar : baz ::") shouldBe "foobar" - titleToSlug("\na\t:b:c") shouldBe "a" + titleToSlug("foo bar : baz ::") shouldBe "foobarbaz" + titleToSlug("\na\t:b:c") shouldBe "abc" titleToSlug("\n \t \r ") shouldBe Scorable.NoSlug } } diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala index 54ae801..f92ba31 100644 --- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala @@ -121,7 +121,7 @@ class ScoreJobTest extends FlatSpec with Matchers { val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") val MalformedCrossrefString = CrossrefString.replace("}", "") val CrossrefStrings = List( - CrossrefString.replace("<>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"), + CrossrefString.replace("<<TITLE>>", "Title 2: TNG").replace("<<DOI>>", "DOI-0"), CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2A").replace("<<DOI>>", "DOI-0.5"), CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"), CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1")) @@ -182,24 +182,24 @@ class ScoreJobTest extends FlatSpec with Matchers { .sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) { // Grobid titles and slugs (in parentheses): // Title 1 (title1) - // Title 2: TNG (title2) - // Title 3: The Sequel (title3) + // Title 2: TNG (title2tng) + // Title 3: The Sequel (title3thesequel) // crossref titles and slugs (in parentheses): - // Title 1: TNG (title1) - // Title 1: TNG 2A (title1) - // Title 1: TNG 3 (title1) - // Title 2: Rebooted (title2) - // Join should have 3 "title1" slugs and 1 "title2" slug + // Title 2: TNG (title2tng) + // Title 1: TNG 2A (title1tng2a) + // Title 1: TNG 3 (title1tng3) + // Title 2: Rebooted (title2rebooted) + // XXX: Join should have 3 "title1" slugs and 1 "title2tng" slug outputBuffer => - "The pipeline" should "return a 4-element list" in { - outputBuffer should have length 4 + "The pipeline" should "return a 1-element list" in { + outputBuffer should have length 1 } it should "has right # of entries with each slug" in { val slugs = outputBuffer.map(_._1) val countMap : Map[String, Int] = slugs.groupBy(identity).mapValues(_.size) - countMap("title1") shouldBe 3 - countMap("title2") shouldBe 1 + // XXX: countMap("title1") shouldBe 3 + countMap("title2tng") shouldBe 1 } def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) : (String, Int, String, String) = { @@ -215,10 +215,10 @@ class ScoreJobTest extends FlatSpec with Matchers { } it should "have right output values" in { - outputBuffer.exists(_ == bundle("title1", 0, 0)) - outputBuffer.exists(_ == bundle("title1", 0, 2)) - outputBuffer.exists(_ == bundle("title1", 0, 1)) - outputBuffer.exists(_ == bundle("title2", 1, 3)) + //outputBuffer.exists(_ == bundle("title1", 0, 0)) + //outputBuffer.exists(_ == bundle("title1", 0, 2)) + //outputBuffer.exists(_ == bundle("title1", 0, 1)) + outputBuffer.exists(_ == bundle("title2tng", 1, 3)) } } .run -- cgit v1.2.3