aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-15 22:43:33 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-15 22:43:33 -0700
commit96ea0ddd06ee4a7c11c7d5def976749ab3675878 (patch)
tree279382cc39355475c8a93f5ca3efcfb05b26fa57
parent2277c2f793a007fa3a347af23fca35f4a3eafeef (diff)
downloadsandcrawler-96ea0ddd06ee4a7c11c7d5def976749ab3675878.tar.gz
sandcrawler-96ea0ddd06ee4a7c11c7d5def976749ab3675878.zip
change slugification behavior to not split on colon
-rw-r--r--scalding/src/main/scala/sandcrawler/ScorableFeatures.scala4
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala14
-rw-r--r--scalding/src/test/scala/sandcrawler/ScoreJobTest.scala32
3 files changed, 25 insertions, 25 deletions
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
index 696b2ef..8ed3369 100644
--- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -32,8 +32,8 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S
Scorable.NoSlug
} else {
val unaccented = StringUtilities.removeAccents(title)
- // Remove punctuation after splitting on colon.
- val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "")
+ // Remove punctuation
+ val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "")
if (slug.isEmpty || slug == null || (slugBlacklist contains slug)) Scorable.NoSlug else slug
}
}
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
index 0acf0b8..80d92aa 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
@@ -14,7 +14,7 @@ class ScorableFeaturesTest extends FlatSpec with Matchers {
}
"mapToSlug()" should "extract the parts of titles before a colon" in {
- titleToSlug("HELLO:there") shouldBe "hello"
+ titleToSlug("HELLO:there") shouldBe "hellothere"
}
it should "extract an entire colon-less string" in {
@@ -30,8 +30,8 @@ class ScorableFeaturesTest extends FlatSpec with Matchers {
}
it should "strip punctuation" in {
- titleToSlug("HELLO!:the:re") shouldBe "hello"
- titleToSlug("a:b:c") shouldBe "a"
+ titleToSlug("HELLO!:the:re") shouldBe "hellothere"
+ titleToSlug("a:b:c") shouldBe "abc"
titleToSlug(
"If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands"
titleToSlug(":;\"\'") shouldBe Scorable.NoSlug
@@ -44,14 +44,14 @@ class ScorableFeaturesTest extends FlatSpec with Matchers {
}
it should "strip special characters" in {
- titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_…") shouldBe Scorable.NoSlug
- // TODO: titleToSlug("©™₨№") shouldBe Scorable.NoSlug
+ titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_") shouldBe Scorable.NoSlug
+ // TODO: titleToSlug("©™₨№…") shouldBe Scorable.NoSlug
// TODO: titleToSlug("πµΣσ") shouldBe Scorable.NoSlug
}
it should "remove whitespace" in {
- titleToSlug("foo bar : baz ::") shouldBe "foobar"
- titleToSlug("\na\t:b:c") shouldBe "a"
+ titleToSlug("foo bar : baz ::") shouldBe "foobarbaz"
+ titleToSlug("\na\t:b:c") shouldBe "abc"
titleToSlug("\n \t \r ") shouldBe Scorable.NoSlug
}
}
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
index 54ae801..f92ba31 100644
--- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -121,7 +121,7 @@ class ScoreJobTest extends FlatSpec with Matchers {
val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
val MalformedCrossrefString = CrossrefString.replace("}", "")
val CrossrefStrings = List(
- CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"),
+ CrossrefString.replace("<<TITLE>>", "Title 2: TNG").replace("<<DOI>>", "DOI-0"),
CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2A").replace("<<DOI>>", "DOI-0.5"),
CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"))
@@ -182,24 +182,24 @@ class ScoreJobTest extends FlatSpec with Matchers {
.sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) {
// Grobid titles and slugs (in parentheses):
// Title 1 (title1)
- // Title 2: TNG (title2)
- // Title 3: The Sequel (title3)
+ // Title 2: TNG (title2tng)
+ // Title 3: The Sequel (title3thesequel)
// crossref titles and slugs (in parentheses):
- // Title 1: TNG (title1)
- // Title 1: TNG 2A (title1)
- // Title 1: TNG 3 (title1)
- // Title 2: Rebooted (title2)
- // Join should have 3 "title1" slugs and 1 "title2" slug
+ // Title 2: TNG (title2tng)
+ // Title 1: TNG 2A (title1tng2a)
+ // Title 1: TNG 3 (title1tng3)
+ // Title 2: Rebooted (title2rebooted)
+ // XXX: Join should have 3 "title1" slugs and 1 "title2tng" slug
outputBuffer =>
- "The pipeline" should "return a 4-element list" in {
- outputBuffer should have length 4
+ "The pipeline" should "return a 1-element list" in {
+ outputBuffer should have length 1
}
it should "has right # of entries with each slug" in {
val slugs = outputBuffer.map(_._1)
val countMap : Map[String, Int] = slugs.groupBy(identity).mapValues(_.size)
- countMap("title1") shouldBe 3
- countMap("title2") shouldBe 1
+ // XXX: countMap("title1") shouldBe 3
+ countMap("title2tng") shouldBe 1
}
def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) : (String, Int, String, String) = {
@@ -215,10 +215,10 @@ class ScoreJobTest extends FlatSpec with Matchers {
}
it should "have right output values" in {
- outputBuffer.exists(_ == bundle("title1", 0, 0))
- outputBuffer.exists(_ == bundle("title1", 0, 2))
- outputBuffer.exists(_ == bundle("title1", 0, 1))
- outputBuffer.exists(_ == bundle("title2", 1, 3))
+ //outputBuffer.exists(_ == bundle("title1", 0, 0))
+ //outputBuffer.exists(_ == bundle("title1", 0, 2))
+ //outputBuffer.exists(_ == bundle("title1", 0, 1))
+ outputBuffer.exists(_ == bundle("title2tng", 1, 3))
}
}
.run