aboutsummaryrefslogtreecommitdiffstats
path: root/scalding
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-23 17:50:23 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-23 17:50:23 -0700
commit2656af2686aa73d0061a581bef3b9ca9d4ad8451 (patch)
tree3fd9332695067458368581aca6254a305ae1e080 /scalding
parent2ab704a09db06ab776bd4cf59974e5f65f5e7c38 (diff)
downloadsandcrawler-2656af2686aa73d0061a581bef3b9ca9d4ad8451.tar.gz
sandcrawler-2656af2686aa73d0061a581bef3b9ca9d4ad8451.zip
set a minimum slug size (8 chars)
Diffstat (limited to 'scalding')
-rw-r--r--scalding/src/main/scala/sandcrawler/ScorableFeatures.scala6
-rw-r--r--scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala6
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala9
-rw-r--r--scalding/src/test/scala/sandcrawler/ScoreJobTest.scala25
4 files changed, 31 insertions, 15 deletions
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
index 0b9868a..9eb03f7 100644
--- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -10,6 +10,7 @@ object ScorableFeatures {
val fileStream : InputStream = getClass.getResourceAsStream("/slug-blacklist.txt")
val SlugBlacklist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet
fileStream.close
+ val MinSlugLength = 8
// Static factory method
def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = {
@@ -38,7 +39,10 @@ class ScorableFeatures private(title : String, year: Int = 0, doi : String = "",
val unaccented = StringUtilities.removeAccents(title)
// Remove punctuation
val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "")
- if (slug.isEmpty || slug == null || (ScorableFeatures.SlugBlacklist contains slug)) Scorable.NoSlug else slug
+ if (slug.isEmpty
+ || slug == null
+ || (ScorableFeatures.SlugBlacklist contains slug)
+ || (slug.length < ScorableFeatures.MinSlugLength)) Scorable.NoSlug else slug
}
}
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala
index 12e13dc..bf9343b 100644
--- a/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala
@@ -78,14 +78,14 @@ class GrobidScorableDumpJobTest extends FlatSpec with Matchers {
"sha1:024937534094897039547e9824382943") // bad status
val JsonStrings : List[String] = List(
- JsonString.replace("<<TITLE>>", "Title 1"),
+ JsonString.replace("<<TITLE>>", "Title 1: The Classic"),
JsonString.replace("<<TITLE>>", "Title 2: TNG"),
JsonString.replace("<<TITLE>>", "Title 3: The Sequel"),
// This will have bad status.
- JsonString.replace("<<TITLE>>", "Title 1"),
+ JsonString.replace("<<TITLE>>", "Title 1: The Classic"),
MalformedJsonString,
// This will have bad status.
- JsonString.replace("<<TITLE>>", "Title 2")
+ JsonString.replace("<<TITLE>>", "Title 2: Not TNG")
)
// bnewbold: status codes aren't strings, they are uint64
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
index d742384..474f69a 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
@@ -36,7 +36,7 @@ class ScorableFeaturesTest extends FlatSpec with Matchers {
it should "strip punctuation" in {
titleToSlug("HELLO!:the:re") shouldBe "hellothere"
- titleToSlug("a:b:c") shouldBe "abc"
+ titleToSlug("a:b:cdefgh") shouldBe "abcdefgh"
titleToSlug(
"If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands"
titleToSlug(":;\"\'") shouldBe Scorable.NoSlug
@@ -56,7 +56,12 @@ class ScorableFeaturesTest extends FlatSpec with Matchers {
it should "remove whitespace" in {
titleToSlug("foo bar : baz ::") shouldBe "foobarbaz"
- titleToSlug("\na\t:b:c") shouldBe "abc"
+ titleToSlug("\na\t:b:cdefghi") shouldBe "abcdefghi"
titleToSlug("\n \t \r ") shouldBe Scorable.NoSlug
}
+
+ it should "skip very short slugs" in {
+ titleToSlug("short") shouldBe Scorable.NoSlug
+ titleToSlug("a longer, more in depth title") shouldBe "alongermoreindepthtitle"
+ }
}
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
index 85d141a..32fb16c 100644
--- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -118,6 +118,7 @@ class ScoreJobTest extends FlatSpec with Matchers {
"""
// scalastyle:on
val TooLongOfTitle = "X" * Scorable.MaxTitleLength + "Y" // arbitrary long string
+ val TooShortOfTitle = "X" * (ScorableFeatures.MinSlugLength - 1)
val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
val MalformedCrossrefString = CrossrefString.replace("}", "")
@@ -126,7 +127,8 @@ class ScoreJobTest extends FlatSpec with Matchers {
CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2A").replace("<<DOI>>", "DOI-0.5"),
CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"),
- CrossrefString.replace("<<TITLE>>", TooLongOfTitle).replace("<<DOI>>", "DOI-1"))
+ CrossrefString.replace("<<TITLE>>", TooLongOfTitle).replace("<<DOI>>", "DOI-1"),
+ CrossrefString.replace("<<TITLE>>", TooShortOfTitle).replace("<<DOI>>", "DOI-1"))
// Pipeline tests
val output = "/tmp/testOutput"
@@ -140,25 +142,27 @@ class ScoreJobTest extends FlatSpec with Matchers {
"sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56",
"sha1:93187A85273589347598473894839443",
"sha1:024937534094897039547e9824382943",
- "sha1:93229759932857982837892347893892")
+ "sha1:93229759932857982837892347893892",
+ "sha1:83229759932857982837892347893892")
val JsonStrings : List[String] = List(
- JsonString.replace("<<TITLE>>", "Title 1"),
+ JsonString.replace("<<TITLE>>", "Title 1: The Original"),
JsonString.replace("<<TITLE>>", "Title 2: TNG"),
JsonString.replace("<<TITLE>>", "Title 3: The Sequel"),
// This will have bad status.
- JsonString.replace("<<TITLE>>", "Title 1"),
+ JsonString.replace("<<TITLE>>", "Title 1: The Original"),
MalformedJsonString,
// This will have bad status.
- JsonString.replace("<<TITLE>>", "Title 2"),
- // This is in both sources but too long.
- JsonString.replace("<<TITLE>>", TooLongOfTitle)
+ JsonString.replace("<<TITLE>>", "Title 2: Not TNG"),
+ // These are in both sources but have bad titles
+ JsonString.replace("<<TITLE>>", TooLongOfTitle),
+ JsonString.replace("<<TITLE>>", TooShortOfTitle)
)
// bnewbold: status codes aren't strings, they are uint64
val Ok : Long = 200
val Bad : Long = 400
- val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad, Ok)
+ val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad, Ok, Ok)
val SampleDataHead : List[Tuple] = (Sha1Strings, JsonStrings, StatusCodes)
.zipped
@@ -187,7 +191,8 @@ class ScoreJobTest extends FlatSpec with Matchers {
1 -> CrossrefStrings(1),
2 -> CrossrefStrings(2),
3 -> CrossrefStrings(3),
- 4 -> CrossrefStrings(4)))
+ 4 -> CrossrefStrings(4),
+ 4 -> CrossrefStrings(5)))
.sink[(String, ReduceFeatures)](TypedTsv[(String, ReduceFeatures)](output + ".trapped")) { _ => () }
.sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) {
// Grobid titles and slugs (in parentheses):
@@ -195,12 +200,14 @@ class ScoreJobTest extends FlatSpec with Matchers {
// Title 2: TNG (title2tng)
// Title 3: The Sequel (title3thesequel)
// <too long of a title>
+ // <too short of a title>
// crossref titles and slugs (in parentheses):
// Title 2: TNG (title2tng)
// Title 1: TNG 2A (title1tng2a)
// Title 1: TNG 3 (title1tng3)
// Title 2: Rebooted (title2rebooted)
// <too long of a title>
+ // <too short of a title>
// XXX: Join should have 3 "title1" slugs and 1 "title2tng" slug
outputBuffer =>
"The pipeline" should "return a 1-element list" in {