diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-08-23 17:50:23 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-23 17:50:23 -0700 |
commit | 2656af2686aa73d0061a581bef3b9ca9d4ad8451 (patch) | |
tree | 3fd9332695067458368581aca6254a305ae1e080 | |
parent | 2ab704a09db06ab776bd4cf59974e5f65f5e7c38 (diff) | |
download | sandcrawler-2656af2686aa73d0061a581bef3b9ca9d4ad8451.tar.gz sandcrawler-2656af2686aa73d0061a581bef3b9ca9d4ad8451.zip |
set a minimum slug size (8 chars)
4 files changed, 31 insertions, 15 deletions
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 0b9868a..9eb03f7 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -10,6 +10,7 @@ object ScorableFeatures { val fileStream : InputStream = getClass.getResourceAsStream("/slug-blacklist.txt") val SlugBlacklist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet fileStream.close + val MinSlugLength = 8 // Static factory method def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = { @@ -38,7 +39,10 @@ class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", val unaccented = StringUtilities.removeAccents(title) // Remove punctuation val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "") - if (slug.isEmpty || slug == null || (ScorableFeatures.SlugBlacklist contains slug)) Scorable.NoSlug else slug + if (slug.isEmpty + || slug == null + || (ScorableFeatures.SlugBlacklist contains slug) + || (slug.length < ScorableFeatures.MinSlugLength)) Scorable.NoSlug else slug } } diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala index 12e13dc..bf9343b 100644 --- a/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala +++ b/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala @@ -78,14 +78,14 @@ class GrobidScorableDumpJobTest extends FlatSpec with Matchers { "sha1:024937534094897039547e9824382943") // bad status val JsonStrings : List[String] = List( - JsonString.replace("<<TITLE>>", "Title 1"), + JsonString.replace("<<TITLE>>", "Title 1: The Classic"), JsonString.replace("<<TITLE>>", "Title 2: TNG"), JsonString.replace("<<TITLE>>", "Title 3: The Sequel"), // This will have bad status. - JsonString.replace("<<TITLE>>", "Title 1"), + JsonString.replace("<<TITLE>>", "Title 1: The Classic"), MalformedJsonString, // This will have bad status. - JsonString.replace("<<TITLE>>", "Title 2") + JsonString.replace("<<TITLE>>", "Title 2: Not TNG") ) // bnewbold: status codes aren't strings, they are uint64 diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index d742384..474f69a 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -36,7 +36,7 @@ class ScorableFeaturesTest extends FlatSpec with Matchers { it should "strip punctuation" in { titleToSlug("HELLO!:the:re") shouldBe "hellothere" - titleToSlug("a:b:c") shouldBe "abc" + titleToSlug("a:b:cdefgh") shouldBe "abcdefgh" titleToSlug( "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands" titleToSlug(":;\"\'") shouldBe Scorable.NoSlug @@ -56,7 +56,12 @@ class ScorableFeaturesTest extends FlatSpec with Matchers { it should "remove whitespace" in { titleToSlug("foo bar : baz ::") shouldBe "foobarbaz" - titleToSlug("\na\t:b:c") shouldBe "abc" + titleToSlug("\na\t:b:cdefghi") shouldBe "abcdefghi" titleToSlug("\n \t \r ") shouldBe Scorable.NoSlug } + + it should "skip very short slugs" in { + titleToSlug("short") shouldBe Scorable.NoSlug + titleToSlug("a longer, more in depth title") shouldBe "alongermoreindepthtitle" + } } diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala index 85d141a..32fb16c 100644 --- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala @@ -118,6 +118,7 @@ class ScoreJobTest extends FlatSpec with Matchers { """ // scalastyle:on val TooLongOfTitle = "X" * Scorable.MaxTitleLength + "Y" // arbitrary long string + val TooShortOfTitle = "X" * (ScorableFeatures.MinSlugLength - 1) val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle") val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") val MalformedCrossrefString = CrossrefString.replace("}", "") @@ -126,7 +127,8 @@ class ScoreJobTest extends FlatSpec with Matchers { CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2A").replace("<<DOI>>", "DOI-0.5"), CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"), CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"), - CrossrefString.replace("<<TITLE>>", TooLongOfTitle).replace("<<DOI>>", "DOI-1")) + CrossrefString.replace("<<TITLE>>", TooLongOfTitle).replace("<<DOI>>", "DOI-1"), + CrossrefString.replace("<<TITLE>>", TooShortOfTitle).replace("<<DOI>>", "DOI-1")) // Pipeline tests val output = "/tmp/testOutput" @@ -140,25 +142,27 @@ class ScoreJobTest extends FlatSpec with Matchers { "sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56", "sha1:93187A85273589347598473894839443", "sha1:024937534094897039547e9824382943", - "sha1:93229759932857982837892347893892") + "sha1:93229759932857982837892347893892", + "sha1:83229759932857982837892347893892") val JsonStrings : List[String] = List( - JsonString.replace("<<TITLE>>", "Title 1"), + JsonString.replace("<<TITLE>>", "Title 1: The Original"), JsonString.replace("<<TITLE>>", "Title 2: TNG"), JsonString.replace("<<TITLE>>", "Title 3: The Sequel"), // This will have bad status. - JsonString.replace("<<TITLE>>", "Title 1"), + JsonString.replace("<<TITLE>>", "Title 1: The Original"), MalformedJsonString, // This will have bad status. - JsonString.replace("<<TITLE>>", "Title 2"), - // This is in both sources but too long. - JsonString.replace("<<TITLE>>", TooLongOfTitle) + JsonString.replace("<<TITLE>>", "Title 2: Not TNG"), + // These are in both sources but have bad titles + JsonString.replace("<<TITLE>>", TooLongOfTitle), + JsonString.replace("<<TITLE>>", TooShortOfTitle) ) // bnewbold: status codes aren't strings, they are uint64 val Ok : Long = 200 val Bad : Long = 400 - val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad, Ok) + val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad, Ok, Ok) val SampleDataHead : List[Tuple] = (Sha1Strings, JsonStrings, StatusCodes) .zipped @@ -187,7 +191,8 @@ class ScoreJobTest extends FlatSpec with Matchers { 1 -> CrossrefStrings(1), 2 -> CrossrefStrings(2), 3 -> CrossrefStrings(3), - 4 -> CrossrefStrings(4))) + 4 -> CrossrefStrings(4), + 4 -> CrossrefStrings(5))) .sink[(String, ReduceFeatures)](TypedTsv[(String, ReduceFeatures)](output + ".trapped")) { _ => () } .sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) { // Grobid titles and slugs (in parentheses): @@ -195,12 +200,14 @@ class ScoreJobTest extends FlatSpec with Matchers { // Title 2: TNG (title2tng) // Title 3: The Sequel (title3thesequel) // <too long of a title> + // <too short of a title> // crossref titles and slugs (in parentheses): // Title 2: TNG (title2tng) // Title 1: TNG 2A (title1tng2a) // Title 1: TNG 3 (title1tng3) // Title 2: Rebooted (title2rebooted) // <too long of a title> + // <too short of a title> // XXX: Join should have 3 "title1" slugs and 1 "title2tng" slug outputBuffer => "The pipeline" should "return a 1-element list" in { |