From b4f1acce5eccbb56291f82906d9c01534c7f1506 Mon Sep 17 00:00:00 2001 From: Ellen Spertus Date: Mon, 13 Aug 2018 10:27:48 -0700 Subject: Factored out ScorableFeatures. --- .../scala/sandcrawler/ScorableFeaturesTest.scala | 37 ++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala (limited to 'scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala') diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala new file mode 100644 index 0000000..7ec0c4d --- /dev/null +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -0,0 +1,37 @@ +package sandcrawler + +import org.scalatest._ + +class ScorableFeaturesTest extends FlatSpec with Matchers { + private def titleToSlug(s : String) : String = { + new ScorableFeatures(title = s).toSlug + } + + "mapToSlug()" should "extract the parts of titles before a colon" in { + titleToSlug("HELLO:there") shouldBe "hello" + } + + it should "extract an entire colon-less string" in { + titleToSlug("hello THERE") shouldBe "hellothere" + } + + it should "return Scorable.NoSlug if given empty string" in { + titleToSlug("") shouldBe Scorable.NoSlug + } + + it should "return Scorable.NoSlug if given null" in { + titleToSlug(null) shouldBe Scorable.NoSlug + } + + it should "strip punctuation" in { + titleToSlug("HELLO!:the:re") shouldBe "hello" + titleToSlug("a:b:c") shouldBe "a" + titleToSlug( + "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands" + } + + it should "remove whitespace" in { + titleToSlug("foo bar : baz ::") shouldBe "foobar" + titleToSlug("\na\t:b:c") shouldBe "a" + } +} -- cgit v1.2.3 From 3ff30c8f20d36f8e47ec5478c10c3348d2f45fa6 Mon Sep 17 00:00:00 2001 From: Ellen Spertus Date: Tue, 14 Aug 2018 20:38:29 -0700 Subject: Fixed style problems (or disabled warning when appropriate) for tests. --- scalding/build.sbt | 7 ++ .../scala/sandcrawler/CrossrefScorableTest.scala | 87 ++++++++++--------- .../scala/sandcrawler/GrobidScorableTest.scala | 7 +- .../test/scala/sandcrawler/HBaseBuilderTest.scala | 1 + .../scala/sandcrawler/HBaseMimeCountTest.scala | 9 +- .../test/scala/sandcrawler/HBaseRowCountTest.scala | 11 +-- .../scala/sandcrawler/HBaseStatusCountTest.scala | 10 ++- .../scala/sandcrawler/ScorableFeaturesTest.scala | 1 + .../src/test/scala/sandcrawler/ScorableTest.scala | 5 +- .../src/test/scala/sandcrawler/ScoreJobTest.scala | 97 ++++++++++++---------- 10 files changed, 135 insertions(+), 100 deletions(-) (limited to 'scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala') diff --git a/scalding/build.sbt b/scalding/build.sbt index 2addd60..d477399 100644 --- a/scalding/build.sbt +++ b/scalding/build.sbt @@ -20,6 +20,13 @@ lazy val root = (project in file(".")). scalaSourceFiles.filterNot(_.getAbsolutePath.contains(dirNameToExclude)) }, + (scalastyleSources in Test) := { + // all .scala files in "src/test/scala" + val scalaSourceFiles = ((scalaSource in Test).value ** "*.scala").get + val dirNameToExclude = "/example/" + scalaSourceFiles.filterNot(_.getAbsolutePath.contains(dirNameToExclude)) + }, + name := "sandcrawler", resolvers += "conjars.org" at "http://conjars.org/repo", diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala index 75be03e..e171dba 100644 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala @@ -2,72 +2,77 @@ package sandcrawler import cascading.tuple.Fields import cascading.tuple.Tuple -import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} +import com.twitter.scalding.JobTest +import com.twitter.scalding.TextLine +import com.twitter.scalding.TupleConversions +import com.twitter.scalding.TypedTsv import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.scalatest._ import parallelai.spyglass.hbase.HBaseConstants.SourceMode class CrossrefScorableTest extends FlatSpec with Matchers { + // scalastyle:off val CrossrefString = """ -{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, - "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], - "date-time" : "2017-10-23T17:19:16Z", - "timestamp" : { "$numberLong" : "1508779156477" } }, - "reference-count" : 0, - "publisher" : "Elsevier BV", - "issue" : "3", - "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", - "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], - "date-time" : "1996-01-01T00:00:00Z", - "timestamp" : { "$numberLong" : "820454400000" } }, +{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, + "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], + "date-time" : "2017-10-23T17:19:16Z", + "timestamp" : { "$numberLong" : "1508779156477" } }, + "reference-count" : 0, + "publisher" : "Elsevier BV", + "issue" : "3", + "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", + "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], + "date-time" : "1996-01-01T00:00:00Z", + "timestamp" : { "$numberLong" : "820454400000" } }, "delay-in-days" : 0, "content-version" : "tdm" }], - "content-domain" : { "domain" : [], "crossmark-restriction" : false }, - "published-print" : { "date-parts" : [ [ 1996 ] ] }, + "content-domain" : { "domain" : [], "crossmark-restriction" : false }, + "published-print" : { "date-parts" : [ [ 1996 ] ] }, "DOI" : "<>", - "type" : "journal-article", - "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], - "date-time" : "2002-07-25T15:09:41Z", - "timestamp" : { "$numberLong" : "1027609781000" } }, - "page" : "186-187", - "source" : "Crossref", - "is-referenced-by-count" : 0, + "type" : "journal-article", + "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], + "date-time" : "2002-07-25T15:09:41Z", + "timestamp" : { "$numberLong" : "1027609781000" } }, + "page" : "186-187", + "source" : "Crossref", + "is-referenced-by-count" : 0, "title" : [ "<>" ], - "prefix" : "10.1016", - "volume" : "9", - "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], - "member" : "78", - "container-title" : [ "Journal de Pédiatrie et de Puériculture" ], + "prefix" : "10.1016", + "volume" : "9", + "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], + "member" : "78", + "container-title" : [ "Journal de Pédiatrie et de Puériculture" ], "link" : [ { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml", - "content-type" : "text/xml", + "content-type" : "text/xml", "content-version" : "vor", - "intended-application" : "text-mining" }, + "intended-application" : "text-mining" }, { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain", - "content-type" : "text/plain", + "content-type" : "text/plain", "content-version" : "vor", - "intended-application" : "text-mining" } ], - "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], - "date-time" : "2015-09-03T10:03:43Z", - "timestamp" : { "$numberLong" : "1441274623000" } }, - "score" : 1, - "issued" : { "date-parts" : [ [ 1996 ] ] }, - "references-count" : 0, - "alternative-id" : [ "0987-7983(96)87729-2" ], - "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", - "ISSN" : [ "0987-7983" ], - "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], + "intended-application" : "text-mining" } ], + "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], + "date-time" : "2015-09-03T10:03:43Z", + "timestamp" : { "$numberLong" : "1441274623000" } }, + "score" : 1, + "issued" : { "date-parts" : [ [ 1996 ] ] }, + "references-count" : 0, + "alternative-id" : [ "0987-7983(96)87729-2" ], + "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", + "ISSN" : [ "0987-7983" ], + "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], "subject" : [ "Pediatrics, Perinatology, and Child Health" ] } """ + // scalastyle:on val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "Some Title") val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") val MalformedCrossrefString = CrossrefString.replace("}", "") // Unit tests "CrossrefScorable.jsonToMapFeatures()" should "handle invalid JSON" in { - val result = CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString) + val result = CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString) result.slug shouldBe Scorable.NoSlug } diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala index 4b958b9..661824b 100644 --- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala @@ -2,7 +2,10 @@ package sandcrawler import cascading.tuple.Fields import cascading.tuple.Tuple -import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} +import com.twitter.scalding.JobTest +import com.twitter.scalding.TextLine +import com.twitter.scalding.TupleConversions +import com.twitter.scalding.TypedTsv import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.scalatest._ @@ -62,7 +65,7 @@ class GrobidScorableTest extends FlatSpec with Matchers { // Unit tests "GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in { - val result = GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString) + val result = GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString) result.slug shouldBe Scorable.NoSlug } diff --git a/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala b/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala index 603a4c7..c61cb22 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala @@ -22,6 +22,7 @@ class HBaseBuilderTest extends FlatSpec with Matchers { fields should have length 0 } + //scalastyle:off no.whitespace.before.left.bracket it should "throw IllegalArgumentException on malformed input" in { a [IllegalArgumentException] should be thrownBy { HBaseBuilder.parseColSpecs(List("file_size")) diff --git a/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala index fde2290..d6d283f 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala @@ -1,15 +1,18 @@ package sandcrawler -import cascading.tuple.{Tuple, Fields} -import com.twitter.scalding.{JobTest, Tsv, TupleConversions} +import cascading.tuple.Fields +import cascading.tuple.Tuple +import com.twitter.scalding.JobTest +import com.twitter.scalding.Tsv +import com.twitter.scalding.TupleConversions import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.junit.runner.RunWith import org.scalatest.FunSpec import org.scalatest.junit.JUnitRunner import org.slf4j.LoggerFactory -import parallelai.spyglass.hbase.HBaseSource import parallelai.spyglass.hbase.HBaseConstants.SourceMode +import parallelai.spyglass.hbase.HBaseSource import scala._ @RunWith(classOf[JUnitRunner]) diff --git a/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala index 3424a36..c4ca5aa 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala @@ -1,15 +1,18 @@ package sandcrawler -import cascading.tuple.{Tuple, Fields} -import com.twitter.scalding.{JobTest, Tsv, TupleConversions} +import cascading.tuple.Fields +import cascading.tuple.Tuple +import com.twitter.scalding.JobTest +import com.twitter.scalding.Tsv +import com.twitter.scalding.TupleConversions import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.junit.runner.RunWith import org.scalatest.FunSpec import org.scalatest.junit.JUnitRunner import org.slf4j.LoggerFactory -import parallelai.spyglass.hbase.HBaseSource import parallelai.spyglass.hbase.HBaseConstants.SourceMode +import parallelai.spyglass.hbase.HBaseSource import scala._ /** @@ -47,12 +50,10 @@ class HBaseRowCountTest extends FunSpec with TupleConversions { outputBuffer => it("should return the test data provided.") { - println("outputBuffer.size => " + outputBuffer.size) assert(outputBuffer.size === 1) } it("should return the correct count") { - println("raw output => " + outputBuffer) assert(outputBuffer(0).getObject(0) === 8) } } diff --git a/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala index 8a71f31..fe3ff21 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala @@ -1,15 +1,19 @@ package sandcrawler -import cascading.tuple.{Tuple, Fields} -import com.twitter.scalding.{JobTest, Tsv, TypedTsv, TupleConversions} +import cascading.tuple.Fields +import cascading.tuple.Tuple +import com.twitter.scalding.JobTest +import com.twitter.scalding.Tsv +import com.twitter.scalding.TupleConversions +import com.twitter.scalding.TypedTsv import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.junit.runner.RunWith import org.scalatest.FunSpec import org.scalatest.junit.JUnitRunner import org.slf4j.LoggerFactory -import parallelai.spyglass.hbase.HBaseSource import parallelai.spyglass.hbase.HBaseConstants.SourceMode +import parallelai.spyglass.hbase.HBaseSource import scala._ @RunWith(classOf[JUnitRunner]) diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index 7ec0c4d..f9c30a2 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -2,6 +2,7 @@ package sandcrawler import org.scalatest._ +// scalastyle:off null class ScorableFeaturesTest extends FlatSpec with Matchers { private def titleToSlug(s : String) : String = { new ScorableFeatures(title = s).toSlug diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala index fd44f57..f63bef8 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala @@ -2,7 +2,10 @@ package sandcrawler import cascading.tuple.Fields import cascading.tuple.Tuple -import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} +import com.twitter.scalding.JobTest +import com.twitter.scalding.TextLine +import com.twitter.scalding.TupleConversions +import com.twitter.scalding.TypedTsv import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.scalatest._ diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala index 1c6ae83..34081a5 100644 --- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala @@ -2,13 +2,17 @@ package sandcrawler import cascading.tuple.Fields import cascading.tuple.Tuple -import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} +import com.twitter.scalding.JobTest +import com.twitter.scalding.TextLine +import com.twitter.scalding.TupleConversions +import com.twitter.scalding.TypedTsv import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.scalatest._ import parallelai.spyglass.hbase.HBaseConstants.SourceMode class ScoreJobTest extends FlatSpec with Matchers { + //scalastyle:off val JsonString = """ { "title": "<<TITLE>>", @@ -54,62 +58,65 @@ class ScoreJobTest extends FlatSpec with Matchers { "annex": null } """ + // scalastyle:on val JsonStringWithTitle = JsonString.replace("<<TITLE>>", "Dummy Example File") val JsonStringWithoutTitle = JsonString.replace("title", "nottitle") val MalformedJsonString = JsonString.replace("}", "") + // scalastyle:off val CrossrefString = """ -{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, - "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], - "date-time" : "2017-10-23T17:19:16Z", - "timestamp" : { "$numberLong" : "1508779156477" } }, - "reference-count" : 0, - "publisher" : "Elsevier BV", - "issue" : "3", - "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", - "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], - "date-time" : "1996-01-01T00:00:00Z", - "timestamp" : { "$numberLong" : "820454400000" } }, +{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, + "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], + "date-time" : "2017-10-23T17:19:16Z", + "timestamp" : { "$numberLong" : "1508779156477" } }, + "reference-count" : 0, + "publisher" : "Elsevier BV", + "issue" : "3", + "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", + "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], + "date-time" : "1996-01-01T00:00:00Z", + "timestamp" : { "$numberLong" : "820454400000" } }, "delay-in-days" : 0, "content-version" : "tdm" }], - "content-domain" : { "domain" : [], "crossmark-restriction" : false }, - "published-print" : { "date-parts" : [ [ 1996 ] ] }, + "content-domain" : { "domain" : [], "crossmark-restriction" : false }, + "published-print" : { "date-parts" : [ [ 1996 ] ] }, "DOI" : "<<DOI>>", - "type" : "journal-article", - "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], - "date-time" : "2002-07-25T15:09:41Z", - "timestamp" : { "$numberLong" : "1027609781000" } }, - "page" : "186-187", - "source" : "Crossref", - "is-referenced-by-count" : 0, + "type" : "journal-article", + "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], + "date-time" : "2002-07-25T15:09:41Z", + "timestamp" : { "$numberLong" : "1027609781000" } }, + "page" : "186-187", + "source" : "Crossref", + "is-referenced-by-count" : 0, "title" : [ "<<TITLE>>" ], - "prefix" : "10.1016", - "volume" : "9", - "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], - "member" : "78", - "container-title" : [ "Journal de Pédiatrie et de Puériculture" ], + "prefix" : "10.1016", + "volume" : "9", + "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], + "member" : "78", + "container-title" : [ "Journal de Pédiatrie et de Puériculture" ], "link" : [ { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml", - "content-type" : "text/xml", - "content-version" : "vor", - "intended-application" : "text-mining" }, + "content-type" : "text/xml", + "content-version" : "vor", + "intended-application" : "text-mining" }, { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain", - "content-type" : "text/plain", + "content-type" : "text/plain", "content-version" : "vor", - "intended-application" : "text-mining" } ], - "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], - "date-time" : "2015-09-03T10:03:43Z", - "timestamp" : { "$numberLong" : "1441274623000" } }, - "score" : 1, - "issued" : { "date-parts" : [ [ 1996 ] ] }, - "references-count" : 0, - "alternative-id" : [ "0987-7983(96)87729-2" ], - "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", - "ISSN" : [ "0987-7983" ], - "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], + "intended-application" : "text-mining" } ], + "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], + "date-time" : "2015-09-03T10:03:43Z", + "timestamp" : { "$numberLong" : "1441274623000" } }, + "score" : 1, + "issued" : { "date-parts" : [ [ 1996 ] ] }, + "references-count" : 0, + "alternative-id" : [ "0987-7983(96)87729-2" ], + "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", + "ISSN" : [ "0987-7983" ], + "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], "subject" : [ "Pediatrics, Perinatology, and Child Health" ] } """ + // scalastyle:on val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle") val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") val MalformedCrossrefString = CrossrefString.replace("}", "") @@ -168,7 +175,7 @@ class ScoreJobTest extends FlatSpec with Matchers { 2 -> CrossrefStrings(2), 3 -> CrossrefStrings(3))) .sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) { - // Grobid titles and slugs (in parentheses): + // Grobid titles and slugs (in parentheses): // Title 1 (title1) // Title 2: TNG (title2) // Title 3: The Sequel (title3) @@ -178,7 +185,7 @@ class ScoreJobTest extends FlatSpec with Matchers { // Title 1: TNG 3 (title1) // Title 2: Rebooted (title2) // Join should have 3 "title1" slugs and 1 "title2" slug - outputBuffer => + outputBuffer => "The pipeline" should "return a 4-element list" in { outputBuffer should have length 4 } @@ -190,9 +197,9 @@ class ScoreJobTest extends FlatSpec with Matchers { countMap("title2") shouldBe 1 } - def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) = { + def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) : (String, Int, String, String) = { val mf1 : MapFeatures = GrobidScorable.jsonToMapFeatures( - Sha1Strings(grobidIndex), + Sha1Strings(grobidIndex), JsonStrings(grobidIndex)) val mf2 : MapFeatures = CrossrefScorable.jsonToMapFeatures( CrossrefStrings(crossrefIndex)) -- cgit v1.2.3 From 419ca3dc053682d688653e9a64eaaf46018fd330 Mon Sep 17 00:00:00 2001 From: Bryan Newbold <bnewbold@archive.org> Date: Wed, 15 Aug 2018 20:22:04 -0700 Subject: scorable: test for null strings --- scalding/src/main/scala/sandcrawler/CrossrefScorable.scala | 1 + scalding/src/main/scala/sandcrawler/ScorableFeatures.scala | 5 ++++- scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala | 5 +++++ 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala') diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index 4897b1c..ff8201a 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -34,6 +34,7 @@ object CrossrefScorable { if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) { new MapFeatures(Scorable.NoSlug, json) } else { + // bnewbold: not checking that titles(0) is non-null/non-empty; case would be, in JSON, "title": [ null ] val sf : ScorableFeatures = new ScorableFeatures(title=titles(0), doi=doi) new MapFeatures(sf.toSlug, sf.toString) } diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 5d6dea0..966fb93 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -6,7 +6,10 @@ import scala.util.parsing.json.JSONObject // with a second ScorableFeatures). class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") { def toMap() : Map[String, Any] = { - Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) + Map("title" -> (if (title == null) "" else title), + "year" -> year, + "doi" -> (if (doi == null) "" else doi), + "sha1" -> (if (sha1 == null) "" else sha1)) } override def toString() : String = { diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index f9c30a2..5ffc305 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -8,6 +8,11 @@ class ScorableFeaturesTest extends FlatSpec with Matchers { new ScorableFeatures(title = s).toSlug } + "toMapFeatures()" should "work with gnarly inputs" in { + new ScorableFeatures(title = null).toMapFeatures + new ScorableFeatures(title = "something", doi = null, sha1 = null, year = 123).toMapFeatures + } + "mapToSlug()" should "extract the parts of titles before a colon" in { titleToSlug("HELLO:there") shouldBe "hello" } -- cgit v1.2.3 From 4ca3d5088520d219eccbc5921928c5b67d8e998a Mon Sep 17 00:00:00 2001 From: Bryan Newbold <bnewbold@archive.org> Date: Wed, 15 Aug 2018 20:23:12 -0700 Subject: scorable: test for more punctuation removal --- scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala') diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index 5ffc305..fd01c91 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -34,10 +34,18 @@ class ScorableFeaturesTest extends FlatSpec with Matchers { titleToSlug("a:b:c") shouldBe "a" titleToSlug( "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands" + titleToSlug(":;\"\'") shouldBe Scorable.NoSlug + } + + it should "strip special characters" in { + titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_…") shouldBe Scorable.NoSlug + // TODO: titleToSlug("©™₨№") shouldBe Scorable.NoSlug + // TODO: titleToSlug("πµΣσ") shouldBe Scorable.NoSlug } it should "remove whitespace" in { titleToSlug("foo bar : baz ::") shouldBe "foobar" titleToSlug("\na\t:b:c") shouldBe "a" + titleToSlug("\n \t \r ") shouldBe Scorable.NoSlug } } -- cgit v1.2.3 From 3f668933d71b82555e89a3bfefe83039ff7ddbfb Mon Sep 17 00:00:00 2001 From: Bryan Newbold <bnewbold@archive.org> Date: Wed, 15 Aug 2018 22:33:09 -0700 Subject: add a stub title blacklist --- scalding/src/main/scala/sandcrawler/ScorableFeatures.scala | 13 ++++++++++++- .../src/test/scala/sandcrawler/ScorableFeaturesTest.scala | 6 ++++++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala') diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 966fb93..696b2ef 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -2,9 +2,20 @@ package sandcrawler import scala.util.parsing.json.JSONObject + // Contains features needed to make slug and to score (in combination // with a second ScorableFeatures). class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") { + + val slugBlacklist = Set( "abbreviations", "abstract", "acknowledgements", + "article", "authorreply", "authorsreply", "bookreview", "bookreviews", + "casereport", "commentary", "commentaryon", "commenton", "commentto", + "contents", "correspondence", "dedication", "editorialadvisoryboard", + "focus", "hypothesis", "inbrief", "introduction", "introductiontotheissue", + "lettertotheeditor", "listofabbreviations", "note", "overview", "preface", + "references", "results", "review", "reviewarticle", "summary", "title", + "name") + def toMap() : Map[String, Any] = { Map("title" -> (if (title == null) "" else title), "year" -> year, @@ -23,7 +34,7 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S val unaccented = StringUtilities.removeAccents(title) // Remove punctuation after splitting on colon. val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "") - if (slug.isEmpty || slug == null) Scorable.NoSlug else slug + if (slug.isEmpty || slug == null || (slugBlacklist contains slug)) Scorable.NoSlug else slug } } diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index fd01c91..0acf0b8 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -37,6 +37,12 @@ class ScorableFeaturesTest extends FlatSpec with Matchers { titleToSlug(":;\"\'") shouldBe Scorable.NoSlug } + it should "filter stub titles" in { + titleToSlug("abstract") shouldBe Scorable.NoSlug + titleToSlug("title!") shouldBe Scorable.NoSlug + titleToSlug("a real title which is not on blacklist") shouldBe "arealtitlewhichisnotonblacklist" + } + it should "strip special characters" in { titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_…") shouldBe Scorable.NoSlug // TODO: titleToSlug("©™₨№") shouldBe Scorable.NoSlug -- cgit v1.2.3 From 96ea0ddd06ee4a7c11c7d5def976749ab3675878 Mon Sep 17 00:00:00 2001 From: Bryan Newbold <bnewbold@archive.org> Date: Wed, 15 Aug 2018 22:43:33 -0700 Subject: change slugification behavior to not split on colon --- .../main/scala/sandcrawler/ScorableFeatures.scala | 4 +-- .../scala/sandcrawler/ScorableFeaturesTest.scala | 14 +++++----- .../src/test/scala/sandcrawler/ScoreJobTest.scala | 32 +++++++++++----------- 3 files changed, 25 insertions(+), 25 deletions(-) (limited to 'scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala') diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 696b2ef..8ed3369 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -32,8 +32,8 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S Scorable.NoSlug } else { val unaccented = StringUtilities.removeAccents(title) - // Remove punctuation after splitting on colon. - val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "") + // Remove punctuation + val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "") if (slug.isEmpty || slug == null || (slugBlacklist contains slug)) Scorable.NoSlug else slug } } diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index 0acf0b8..80d92aa 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -14,7 +14,7 @@ class ScorableFeaturesTest extends FlatSpec with Matchers { } "mapToSlug()" should "extract the parts of titles before a colon" in { - titleToSlug("HELLO:there") shouldBe "hello" + titleToSlug("HELLO:there") shouldBe "hellothere" } it should "extract an entire colon-less string" in { @@ -30,8 +30,8 @@ class ScorableFeaturesTest extends FlatSpec with Matchers { } it should "strip punctuation" in { - titleToSlug("HELLO!:the:re") shouldBe "hello" - titleToSlug("a:b:c") shouldBe "a" + titleToSlug("HELLO!:the:re") shouldBe "hellothere" + titleToSlug("a:b:c") shouldBe "abc" titleToSlug( "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands" titleToSlug(":;\"\'") shouldBe Scorable.NoSlug @@ -44,14 +44,14 @@ class ScorableFeaturesTest extends FlatSpec with Matchers { } it should "strip special characters" in { - titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_…") shouldBe Scorable.NoSlug - // TODO: titleToSlug("©™₨№") shouldBe Scorable.NoSlug + titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_") shouldBe Scorable.NoSlug + // TODO: titleToSlug("©™₨№…") shouldBe Scorable.NoSlug // TODO: titleToSlug("πµΣσ") shouldBe Scorable.NoSlug } it should "remove whitespace" in { - titleToSlug("foo bar : baz ::") shouldBe "foobar" - titleToSlug("\na\t:b:c") shouldBe "a" + titleToSlug("foo bar : baz ::") shouldBe "foobarbaz" + titleToSlug("\na\t:b:c") shouldBe "abc" titleToSlug("\n \t \r ") shouldBe Scorable.NoSlug } } diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala index 54ae801..f92ba31 100644 --- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala @@ -121,7 +121,7 @@ class ScoreJobTest extends FlatSpec with Matchers { val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") val MalformedCrossrefString = CrossrefString.replace("}", "") val CrossrefStrings = List( - CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"), + CrossrefString.replace("<<TITLE>>", "Title 2: TNG").replace("<<DOI>>", "DOI-0"), CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2A").replace("<<DOI>>", "DOI-0.5"), CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"), CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1")) @@ -182,24 +182,24 @@ class ScoreJobTest extends FlatSpec with Matchers { .sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) { // Grobid titles and slugs (in parentheses): // Title 1 (title1) - // Title 2: TNG (title2) - // Title 3: The Sequel (title3) + // Title 2: TNG (title2tng) + // Title 3: The Sequel (title3thesequel) // crossref titles and slugs (in parentheses): - // Title 1: TNG (title1) - // Title 1: TNG 2A (title1) - // Title 1: TNG 3 (title1) - // Title 2: Rebooted (title2) - // Join should have 3 "title1" slugs and 1 "title2" slug + // Title 2: TNG (title2tng) + // Title 1: TNG 2A (title1tng2a) + // Title 1: TNG 3 (title1tng3) + // Title 2: Rebooted (title2rebooted) + // XXX: Join should have 3 "title1" slugs and 1 "title2tng" slug outputBuffer => - "The pipeline" should "return a 4-element list" in { - outputBuffer should have length 4 + "The pipeline" should "return a 1-element list" in { + outputBuffer should have length 1 } it should "has right # of entries with each slug" in { val slugs = outputBuffer.map(_._1) val countMap : Map[String, Int] = slugs.groupBy(identity).mapValues(_.size) - countMap("title1") shouldBe 3 - countMap("title2") shouldBe 1 + // XXX: countMap("title1") shouldBe 3 + countMap("title2tng") shouldBe 1 } def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) : (String, Int, String, String) = { @@ -215,10 +215,10 @@ class ScoreJobTest extends FlatSpec with Matchers { } it should "have right output values" in { - outputBuffer.exists(_ == bundle("title1", 0, 0)) - outputBuffer.exists(_ == bundle("title1", 0, 2)) - outputBuffer.exists(_ == bundle("title1", 0, 1)) - outputBuffer.exists(_ == bundle("title2", 1, 3)) + //outputBuffer.exists(_ == bundle("title1", 0, 0)) + //outputBuffer.exists(_ == bundle("title1", 0, 2)) + //outputBuffer.exists(_ == bundle("title1", 0, 1)) + outputBuffer.exists(_ == bundle("title2tng", 1, 3)) } } .run -- cgit v1.2.3