aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/test/scala/sandcrawler/ScorableTest.scala
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala')
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableTest.scala112
1 files changed, 28 insertions, 84 deletions
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
index 40801a0..2f80492 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -9,7 +9,7 @@ import org.scalatest._
import parallelai.spyglass.hbase.HBaseConstants.SourceMode
class ScorableTest extends FlatSpec with Matchers {
- val JsonString = """
+ val JsonString = """
{
"title": "<<TITLE>>",
"authors": [
@@ -55,96 +55,40 @@ class ScorableTest extends FlatSpec with Matchers {
}
"""
- performUnitTests()
- performPipelineTests()
-
- def performUnitTests() {
- "titleToSlug()" should "extract the parts of titles before a colon" in {
- Scorable.titleToSlug("HELLO:there") shouldBe "hello"
- }
-
- it should "extract an entire colon-less string" in {
- Scorable.titleToSlug("hello THERE") shouldBe "hello there"
- }
-
- it should "return Scorable.NoSlug if given empty string" in {
- Scorable.titleToSlug("") shouldBe Scorable.NoSlug
- }
-
- it should "return Scorable.NoSlug if given null" in {
- Scorable.titleToSlug(null) shouldBe Scorable.NoSlug
- }
-
- "titleToSlug()" should "strip punctuation" in {
- Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello"
- Scorable.titleToSlug("a:b:c") shouldBe "a"
- Scorable.titleToSlug(
- "If you're happy and you know it, clap your hands!") shouldBe "if youre happy and you know it clap your hands"
- }
+ "titleToSlug()" should "extract the parts of titles before a colon" in {
+ Scorable.titleToSlug("HELLO:there") shouldBe "hello"
+ }
- "jsonToMap()" should "return a map, given a legal JSON string" in {
- Scorable.jsonToMap(JsonString) should not be (None)
- }
+ it should "extract an entire colon-less string" in {
+ Scorable.titleToSlug("hello THERE") shouldBe "hello there"
+ }
- it should "return None, given illegal JSON" in {
- Scorable.jsonToMap("illegal{,json{{") should be (None)
- }
+ it should "return Scorable.NoSlug if given empty string" in {
+ Scorable.titleToSlug("") shouldBe Scorable.NoSlug
+ }
- "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in {
- val score = Scorable.computeSimilarity(
- new ReduceFeatures(JsonString), new ReduceFeatures(JsonString))
- score shouldBe Scorable.MaxScore
- }
+ it should "return Scorable.NoSlug if given null" in {
+ Scorable.titleToSlug(null) shouldBe Scorable.NoSlug
}
- def performPipelineTests() {
- /*
+ "titleToSlug()" should "strip punctuation" in {
+ Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello"
+ Scorable.titleToSlug("a:b:c") shouldBe "a"
+ Scorable.titleToSlug(
+ "If you're happy and you know it, clap your hands!") shouldBe "if youre happy and you know it clap your hands"
+ }
- val output = "/tmp/testOutput"
- val input = "/tmp/testInput"
- val (testTable, testHost) = ("test-table", "dummy-host:2181")
+ "jsonToMap()" should "return a map, given a legal JSON string" in {
+ Scorable.jsonToMap(JsonString) should not be (None)
+ }
- val grobidSampleData = List(
- List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"),
- Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 1"))),
- List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"),
- Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 2: TNG"))),
- List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"),
- Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"))),
- List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"),
- Bytes.toBytes(MalformedGrobidString)))
+ it should "return None, given illegal JSON" in {
+ Scorable.jsonToMap("illegal{,json{{") should be (None)
+ }
- JobTest("sandcrawler.HBaseCrossrefScoreJob")
- .arg("test", "")
- .arg("app.conf.path", "app.conf")
- .arg("output", output)
- .arg("hbase-table", testTable)
- .arg("zookeeper-hosts", testHost)
- .arg("crossref-input", input)
- .arg("debug", "true")
- .source[Tuple](HBaseCrossrefScore.getHBaseSource(testTable, testHost),
- grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
- .source(TextLine(input), List(
- 0 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"),
- 1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"),
- 2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
- 3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1")))
- .sink[(Int, String, String, String, String)](TypedTsv[(Int,
- String, String, String, String)](output)) {
- // Grobid titles:
- // "Title 1", "Title 2: TNG", "Title 3: The Sequel"
- // crossref slugs:
- // "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted"
- // Join should have 3 "Title 1" slugs and 1 "Title 2" slug
- outputBuffer =>
- "The pipeline" should "return a 4-element list" in {
- outputBuffer should have length 4
- }
- }
- .run
- .finish
-}
- */
+ "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in {
+ val score = Scorable.computeSimilarity(
+ new ReduceFeatures(JsonString), new ReduceFeatures(JsonString))
+ score shouldBe Scorable.MaxScore
}
}
-