aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/test/scala/sandcrawler/ScorableTest.scala
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala')
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableTest.scala111
1 files changed, 69 insertions, 42 deletions
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
index 9437fe6..8445073 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -9,7 +9,7 @@ import org.scalatest._
import parallelai.spyglass.hbase.HBaseConstants.SourceMode
class ScorableTest extends FlatSpec with Matchers {
- val JsonString = """
+ val JsonString = """
{
"title": "<<TITLE>>",
"authors": [
@@ -54,59 +54,86 @@ class ScorableTest extends FlatSpec with Matchers {
"annex": null
}
"""
- val MalformedJsonString = JsonString.replace("}", "")
- "titleToSlug()" should "extract the parts of titles before a colon" in {
- Scorable.titleToSlug("HELLO:there") shouldBe "hello"
- }
+ performUnitTests()
+ performPipelineTests()
- it should "extract an entire colon-less string" in {
- Scorable.titleToSlug("hello THERE") shouldBe "hello there"
- }
+ def performUnitTests() {
+ "titleToSlug()" should "extract the parts of titles before a colon" in {
+ Scorable.titleToSlug("HELLO:there") shouldBe "hello"
+ }
- it should "return Scorable.NoSlug if given empty string" in {
- Scorable.titleToSlug("") shouldBe Scorable.NoSlug
- }
+ it should "extract an entire colon-less string" in {
+ Scorable.titleToSlug("hello THERE") shouldBe "hello there"
+ }
- "jsonToMap()" should "return a map, given a legal JSON string" in {
- Scorable.jsonToMap(JsonString) should not be (None)
- }
+ it should "return Scorable.NoSlug if given empty string" in {
+ Scorable.titleToSlug("") shouldBe Scorable.NoSlug
+ }
- it should "return None, given illegal JSON" in {
- Scorable.jsonToMap("illegal{,json{{") should be (None)
- }
+ "jsonToMap()" should "return a map, given a legal JSON string" in {
+ Scorable.jsonToMap(JsonString) should not be (None)
+ }
- "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in {
- val score = Scorable.computeSimilarity(
- new ReduceFeatures(JsonString), new ReduceFeatures(JsonString))
- score shouldBe Scorable.MaxScore
- }
+ it should "return None, given illegal JSON" in {
+ Scorable.jsonToMap("illegal{,json{{") should be (None)
+ }
- /*
- it should "return None if given a malformed json string" in {
- val slug = Scorable.grobidToSlug(MalformedGrobidString)
- slug shouldBe None
+ "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in {
+ val score = Scorable.computeSimilarity(
+ new ReduceFeatures(JsonString), new ReduceFeatures(JsonString))
+ score shouldBe Scorable.MaxScore
+ }
}
- it should "return None if given an empty json string" in {
- val slug = Scorable.grobidToSlug("")
- slug shouldBe None
- }
+ def performPipelineTests() {
+ /*
- "crossrefToSlug()" should "get the right slug for a crossref json string" in {
- val slug = Scorable.crossrefToSlug(CrossrefStringWithTitle)
- slug should contain ("sometitle")
- }
+ val output = "/tmp/testOutput"
+ val input = "/tmp/testInput"
+ val (testTable, testHost) = ("test-table", "dummy-host:2181")
- it should "return None if given json string without title" in {
- val slug = Scorable.grobidToSlug(CrossrefStringWithoutTitle)
- slug shouldBe None
- }
+ val grobidSampleData = List(
+ List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"),
+ Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 1"))),
+ List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"),
+ Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 2: TNG"))),
+ List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"),
+ Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"))),
+ List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"),
+ Bytes.toBytes(MalformedGrobidString)))
- it should "return None if given a malformed json string" in {
- val slug = Scorable.grobidToSlug(MalformedCrossrefString)
- slug shouldBe None
+ JobTest("sandcrawler.HBaseCrossrefScoreJob")
+ .arg("test", "")
+ .arg("app.conf.path", "app.conf")
+ .arg("output", output)
+ .arg("hbase-table", testTable)
+ .arg("zookeeper-hosts", testHost)
+ .arg("crossref-input", input)
+ .arg("debug", "true")
+ .source[Tuple](HBaseCrossrefScore.getHBaseSource(testTable, testHost),
+ grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
+ .source(TextLine(input), List(
+ 0 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"),
+ 1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"),
+ 2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
+ 3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1")))
+ .sink[(Int, String, String, String, String)](TypedTsv[(Int,
+ String, String, String, String)](output)) {
+ // Grobid titles:
+ // "Title 1", "Title 2: TNG", "Title 3: The Sequel"
+ // crossref slugs:
+ // "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted"
+ // Join should have 3 "Title 1" slugs and 1 "Title 2" slug
+ outputBuffer =>
+ "The pipeline" should "return a 4-element list" in {
+ outputBuffer should have length 4
+ }
+ }
+ .run
+ .finish
+}
+ */
}
- */
}