diff options
author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-24 16:15:42 -0700 |
---|---|---|
committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-24 16:15:42 -0700 |
commit | a950d5d5c61fb77b2ba83703ef853ef951ac94af (patch) | |
tree | 38b34c904ad9d21c0d51ab6401d485e71ce4fcf3 /scalding/src/test | |
parent | 07edf1ccad9c3268324926471dd0c8a7433f0c08 (diff) | |
download | sandcrawler-a950d5d5c61fb77b2ba83703ef853ef951ac94af.tar.gz sandcrawler-a950d5d5c61fb77b2ba83703ef853ef951ac94af.zip |
WIP. I'm having problems converting between ImmutableBytesWritable and String.
Diffstat (limited to 'scalding/src/test')
-rw-r--r-- | scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala | 49 |
1 files changed, 44 insertions, 5 deletions
diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala index a59b278..f52c5b4 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala @@ -1,13 +1,17 @@ package sandcrawler import cascading.tuple.Fields +import cascading.tuple.Tuple +import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} +import org.apache.hadoop.hbase.io.ImmutableBytesWritable +import org.apache.hadoop.hbase.util.Bytes import org.scalatest._ import parallelai.spyglass.hbase.HBaseConstants.SourceMode -class HBaseCrossrefScoreTest extends FlatSpec with Matchers { +class HBaseCrossrefScoreTest extends FunSpec with TupleConversions { val GrobidString = """ { - "title": "Dummy Example File", + "title": "<<TITLE>>", "authors": [ {"name": "Brewster Kahle"}, {"name": "J Doe"} @@ -50,6 +54,7 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers { "annex": null } """ + val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File") val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle") val MalformedGrobidString = GrobidString.replace("}", "") @@ -69,7 +74,7 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers { "delay-in-days" : 0, "content-version" : "tdm" }], "content-domain" : { "domain" : [], "crossmark-restriction" : false }, "published-print" : { "date-parts" : [ [ 1996 ] ] }, - "DOI" : "10.1016/0987-7983(96)87729-2", + "DOI" : "<<DOI>>", "type" : "journal-article", "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], "date-time" : "2002-07-25T15:09:41Z", @@ -77,7 +82,7 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers { "page" : "186-187", "source" : "Crossref", "is-referenced-by-count" : 0, - "title" : [ "les ferments lactiques: classification, propriétés, utilisations agroalimentaires" ], + "title" : [ "<<TITLE>>" ], "prefix" : "10.1016", "volume" : "9", "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], @@ -105,9 +110,10 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers { "subject" : [ "Pediatrics, Perinatology, and Child Health" ] } """ + val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle") val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") val MalformedCrossrefString = CrossrefString.replace("}", "") - +/* "titleToSlug()" should "extract the parts of titles before a colon" in { val slug = HBaseCrossrefScore.titleToSlug("HELLO:there") slug should contain ("hello") @@ -147,4 +153,37 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers { val slug = HBaseCrossrefScore.grobidToSlug(MalformedCrossrefString) slug shouldBe None } + */ + + val output = "/tmp/testOutput" + val input = "/tmp/testInput" + val (testTable, testHost) = ("test-table", "dummy-host:2181") + + val grobidSampleData = List( + List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title1"))), + List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title2: TNG"))), + List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title3: The Sequel"))), + List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title4")))) + + JobTest("sandcrawler.HBaseCrossrefScoreJob") + .arg("test", "") + .arg("app.conf.path", "app.conf") + .arg("output", output) + .arg("hbase-table", testTable) + .arg("zookeeper-hosts", testHost) + .arg("crossref-input", input) + .arg("debug", "true") + .source[Tuple](HBaseCrossrefScore.getHBaseSource(testTable, testHost), + grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*))) + .source(TextLine(input), List(( + "0" -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"), + "1" -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1")))) + .sink[Tuple](TypedTsv[(String, String, String)](output)) { + outputBuffer => + it("should return a 2-element list.") { + assert(outputBuffer.size === 2) + } + } + .run + .finish } |