diff options
Diffstat (limited to 'scalding/src')
4 files changed, 179 insertions, 52 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index 0849aff..cf5849c 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -15,10 +15,27 @@ class CrossrefScorable extends Scorable { .read .toTypedPipe[String](new Fields("line")) .map{ json : String => - HBaseCrossrefScore.crossrefToSlug(json) match { + CrossrefScorable.crossrefToSlug(json) match { case Some(slug) => new MapFeatures(slug, json) case None => new MapFeatures(Scorable.NoSlug, json) } } } } + +object CrossrefScorable { + def crossrefToSlug(json : String) : Option[String] = { + Scorable.jsonToMap(json) match { + case None => None + case Some(map) => { + if (map contains "title") { + // TODO: Don't ignore titles after the first. + val title = map("title").asInstanceOf[List[String]](0) + Some(Scorable.titleToSlug(title)) + } else { + None + } + } + } + } +} diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index 8da7708..25e5985 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -32,14 +32,20 @@ class GrobidScorable extends Scorable with HBasePipeConversions { } } } -/* - def fromBytesWritableLocal(f: Fields): Pipe = { - asList(f) - .foldLeft(pipe) { (p, fld) => { - p.map(fld.toString -> fld.toString) { from: org.apache.hadoop.hbase.io.ImmutableBytesWritable => - Option(from).map(x => Bytes.toString(x.get)).getOrElse(null) - } - }} +} + +object GrobidScorable { + def grobidToSlug(json : String) : Option[String] = { + Scorable.jsonToMap(json) match { + case None => None + case Some(map) => { + if (map contains "title") { + Some(Scorable.titleToSlug(map("title").asInstanceOf[String])) + } else { + None + } + } + } } - */ } + diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala new file mode 100644 index 0000000..7777610 --- /dev/null +++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala @@ -0,0 +1,77 @@ +package sandcrawler + +import cascading.tuple.Fields +import cascading.tuple.Tuple +import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} +import org.apache.hadoop.hbase.io.ImmutableBytesWritable +import org.apache.hadoop.hbase.util.Bytes +import org.scalatest._ +import parallelai.spyglass.hbase.HBaseConstants.SourceMode + +class GrobidScorableTest extends FlatSpec with Matchers { + val GrobidString = """ +{ + "title": "<<TITLE>>", + "authors": [ + {"name": "Brewster Kahle"}, + {"name": "J Doe"} + ], + "journal": { + "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", + "eissn": null, + "issn": null, + "issue": null, + "publisher": null, + "volume": null + }, + "date": "2000", + "doi": null, + "citations": [ + { "authors": [{"name": "A Seaperson"}], + "date": "2001", + "id": "b0", + "index": 0, + "issue": null, + "journal": "Letters in the Alphabet", + "publisher": null, + "title": "Everything is Wonderful", + "url": null, + "volume": "20"}, + { "authors": [], + "date": "2011-03-28", + "id": "b1", + "index": 1, + "issue": null, + "journal": "The Dictionary", + "publisher": null, + "title": "All about Facts", + "url": null, + "volume": "14"} + ], + "abstract": "Everything you ever wanted to know about nothing", + "body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", + "acknowledgement": null, + "annex": null +} +""" + val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File") + val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle") + val MalformedGrobidString = GrobidString.replace("}", "") + + // Unit tests + + "grobidToSlug()" should "get the right slug for a grobid json string" in { + val slug = GrobidScorable.grobidToSlug(GrobidStringWithTitle) + slug should contain ("dummy example file") + } + + it should "return None if given json string without title" in { + val slug = GrobidScorable.grobidToSlug(GrobidStringWithoutTitle) + slug shouldBe None + } + + it should "return None if given a malformed json string" in { + val slug = GrobidScorable.grobidToSlug(MalformedGrobidString) + slug shouldBe None + } +} diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala index 9437fe6..8445073 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala @@ -9,7 +9,7 @@ import org.scalatest._ import parallelai.spyglass.hbase.HBaseConstants.SourceMode class ScorableTest extends FlatSpec with Matchers { - val JsonString = """ + val JsonString = """ { "title": "<<TITLE>>", "authors": [ @@ -54,59 +54,86 @@ class ScorableTest extends FlatSpec with Matchers { "annex": null } """ - val MalformedJsonString = JsonString.replace("}", "") - "titleToSlug()" should "extract the parts of titles before a colon" in { - Scorable.titleToSlug("HELLO:there") shouldBe "hello" - } + performUnitTests() + performPipelineTests() - it should "extract an entire colon-less string" in { - Scorable.titleToSlug("hello THERE") shouldBe "hello there" - } + def performUnitTests() { + "titleToSlug()" should "extract the parts of titles before a colon" in { + Scorable.titleToSlug("HELLO:there") shouldBe "hello" + } - it should "return Scorable.NoSlug if given empty string" in { - Scorable.titleToSlug("") shouldBe Scorable.NoSlug - } + it should "extract an entire colon-less string" in { + Scorable.titleToSlug("hello THERE") shouldBe "hello there" + } - "jsonToMap()" should "return a map, given a legal JSON string" in { - Scorable.jsonToMap(JsonString) should not be (None) - } + it should "return Scorable.NoSlug if given empty string" in { + Scorable.titleToSlug("") shouldBe Scorable.NoSlug + } - it should "return None, given illegal JSON" in { - Scorable.jsonToMap("illegal{,json{{") should be (None) - } + "jsonToMap()" should "return a map, given a legal JSON string" in { + Scorable.jsonToMap(JsonString) should not be (None) + } - "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in { - val score = Scorable.computeSimilarity( - new ReduceFeatures(JsonString), new ReduceFeatures(JsonString)) - score shouldBe Scorable.MaxScore - } + it should "return None, given illegal JSON" in { + Scorable.jsonToMap("illegal{,json{{") should be (None) + } - /* - it should "return None if given a malformed json string" in { - val slug = Scorable.grobidToSlug(MalformedGrobidString) - slug shouldBe None + "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in { + val score = Scorable.computeSimilarity( + new ReduceFeatures(JsonString), new ReduceFeatures(JsonString)) + score shouldBe Scorable.MaxScore + } } - it should "return None if given an empty json string" in { - val slug = Scorable.grobidToSlug("") - slug shouldBe None - } + def performPipelineTests() { + /* - "crossrefToSlug()" should "get the right slug for a crossref json string" in { - val slug = Scorable.crossrefToSlug(CrossrefStringWithTitle) - slug should contain ("sometitle") - } + val output = "/tmp/testOutput" + val input = "/tmp/testInput" + val (testTable, testHost) = ("test-table", "dummy-host:2181") - it should "return None if given json string without title" in { - val slug = Scorable.grobidToSlug(CrossrefStringWithoutTitle) - slug shouldBe None - } + val grobidSampleData = List( + List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), + Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 1"))), + List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), + Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 2: TNG"))), + List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), + Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"))), + List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), + Bytes.toBytes(MalformedGrobidString))) - it should "return None if given a malformed json string" in { - val slug = Scorable.grobidToSlug(MalformedCrossrefString) - slug shouldBe None + JobTest("sandcrawler.HBaseCrossrefScoreJob") + .arg("test", "") + .arg("app.conf.path", "app.conf") + .arg("output", output) + .arg("hbase-table", testTable) + .arg("zookeeper-hosts", testHost) + .arg("crossref-input", input) + .arg("debug", "true") + .source[Tuple](HBaseCrossrefScore.getHBaseSource(testTable, testHost), + grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*))) + .source(TextLine(input), List( + 0 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"), + 1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"), + 2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"), + 3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"))) + .sink[(Int, String, String, String, String)](TypedTsv[(Int, + String, String, String, String)](output)) { + // Grobid titles: + // "Title 1", "Title 2: TNG", "Title 3: The Sequel" + // crossref slugs: + // "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted" + // Join should have 3 "Title 1" slugs and 1 "Title 2" slug + outputBuffer => + "The pipeline" should "return a 4-element list" in { + outputBuffer should have length 4 + } + } + .run + .finish +} + */ } - */ } |