aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-08-07 11:05:23 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-08-07 11:05:23 -0700
commit8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64 (patch)
treef515c25882aebeb5edb8d8a13e06e457e19a4fb4 /scalding/src
parent408123177b9e8afd145ea0f0fa1d6bb449f1bd20 (diff)
downloadsandcrawler-8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64.tar.gz
sandcrawler-8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64.zip
Added GrobidScorableTest, minor improvements.
Diffstat (limited to 'scalding/src')
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala19
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorable.scala24
-rw-r--r--scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala77
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableTest.scala111
4 files changed, 179 insertions, 52 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index 0849aff..cf5849c 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -15,10 +15,27 @@ class CrossrefScorable extends Scorable {
.read
.toTypedPipe[String](new Fields("line"))
.map{ json : String =>
- HBaseCrossrefScore.crossrefToSlug(json) match {
+ CrossrefScorable.crossrefToSlug(json) match {
case Some(slug) => new MapFeatures(slug, json)
case None => new MapFeatures(Scorable.NoSlug, json)
}
}
}
}
+
+object CrossrefScorable {
+ def crossrefToSlug(json : String) : Option[String] = {
+ Scorable.jsonToMap(json) match {
+ case None => None
+ case Some(map) => {
+ if (map contains "title") {
+ // TODO: Don't ignore titles after the first.
+ val title = map("title").asInstanceOf[List[String]](0)
+ Some(Scorable.titleToSlug(title))
+ } else {
+ None
+ }
+ }
+ }
+ }
+}
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
index 8da7708..25e5985 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -32,14 +32,20 @@ class GrobidScorable extends Scorable with HBasePipeConversions {
}
}
}
-/*
- def fromBytesWritableLocal(f: Fields): Pipe = {
- asList(f)
- .foldLeft(pipe) { (p, fld) => {
- p.map(fld.toString -> fld.toString) { from: org.apache.hadoop.hbase.io.ImmutableBytesWritable =>
- Option(from).map(x => Bytes.toString(x.get)).getOrElse(null)
- }
- }}
+}
+
+object GrobidScorable {
+ def grobidToSlug(json : String) : Option[String] = {
+ Scorable.jsonToMap(json) match {
+ case None => None
+ case Some(map) => {
+ if (map contains "title") {
+ Some(Scorable.titleToSlug(map("title").asInstanceOf[String]))
+ } else {
+ None
+ }
+ }
+ }
}
- */
}
+
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
new file mode 100644
index 0000000..7777610
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
@@ -0,0 +1,77 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class GrobidScorableTest extends FlatSpec with Matchers {
+ val GrobidString = """
+{
+ "title": "<<TITLE>>",
+ "authors": [
+ {"name": "Brewster Kahle"},
+ {"name": "J Doe"}
+ ],
+ "journal": {
+ "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+ "eissn": null,
+ "issn": null,
+ "issue": null,
+ "publisher": null,
+ "volume": null
+ },
+ "date": "2000",
+ "doi": null,
+ "citations": [
+ { "authors": [{"name": "A Seaperson"}],
+ "date": "2001",
+ "id": "b0",
+ "index": 0,
+ "issue": null,
+ "journal": "Letters in the Alphabet",
+ "publisher": null,
+ "title": "Everything is Wonderful",
+ "url": null,
+ "volume": "20"},
+ { "authors": [],
+ "date": "2011-03-28",
+ "id": "b1",
+ "index": 1,
+ "issue": null,
+ "journal": "The Dictionary",
+ "publisher": null,
+ "title": "All about Facts",
+ "url": null,
+ "volume": "14"}
+ ],
+ "abstract": "Everything you ever wanted to know about nothing",
+ "body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+ "acknowledgement": null,
+ "annex": null
+}
+"""
+ val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File")
+ val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle")
+ val MalformedGrobidString = GrobidString.replace("}", "")
+
+ // Unit tests
+
+ "grobidToSlug()" should "get the right slug for a grobid json string" in {
+ val slug = GrobidScorable.grobidToSlug(GrobidStringWithTitle)
+ slug should contain ("dummy example file")
+ }
+
+ it should "return None if given json string without title" in {
+ val slug = GrobidScorable.grobidToSlug(GrobidStringWithoutTitle)
+ slug shouldBe None
+ }
+
+ it should "return None if given a malformed json string" in {
+ val slug = GrobidScorable.grobidToSlug(MalformedGrobidString)
+ slug shouldBe None
+ }
+}
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
index 9437fe6..8445073 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -9,7 +9,7 @@ import org.scalatest._
import parallelai.spyglass.hbase.HBaseConstants.SourceMode
class ScorableTest extends FlatSpec with Matchers {
- val JsonString = """
+ val JsonString = """
{
"title": "<<TITLE>>",
"authors": [
@@ -54,59 +54,86 @@ class ScorableTest extends FlatSpec with Matchers {
"annex": null
}
"""
- val MalformedJsonString = JsonString.replace("}", "")
- "titleToSlug()" should "extract the parts of titles before a colon" in {
- Scorable.titleToSlug("HELLO:there") shouldBe "hello"
- }
+ performUnitTests()
+ performPipelineTests()
- it should "extract an entire colon-less string" in {
- Scorable.titleToSlug("hello THERE") shouldBe "hello there"
- }
+ def performUnitTests() {
+ "titleToSlug()" should "extract the parts of titles before a colon" in {
+ Scorable.titleToSlug("HELLO:there") shouldBe "hello"
+ }
- it should "return Scorable.NoSlug if given empty string" in {
- Scorable.titleToSlug("") shouldBe Scorable.NoSlug
- }
+ it should "extract an entire colon-less string" in {
+ Scorable.titleToSlug("hello THERE") shouldBe "hello there"
+ }
- "jsonToMap()" should "return a map, given a legal JSON string" in {
- Scorable.jsonToMap(JsonString) should not be (None)
- }
+ it should "return Scorable.NoSlug if given empty string" in {
+ Scorable.titleToSlug("") shouldBe Scorable.NoSlug
+ }
- it should "return None, given illegal JSON" in {
- Scorable.jsonToMap("illegal{,json{{") should be (None)
- }
+ "jsonToMap()" should "return a map, given a legal JSON string" in {
+ Scorable.jsonToMap(JsonString) should not be (None)
+ }
- "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in {
- val score = Scorable.computeSimilarity(
- new ReduceFeatures(JsonString), new ReduceFeatures(JsonString))
- score shouldBe Scorable.MaxScore
- }
+ it should "return None, given illegal JSON" in {
+ Scorable.jsonToMap("illegal{,json{{") should be (None)
+ }
- /*
- it should "return None if given a malformed json string" in {
- val slug = Scorable.grobidToSlug(MalformedGrobidString)
- slug shouldBe None
+ "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in {
+ val score = Scorable.computeSimilarity(
+ new ReduceFeatures(JsonString), new ReduceFeatures(JsonString))
+ score shouldBe Scorable.MaxScore
+ }
}
- it should "return None if given an empty json string" in {
- val slug = Scorable.grobidToSlug("")
- slug shouldBe None
- }
+ def performPipelineTests() {
+ /*
- "crossrefToSlug()" should "get the right slug for a crossref json string" in {
- val slug = Scorable.crossrefToSlug(CrossrefStringWithTitle)
- slug should contain ("sometitle")
- }
+ val output = "/tmp/testOutput"
+ val input = "/tmp/testInput"
+ val (testTable, testHost) = ("test-table", "dummy-host:2181")
- it should "return None if given json string without title" in {
- val slug = Scorable.grobidToSlug(CrossrefStringWithoutTitle)
- slug shouldBe None
- }
+ val grobidSampleData = List(
+ List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"),
+ Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 1"))),
+ List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"),
+ Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 2: TNG"))),
+ List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"),
+ Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"))),
+ List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"),
+ Bytes.toBytes(MalformedGrobidString)))
- it should "return None if given a malformed json string" in {
- val slug = Scorable.grobidToSlug(MalformedCrossrefString)
- slug shouldBe None
+ JobTest("sandcrawler.HBaseCrossrefScoreJob")
+ .arg("test", "")
+ .arg("app.conf.path", "app.conf")
+ .arg("output", output)
+ .arg("hbase-table", testTable)
+ .arg("zookeeper-hosts", testHost)
+ .arg("crossref-input", input)
+ .arg("debug", "true")
+ .source[Tuple](HBaseCrossrefScore.getHBaseSource(testTable, testHost),
+ grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
+ .source(TextLine(input), List(
+ 0 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"),
+ 1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"),
+ 2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
+ 3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1")))
+ .sink[(Int, String, String, String, String)](TypedTsv[(Int,
+ String, String, String, String)](output)) {
+ // Grobid titles:
+ // "Title 1", "Title 2: TNG", "Title 3: The Sequel"
+ // crossref slugs:
+ // "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted"
+ // Join should have 3 "Title 1" slugs and 1 "Title 2" slug
+ outputBuffer =>
+ "The pipeline" should "return a 4-element list" in {
+ outputBuffer should have length 4
+ }
+ }
+ .run
+ .finish
+}
+ */
}
- */
}