From cbd6433af7949df7c4433468bf99eefe9973e864 Mon Sep 17 00:00:00 2001 From: Ellen Spertus Date: Tue, 7 Aug 2018 10:11:54 -0700 Subject: Removed commented-out code. --- .../src/test/scala/sandcrawler/ScorableTest.scala | 108 +++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 scalding/src/test/scala/sandcrawler/ScorableTest.scala (limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala') diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala new file mode 100644 index 0000000..0375b6a --- /dev/null +++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala @@ -0,0 +1,108 @@ +package sandcrawler + +import cascading.tuple.Fields +import cascading.tuple.Tuple +import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} +import org.apache.hadoop.hbase.io.ImmutableBytesWritable +import org.apache.hadoop.hbase.util.Bytes +import org.scalatest._ +import parallelai.spyglass.hbase.HBaseConstants.SourceMode + +class HBaseCrossrefScoreTest extends FlatSpec with Matchers { + val JsonString = """ +{ + "title": "<>", + "authors": [ + {"name": "Brewster Kahle"}, + {"name": "J Doe"} + ], + "journal": { + "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", + "eissn": null, + "issn": null, + "issue": null, + "publisher": null, + "volume": null + }, + "date": "2000", + "doi": null, + "citations": [ + { "authors": [{"name": "A Seaperson"}], + "date": "2001", + "id": "b0", + "index": 0, + "issue": null, + "journal": "Letters in the Alphabet", + "publisher": null, + "title": "Everything is Wonderful", + "url": null, + "volume": "20"}, + { "authors": [], + "date": "2011-03-28", + "id": "b1", + "index": 1, + "issue": null, + "journal": "The Dictionary", + "publisher": null, + "title": "All about Facts", + "url": null, + "volume": "14"} + ], + "abstract": "Everything you ever wanted to know about nothing", + "body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", + "acknowledgement": null, + "annex": null +} +""" + val MalformedJsonString = JsonString.replace("}", "") + + "titleToSlug()" should "extract the parts of titles before a colon" in { + val slug = Scorable.titleToSlug("HELLO:there") + slug should contain ("hello") + } + + it should "extract an entire colon-less string" in { + val slug = Scorable.titleToSlug("hello THERE") + slug should contain ("hello there") + } + + it should "return None if given empty string" in { + Scorable.titleToSlug("") shouldBe None + } + + "jsonToMap()" should "return a map, given a legal JSON string" in { + Scorable.jsonToMap(jsonString) should be (Some(_)) + } + + it should "return None, given illegal JSON" in { + Scorable.jsonToMap("illegal{,json{{") should be (None)) + } + +/* + it should "return None if given a malformed json string" in { + val slug = Scorable.grobidToSlug(MalformedGrobidString) + slug shouldBe None + } + + it should "return None if given an empty json string" in { + val slug = Scorable.grobidToSlug("") + slug shouldBe None + } + + "crossrefToSlug()" should "get the right slug for a crossref json string" in { + val slug = Scorable.crossrefToSlug(CrossrefStringWithTitle) + slug should contain ("sometitle") + } + + it should "return None if given json string without title" in { + val slug = Scorable.grobidToSlug(CrossrefStringWithoutTitle) + slug shouldBe None + } + + it should "return None if given a malformed json string" in { + val slug = Scorable.grobidToSlug(MalformedCrossrefString) + slug shouldBe None + } + */ +} + -- cgit v1.2.3 From 6cdea0ec0950c8f12c362b6521a1bbbabc3db379 Mon Sep 17 00:00:00 2001 From: Ellen Spertus <ellen.spertus@gmail.com> Date: Tue, 7 Aug 2018 10:12:12 -0700 Subject: Added ScorableTest, which passes. --- scalding/src/test/scala/sandcrawler/ScorableTest.scala | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala') diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala index 0375b6a..78cd358 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala @@ -8,7 +8,7 @@ import org.apache.hadoop.hbase.util.Bytes import org.scalatest._ import parallelai.spyglass.hbase.HBaseConstants.SourceMode -class HBaseCrossrefScoreTest extends FlatSpec with Matchers { +class ScorableTest extends FlatSpec with Matchers { val JsonString = """ { "title": "<<TITLE>>", @@ -58,24 +58,24 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers { "titleToSlug()" should "extract the parts of titles before a colon" in { val slug = Scorable.titleToSlug("HELLO:there") - slug should contain ("hello") + slug shouldBe "hello" } it should "extract an entire colon-less string" in { val slug = Scorable.titleToSlug("hello THERE") - slug should contain ("hello there") + slug shouldBe "hello there" } - it should "return None if given empty string" in { - Scorable.titleToSlug("") shouldBe None + it should "return Scorable.NoSlug if given empty string" in { + Scorable.titleToSlug("") shouldBe Scorable.NoSlug } "jsonToMap()" should "return a map, given a legal JSON string" in { - Scorable.jsonToMap(jsonString) should be (Some(_)) + Scorable.jsonToMap(JsonString) should not be (None) } it should "return None, given illegal JSON" in { - Scorable.jsonToMap("illegal{,json{{") should be (None)) + Scorable.jsonToMap("illegal{,json{{") should be (None) } /* -- cgit v1.2.3 From dddb7ed410bdd542ca12756d3e97aca6beea5532 Mon Sep 17 00:00:00 2001 From: Ellen Spertus <ellen.spertus@gmail.com> Date: Tue, 7 Aug 2018 10:22:02 -0700 Subject: Added test, which passes. --- scalding/src/test/scala/sandcrawler/ScorableTest.scala | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala') diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala index 78cd358..535b8f6 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala @@ -57,13 +57,11 @@ class ScorableTest extends FlatSpec with Matchers { val MalformedJsonString = JsonString.replace("}", "") "titleToSlug()" should "extract the parts of titles before a colon" in { - val slug = Scorable.titleToSlug("HELLO:there") - slug shouldBe "hello" + Scorable.titleToSlug("HELLO:there") shouldBe "hello" } it should "extract an entire colon-less string" in { - val slug = Scorable.titleToSlug("hello THERE") - slug shouldBe "hello there" + Scorable.titleToSlug("hello THERE") shouldBe "hello there" } it should "return Scorable.NoSlug if given empty string" in { @@ -78,7 +76,12 @@ class ScorableTest extends FlatSpec with Matchers { Scorable.jsonToMap("illegal{,json{{") should be (None) } -/* + "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in { + val output = Scorable.computeOutput(new ReduceFeatures(JsonString), new ReduceFeatures(JsonString)) + output.score shouldBe Scorable.MaxScore + } + + /* it should "return None if given a malformed json string" in { val slug = Scorable.grobidToSlug(MalformedGrobidString) slug shouldBe None -- cgit v1.2.3 From 4981a98358aae098714d2266404f7b167993bf0c Mon Sep 17 00:00:00 2001 From: Ellen Spertus <ellen.spertus@gmail.com> Date: Tue, 7 Aug 2018 10:28:48 -0700 Subject: Minor refactoring. Added test. --- scalding/src/main/scala/sandcrawler/Scorable.scala | 15 ++++++--------- scalding/src/main/scala/sandcrawler/ScoreJob.scala | 4 +++- scalding/src/test/scala/sandcrawler/ScorableTest.scala | 5 +++-- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala') diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 948002b..77bb7ae 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -69,19 +69,16 @@ object Scorable { val MaxScore = 1000 - def computeOutput(feature1 : ReduceFeatures, feature2 : ReduceFeatures) : - ReduceOutput = { - val json1 = jsonToMap(feature1.json) - val json2 = jsonToMap(feature2.json) + def computeSimilarity(features1 : ReduceFeatures, features2 : ReduceFeatures) : Int = { + val json1 = jsonToMap(features1.json) + val json2 = jsonToMap(features2.json) getStringOption(json1, "title") match { - case None => ReduceOutput(0, "No title", feature1.json) + case None => 0 case Some(title1) => { getStringOption(json2, "title") match { - case None => ReduceOutput(0, "No title", feature2.json) + case None => 0 case Some(title2) => - ReduceOutput( - (StringUtilities.similarity(title1, title2) * MaxScore).toInt, - feature1.json, feature2.json) + (StringUtilities.similarity(title1, title2) * MaxScore).toInt } } } diff --git a/scalding/src/main/scala/sandcrawler/ScoreJob.scala b/scalding/src/main/scala/sandcrawler/ScoreJob.scala index 22cc9e9..e6a5dc1 100644 --- a/scalding/src/main/scala/sandcrawler/ScoreJob.scala +++ b/scalding/src/main/scala/sandcrawler/ScoreJob.scala @@ -17,7 +17,9 @@ class ScoreJob(args: Args, sc1 : Scorable, sc2 : Scorable)(implicit flowDef : Fl pipe1.join(pipe2).map { entry => val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry - Scorable.computeOutput(features1, features2) + new ReduceOutput(Scorable.computeSimilarity(features1, features2), + features1.json, + features2.json) } .write(TypedTsv[ReduceOutput](args("output"))) } diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala index 535b8f6..9437fe6 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala @@ -77,8 +77,9 @@ class ScorableTest extends FlatSpec with Matchers { } "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in { - val output = Scorable.computeOutput(new ReduceFeatures(JsonString), new ReduceFeatures(JsonString)) - output.score shouldBe Scorable.MaxScore + val score = Scorable.computeSimilarity( + new ReduceFeatures(JsonString), new ReduceFeatures(JsonString)) + score shouldBe Scorable.MaxScore } /* -- cgit v1.2.3 From 8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64 Mon Sep 17 00:00:00 2001 From: Ellen Spertus <ellen.spertus@gmail.com> Date: Tue, 7 Aug 2018 11:05:23 -0700 Subject: Added GrobidScorableTest, minor improvements. --- .../main/scala/sandcrawler/CrossrefScorable.scala | 19 +++- .../main/scala/sandcrawler/GrobidScorable.scala | 24 +++-- .../scala/sandcrawler/GrobidScorableTest.scala | 77 ++++++++++++++ .../src/test/scala/sandcrawler/ScorableTest.scala | 111 +++++++++++++-------- 4 files changed, 179 insertions(+), 52 deletions(-) create mode 100644 scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala (limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala') diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index 0849aff..cf5849c 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -15,10 +15,27 @@ class CrossrefScorable extends Scorable { .read .toTypedPipe[String](new Fields("line")) .map{ json : String => - HBaseCrossrefScore.crossrefToSlug(json) match { + CrossrefScorable.crossrefToSlug(json) match { case Some(slug) => new MapFeatures(slug, json) case None => new MapFeatures(Scorable.NoSlug, json) } } } } + +object CrossrefScorable { + def crossrefToSlug(json : String) : Option[String] = { + Scorable.jsonToMap(json) match { + case None => None + case Some(map) => { + if (map contains "title") { + // TODO: Don't ignore titles after the first. + val title = map("title").asInstanceOf[List[String]](0) + Some(Scorable.titleToSlug(title)) + } else { + None + } + } + } + } +} diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index 8da7708..25e5985 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -32,14 +32,20 @@ class GrobidScorable extends Scorable with HBasePipeConversions { } } } -/* - def fromBytesWritableLocal(f: Fields): Pipe = { - asList(f) - .foldLeft(pipe) { (p, fld) => { - p.map(fld.toString -> fld.toString) { from: org.apache.hadoop.hbase.io.ImmutableBytesWritable => - Option(from).map(x => Bytes.toString(x.get)).getOrElse(null) - } - }} +} + +object GrobidScorable { + def grobidToSlug(json : String) : Option[String] = { + Scorable.jsonToMap(json) match { + case None => None + case Some(map) => { + if (map contains "title") { + Some(Scorable.titleToSlug(map("title").asInstanceOf[String])) + } else { + None + } + } + } } - */ } + diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala new file mode 100644 index 0000000..7777610 --- /dev/null +++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala @@ -0,0 +1,77 @@ +package sandcrawler + +import cascading.tuple.Fields +import cascading.tuple.Tuple +import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} +import org.apache.hadoop.hbase.io.ImmutableBytesWritable +import org.apache.hadoop.hbase.util.Bytes +import org.scalatest._ +import parallelai.spyglass.hbase.HBaseConstants.SourceMode + +class GrobidScorableTest extends FlatSpec with Matchers { + val GrobidString = """ +{ + "title": "<<TITLE>>", + "authors": [ + {"name": "Brewster Kahle"}, + {"name": "J Doe"} + ], + "journal": { + "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", + "eissn": null, + "issn": null, + "issue": null, + "publisher": null, + "volume": null + }, + "date": "2000", + "doi": null, + "citations": [ + { "authors": [{"name": "A Seaperson"}], + "date": "2001", + "id": "b0", + "index": 0, + "issue": null, + "journal": "Letters in the Alphabet", + "publisher": null, + "title": "Everything is Wonderful", + "url": null, + "volume": "20"}, + { "authors": [], + "date": "2011-03-28", + "id": "b1", + "index": 1, + "issue": null, + "journal": "The Dictionary", + "publisher": null, + "title": "All about Facts", + "url": null, + "volume": "14"} + ], + "abstract": "Everything you ever wanted to know about nothing", + "body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", + "acknowledgement": null, + "annex": null +} +""" + val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File") + val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle") + val MalformedGrobidString = GrobidString.replace("}", "") + + // Unit tests + + "grobidToSlug()" should "get the right slug for a grobid json string" in { + val slug = GrobidScorable.grobidToSlug(GrobidStringWithTitle) + slug should contain ("dummy example file") + } + + it should "return None if given json string without title" in { + val slug = GrobidScorable.grobidToSlug(GrobidStringWithoutTitle) + slug shouldBe None + } + + it should "return None if given a malformed json string" in { + val slug = GrobidScorable.grobidToSlug(MalformedGrobidString) + slug shouldBe None + } +} diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala index 9437fe6..8445073 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala @@ -9,7 +9,7 @@ import org.scalatest._ import parallelai.spyglass.hbase.HBaseConstants.SourceMode class ScorableTest extends FlatSpec with Matchers { - val JsonString = """ + val JsonString = """ { "title": "<<TITLE>>", "authors": [ @@ -54,59 +54,86 @@ class ScorableTest extends FlatSpec with Matchers { "annex": null } """ - val MalformedJsonString = JsonString.replace("}", "") - "titleToSlug()" should "extract the parts of titles before a colon" in { - Scorable.titleToSlug("HELLO:there") shouldBe "hello" - } + performUnitTests() + performPipelineTests() - it should "extract an entire colon-less string" in { - Scorable.titleToSlug("hello THERE") shouldBe "hello there" - } + def performUnitTests() { + "titleToSlug()" should "extract the parts of titles before a colon" in { + Scorable.titleToSlug("HELLO:there") shouldBe "hello" + } - it should "return Scorable.NoSlug if given empty string" in { - Scorable.titleToSlug("") shouldBe Scorable.NoSlug - } + it should "extract an entire colon-less string" in { + Scorable.titleToSlug("hello THERE") shouldBe "hello there" + } - "jsonToMap()" should "return a map, given a legal JSON string" in { - Scorable.jsonToMap(JsonString) should not be (None) - } + it should "return Scorable.NoSlug if given empty string" in { + Scorable.titleToSlug("") shouldBe Scorable.NoSlug + } - it should "return None, given illegal JSON" in { - Scorable.jsonToMap("illegal{,json{{") should be (None) - } + "jsonToMap()" should "return a map, given a legal JSON string" in { + Scorable.jsonToMap(JsonString) should not be (None) + } - "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in { - val score = Scorable.computeSimilarity( - new ReduceFeatures(JsonString), new ReduceFeatures(JsonString)) - score shouldBe Scorable.MaxScore - } + it should "return None, given illegal JSON" in { + Scorable.jsonToMap("illegal{,json{{") should be (None) + } - /* - it should "return None if given a malformed json string" in { - val slug = Scorable.grobidToSlug(MalformedGrobidString) - slug shouldBe None + "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in { + val score = Scorable.computeSimilarity( + new ReduceFeatures(JsonString), new ReduceFeatures(JsonString)) + score shouldBe Scorable.MaxScore + } } - it should "return None if given an empty json string" in { - val slug = Scorable.grobidToSlug("") - slug shouldBe None - } + def performPipelineTests() { + /* - "crossrefToSlug()" should "get the right slug for a crossref json string" in { - val slug = Scorable.crossrefToSlug(CrossrefStringWithTitle) - slug should contain ("sometitle") - } + val output = "/tmp/testOutput" + val input = "/tmp/testInput" + val (testTable, testHost) = ("test-table", "dummy-host:2181") - it should "return None if given json string without title" in { - val slug = Scorable.grobidToSlug(CrossrefStringWithoutTitle) - slug shouldBe None - } + val grobidSampleData = List( + List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), + Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 1"))), + List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), + Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 2: TNG"))), + List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), + Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"))), + List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), + Bytes.toBytes(MalformedGrobidString))) - it should "return None if given a malformed json string" in { - val slug = Scorable.grobidToSlug(MalformedCrossrefString) - slug shouldBe None + JobTest("sandcrawler.HBaseCrossrefScoreJob") + .arg("test", "") + .arg("app.conf.path", "app.conf") + .arg("output", output) + .arg("hbase-table", testTable) + .arg("zookeeper-hosts", testHost) + .arg("crossref-input", input) + .arg("debug", "true") + .source[Tuple](HBaseCrossrefScore.getHBaseSource(testTable, testHost), + grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*))) + .source(TextLine(input), List( + 0 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"), + 1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"), + 2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"), + 3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"))) + .sink[(Int, String, String, String, String)](TypedTsv[(Int, + String, String, String, String)](output)) { + // Grobid titles: + // "Title 1", "Title 2: TNG", "Title 3: The Sequel" + // crossref slugs: + // "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted" + // Join should have 3 "Title 1" slugs and 1 "Title 2" slug + outputBuffer => + "The pipeline" should "return a 4-element list" in { + outputBuffer should have length 4 + } + } + .run + .finish +} + */ } - */ } -- cgit v1.2.3 From 71b8d527da73f99ffb1b09ec1044031e772d1db6 Mon Sep 17 00:00:00 2001 From: Ellen Spertus <ellen.spertus@gmail.com> Date: Tue, 7 Aug 2018 11:24:06 -0700 Subject: Added punctuation removal to slug creation and similarity comparisons --- scalding/src/main/scala/sandcrawler/Scorable.scala | 3 ++- scalding/src/main/scala/sandcrawler/StringUtilities.scala | 8 +++++++- scalding/src/test/scala/sandcrawler/ScorableTest.scala | 7 +++++++ scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala | 10 ++++++++++ 4 files changed, 26 insertions(+), 2 deletions(-) (limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala') diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 77bb7ae..736c175 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -45,7 +45,8 @@ object Scorable { } def titleToSlug(title : String) : String = { - val slug = StringUtilities.removeAccents(title).split(":")(0).toLowerCase() + val slug = StringUtilities.removePunctuation( + StringUtilities.removeAccents(title).split(":")(0).toLowerCase()) if (slug.isEmpty) { NoSlug } else { diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala index 1ae6db3..3058f15 100644 --- a/scalding/src/main/scala/sandcrawler/StringUtilities.scala +++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala @@ -25,9 +25,15 @@ object StringUtilities { pattern.matcher(sb).replaceAll("") } + // Source: https://stackoverflow.com/a/30076541/631051 + def removePunctuation(s: String) : String = { + s.replaceAll("""[\p{Punct}&&[^.]]""", "") + } + // Adapted from: https://stackoverflow.com/a/16018452/631051 def similarity(s1a : String, s2a : String) : Double = { - val (s1, s2) = (removeAccents(s1a), removeAccents(s2a)) + val (s1, s2) = (removeAccents(removePunctuation(s1a)), + removeAccents(removePunctuation(s2a))) val longer : String = if (s1.length > s2.length) s1 else s2 val shorter : String = if (s1.length > s2.length) s2 else s1 if (longer.length == 0) { diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala index 8445073..713a7e5 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala @@ -71,6 +71,13 @@ class ScorableTest extends FlatSpec with Matchers { Scorable.titleToSlug("") shouldBe Scorable.NoSlug } + "titleToSlug()" should "strip punctuation" in { + Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello" + Scorable.titleToSlug("a:b:c") shouldBe "a" + Scorable.titleToSlug( + "If you're happy and you know it, clap your hands!") shouldBe "if youre happy and you know it clap your hands" + } + "jsonToMap()" should "return a map, given a legal JSON string" in { Scorable.jsonToMap(JsonString) should not be (None) } diff --git a/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala index 2df5a22..410819b 100644 --- a/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala +++ b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala @@ -26,6 +26,16 @@ class StringUtilitiesTest extends FlatSpec with Matchers { StringUtilities.removeAccents("SØREN") shouldBe "SOREN" } + "removePunctuation" should "work on the empty string" in { + StringUtilities.removePunctuation("") shouldBe "" + } + + it should "work on non-empty text strings" in { + StringUtilities.removePunctuation("Hello, world!") shouldBe "Hello world" + StringUtilities.removePunctuation(":-)") shouldBe "" + StringUtilities.removePunctuation("<<---a---b--->") shouldBe "ab" + } + // Tests adapted from https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/ "stringDistance" should "work on empty strings" in { StringUtilities.stringDistance("", "") shouldBe 0 -- cgit v1.2.3 From 6d64c5d4e1527c7277527132efa858def2589486 Mon Sep 17 00:00:00 2001 From: Ellen Spertus <ellen.spertus@gmail.com> Date: Thu, 9 Aug 2018 11:30:44 -0700 Subject: Added test for null argument to titleToSlug() --- scalding/src/main/scala/sandcrawler/Scorable.scala | 13 +++++++++---- scalding/src/test/scala/sandcrawler/ScorableTest.scala | 4 ++++ 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala') diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 736c175..ce4fdca 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -45,12 +45,17 @@ object Scorable { } def titleToSlug(title : String) : String = { - val slug = StringUtilities.removePunctuation( - StringUtilities.removeAccents(title).split(":")(0).toLowerCase()) - if (slug.isEmpty) { + if (title == null || title.isEmpty) { NoSlug } else { - slug + val unaccented = StringUtilities.removeAccents(title) + // Remove punctuation after splitting on colon. + val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())) + if (slug.isEmpty || slug == null) { + NoSlug + } else { + slug + } } } diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala index 713a7e5..40801a0 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala @@ -71,6 +71,10 @@ class ScorableTest extends FlatSpec with Matchers { Scorable.titleToSlug("") shouldBe Scorable.NoSlug } + it should "return Scorable.NoSlug if given null" in { + Scorable.titleToSlug(null) shouldBe Scorable.NoSlug + } + "titleToSlug()" should "strip punctuation" in { Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello" Scorable.titleToSlug("a:b:c") shouldBe "a" -- cgit v1.2.3 From 9d7adc94ad63e85ffb2b459d4a8c2ed0ed46d8c8 Mon Sep 17 00:00:00 2001 From: Ellen Spertus <ellen.spertus@gmail.com> Date: Thu, 9 Aug 2018 19:03:01 -0700 Subject: WIP --- .../main/scala/sandcrawler/CrossrefScorable.scala | 1 + .../main/scala/sandcrawler/GrobidScorable.scala | 15 +- scalding/src/main/scala/sandcrawler/Scorable.scala | 2 +- scalding/src/main/scala/sandcrawler/ScoreJob.scala | 46 ++++-- .../src/test/scala/sandcrawler/ScorableTest.scala | 112 ++++--------- .../src/test/scala/sandcrawler/ScoreJobTest.scala | 177 +++++++++++++++++++++ 6 files changed, 251 insertions(+), 102 deletions(-) create mode 100644 scalding/src/test/scala/sandcrawler/ScoreJobTest.scala (limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala') diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index ee4cc54..d5da845 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -11,6 +11,7 @@ import parallelai.spyglass.hbase.HBaseSource class CrossrefScorable extends Scorable { def getFeaturesPipe(args : Args)(implicit flowDef : FlowDef, mode : Mode) : TypedPipe[MapFeatures] = { + // TODO: Generalize args so there can be multiple Grobid pipes in one job. TextLine(args("crossref-input")) .read .toTypedPipe[String](new Fields("line")) diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index 95d6dae..4c67074 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -11,14 +11,9 @@ import parallelai.spyglass.hbase.HBaseSource class GrobidScorable extends Scorable with HBasePipeConversions { def getFeaturesPipe(args : Args)(implicit flowDef : FlowDef, mode : Mode) : TypedPipe[MapFeatures] = { - // TODO: Clean up code after debugging. - val grobidSource = HBaseBuilder.build( - args("hbase-table"), - args("zookeeper-hosts"), - List("grobid0:tei_json"), - SourceMode.SCAN_ALL) - - grobidSource.read + // TODO: Generalize args so there can be multiple grobid pipes in one job. + GrobidScorable.getHBaseSource(args("hbase-table"), args("zookeeper-hosts")) + .read .fromBytesWritable(new Fields("key", "tei_json")) // TODO: Figure out why this line (used in HBaseCrossrefScoreJob.scala) // didn't work here: .toTypedPipe[(String, String)]('key, 'tei_json) @@ -34,6 +29,10 @@ class GrobidScorable extends Scorable with HBasePipeConversions { } object GrobidScorable { + def getHBaseSource(table : String, host : String) : HBaseSource = { + HBaseBuilder.build(table, host, List("grobid0:tei_json"), SourceMode.SCAN_ALL) + } + def grobidToSlug(json : String) : Option[String] = { Scorable.jsonToMap(json) match { case None => None diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 86336cb..cfdc192 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -9,7 +9,7 @@ import com.twitter.scalding.typed.TDsl._ case class MapFeatures(slug : String, json : String) case class ReduceFeatures(json : String) -case class ReduceOutput(val score : Int, json1 : String, json2 : String) +case class ReduceOutput(val slug : String, score : Int, json1 : String, json2 : String) abstract class Scorable { def getInputPipe(args : Args, flowDef : FlowDef, mode : Mode) : TypedPipe[(String, ReduceFeatures)] = diff --git a/scalding/src/main/scala/sandcrawler/ScoreJob.scala b/scalding/src/main/scala/sandcrawler/ScoreJob.scala index e6a5dc1..aa20d0f 100644 --- a/scalding/src/main/scala/sandcrawler/ScoreJob.scala +++ b/scalding/src/main/scala/sandcrawler/ScoreJob.scala @@ -1,25 +1,53 @@ package sandcrawler -import java.text.Normalizer - -import scala.math -import scala.util.parsing.json.JSON - import cascading.flow.FlowDef import com.twitter.scalding._ import com.twitter.scalding.typed.TDsl._ import parallelai.spyglass.base.JobBase import parallelai.spyglass.hbase.HBasePipeConversions -class ScoreJob(args: Args, sc1 : Scorable, sc2 : Scorable)(implicit flowDef : FlowDef, mode: Mode) extends JobBase(args) with HBasePipeConversions { - val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(args, flowDef, mode) - val pipe2 : TypedPipe[(String, ReduceFeatures)] = sc2.getInputPipe(args, flowDef, mode) +class ScoreJob(args: Args)(implicit flowDef : FlowDef, mode: Mode) extends JobBase(args) with + HBasePipeConversions { + /* + val pipe1 : TypedPipe[(String, ReduceFeatures)] = ScoreJob.getScorable1().getInputPipe(args, flowDef, mode) + val pipe2 : TypedPipe[(String, ReduceFeatures)] = ScoreJob.getScorable2().getInputPipe(args, flowDef, mode) pipe1.join(pipe2).map { entry => val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry - new ReduceOutput(Scorable.computeSimilarity(features1, features2), + new ReduceOutput( + slug, + Scorable.computeSimilarity(features1, features2), features1.json, features2.json) } .write(TypedTsv[ReduceOutput](args("output"))) + */ +} + +// Ugly hack to get non-String information into ScoreJob above. +object ScoreJob { + var scorable1 : Option[Scorable] = None + var scorable2 : Option[Scorable] = None + + def setScorable1(s : Scorable) { + scorable1 = Some(s) + } + + def getScorable1() : Scorable = { + scorable1 match { + case Some(s) => s + case None => null + } + } + + def setScorable2(s: Scorable) { + scorable2 = Some(s) + } + + def getScorable2() : Scorable = { + scorable2 match { + case Some(s) => s + case None => null + } + } } diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala index 40801a0..2f80492 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala @@ -9,7 +9,7 @@ import org.scalatest._ import parallelai.spyglass.hbase.HBaseConstants.SourceMode class ScorableTest extends FlatSpec with Matchers { - val JsonString = """ + val JsonString = """ { "title": "<<TITLE>>", "authors": [ @@ -55,96 +55,40 @@ class ScorableTest extends FlatSpec with Matchers { } """ - performUnitTests() - performPipelineTests() - - def performUnitTests() { - "titleToSlug()" should "extract the parts of titles before a colon" in { - Scorable.titleToSlug("HELLO:there") shouldBe "hello" - } - - it should "extract an entire colon-less string" in { - Scorable.titleToSlug("hello THERE") shouldBe "hello there" - } - - it should "return Scorable.NoSlug if given empty string" in { - Scorable.titleToSlug("") shouldBe Scorable.NoSlug - } - - it should "return Scorable.NoSlug if given null" in { - Scorable.titleToSlug(null) shouldBe Scorable.NoSlug - } - - "titleToSlug()" should "strip punctuation" in { - Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello" - Scorable.titleToSlug("a:b:c") shouldBe "a" - Scorable.titleToSlug( - "If you're happy and you know it, clap your hands!") shouldBe "if youre happy and you know it clap your hands" - } + "titleToSlug()" should "extract the parts of titles before a colon" in { + Scorable.titleToSlug("HELLO:there") shouldBe "hello" + } - "jsonToMap()" should "return a map, given a legal JSON string" in { - Scorable.jsonToMap(JsonString) should not be (None) - } + it should "extract an entire colon-less string" in { + Scorable.titleToSlug("hello THERE") shouldBe "hello there" + } - it should "return None, given illegal JSON" in { - Scorable.jsonToMap("illegal{,json{{") should be (None) - } + it should "return Scorable.NoSlug if given empty string" in { + Scorable.titleToSlug("") shouldBe Scorable.NoSlug + } - "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in { - val score = Scorable.computeSimilarity( - new ReduceFeatures(JsonString), new ReduceFeatures(JsonString)) - score shouldBe Scorable.MaxScore - } + it should "return Scorable.NoSlug if given null" in { + Scorable.titleToSlug(null) shouldBe Scorable.NoSlug } - def performPipelineTests() { - /* + "titleToSlug()" should "strip punctuation" in { + Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello" + Scorable.titleToSlug("a:b:c") shouldBe "a" + Scorable.titleToSlug( + "If you're happy and you know it, clap your hands!") shouldBe "if youre happy and you know it clap your hands" + } - val output = "/tmp/testOutput" - val input = "/tmp/testInput" - val (testTable, testHost) = ("test-table", "dummy-host:2181") + "jsonToMap()" should "return a map, given a legal JSON string" in { + Scorable.jsonToMap(JsonString) should not be (None) + } - val grobidSampleData = List( - List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), - Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 1"))), - List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), - Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 2: TNG"))), - List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), - Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"))), - List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), - Bytes.toBytes(MalformedGrobidString))) + it should "return None, given illegal JSON" in { + Scorable.jsonToMap("illegal{,json{{") should be (None) + } - JobTest("sandcrawler.HBaseCrossrefScoreJob") - .arg("test", "") - .arg("app.conf.path", "app.conf") - .arg("output", output) - .arg("hbase-table", testTable) - .arg("zookeeper-hosts", testHost) - .arg("crossref-input", input) - .arg("debug", "true") - .source[Tuple](HBaseCrossrefScore.getHBaseSource(testTable, testHost), - grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*))) - .source(TextLine(input), List( - 0 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"), - 1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"), - 2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"), - 3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"))) - .sink[(Int, String, String, String, String)](TypedTsv[(Int, - String, String, String, String)](output)) { - // Grobid titles: - // "Title 1", "Title 2: TNG", "Title 3: The Sequel" - // crossref slugs: - // "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted" - // Join should have 3 "Title 1" slugs and 1 "Title 2" slug - outputBuffer => - "The pipeline" should "return a 4-element list" in { - outputBuffer should have length 4 - } - } - .run - .finish -} - */ + "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in { + val score = Scorable.computeSimilarity( + new ReduceFeatures(JsonString), new ReduceFeatures(JsonString)) + score shouldBe Scorable.MaxScore } } - diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala new file mode 100644 index 0000000..22cbdb8 --- /dev/null +++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala @@ -0,0 +1,177 @@ +package sandcrawler + +import cascading.tuple.Fields +import cascading.tuple.Tuple +import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} +import org.apache.hadoop.hbase.io.ImmutableBytesWritable +import org.apache.hadoop.hbase.util.Bytes +import org.scalatest._ +import parallelai.spyglass.hbase.HBaseConstants.SourceMode + +class ScoreJobTest extends FlatSpec with Matchers { + val GrobidString = """ +{ + "title": "<<TITLE>>", + "authors": [ + {"name": "Brewster Kahle"}, + {"name": "J Doe"} + ], + "journal": { + "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", + "eissn": null, + "issn": null, + "issue": null, + "publisher": null, + "volume": null + }, + "date": "2000", + "doi": null, + "citations": [ + { "authors": [{"name": "A Seaperson"}], + "date": "2001", + "id": "b0", + "index": 0, + "issue": null, + "journal": "Letters in the Alphabet", + "publisher": null, + "title": "Everything is Wonderful", + "url": null, + "volume": "20"}, + { "authors": [], + "date": "2011-03-28", + "id": "b1", + "index": 1, + "issue": null, + "journal": "The Dictionary", + "publisher": null, + "title": "All about Facts", + "url": null, + "volume": "14"} + ], + "abstract": "Everything you ever wanted to know about nothing", + "body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", + "acknowledgement": null, + "annex": null +} +""" + val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File") + val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle") + val MalformedGrobidString = GrobidString.replace("}", "") + + val CrossrefString = +""" +{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, + "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], + "date-time" : "2017-10-23T17:19:16Z", + "timestamp" : { "$numberLong" : "1508779156477" } }, + "reference-count" : 0, + "publisher" : "Elsevier BV", + "issue" : "3", + "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", + "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], + "date-time" : "1996-01-01T00:00:00Z", + "timestamp" : { "$numberLong" : "820454400000" } }, + "delay-in-days" : 0, "content-version" : "tdm" }], + "content-domain" : { "domain" : [], "crossmark-restriction" : false }, + "published-print" : { "date-parts" : [ [ 1996 ] ] }, + "DOI" : "<<DOI>>", + "type" : "journal-article", + "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], + "date-time" : "2002-07-25T15:09:41Z", + "timestamp" : { "$numberLong" : "1027609781000" } }, + "page" : "186-187", + "source" : "Crossref", + "is-referenced-by-count" : 0, + "title" : [ "<<TITLE>>" ], + "prefix" : "10.1016", + "volume" : "9", + "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], + "member" : "78", + "container-title" : [ "Journal de Pédiatrie et de Puériculture" ], + "link" : [ { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml", + "content-type" : "text/xml", + "content-version" : "vor", + "intended-application" : "text-mining" }, + { "URL" : + "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain", + "content-type" : "text/plain", + "content-version" : "vor", + "intended-application" : "text-mining" } ], + "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], + "date-time" : "2015-09-03T10:03:43Z", + "timestamp" : { "$numberLong" : "1441274623000" } }, + "score" : 1, + "issued" : { "date-parts" : [ [ 1996 ] ] }, + "references-count" : 0, + "alternative-id" : [ "0987-7983(96)87729-2" ], + "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", + "ISSN" : [ "0987-7983" ], + "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], + "subject" : [ "Pediatrics, Perinatology, and Child Health" ] +} +""" + val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle") + val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") + val MalformedCrossrefString = CrossrefString.replace("}", "") + + // Pipeline tests + val output = "/tmp/testOutput" + val input = "/tmp/testInput" + val (testTable, testHost) = ("test-table", "dummy-host:2181") + + val grobidSampleData = List( + List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), + Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 1"))), + List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), + Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 2: TNG"))), + List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), + Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"))), + List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), + Bytes.toBytes(MalformedGrobidString))) + + // TODO: Make less yucky. + ScoreJob.setScorable1(new CrossrefScorable()) + ScoreJob.setScorable2(new GrobidScorable()) + + JobTest("sandcrawler.ScoreJob") + .arg("test", "") + .arg("app.conf.path", "app.conf") + .arg("output", output) + .arg("hbase-table", testTable) + .arg("zookeeper-hosts", testHost) + .arg("crossref-input", input) + .arg("debug", "true") + .source[Tuple](GrobidScorable.getHBaseSource(testTable, testHost), + grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*))) + .source(TextLine(input), List( + 0 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"), + 1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"), + 2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"), + 3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"))) + .sink[ReduceOutput](TypedTsv[ReduceOutput](output)) { + // Grobid titles: + // "Title 1", "Title 2: TNG", "Title 3: The Sequel" + // crossref slugs: + // "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted" + // Join should have 3 "Title 1" slugs and 1 "Title 2" slug + outputBuffer => + "The pipeline" should "return a 4-element list" in { + outputBuffer should have length 4 + } + + /* + it should "return the right first entry" in { + outputBuffer(0) shouldBe ReduceOutput("slug", 50, "", + "") + val (slug, slug0, slug1, sha1, grobidJson, crossrefJson) = outputBuffer(0) + slug shouldBe "title 1" + slug shouldBe slug0 + slug shouldBe slug1 + sha1 shouldBe new String(grobidSampleData(0)(0), "UTF-8") + grobidJson shouldBe new String(grobidSampleData(0)(1), "UTF-8") + } + */ + } + .run + .finish +} -- cgit v1.2.3 From 31354b1a6062c5c56a30610f68fa48c82a7e83f0 Mon Sep 17 00:00:00 2001 From: Ellen Spertus <ellen.spertus@gmail.com> Date: Sun, 12 Aug 2018 18:08:51 -0700 Subject: Tests pass. --- scalding/src/main/scala/sandcrawler/Scorable.scala | 11 +-- .../scala/sandcrawler/CrossrefScorableTest.scala | 89 ---------------------- .../scala/sandcrawler/GrobidScorableTest.scala | 20 +++-- .../src/test/scala/sandcrawler/ScorableTest.scala | 28 ++++--- 4 files changed, 39 insertions(+), 109 deletions(-) delete mode 100644 scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala (limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala') diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 9c8da69..929461b 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -66,13 +66,14 @@ object Scorable { // This guarantees it will have all of the fields needed to compute // the ultimate score, which are a superset of those needed for a slug. def mapToSlug(map : Map[String, Any]) : String = { - val unaccented = StringUtilities.removeAccents(getString(map, "title")) - // Remove punctuation after splitting on colon. - val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())) - if (slug.isEmpty || slug == null) { + val title = getString(map, "title") + if (title == null) { NoSlug } else { - slug + val unaccented = StringUtilities.removeAccents(title) + // Remove punctuation after splitting on colon. + val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "") + if (slug.isEmpty || slug == null) NoSlug else slug } } diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala deleted file mode 100644 index 1c35d66..0000000 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ /dev/null @@ -1,89 +0,0 @@ -package sandcrawler - -import cascading.tuple.Fields -import cascading.tuple.Tuple -import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} -import org.apache.hadoop.hbase.io.ImmutableBytesWritable -import org.apache.hadoop.hbase.util.Bytes -import org.scalatest._ -import parallelai.spyglass.hbase.HBaseConstants.SourceMode - -class CrossrefScorableTest extends FlatSpec with Matchers { - val CrossrefString = -""" -{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, - "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], - "date-time" : "2017-10-23T17:19:16Z", - "timestamp" : { "$numberLong" : "1508779156477" } }, - "reference-count" : 0, - "publisher" : "Elsevier BV", - "issue" : "3", - "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", - "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], - "date-time" : "1996-01-01T00:00:00Z", - "timestamp" : { "$numberLong" : "820454400000" } }, - "delay-in-days" : 0, "content-version" : "tdm" }], - "content-domain" : { "domain" : [], "crossmark-restriction" : false }, - "published-print" : { "date-parts" : [ [ 1996 ] ] }, - "DOI" : "<<DOI>>", - "type" : "journal-article", - "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], - "date-time" : "2002-07-25T15:09:41Z", - "timestamp" : { "$numberLong" : "1027609781000" } }, - "page" : "186-187", - "source" : "Crossref", - "is-referenced-by-count" : 0, - "title" : [ "<<TITLE>>" ], - "prefix" : "10.1016", - "volume" : "9", - "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], - "member" : "78", - "container-title" : [ "Journal de Pédiatrie et de Puériculture" ], - "link" : [ { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml", - "content-type" : "text/xml", - "content-version" : "vor", - "intended-application" : "text-mining" }, - { "URL" : - "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain", - "content-type" : "text/plain", - "content-version" : "vor", - "intended-application" : "text-mining" } ], - "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], - "date-time" : "2015-09-03T10:03:43Z", - "timestamp" : { "$numberLong" : "1441274623000" } }, - "score" : 1, - "issued" : { "date-parts" : [ [ 1996 ] ] }, - "references-count" : 0, - "alternative-id" : [ "0987-7983(96)87729-2" ], - "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", - "ISSN" : [ "0987-7983" ], - "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], - "subject" : [ "Pediatrics, Perinatology, and Child Health" ] -} -""" - val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle") - val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") - val MalformedCrossrefString = CrossrefString.replace("}", "") - - // Unit tests - "simplifyJson()" should "return None for bad JSON" in { - CrossrefScorable.simplifyJson("") shouldBe None - CrossrefScorable.simplifyJson(MalformedCrossrefString) shouldBe None - } - - it should "return None for JSON lacking title" in { - CrossrefScorable.simplifyJson(CrossrefStringWithoutTitle) shouldBe None - } - - it should "return appropriate result for valid JSON" in { - CrossrefScorable.simplifyJson(CrossrefStringWithTitle) match { - case None => fail("None unexpectedly returned by simplifyJson") - case Some(map) => { - Scorable.isScorableMap(map) shouldBe true - map.size shouldBe 1 - map.keys should contain ("title") - map("title") shouldBe "SomeTitle" - } - } - } -} diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala index 5bb955a..3fcd856 100644 --- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala @@ -57,18 +57,28 @@ class GrobidScorableTest extends FlatSpec with Matchers { val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File") val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle") val MalformedGrobidString = GrobidString.replace("}", "") + val Key = "Dummy Key" // Unit tests "GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in { - val result = GrobidScorable.jsonToMapFeatures(MalformedGrobidString) shouldBe None + val result = GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString) result.slug shouldBe Scorable.NoSlug - result.json shouldBe MalformedGrobidString } - "GrobidScorable.jsonToMapFeatures()" should "handle missing title" in { - val result = GrobidScorable.jsonToMapFeatures(GrobidStringWithoutTitle) shouldBe None + it should "handle missing title" in { + val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle) result.slug shouldBe Scorable.NoSlug - result.json shouldBe GrobidStringWithoutTitle + } + + it should "handle valid input" in { + val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithTitle) + result.slug shouldBe "dummyexamplefile" + Scorable.jsonToMap(result.json) match { + case None => fail() + case Some(map) => { + map("title").asInstanceOf[String] shouldBe "Dummy Example File" + } + } } } diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala index 2f80492..95faacc 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala @@ -54,28 +54,36 @@ class ScorableTest extends FlatSpec with Matchers { "annex": null } """ + private def titleToSlug(s : String) : String = { + Scorable.mapToSlug(Scorable.toScorableMap(title = s)) + } - "titleToSlug()" should "extract the parts of titles before a colon" in { - Scorable.titleToSlug("HELLO:there") shouldBe "hello" + "mapToSlug()" should "extract the parts of titles before a colon" in { + titleToSlug("HELLO:there") shouldBe "hello" } it should "extract an entire colon-less string" in { - Scorable.titleToSlug("hello THERE") shouldBe "hello there" + titleToSlug("hello THERE") shouldBe "hellothere" } it should "return Scorable.NoSlug if given empty string" in { - Scorable.titleToSlug("") shouldBe Scorable.NoSlug + titleToSlug("") shouldBe Scorable.NoSlug } it should "return Scorable.NoSlug if given null" in { - Scorable.titleToSlug(null) shouldBe Scorable.NoSlug + titleToSlug(null) shouldBe Scorable.NoSlug + } + + it should "strip punctuation" in { + titleToSlug("HELLO!:the:re") shouldBe "hello" + titleToSlug("a:b:c") shouldBe "a" + titleToSlug( + "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands" } - "titleToSlug()" should "strip punctuation" in { - Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello" - Scorable.titleToSlug("a:b:c") shouldBe "a" - Scorable.titleToSlug( - "If you're happy and you know it, clap your hands!") shouldBe "if youre happy and you know it clap your hands" + it should "remove whitespace" in { + titleToSlug("foo bar : baz ::") shouldBe "foobar" + titleToSlug("\na\t:b:c") shouldBe "a" } "jsonToMap()" should "return a map, given a legal JSON string" in { -- cgit v1.2.3 From b4f1acce5eccbb56291f82906d9c01534c7f1506 Mon Sep 17 00:00:00 2001 From: Ellen Spertus <ellen.spertus@gmail.com> Date: Mon, 13 Aug 2018 10:27:48 -0700 Subject: Factored out ScorableFeatures. --- .../main/scala/sandcrawler/CrossrefScorable.scala | 7 ++-- .../main/scala/sandcrawler/GrobidScorable.scala | 6 +--- scalding/src/main/scala/sandcrawler/Scorable.scala | 30 ------------------ .../main/scala/sandcrawler/ScorableFeatures.scala | 30 ++++++++++++++++++ .../scala/sandcrawler/ScorableFeaturesTest.scala | 37 ++++++++++++++++++++++ .../src/test/scala/sandcrawler/ScorableTest.scala | 32 ------------------- 6 files changed, 70 insertions(+), 72 deletions(-) create mode 100644 scalding/src/main/scala/sandcrawler/ScorableFeatures.scala create mode 100644 scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala (limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala') diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index 4558ee6..4897b1c 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -34,11 +34,8 @@ object CrossrefScorable { if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) { new MapFeatures(Scorable.NoSlug, json) } else { - val title = titles(0) - val map2 = Scorable.toScorableMap(title=title, doi=doi) - new MapFeatures( - Scorable.mapToSlug(map2), - JSONObject(map2).toString) + val sf : ScorableFeatures = new ScorableFeatures(title=titles(0), doi=doi) + new MapFeatures(sf.toSlug, sf.toString) } } else { new MapFeatures(Scorable.NoSlug, json) diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index 94b3494..5ba7d58 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -35,11 +35,7 @@ object GrobidScorable { case None => MapFeatures(Scorable.NoSlug, json) case Some(map) => { if (map contains "title") { - val map2 = Scorable.toScorableMap(Scorable.getString(map, "title"), - sha1=key) - new MapFeatures( - Scorable.mapToSlug(map2), - JSONObject(map2).toString) + new ScorableFeatures(Scorable.getString(map, "title"), sha1=key).toMapFeatures } else { MapFeatures(Scorable.NoSlug, json) } diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 717b2d5..9b9c633 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -36,21 +36,6 @@ object Scorable { slug != NoSlug } - // NOTE: I could go all out and make ScorableMap a type. - // TODO: Require year. Other features will get added here. - def toScorableMap(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : Map[String, Any] = { - Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) - } - - def toScorableJson(title : String, year : Int, doi : String = "", sha1 : String = "") : String = { - JSONObject(toScorableMap(title=title, year=year, doi=doi, sha1=sha1)).toString - } - - // TODO: Score on more fields than "title". - def isScorableMap(map : Map[String, Any]) : Boolean = { - map.contains("title") - } - def jsonToMap(json : String) : Option[Map[String, Any]] = { // https://stackoverflow.com/a/32717262/631051 val jsonObject = JSON.parseFull(json) @@ -61,21 +46,6 @@ object Scorable { } } - // Map should have been produced by toScorableMap. - // This guarantees it will have all of the fields needed to compute - // the ultimate score, which are a superset of those needed for a slug. - def mapToSlug(map : Map[String, Any]) : String = { - val title = getString(map, "title") - if (title == null) { - NoSlug - } else { - val unaccented = StringUtilities.removeAccents(title) - // Remove punctuation after splitting on colon. - val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "") - if (slug.isEmpty || slug == null) NoSlug else slug - } - } - def getStringOption(optionalMap : Option[Map[String, Any]], key : String) : Option[String] = { optionalMap match { case None => None diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala new file mode 100644 index 0000000..5d6dea0 --- /dev/null +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -0,0 +1,30 @@ +package sandcrawler + +import scala.util.parsing.json.JSONObject + +// Contains features needed to make slug and to score (in combination +// with a second ScorableFeatures). +class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") { + def toMap() : Map[String, Any] = { + Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) + } + + override def toString() : String = { + JSONObject(toMap()).toString + } + + def toSlug() : String = { + if (title == null) { + Scorable.NoSlug + } else { + val unaccented = StringUtilities.removeAccents(title) + // Remove punctuation after splitting on colon. + val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "") + if (slug.isEmpty || slug == null) Scorable.NoSlug else slug + } + } + + def toMapFeatures = { + MapFeatures(toSlug, toString) + } +} diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala new file mode 100644 index 0000000..7ec0c4d --- /dev/null +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -0,0 +1,37 @@ +package sandcrawler + +import org.scalatest._ + +class ScorableFeaturesTest extends FlatSpec with Matchers { + private def titleToSlug(s : String) : String = { + new ScorableFeatures(title = s).toSlug + } + + "mapToSlug()" should "extract the parts of titles before a colon" in { + titleToSlug("HELLO:there") shouldBe "hello" + } + + it should "extract an entire colon-less string" in { + titleToSlug("hello THERE") shouldBe "hellothere" + } + + it should "return Scorable.NoSlug if given empty string" in { + titleToSlug("") shouldBe Scorable.NoSlug + } + + it should "return Scorable.NoSlug if given null" in { + titleToSlug(null) shouldBe Scorable.NoSlug + } + + it should "strip punctuation" in { + titleToSlug("HELLO!:the:re") shouldBe "hello" + titleToSlug("a:b:c") shouldBe "a" + titleToSlug( + "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands" + } + + it should "remove whitespace" in { + titleToSlug("foo bar : baz ::") shouldBe "foobar" + titleToSlug("\na\t:b:c") shouldBe "a" + } +} diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala index 95faacc..fd44f57 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala @@ -54,38 +54,6 @@ class ScorableTest extends FlatSpec with Matchers { "annex": null } """ - private def titleToSlug(s : String) : String = { - Scorable.mapToSlug(Scorable.toScorableMap(title = s)) - } - - "mapToSlug()" should "extract the parts of titles before a colon" in { - titleToSlug("HELLO:there") shouldBe "hello" - } - - it should "extract an entire colon-less string" in { - titleToSlug("hello THERE") shouldBe "hellothere" - } - - it should "return Scorable.NoSlug if given empty string" in { - titleToSlug("") shouldBe Scorable.NoSlug - } - - it should "return Scorable.NoSlug if given null" in { - titleToSlug(null) shouldBe Scorable.NoSlug - } - - it should "strip punctuation" in { - titleToSlug("HELLO!:the:re") shouldBe "hello" - titleToSlug("a:b:c") shouldBe "a" - titleToSlug( - "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands" - } - - it should "remove whitespace" in { - titleToSlug("foo bar : baz ::") shouldBe "foobar" - titleToSlug("\na\t:b:c") shouldBe "a" - } - "jsonToMap()" should "return a map, given a legal JSON string" in { Scorable.jsonToMap(JsonString) should not be (None) } -- cgit v1.2.3 From 3ff30c8f20d36f8e47ec5478c10c3348d2f45fa6 Mon Sep 17 00:00:00 2001 From: Ellen Spertus <ellen.spertus@gmail.com> Date: Tue, 14 Aug 2018 20:38:29 -0700 Subject: Fixed style problems (or disabled warning when appropriate) for tests. --- scalding/build.sbt | 7 ++ .../scala/sandcrawler/CrossrefScorableTest.scala | 87 ++++++++++--------- .../scala/sandcrawler/GrobidScorableTest.scala | 7 +- .../test/scala/sandcrawler/HBaseBuilderTest.scala | 1 + .../scala/sandcrawler/HBaseMimeCountTest.scala | 9 +- .../test/scala/sandcrawler/HBaseRowCountTest.scala | 11 +-- .../scala/sandcrawler/HBaseStatusCountTest.scala | 10 ++- .../scala/sandcrawler/ScorableFeaturesTest.scala | 1 + .../src/test/scala/sandcrawler/ScorableTest.scala | 5 +- .../src/test/scala/sandcrawler/ScoreJobTest.scala | 97 ++++++++++++---------- 10 files changed, 135 insertions(+), 100 deletions(-) (limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala') diff --git a/scalding/build.sbt b/scalding/build.sbt index 2addd60..d477399 100644 --- a/scalding/build.sbt +++ b/scalding/build.sbt @@ -20,6 +20,13 @@ lazy val root = (project in file(".")). scalaSourceFiles.filterNot(_.getAbsolutePath.contains(dirNameToExclude)) }, + (scalastyleSources in Test) := { + // all .scala files in "src/test/scala" + val scalaSourceFiles = ((scalaSource in Test).value ** "*.scala").get + val dirNameToExclude = "/example/" + scalaSourceFiles.filterNot(_.getAbsolutePath.contains(dirNameToExclude)) + }, + name := "sandcrawler", resolvers += "conjars.org" at "http://conjars.org/repo", diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala index 75be03e..e171dba 100644 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala @@ -2,72 +2,77 @@ package sandcrawler import cascading.tuple.Fields import cascading.tuple.Tuple -import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} +import com.twitter.scalding.JobTest +import com.twitter.scalding.TextLine +import com.twitter.scalding.TupleConversions +import com.twitter.scalding.TypedTsv import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.scalatest._ import parallelai.spyglass.hbase.HBaseConstants.SourceMode class CrossrefScorableTest extends FlatSpec with Matchers { + // scalastyle:off val CrossrefString = """ -{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, - "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], - "date-time" : "2017-10-23T17:19:16Z", - "timestamp" : { "$numberLong" : "1508779156477" } }, - "reference-count" : 0, - "publisher" : "Elsevier BV", - "issue" : "3", - "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", - "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], - "date-time" : "1996-01-01T00:00:00Z", - "timestamp" : { "$numberLong" : "820454400000" } }, +{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, + "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], + "date-time" : "2017-10-23T17:19:16Z", + "timestamp" : { "$numberLong" : "1508779156477" } }, + "reference-count" : 0, + "publisher" : "Elsevier BV", + "issue" : "3", + "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", + "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], + "date-time" : "1996-01-01T00:00:00Z", + "timestamp" : { "$numberLong" : "820454400000" } }, "delay-in-days" : 0, "content-version" : "tdm" }], - "content-domain" : { "domain" : [], "crossmark-restriction" : false }, - "published-print" : { "date-parts" : [ [ 1996 ] ] }, + "content-domain" : { "domain" : [], "crossmark-restriction" : false }, + "published-print" : { "date-parts" : [ [ 1996 ] ] }, "DOI" : "<<DOI>>", - "type" : "journal-article", - "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], - "date-time" : "2002-07-25T15:09:41Z", - "timestamp" : { "$numberLong" : "1027609781000" } }, - "page" : "186-187", - "source" : "Crossref", - "is-referenced-by-count" : 0, + "type" : "journal-article", + "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], + "date-time" : "2002-07-25T15:09:41Z", + "timestamp" : { "$numberLong" : "1027609781000" } }, + "page" : "186-187", + "source" : "Crossref", + "is-referenced-by-count" : 0, "title" : [ "<<TITLE>>" ], - "prefix" : "10.1016", - "volume" : "9", - "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], - "member" : "78", - "container-title" : [ "Journal de Pédiatrie et de Puériculture" ], + "prefix" : "10.1016", + "volume" : "9", + "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], + "member" : "78", + "container-title" : [ "Journal de Pédiatrie et de Puériculture" ], "link" : [ { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml", - "content-type" : "text/xml", + "content-type" : "text/xml", "content-version" : "vor", - "intended-application" : "text-mining" }, + "intended-application" : "text-mining" }, { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain", - "content-type" : "text/plain", + "content-type" : "text/plain", "content-version" : "vor", - "intended-application" : "text-mining" } ], - "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], - "date-time" : "2015-09-03T10:03:43Z", - "timestamp" : { "$numberLong" : "1441274623000" } }, - "score" : 1, - "issued" : { "date-parts" : [ [ 1996 ] ] }, - "references-count" : 0, - "alternative-id" : [ "0987-7983(96)87729-2" ], - "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", - "ISSN" : [ "0987-7983" ], - "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], + "intended-application" : "text-mining" } ], + "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], + "date-time" : "2015-09-03T10:03:43Z", + "timestamp" : { "$numberLong" : "1441274623000" } }, + "score" : 1, + "issued" : { "date-parts" : [ [ 1996 ] ] }, + "references-count" : 0, + "alternative-id" : [ "0987-7983(96)87729-2" ], + "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", + "ISSN" : [ "0987-7983" ], + "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], "subject" : [ "Pediatrics, Perinatology, and Child Health" ] } """ + // scalastyle:on val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "Some Title") val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") val MalformedCrossrefString = CrossrefString.replace("}", "") // Unit tests "CrossrefScorable.jsonToMapFeatures()" should "handle invalid JSON" in { - val result = CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString) + val result = CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString) result.slug shouldBe Scorable.NoSlug } diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala index 4b958b9..661824b 100644 --- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala @@ -2,7 +2,10 @@ package sandcrawler import cascading.tuple.Fields import cascading.tuple.Tuple -import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} +import com.twitter.scalding.JobTest +import com.twitter.scalding.TextLine +import com.twitter.scalding.TupleConversions +import com.twitter.scalding.TypedTsv import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.scalatest._ @@ -62,7 +65,7 @@ class GrobidScorableTest extends FlatSpec with Matchers { // Unit tests "GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in { - val result = GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString) + val result = GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString) result.slug shouldBe Scorable.NoSlug } diff --git a/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala b/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala index 603a4c7..c61cb22 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala @@ -22,6 +22,7 @@ class HBaseBuilderTest extends FlatSpec with Matchers { fields should have length 0 } + //scalastyle:off no.whitespace.before.left.bracket it should "throw IllegalArgumentException on malformed input" in { a [IllegalArgumentException] should be thrownBy { HBaseBuilder.parseColSpecs(List("file_size")) diff --git a/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala index fde2290..d6d283f 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala @@ -1,15 +1,18 @@ package sandcrawler -import cascading.tuple.{Tuple, Fields} -import com.twitter.scalding.{JobTest, Tsv, TupleConversions} +import cascading.tuple.Fields +import cascading.tuple.Tuple +import com.twitter.scalding.JobTest +import com.twitter.scalding.Tsv +import com.twitter.scalding.TupleConversions import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.junit.runner.RunWith import org.scalatest.FunSpec import org.scalatest.junit.JUnitRunner import org.slf4j.LoggerFactory -import parallelai.spyglass.hbase.HBaseSource import parallelai.spyglass.hbase.HBaseConstants.SourceMode +import parallelai.spyglass.hbase.HBaseSource import scala._ @RunWith(classOf[JUnitRunner]) diff --git a/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala index 3424a36..c4ca5aa 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala @@ -1,15 +1,18 @@ package sandcrawler -import cascading.tuple.{Tuple, Fields} -import com.twitter.scalding.{JobTest, Tsv, TupleConversions} +import cascading.tuple.Fields +import cascading.tuple.Tuple +import com.twitter.scalding.JobTest +import com.twitter.scalding.Tsv +import com.twitter.scalding.TupleConversions import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.junit.runner.RunWith import org.scalatest.FunSpec import org.scalatest.junit.JUnitRunner import org.slf4j.LoggerFactory -import parallelai.spyglass.hbase.HBaseSource import parallelai.spyglass.hbase.HBaseConstants.SourceMode +import parallelai.spyglass.hbase.HBaseSource import scala._ /** @@ -47,12 +50,10 @@ class HBaseRowCountTest extends FunSpec with TupleConversions { outputBuffer => it("should return the test data provided.") { - println("outputBuffer.size => " + outputBuffer.size) assert(outputBuffer.size === 1) } it("should return the correct count") { - println("raw output => " + outputBuffer) assert(outputBuffer(0).getObject(0) === 8) } } diff --git a/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala index 8a71f31..fe3ff21 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala @@ -1,15 +1,19 @@ package sandcrawler -import cascading.tuple.{Tuple, Fields} -import com.twitter.scalding.{JobTest, Tsv, TypedTsv, TupleConversions} +import cascading.tuple.Fields +import cascading.tuple.Tuple +import com.twitter.scalding.JobTest +import com.twitter.scalding.Tsv +import com.twitter.scalding.TupleConversions +import com.twitter.scalding.TypedTsv import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.junit.runner.RunWith import org.scalatest.FunSpec import org.scalatest.junit.JUnitRunner import org.slf4j.LoggerFactory -import parallelai.spyglass.hbase.HBaseSource import parallelai.spyglass.hbase.HBaseConstants.SourceMode +import parallelai.spyglass.hbase.HBaseSource import scala._ @RunWith(classOf[JUnitRunner]) diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index 7ec0c4d..f9c30a2 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -2,6 +2,7 @@ package sandcrawler import org.scalatest._ +// scalastyle:off null class ScorableFeaturesTest extends FlatSpec with Matchers { private def titleToSlug(s : String) : String = { new ScorableFeatures(title = s).toSlug diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala index fd44f57..f63bef8 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala @@ -2,7 +2,10 @@ package sandcrawler import cascading.tuple.Fields import cascading.tuple.Tuple -import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} +import com.twitter.scalding.JobTest +import com.twitter.scalding.TextLine +import com.twitter.scalding.TupleConversions +import com.twitter.scalding.TypedTsv import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.scalatest._ diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala index 1c6ae83..34081a5 100644 --- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala @@ -2,13 +2,17 @@ package sandcrawler import cascading.tuple.Fields import cascading.tuple.Tuple -import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} +import com.twitter.scalding.JobTest +import com.twitter.scalding.TextLine +import com.twitter.scalding.TupleConversions +import com.twitter.scalding.TypedTsv import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.scalatest._ import parallelai.spyglass.hbase.HBaseConstants.SourceMode class ScoreJobTest extends FlatSpec with Matchers { + //scalastyle:off val JsonString = """ { "title": "<<TITLE>>", @@ -54,62 +58,65 @@ class ScoreJobTest extends FlatSpec with Matchers { "annex": null } """ + // scalastyle:on val JsonStringWithTitle = JsonString.replace("<<TITLE>>", "Dummy Example File") val JsonStringWithoutTitle = JsonString.replace("title", "nottitle") val MalformedJsonString = JsonString.replace("}", "") + // scalastyle:off val CrossrefString = """ -{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, - "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], - "date-time" : "2017-10-23T17:19:16Z", - "timestamp" : { "$numberLong" : "1508779156477" } }, - "reference-count" : 0, - "publisher" : "Elsevier BV", - "issue" : "3", - "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", - "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], - "date-time" : "1996-01-01T00:00:00Z", - "timestamp" : { "$numberLong" : "820454400000" } }, +{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, + "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], + "date-time" : "2017-10-23T17:19:16Z", + "timestamp" : { "$numberLong" : "1508779156477" } }, + "reference-count" : 0, + "publisher" : "Elsevier BV", + "issue" : "3", + "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", + "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], + "date-time" : "1996-01-01T00:00:00Z", + "timestamp" : { "$numberLong" : "820454400000" } }, "delay-in-days" : 0, "content-version" : "tdm" }], - "content-domain" : { "domain" : [], "crossmark-restriction" : false }, - "published-print" : { "date-parts" : [ [ 1996 ] ] }, + "content-domain" : { "domain" : [], "crossmark-restriction" : false }, + "published-print" : { "date-parts" : [ [ 1996 ] ] }, "DOI" : "<<DOI>>", - "type" : "journal-article", - "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], - "date-time" : "2002-07-25T15:09:41Z", - "timestamp" : { "$numberLong" : "1027609781000" } }, - "page" : "186-187", - "source" : "Crossref", - "is-referenced-by-count" : 0, + "type" : "journal-article", + "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], + "date-time" : "2002-07-25T15:09:41Z", + "timestamp" : { "$numberLong" : "1027609781000" } }, + "page" : "186-187", + "source" : "Crossref", + "is-referenced-by-count" : 0, "title" : [ "<<TITLE>>" ], - "prefix" : "10.1016", - "volume" : "9", - "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], - "member" : "78", - "container-title" : [ "Journal de Pédiatrie et de Puériculture" ], + "prefix" : "10.1016", + "volume" : "9", + "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], + "member" : "78", + "container-title" : [ "Journal de Pédiatrie et de Puériculture" ], "link" : [ { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml", - "content-type" : "text/xml", - "content-version" : "vor", - "intended-application" : "text-mining" }, + "content-type" : "text/xml", + "content-version" : "vor", + "intended-application" : "text-mining" }, { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain", - "content-type" : "text/plain", + "content-type" : "text/plain", "content-version" : "vor", - "intended-application" : "text-mining" } ], - "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], - "date-time" : "2015-09-03T10:03:43Z", - "timestamp" : { "$numberLong" : "1441274623000" } }, - "score" : 1, - "issued" : { "date-parts" : [ [ 1996 ] ] }, - "references-count" : 0, - "alternative-id" : [ "0987-7983(96)87729-2" ], - "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", - "ISSN" : [ "0987-7983" ], - "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], + "intended-application" : "text-mining" } ], + "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], + "date-time" : "2015-09-03T10:03:43Z", + "timestamp" : { "$numberLong" : "1441274623000" } }, + "score" : 1, + "issued" : { "date-parts" : [ [ 1996 ] ] }, + "references-count" : 0, + "alternative-id" : [ "0987-7983(96)87729-2" ], + "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", + "ISSN" : [ "0987-7983" ], + "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], "subject" : [ "Pediatrics, Perinatology, and Child Health" ] } """ + // scalastyle:on val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle") val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") val MalformedCrossrefString = CrossrefString.replace("}", "") @@ -168,7 +175,7 @@ class ScoreJobTest extends FlatSpec with Matchers { 2 -> CrossrefStrings(2), 3 -> CrossrefStrings(3))) .sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) { - // Grobid titles and slugs (in parentheses): + // Grobid titles and slugs (in parentheses): // Title 1 (title1) // Title 2: TNG (title2) // Title 3: The Sequel (title3) @@ -178,7 +185,7 @@ class ScoreJobTest extends FlatSpec with Matchers { // Title 1: TNG 3 (title1) // Title 2: Rebooted (title2) // Join should have 3 "title1" slugs and 1 "title2" slug - outputBuffer => + outputBuffer => "The pipeline" should "return a 4-element list" in { outputBuffer should have length 4 } @@ -190,9 +197,9 @@ class ScoreJobTest extends FlatSpec with Matchers { countMap("title2") shouldBe 1 } - def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) = { + def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) : (String, Int, String, String) = { val mf1 : MapFeatures = GrobidScorable.jsonToMapFeatures( - Sha1Strings(grobidIndex), + Sha1Strings(grobidIndex), JsonStrings(grobidIndex)) val mf2 : MapFeatures = CrossrefScorable.jsonToMapFeatures( CrossrefStrings(crossrefIndex)) -- cgit v1.2.3