From 408123177b9e8afd145ea0f0fa1d6bb449f1bd20 Mon Sep 17 00:00:00 2001 From: Ellen Spertus Date: Tue, 7 Aug 2018 10:57:10 -0700 Subject: Added CrossrefScorableTest, minor cleanups. --- .../scala/sandcrawler/CrossrefScorableTest.scala | 84 ++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala (limited to 'scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala') diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala new file mode 100644 index 0000000..5973ce5 --- /dev/null +++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala @@ -0,0 +1,84 @@ +package sandcrawler + +import cascading.tuple.Fields +import cascading.tuple.Tuple +import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} +import org.apache.hadoop.hbase.io.ImmutableBytesWritable +import org.apache.hadoop.hbase.util.Bytes +import org.scalatest._ +import parallelai.spyglass.hbase.HBaseConstants.SourceMode + +class CrossrefScorableTest extends FlatSpec with Matchers { + val CrossrefString = +""" +{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, + "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], + "date-time" : "2017-10-23T17:19:16Z", + "timestamp" : { "$numberLong" : "1508779156477" } }, + "reference-count" : 0, + "publisher" : "Elsevier BV", + "issue" : "3", + "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", + "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], + "date-time" : "1996-01-01T00:00:00Z", + "timestamp" : { "$numberLong" : "820454400000" } }, + "delay-in-days" : 0, "content-version" : "tdm" }], + "content-domain" : { "domain" : [], "crossmark-restriction" : false }, + "published-print" : { "date-parts" : [ [ 1996 ] ] }, + "DOI" : "<>", + "type" : "journal-article", + "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], + "date-time" : "2002-07-25T15:09:41Z", + "timestamp" : { "$numberLong" : "1027609781000" } }, + "page" : "186-187", + "source" : "Crossref", + "is-referenced-by-count" : 0, + "title" : [ "<>" ], + "prefix" : "10.1016", + "volume" : "9", + "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], + "member" : "78", + "container-title" : [ "Journal de Pédiatrie et de Puériculture" ], + "link" : [ { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml", + "content-type" : "text/xml", + "content-version" : "vor", + "intended-application" : "text-mining" }, + { "URL" : + "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain", + "content-type" : "text/plain", + "content-version" : "vor", + "intended-application" : "text-mining" } ], + "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], + "date-time" : "2015-09-03T10:03:43Z", + "timestamp" : { "$numberLong" : "1441274623000" } }, + "score" : 1, + "issued" : { "date-parts" : [ [ 1996 ] ] }, + "references-count" : 0, + "alternative-id" : [ "0987-7983(96)87729-2" ], + "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", + "ISSN" : [ "0987-7983" ], + "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], + "subject" : [ "Pediatrics, Perinatology, and Child Health" ] +} +""" + val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle") + val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") + val MalformedCrossrefString = CrossrefString.replace("}", "") + + // Unit tests + + "crossrefToSlug()" should "get the right slug for a crossref json string" in { + val slug = CrossrefScorable.crossrefToSlug(CrossrefStringWithTitle) + slug should contain ("sometitle") + } + + it should "return None if given json string without title" in { + val slug = CrossrefScorable.crossrefToSlug(CrossrefStringWithoutTitle) + slug shouldBe None + } + + it should "return None if given a malformed json string" in { + val slug = CrossrefScorable.crossrefToSlug(MalformedCrossrefString) + slug shouldBe None + } +} -- cgit v1.2.3 From 768e7ef0d127cf55119543be6e656751704ca5b2 Mon Sep 17 00:00:00 2001 From: Ellen Spertus <ellen.spertus@gmail.com> Date: Fri, 10 Aug 2018 20:49:44 -0700 Subject: Tests pass. Still have changes to do but made huge progress. --- .../main/scala/sandcrawler/CrossrefScorable.scala | 38 +++++++++++-------- scalding/src/main/scala/sandcrawler/ScoreJob.scala | 44 +++------------------- .../scala/sandcrawler/CrossrefScorableTest.scala | 3 +- .../src/test/scala/sandcrawler/ScoreJobTest.scala | 2 +- 4 files changed, 30 insertions(+), 57 deletions(-) (limited to 'scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala') diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index 817bee5..b2f6537 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -9,6 +9,7 @@ import parallelai.spyglass.hbase.HBaseConstants.SourceMode import parallelai.spyglass.hbase.HBasePipeConversions import parallelai.spyglass.hbase.HBaseSource import TDsl._ +import scala.util.parsing.json.JSONObject import java.text.Normalizer import java.util.Arrays @@ -31,7 +32,7 @@ import parallelai.spyglass.hbase.HBasePipeConversions import parallelai.spyglass.hbase.HBaseSource class CrossrefScorable extends Scorable with HBasePipeConversions { - // TODO: Generalize args so there can be multiple Grobid pipes in one job. + // TODO: Generalize args so there can be multiple Crossref pipes in one job. def getSource(args : Args) : Source = { TextLine(args("crossref-input")) } @@ -39,26 +40,31 @@ class CrossrefScorable extends Scorable with HBasePipeConversions { def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = { getSource(args).read .toTypedPipe[String](new Fields("line")) - .map{ json : String => - CrossrefScorable.crossrefToSlug(json) match { - case Some(slug) => new MapFeatures(slug, json) + .map{ json : String => + CrossrefScorable.simplifyJson(json) match { case None => new MapFeatures(Scorable.NoSlug, json) + case Some(map) => new MapFeatures( + Scorable.titleToSlug(map("title").asInstanceOf[String]), + JSONObject(map).toString) } } } -} -object CrossrefScorable { - def crossrefToSlug(json : String) : Option[String] = { - Scorable.jsonToMap(json) match { - case None => None - case Some(map) => { - if (map contains "title") { - // TODO: Don't ignore titles after the first. - val title = map("title").asInstanceOf[List[String]](0) - Some(Scorable.titleToSlug(title)) - } else { - None + object CrossrefScorable { + def simplifyJson(json : String) : Option[Map[String, Any]] = { + Scorable.jsonToMap(json) match { + case None => None + case Some(map) => { + if (map contains "title") { + val titles = map("title").asInstanceOf[List[String]] + if (titles.isEmpty) { + None + } else { + Some(Map("title" -> titles(0))) + } + } else { + None + } } } } diff --git a/scalding/src/main/scala/sandcrawler/ScoreJob.scala b/scalding/src/main/scala/sandcrawler/ScoreJob.scala index bc5bf87..386b367 100644 --- a/scalding/src/main/scala/sandcrawler/ScoreJob.scala +++ b/scalding/src/main/scala/sandcrawler/ScoreJob.scala @@ -3,7 +3,7 @@ package sandcrawler import cascading.flow.FlowDef import cascading.tuple.Fields import com.twitter.scalding.{Args,Source,TextLine,TypedPipe, TypedTsv} -//import com.twitter.scalding.typed.TDsl._ +//import com.twitter.scalding.source.TypedText import parallelai.spyglass.base.JobBase import parallelai.spyglass.hbase.HBasePipeConversions import parallelai.spyglass.hbase.HBaseSource @@ -13,7 +13,7 @@ import cascading.pipe.Pipe class ScoreJob(args: Args) extends JobBase(args) { //with HBasePipeConversions { // TODO: Instantiate any subclass of Scorable specified in args. val sc1 : Scorable = new GrobidScorable() - val sc2 : Scorable = new GrobidScorable() + val sc2 : Scorable = new CrossrefScorable() val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(args) val pipe2 : TypedPipe[(String, ReduceFeatures)] = sc2.getInputPipe(args) @@ -25,44 +25,10 @@ class ScoreJob(args: Args) extends JobBase(args) { //with HBasePipeConversions { features1.json, features2.json) } - .write(TypedTsv[ReduceOutput](args("output"))) - - /* - val grobidSource = HBaseCrossrefScore.getHBaseSource( - args("hbase-table"), - args("zookeeper-hosts")) - - val source0 : Source = TextLine("foo") - val pipe0 : cascading.pipe.Pipe = source0.read - // This compiles: - val pipe00 : TypedPipe[String] = getFeaturesPipe0(pipe0) - - // Calling a method within ScoreJob compiles fine. - def getFeaturesPipe0(pipe : cascading.pipe.Pipe) : TypedPipe[String] = { - pipe - // This compiles: - .toTypedPipe[String](new Fields("line")) - } - - // Calling a function in a ScoreJob object leads to a compiler error. - val source1 : Source = TextLine("foo") - val pipe1 : cascading.pipe.Pipe = source1.read - // This leads to a compile error: - val pipe11 : TypedPipe[String] = ScoreJob.getFeaturesPipe1(pipe0) - - val pipe : cascading.pipe.Pipe = grobidSource - .read - val grobidPipe : TypedPipe[(String, String)] = pipe - .fromBytesWritable(new Fields("key", "tei_json")) - // Here I CAN call Pipe.toTypedPipe() - .toTypedPipe[(String, String)]('key, 'tei_json) - .write(TypedTsv[(String, String)](args("output"))) - - // Let's try making a method call. -// ScoreJob.etFeaturesPipe(pipe) - - */ + //TypedTsv doesn't work over case classes. + .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) } + .write(TypedTsv[(String, Int, String, String)](args("output"))) } // Ugly hack to get non-String information into ScoreJob above. diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala index 5973ce5..67a8bfe 100644 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala @@ -66,7 +66,7 @@ class CrossrefScorableTest extends FlatSpec with Matchers { val MalformedCrossrefString = CrossrefString.replace("}", "") // Unit tests - +/* "crossrefToSlug()" should "get the right slug for a crossref json string" in { val slug = CrossrefScorable.crossrefToSlug(CrossrefStringWithTitle) slug should contain ("sometitle") @@ -81,4 +81,5 @@ class CrossrefScorableTest extends FlatSpec with Matchers { val slug = CrossrefScorable.crossrefToSlug(MalformedCrossrefString) slug shouldBe None } + */ } diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala index 22cbdb8..8acb454 100644 --- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala @@ -148,7 +148,7 @@ class ScoreJobTest extends FlatSpec with Matchers { 1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"), 2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"), 3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"))) - .sink[ReduceOutput](TypedTsv[ReduceOutput](output)) { + .sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) { // Grobid titles: // "Title 1", "Title 2: TNG", "Title 3: The Sequel" // crossref slugs: -- cgit v1.2.3 From 728e50a33cec921c9a624439f2e1c8561a6e12ce Mon Sep 17 00:00:00 2001 From: Ellen Spertus <ellen.spertus@gmail.com> Date: Sat, 11 Aug 2018 21:03:53 -0700 Subject: It compiles. --- .../main/scala/sandcrawler/CrossrefScorable.scala | 54 ++++++++++++++-------- .../main/scala/sandcrawler/GrobidScorable.scala | 21 ++++----- scalding/src/main/scala/sandcrawler/Scorable.scala | 40 +++++++++++----- .../scala/sandcrawler/CrossrefScorableTest.scala | 26 ++++++----- .../scala/sandcrawler/GrobidScorableTest.scala | 19 ++++---- 5 files changed, 96 insertions(+), 64 deletions(-) (limited to 'scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala') diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index b2f6537..5113b0c 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -18,6 +18,7 @@ import java.util.regex.Pattern import scala.math import scala.util.parsing.json.JSON +import scala.util.parsing.json.JSONObject import cascading.tuple.Fields import com.twitter.scalding._ @@ -40,33 +41,48 @@ class CrossrefScorable extends Scorable with HBasePipeConversions { def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = { getSource(args).read .toTypedPipe[String](new Fields("line")) - .map{ json : String => - CrossrefScorable.simplifyJson(json) match { - case None => new MapFeatures(Scorable.NoSlug, json) - case Some(map) => new MapFeatures( - Scorable.titleToSlug(map("title").asInstanceOf[String]), - JSONObject(map).toString) + .map{ json : String => + Scorable.jsonToMap(json) match { + case None => MapFeatures(Scorable.NoSlug, json) + case Some(map) => { + if ((map contains "title") && (map contains "DOI")) { + val titles = map("title").asInstanceOf[List[String]] + if (titles.isEmpty) { + new MapFeatures(Scorable.NoSlug, json) + } else { + val title = titles(0) + val map2 = Scorable.toScorableMap(title=titles(0), doi=map("DOI").asInstanceOf[String]) + new MapFeatures( + Scorable.mapToSlug(map2), + JSONObject(map2).toString) + } + } else { + new MapFeatures(Scorable.NoSlug, json) + } + } } } } +} - object CrossrefScorable { - def simplifyJson(json : String) : Option[Map[String, Any]] = { - Scorable.jsonToMap(json) match { - case None => None - case Some(map) => { - if (map contains "title") { - val titles = map("title").asInstanceOf[List[String]] - if (titles.isEmpty) { - None - } else { - Some(Map("title" -> titles(0))) - } - } else { +/* +object CrossrefScorable { + def simplifyJson(json : String) : Option[Map[String, Any]] = { + Scorable.jsonToMap(json) match { + case None => None + case Some(map) => { + if (map contains "title") { + val titles = map("title").asInstanceOf[List[String]] + if (titles.isEmpty) { None + } else { + Some(Map("title" -> titles(0))) } + } else { + None } } } } } + */ diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index 61055f2..de9f51a 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -1,5 +1,6 @@ package sandcrawler +import scala.util.parsing.json.JSONObject import cascading.flow.FlowDef import cascading.pipe.Pipe import cascading.tuple.Fields @@ -21,13 +22,7 @@ class GrobidScorable extends Scorable with HBasePipeConversions { .read .fromBytesWritable(new Fields("key", "tei_json")) .toTypedPipe[(String, String)](new Fields("key", "tei_json")) - .map { entry => - val (key : String, json : String) = (entry._1, entry._2) - GrobidScorable.grobidToSlug(json) match { - case Some(slug) => new MapFeatures(slug, json) - case None => new MapFeatures(Scorable.NoSlug, json) - } - } + .map { entry : (String, String) => GrobidScorable.jsonToMapFeatures(entry._1, entry._2) } } } @@ -36,14 +31,18 @@ object GrobidScorable { HBaseBuilder.build(table, host, List("grobid0:tei_json"), SourceMode.SCAN_ALL) } - def grobidToSlug(json : String) : Option[String] = { + def jsonToMapFeatures(key : String, json : String) : MapFeatures = { Scorable.jsonToMap(json) match { - case None => None + case None => MapFeatures(Scorable.NoSlug, json) case Some(map) => { if (map contains "title") { - Some(Scorable.titleToSlug(map("title").asInstanceOf[String])) + val map2 = Scorable.toScorableMap(Scorable.getString(map, "title"), + sha1=key) + new MapFeatures( + Scorable.mapToSlug(map2), + JSONObject(map2).toString) } else { - None + MapFeatures(Scorable.NoSlug, json) } } } diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 0ec8e46..9c8da69 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -2,6 +2,7 @@ package sandcrawler import scala.math import scala.util.parsing.json.JSON +import scala.util.parsing.json.JSONObject import cascading.flow.FlowDef import com.twitter.scalding._ @@ -36,6 +37,21 @@ object Scorable { slug != NoSlug } + // NOTE: I could go all out and make ScorableMap a type. + // TODO: Require year. Other features will get added here. + def toScorableMap(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : Map[String, Any] = { + Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) + } + + def toScorableJson(title : String, year : Int, doi : String = "", sha1 : String = "") : String = { + JSONObject(toScorableMap(title=title, year=year, doi=doi, sha1=sha1)).toString + } + + // TODO: Score on more fields than "title". + def isScorableMap(map : Map[String, Any]) : Boolean = { + map.contains("title") + } + def jsonToMap(json : String) : Option[Map[String, Any]] = { // https://stackoverflow.com/a/32717262/631051 val jsonObject = JSON.parseFull(json) @@ -46,18 +62,17 @@ object Scorable { } } - def titleToSlug(title : String) : String = { - if (title == null || title.isEmpty) { + // Map should have been produced by toScorableMap. + // This guarantees it will have all of the fields needed to compute + // the ultimate score, which are a superset of those needed for a slug. + def mapToSlug(map : Map[String, Any]) : String = { + val unaccented = StringUtilities.removeAccents(getString(map, "title")) + // Remove punctuation after splitting on colon. + val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())) + if (slug.isEmpty || slug == null) { NoSlug } else { - val unaccented = StringUtilities.removeAccents(title) - // Remove punctuation after splitting on colon. - val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())) - if (slug.isEmpty || slug == null) { - NoSlug - } else { - slug - } + slug } } @@ -68,8 +83,9 @@ object Scorable { } } - // Caller is responsible for ensuring that key is in map. - def getString(map : Map[String, String], key : String) : String = { + // Caller is responsible for ensuring that key is a String in map. + // TODO: Add and handle ClassCastException + def getString(map : Map[String, Any], key : String) : String = { assert(map contains key) map(key).asInstanceOf[String] } diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala index 67a8bfe..1c35d66 100644 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala @@ -66,20 +66,24 @@ class CrossrefScorableTest extends FlatSpec with Matchers { val MalformedCrossrefString = CrossrefString.replace("}", "") // Unit tests -/* - "crossrefToSlug()" should "get the right slug for a crossref json string" in { - val slug = CrossrefScorable.crossrefToSlug(CrossrefStringWithTitle) - slug should contain ("sometitle") + "simplifyJson()" should "return None for bad JSON" in { + CrossrefScorable.simplifyJson("") shouldBe None + CrossrefScorable.simplifyJson(MalformedCrossrefString) shouldBe None } - it should "return None if given json string without title" in { - val slug = CrossrefScorable.crossrefToSlug(CrossrefStringWithoutTitle) - slug shouldBe None + it should "return None for JSON lacking title" in { + CrossrefScorable.simplifyJson(CrossrefStringWithoutTitle) shouldBe None } - it should "return None if given a malformed json string" in { - val slug = CrossrefScorable.crossrefToSlug(MalformedCrossrefString) - slug shouldBe None + it should "return appropriate result for valid JSON" in { + CrossrefScorable.simplifyJson(CrossrefStringWithTitle) match { + case None => fail("None unexpectedly returned by simplifyJson") + case Some(map) => { + Scorable.isScorableMap(map) shouldBe true + map.size shouldBe 1 + map.keys should contain ("title") + map("title") shouldBe "SomeTitle" + } + } } - */ } diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala index 7777610..5bb955a 100644 --- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala @@ -60,18 +60,15 @@ class GrobidScorableTest extends FlatSpec with Matchers { // Unit tests - "grobidToSlug()" should "get the right slug for a grobid json string" in { - val slug = GrobidScorable.grobidToSlug(GrobidStringWithTitle) - slug should contain ("dummy example file") + "GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in { + val result = GrobidScorable.jsonToMapFeatures(MalformedGrobidString) shouldBe None + result.slug shouldBe Scorable.NoSlug + result.json shouldBe MalformedGrobidString } - it should "return None if given json string without title" in { - val slug = GrobidScorable.grobidToSlug(GrobidStringWithoutTitle) - slug shouldBe None - } - - it should "return None if given a malformed json string" in { - val slug = GrobidScorable.grobidToSlug(MalformedGrobidString) - slug shouldBe None + "GrobidScorable.jsonToMapFeatures()" should "handle missing title" in { + val result = GrobidScorable.jsonToMapFeatures(GrobidStringWithoutTitle) shouldBe None + result.slug shouldBe Scorable.NoSlug + result.json shouldBe GrobidStringWithoutTitle } } -- cgit v1.2.3 From 31354b1a6062c5c56a30610f68fa48c82a7e83f0 Mon Sep 17 00:00:00 2001 From: Ellen Spertus <ellen.spertus@gmail.com> Date: Sun, 12 Aug 2018 18:08:51 -0700 Subject: Tests pass. --- scalding/src/main/scala/sandcrawler/Scorable.scala | 11 +-- .../scala/sandcrawler/CrossrefScorableTest.scala | 89 ---------------------- .../scala/sandcrawler/GrobidScorableTest.scala | 20 +++-- .../src/test/scala/sandcrawler/ScorableTest.scala | 28 ++++--- 4 files changed, 39 insertions(+), 109 deletions(-) delete mode 100644 scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala (limited to 'scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala') diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 9c8da69..929461b 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -66,13 +66,14 @@ object Scorable { // This guarantees it will have all of the fields needed to compute // the ultimate score, which are a superset of those needed for a slug. def mapToSlug(map : Map[String, Any]) : String = { - val unaccented = StringUtilities.removeAccents(getString(map, "title")) - // Remove punctuation after splitting on colon. - val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())) - if (slug.isEmpty || slug == null) { + val title = getString(map, "title") + if (title == null) { NoSlug } else { - slug + val unaccented = StringUtilities.removeAccents(title) + // Remove punctuation after splitting on colon. + val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "") + if (slug.isEmpty || slug == null) NoSlug else slug } } diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala deleted file mode 100644 index 1c35d66..0000000 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ /dev/null @@ -1,89 +0,0 @@ -package sandcrawler - -import cascading.tuple.Fields -import cascading.tuple.Tuple -import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} -import org.apache.hadoop.hbase.io.ImmutableBytesWritable -import org.apache.hadoop.hbase.util.Bytes -import org.scalatest._ -import parallelai.spyglass.hbase.HBaseConstants.SourceMode - -class CrossrefScorableTest extends FlatSpec with Matchers { - val CrossrefString = -""" -{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, - "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], - "date-time" : "2017-10-23T17:19:16Z", - "timestamp" : { "$numberLong" : "1508779156477" } }, - "reference-count" : 0, - "publisher" : "Elsevier BV", - "issue" : "3", - "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", - "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], - "date-time" : "1996-01-01T00:00:00Z", - "timestamp" : { "$numberLong" : "820454400000" } }, - "delay-in-days" : 0, "content-version" : "tdm" }], - "content-domain" : { "domain" : [], "crossmark-restriction" : false }, - "published-print" : { "date-parts" : [ [ 1996 ] ] }, - "DOI" : "<<DOI>>", - "type" : "journal-article", - "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], - "date-time" : "2002-07-25T15:09:41Z", - "timestamp" : { "$numberLong" : "1027609781000" } }, - "page" : "186-187", - "source" : "Crossref", - "is-referenced-by-count" : 0, - "title" : [ "<<TITLE>>" ], - "prefix" : "10.1016", - "volume" : "9", - "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], - "member" : "78", - "container-title" : [ "Journal de Pédiatrie et de Puériculture" ], - "link" : [ { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml", - "content-type" : "text/xml", - "content-version" : "vor", - "intended-application" : "text-mining" }, - { "URL" : - "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain", - "content-type" : "text/plain", - "content-version" : "vor", - "intended-application" : "text-mining" } ], - "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], - "date-time" : "2015-09-03T10:03:43Z", - "timestamp" : { "$numberLong" : "1441274623000" } }, - "score" : 1, - "issued" : { "date-parts" : [ [ 1996 ] ] }, - "references-count" : 0, - "alternative-id" : [ "0987-7983(96)87729-2" ], - "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", - "ISSN" : [ "0987-7983" ], - "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], - "subject" : [ "Pediatrics, Perinatology, and Child Health" ] -} -""" - val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle") - val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") - val MalformedCrossrefString = CrossrefString.replace("}", "") - - // Unit tests - "simplifyJson()" should "return None for bad JSON" in { - CrossrefScorable.simplifyJson("") shouldBe None - CrossrefScorable.simplifyJson(MalformedCrossrefString) shouldBe None - } - - it should "return None for JSON lacking title" in { - CrossrefScorable.simplifyJson(CrossrefStringWithoutTitle) shouldBe None - } - - it should "return appropriate result for valid JSON" in { - CrossrefScorable.simplifyJson(CrossrefStringWithTitle) match { - case None => fail("None unexpectedly returned by simplifyJson") - case Some(map) => { - Scorable.isScorableMap(map) shouldBe true - map.size shouldBe 1 - map.keys should contain ("title") - map("title") shouldBe "SomeTitle" - } - } - } -} diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala index 5bb955a..3fcd856 100644 --- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala @@ -57,18 +57,28 @@ class GrobidScorableTest extends FlatSpec with Matchers { val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File") val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle") val MalformedGrobidString = GrobidString.replace("}", "") + val Key = "Dummy Key" // Unit tests "GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in { - val result = GrobidScorable.jsonToMapFeatures(MalformedGrobidString) shouldBe None + val result = GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString) result.slug shouldBe Scorable.NoSlug - result.json shouldBe MalformedGrobidString } - "GrobidScorable.jsonToMapFeatures()" should "handle missing title" in { - val result = GrobidScorable.jsonToMapFeatures(GrobidStringWithoutTitle) shouldBe None + it should "handle missing title" in { + val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle) result.slug shouldBe Scorable.NoSlug - result.json shouldBe GrobidStringWithoutTitle + } + + it should "handle valid input" in { + val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithTitle) + result.slug shouldBe "dummyexamplefile" + Scorable.jsonToMap(result.json) match { + case None => fail() + case Some(map) => { + map("title").asInstanceOf[String] shouldBe "Dummy Example File" + } + } } } diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala index 2f80492..95faacc 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala @@ -54,28 +54,36 @@ class ScorableTest extends FlatSpec with Matchers { "annex": null } """ + private def titleToSlug(s : String) : String = { + Scorable.mapToSlug(Scorable.toScorableMap(title = s)) + } - "titleToSlug()" should "extract the parts of titles before a colon" in { - Scorable.titleToSlug("HELLO:there") shouldBe "hello" + "mapToSlug()" should "extract the parts of titles before a colon" in { + titleToSlug("HELLO:there") shouldBe "hello" } it should "extract an entire colon-less string" in { - Scorable.titleToSlug("hello THERE") shouldBe "hello there" + titleToSlug("hello THERE") shouldBe "hellothere" } it should "return Scorable.NoSlug if given empty string" in { - Scorable.titleToSlug("") shouldBe Scorable.NoSlug + titleToSlug("") shouldBe Scorable.NoSlug } it should "return Scorable.NoSlug if given null" in { - Scorable.titleToSlug(null) shouldBe Scorable.NoSlug + titleToSlug(null) shouldBe Scorable.NoSlug + } + + it should "strip punctuation" in { + titleToSlug("HELLO!:the:re") shouldBe "hello" + titleToSlug("a:b:c") shouldBe "a" + titleToSlug( + "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands" } - "titleToSlug()" should "strip punctuation" in { - Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello" - Scorable.titleToSlug("a:b:c") shouldBe "a" - Scorable.titleToSlug( - "If you're happy and you know it, clap your hands!") shouldBe "if youre happy and you know it clap your hands" + it should "remove whitespace" in { + titleToSlug("foo bar : baz ::") shouldBe "foobar" + titleToSlug("\na\t:b:c") shouldBe "a" } "jsonToMap()" should "return a map, given a legal JSON string" in { -- cgit v1.2.3 From 05c0213547f29842bbae6faaf77e983a364d4a2e Mon Sep 17 00:00:00 2001 From: Ellen Spertus <ellen.spertus@gmail.com> Date: Sun, 12 Aug 2018 18:41:27 -0700 Subject: Added back file I shouldn't have deleted. --- .../main/scala/sandcrawler/CrossrefScorable.scala | 22 ------ .../scala/sandcrawler/CrossrefScorableTest.scala | 89 ++++++++++++++++++++++ 2 files changed, 89 insertions(+), 22 deletions(-) create mode 100644 scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala (limited to 'scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala') diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index 5113b0c..667a5cc 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -64,25 +64,3 @@ class CrossrefScorable extends Scorable with HBasePipeConversions { } } } - -/* -object CrossrefScorable { - def simplifyJson(json : String) : Option[Map[String, Any]] = { - Scorable.jsonToMap(json) match { - case None => None - case Some(map) => { - if (map contains "title") { - val titles = map("title").asInstanceOf[List[String]] - if (titles.isEmpty) { - None - } else { - Some(Map("title" -> titles(0))) - } - } else { - None - } - } - } - } -} - */ diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala new file mode 100644 index 0000000..1c35d66 --- /dev/null +++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala @@ -0,0 +1,89 @@ +package sandcrawler + +import cascading.tuple.Fields +import cascading.tuple.Tuple +import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} +import org.apache.hadoop.hbase.io.ImmutableBytesWritable +import org.apache.hadoop.hbase.util.Bytes +import org.scalatest._ +import parallelai.spyglass.hbase.HBaseConstants.SourceMode + +class CrossrefScorableTest extends FlatSpec with Matchers { + val CrossrefString = +""" +{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, + "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], + "date-time" : "2017-10-23T17:19:16Z", + "timestamp" : { "$numberLong" : "1508779156477" } }, + "reference-count" : 0, + "publisher" : "Elsevier BV", + "issue" : "3", + "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", + "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], + "date-time" : "1996-01-01T00:00:00Z", + "timestamp" : { "$numberLong" : "820454400000" } }, + "delay-in-days" : 0, "content-version" : "tdm" }], + "content-domain" : { "domain" : [], "crossmark-restriction" : false }, + "published-print" : { "date-parts" : [ [ 1996 ] ] }, + "DOI" : "<<DOI>>", + "type" : "journal-article", + "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], + "date-time" : "2002-07-25T15:09:41Z", + "timestamp" : { "$numberLong" : "1027609781000" } }, + "page" : "186-187", + "source" : "Crossref", + "is-referenced-by-count" : 0, + "title" : [ "<<TITLE>>" ], + "prefix" : "10.1016", + "volume" : "9", + "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], + "member" : "78", + "container-title" : [ "Journal de Pédiatrie et de Puériculture" ], + "link" : [ { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml", + "content-type" : "text/xml", + "content-version" : "vor", + "intended-application" : "text-mining" }, + { "URL" : + "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain", + "content-type" : "text/plain", + "content-version" : "vor", + "intended-application" : "text-mining" } ], + "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], + "date-time" : "2015-09-03T10:03:43Z", + "timestamp" : { "$numberLong" : "1441274623000" } }, + "score" : 1, + "issued" : { "date-parts" : [ [ 1996 ] ] }, + "references-count" : 0, + "alternative-id" : [ "0987-7983(96)87729-2" ], + "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", + "ISSN" : [ "0987-7983" ], + "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], + "subject" : [ "Pediatrics, Perinatology, and Child Health" ] +} +""" + val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle") + val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") + val MalformedCrossrefString = CrossrefString.replace("}", "") + + // Unit tests + "simplifyJson()" should "return None for bad JSON" in { + CrossrefScorable.simplifyJson("") shouldBe None + CrossrefScorable.simplifyJson(MalformedCrossrefString) shouldBe None + } + + it should "return None for JSON lacking title" in { + CrossrefScorable.simplifyJson(CrossrefStringWithoutTitle) shouldBe None + } + + it should "return appropriate result for valid JSON" in { + CrossrefScorable.simplifyJson(CrossrefStringWithTitle) match { + case None => fail("None unexpectedly returned by simplifyJson") + case Some(map) => { + Scorable.isScorableMap(map) shouldBe true + map.size shouldBe 1 + map.keys should contain ("title") + map("title") shouldBe "SomeTitle" + } + } + } +} -- cgit v1.2.3 From 5615428921a45ba6a2fb005b255a28dcbb83b13f Mon Sep 17 00:00:00 2001 From: Ellen Spertus <ellen.spertus@gmail.com> Date: Sun, 12 Aug 2018 19:12:32 -0700 Subject: Snapshot before changing Scorable to find bug. --- .../main/scala/sandcrawler/CrossrefScorable.scala | 41 ++++++++++++---------- scalding/src/main/scala/sandcrawler/Scorable.scala | 1 - .../scala/sandcrawler/CrossrefScorableTest.scala | 24 ++++++------- .../scala/sandcrawler/GrobidScorableTest.scala | 1 + .../src/test/scala/sandcrawler/ScoreJobTest.scala | 15 +++++--- 5 files changed, 46 insertions(+), 36 deletions(-) (limited to 'scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala') diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index 667a5cc..e257152 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -41,26 +41,31 @@ class CrossrefScorable extends Scorable with HBasePipeConversions { def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = { getSource(args).read .toTypedPipe[String](new Fields("line")) - .map{ json : String => - Scorable.jsonToMap(json) match { - case None => MapFeatures(Scorable.NoSlug, json) - case Some(map) => { - if ((map contains "title") && (map contains "DOI")) { - val titles = map("title").asInstanceOf[List[String]] - if (titles.isEmpty) { - new MapFeatures(Scorable.NoSlug, json) - } else { - val title = titles(0) - val map2 = Scorable.toScorableMap(title=titles(0), doi=map("DOI").asInstanceOf[String]) - new MapFeatures( - Scorable.mapToSlug(map2), - JSONObject(map2).toString) - } - } else { - new MapFeatures(Scorable.NoSlug, json) - } + .map { CrossrefScorable.jsonToMapFeatures(_) } + } +} + +object CrossrefScorable { + def jsonToMapFeatures(json : String) : MapFeatures = { + Scorable.jsonToMap(json) match { + case None => MapFeatures(Scorable.NoSlug, json) + case Some(map) => { + if ((map contains "titles") && (map contains "DOI")) { + val titles = map("titles").asInstanceOf[List[String]] + val doi = Scorable.getString(map, "DOI") + if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) { + new MapFeatures(Scorable.NoSlug, json) + } else { + val title = titles(0) + val map2 = Scorable.toScorableMap(title=title, doi=doi) + new MapFeatures( + Scorable.mapToSlug(map2), + JSONObject(map2).toString) } + } else { + new MapFeatures(Scorable.NoSlug, json) } } + } } } diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 929461b..a256fa4 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -7,7 +7,6 @@ import scala.util.parsing.json.JSONObject import cascading.flow.FlowDef import com.twitter.scalding._ import com.twitter.scalding.typed.TDsl._ -//import TDsl._ case class MapFeatures(slug : String, json : String) case class ReduceFeatures(json : String) diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala index 1c35d66..dc6f347 100644 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala @@ -66,23 +66,23 @@ class CrossrefScorableTest extends FlatSpec with Matchers { val MalformedCrossrefString = CrossrefString.replace("}", "") // Unit tests - "simplifyJson()" should "return None for bad JSON" in { - CrossrefScorable.simplifyJson("") shouldBe None - CrossrefScorable.simplifyJson(MalformedCrossrefString) shouldBe None + "CrossrefScorable.jsonToMapFeatures()" should "handle invalid JSON" in { + val result = CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString) + result.slug shouldBe Scorable.NoSlug } - it should "return None for JSON lacking title" in { - CrossrefScorable.simplifyJson(CrossrefStringWithoutTitle) shouldBe None + it should "handle missing title" in { + val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithoutTitle) + result.slug shouldBe Scorable.NoSlug } - it should "return appropriate result for valid JSON" in { - CrossrefScorable.simplifyJson(CrossrefStringWithTitle) match { - case None => fail("None unexpectedly returned by simplifyJson") + it should "handle valid input" in { + val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithTitle) + result.slug shouldBe "dummyexamplefile" + Scorable.jsonToMap(result.json) match { + case None => fail() case Some(map) => { - Scorable.isScorableMap(map) shouldBe true - map.size shouldBe 1 - map.keys should contain ("title") - map("title") shouldBe "SomeTitle" + map("title").asInstanceOf[String] shouldBe "Dummy Example File" } } } diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala index 3fcd856..4b958b9 100644 --- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala @@ -77,6 +77,7 @@ class GrobidScorableTest extends FlatSpec with Matchers { Scorable.jsonToMap(result.json) match { case None => fail() case Some(map) => { + map should contain key "title" map("title").asInstanceOf[String] shouldBe "Dummy Example File" } } diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala index 8acb454..8436817 100644 --- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala @@ -149,11 +149,16 @@ class ScoreJobTest extends FlatSpec with Matchers { 2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"), 3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"))) .sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) { - // Grobid titles: - // "Title 1", "Title 2: TNG", "Title 3: The Sequel" - // crossref slugs: - // "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted" - // Join should have 3 "Title 1" slugs and 1 "Title 2" slug + // Grobid titles and slugs (in parentheses): + // Title 1 (title1) + // Title 2: TNG (title2) + // Title 3: The Sequel (title3) + // crossref titles and slugs (in parentheses): + // Title 1: TNG (title1) + // Title 1: TNG 2 (title1) + // Title 1: TNG 3 (title1) + // Title 2 Rebooted (title2rebooted) + // Join should have 3 "title1" slugs and 1 "title2" slug outputBuffer => "The pipeline" should "return a 4-element list" in { outputBuffer should have length 4 -- cgit v1.2.3 From 1c6e1234974d8b6e4480a13ff5c4ff861c6d1deb Mon Sep 17 00:00:00 2001 From: Ellen Spertus <ellen.spertus@gmail.com> Date: Mon, 13 Aug 2018 09:58:27 -0700 Subject: Pipeline works, all tests pass, no scalastyle errors. --- .../main/scala/sandcrawler/CrossrefScorable.scala | 28 +-- .../main/scala/sandcrawler/GrobidScorable.scala | 3 +- .../scala/sandcrawler/HBaseCrossrefScoreJob.scala | 218 --------------------- scalding/src/main/scala/sandcrawler/Scorable.scala | 2 +- scalding/src/main/scala/sandcrawler/ScoreJob.scala | 51 +---- .../scala/sandcrawler/CrossrefScorableTest.scala | 6 +- .../src/test/scala/sandcrawler/ScoreJobTest.scala | 80 +++++--- 7 files changed, 65 insertions(+), 323 deletions(-) delete mode 100644 scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala (limited to 'scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala') diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index e257152..4558ee6 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -1,36 +1,14 @@ package sandcrawler -import cascading.flow.FlowDef -import cascading.pipe.Pipe -import cascading.tuple.Fields -import com.twitter.scalding._ -import com.twitter.scalding.typed.TDsl._ -import parallelai.spyglass.hbase.HBaseConstants.SourceMode -import parallelai.spyglass.hbase.HBasePipeConversions -import parallelai.spyglass.hbase.HBaseSource -import TDsl._ -import scala.util.parsing.json.JSONObject - -import java.text.Normalizer -import java.util.Arrays -import java.util.Properties -import java.util.regex.Pattern - import scala.math import scala.util.parsing.json.JSON import scala.util.parsing.json.JSONObject +import cascading.flow.FlowDef import cascading.tuple.Fields import com.twitter.scalding._ -import com.twitter.scalding.typed.CoGrouped -import com.twitter.scalding.typed.Grouped import com.twitter.scalding.typed.TDsl._ -import org.apache.hadoop.hbase.io.ImmutableBytesWritable -import org.apache.hadoop.hbase.util.Bytes -import parallelai.spyglass.base.JobBase -import parallelai.spyglass.hbase.HBaseConstants.SourceMode import parallelai.spyglass.hbase.HBasePipeConversions -import parallelai.spyglass.hbase.HBaseSource class CrossrefScorable extends Scorable with HBasePipeConversions { // TODO: Generalize args so there can be multiple Crossref pipes in one job. @@ -50,8 +28,8 @@ object CrossrefScorable { Scorable.jsonToMap(json) match { case None => MapFeatures(Scorable.NoSlug, json) case Some(map) => { - if ((map contains "titles") && (map contains "DOI")) { - val titles = map("titles").asInstanceOf[List[String]] + if ((map contains "title") && (map contains "DOI")) { + val titles = map("title").asInstanceOf[List[String]] val doi = Scorable.getString(map, "DOI") if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) { new MapFeatures(Scorable.NoSlug, json) diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index de9f51a..94b3494 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -1,15 +1,14 @@ package sandcrawler import scala.util.parsing.json.JSONObject + import cascading.flow.FlowDef -import cascading.pipe.Pipe import cascading.tuple.Fields import com.twitter.scalding._ import com.twitter.scalding.typed.TDsl._ import parallelai.spyglass.hbase.HBaseConstants.SourceMode import parallelai.spyglass.hbase.HBasePipeConversions import parallelai.spyglass.hbase.HBaseSource -//import TDsl._ class GrobidScorable extends Scorable with HBasePipeConversions { def getSource(args : Args) : Source = { diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala deleted file mode 100644 index 018a74b..0000000 --- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala +++ /dev/null @@ -1,218 +0,0 @@ -package sandcrawler - -import java.text.Normalizer -import java.util.Arrays -import java.util.Properties -import java.util.regex.Pattern - -import scala.math -import scala.util.parsing.json.JSON - -import cascading.tuple.Fields -import com.twitter.scalding._ -import com.twitter.scalding.typed.CoGrouped -import com.twitter.scalding.typed.Grouped -import com.twitter.scalding.typed.TDsl._ -import org.apache.hadoop.hbase.io.ImmutableBytesWritable -import org.apache.hadoop.hbase.util.Bytes -import parallelai.spyglass.base.JobBase -import parallelai.spyglass.hbase.HBaseConstants.SourceMode -import parallelai.spyglass.hbase.HBasePipeConversions -import parallelai.spyglass.hbase.HBaseSource -import TDsl._ - -class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConversions { - val NoTitle = "NO TITLE" // Used for slug if title is empty or unparseable - - // key is SHA1 - val grobidSource = HBaseCrossrefScore.getHBaseSource( - args("hbase-table"), - args("zookeeper-hosts")) - - val temp : cascading.pipe.Pipe = grobidSource - .read - - // Here I CAN call Pipe.toTypedPipe() - val grobidPipe : TypedPipe[(String, String, String)] = temp - .fromBytesWritable(new Fields("key", "tei_json")) - .toTypedPipe[(String, String)]('key, 'tei_json) - .map { entry => - val (key, json) = (entry._1, entry._2) - HBaseCrossrefScore.grobidToSlug(json) match { - case Some(slug) => (slug, key, json) - case None => (NoTitle, key, json) - } - } - .filter { entry => - val (slug, _, _) = entry - slug != NoTitle - } - - val grobidGroup = grobidPipe - .groupBy { case (slug, key, json) => slug } - - val crossrefSource = TextLine(args("crossref-input")) - val temp2 : cascading.pipe.Pipe = crossrefSource.read - val crossrefPipe : TypedPipe[(String, String)] = temp2 - // .debug // Should be 4 tuples for mocked data - .toTypedPipe[String]('line) - .map{ json : String => - HBaseCrossrefScore.crossrefToSlug(json) match { - case Some(slug) => (slug, json) - case None => (NoTitle, json) - } - } - .filter { entry => - val (slug, json) = entry - slug != NoTitle - } - - val crossrefGroup = crossrefPipe - .groupBy { case (slug, json) => slug } - - val theJoin : CoGrouped[String, ((String, String, String), (String, String))] = - grobidGroup.join(crossrefGroup) - - theJoin.map{ entry => - val (slug : String, - ((slug0: String, sha1 : String, grobidJson : String), - (slug1 : String, crossrefJson : String))) = entry - HBaseCrossrefScore.computeOutput(sha1, grobidJson, crossrefJson)} - // Output: score, sha1, doi, grobid title, crossref title - .write(TypedTsv[(Int, String, String, String, String)](args("output"))) - -} - -object HBaseCrossrefScore { - def getHBaseSource(hbaseTable: String, zookeeperHosts: String) : HBaseSource = HBaseBuilder.build( - hbaseTable, // HBase Table Name - zookeeperHosts, // HBase Zookeeper server (to get runtime config info; can be array?) - List("grobid0:tei_json"), - SourceMode.SCAN_ALL) - - def jsonToMap(json : String) : Option[Map[String, Any]] = { - // https://stackoverflow.com/a/32717262/631051 - val jsonObject = JSON.parseFull(json) - if (jsonObject == None) { - None - } else { - Some(jsonObject.get.asInstanceOf[Map[String, Any]]) - } - } - - def grobidToSlug(json : String) : Option[String] = { - jsonToMap(json) match { - case None => None - case Some(map) => { - if (map contains "title") { - titleToSlug(map("title").asInstanceOf[String]) - } else { - None - } - } - } - } - - def crossrefToSlug(json : String) : Option[String] = { - jsonToMap(json) match { - case None => None - case Some(map) => { - if (map contains "title") { - // TODO: Don't ignore titles after the first. - titleToSlug(map("title").asInstanceOf[List[String]](0)) - } else { - None - } - } - } - } - - def titleToSlug(title : String) : Option[String] = { - val slug = removeAccents(title).split(":")(0).toLowerCase() - if (slug.isEmpty) { - None - } else { - Some(slug) - } - } - - val MaxScore = 1000 - - def computeOutput(sha1 : String, grobidJson : String, crossrefJson : String) : - // (score, sha1, doi, grobidTitle, crossrefTitle) - (Int, String, String, String, String) = { - jsonToMap(grobidJson) match { - case None => (0, "", "", "", "") // This can't happen, because grobidJson already validated in earlier stage - case Some(grobid) => { - val grobidTitle = grobid("title").asInstanceOf[String].toLowerCase() - - jsonToMap(crossrefJson) match { - case None => (0, "", "", "", "") // This can't happen, because crossrefJson already validated in earlier stage - case Some(crossref) => { - val crossrefTitle = crossref("title").asInstanceOf[List[String]](0).toLowerCase() - - (similarity(removeAccents(grobidTitle), removeAccents(crossrefTitle)), - sha1, - crossref("DOI").asInstanceOf[String], - "'" + grobidTitle + "'", - "'" + crossrefTitle + "'") - } - } - } - } - } - - // Adapted from https://git-wip-us.apache.org/repos/asf?p=commons-lang.git;a=blob;f=src/main/java/org/apache/commons/lang3/StringUtils.java;h=1d7b9b99335865a88c509339f700ce71ce2c71f2;hb=HEAD#l934 - def removeAccents(s : String) : String = { - val replacements = Map( - '\u0141' -> 'L', - '\u0142' -> 'l', // Letter ell - '\u00d8' -> 'O', - '\u00f8' -> 'o' - ) - val sb = new StringBuilder(Normalizer.normalize(s, Normalizer.Form.NFD)) - for (i <- 0 to sb.length - 1) { - for (key <- replacements.keys) { - if (sb(i) == key) { - sb.deleteCharAt(i); - sb.insert(i, replacements(key)) - } - } - } - val pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+") - pattern.matcher(sb).replaceAll("") - } - - // Adapted from: https://stackoverflow.com/a/16018452/631051 - def similarity(s1 : String, s2 : String) : Int = { - val longer : String = if (s1.length > s2.length) s1 else s2 - val shorter : String = if (s1.length > s2.length) s2 else s1 - if (longer.length == 0) { - // Both strings are empty. - MaxScore - } else { - (longer.length - stringDistance(longer, shorter)) * MaxScore / longer.length - } - } - - // Source: // https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/ - def stringDistance(s1: String, s2: String): Int = { - val memo = scala.collection.mutable.Map[(List[Char],List[Char]),Int]() - def min(a:Int, b:Int, c:Int) = Math.min( Math.min( a, b ), c) - def sd(s1: List[Char], s2: List[Char]): Int = { - if (!memo.contains((s1, s2))) { - memo((s1,s2)) = (s1, s2) match { - case (_, Nil) => s1.length - case (Nil, _) => s2.length - case (c1::t1, c2::t2) => - min( sd(t1,s2) + 1, sd(s1,t2) + 1, - sd(t1,t2) + (if (c1==c2) 0 else 1) ) - } - } - memo((s1,s2)) - } - - sd( s1.toList, s2.toList ) - } -} - diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index a256fa4..717b2d5 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -39,7 +39,7 @@ object Scorable { // NOTE: I could go all out and make ScorableMap a type. // TODO: Require year. Other features will get added here. def toScorableMap(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : Map[String, Any] = { - Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) + Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) } def toScorableJson(title : String, year : Int, doi : String = "", sha1 : String = "") : String = { diff --git a/scalding/src/main/scala/sandcrawler/ScoreJob.scala b/scalding/src/main/scala/sandcrawler/ScoreJob.scala index 386b367..75d45e9 100644 --- a/scalding/src/main/scala/sandcrawler/ScoreJob.scala +++ b/scalding/src/main/scala/sandcrawler/ScoreJob.scala @@ -1,16 +1,12 @@ package sandcrawler -import cascading.flow.FlowDef -import cascading.tuple.Fields -import com.twitter.scalding.{Args,Source,TextLine,TypedPipe, TypedTsv} -//import com.twitter.scalding.source.TypedText -import parallelai.spyglass.base.JobBase -import parallelai.spyglass.hbase.HBasePipeConversions -import parallelai.spyglass.hbase.HBaseSource -import com.twitter.scalding.{ Dsl, RichPipe, IterableSource, TupleSetter, TupleConverter } import cascading.pipe.Pipe +import com.twitter.scalding.Args +import com.twitter.scalding.TypedPipe +import com.twitter.scalding.TypedTsv +import parallelai.spyglass.base.JobBase -class ScoreJob(args: Args) extends JobBase(args) { //with HBasePipeConversions { +class ScoreJob(args: Args) extends JobBase(args) { // TODO: Instantiate any subclass of Scorable specified in args. val sc1 : Scorable = new GrobidScorable() val sc2 : Scorable = new CrossrefScorable() @@ -27,10 +23,10 @@ class ScoreJob(args: Args) extends JobBase(args) { //with HBasePipeConversions { } //TypedTsv doesn't work over case classes. .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) } - .write(TypedTsv[(String, Int, String, String)](args("output"))) } +/* // Ugly hack to get non-String information into ScoreJob above. object ScoreJob { var scorable1 : Option[Scorable] = None @@ -57,38 +53,5 @@ object ScoreJob { case None => null } } - - /* - implicit def sourceToRichPipe(src: Source): RichPipe = new RichPipe(src.read) - - // This converts an Iterable into a Pipe or RichPipe with index (int-based) fields - implicit def toPipe[T](iter: Iterable[T])(implicit set: TupleSetter[T], conv: TupleConverter[T]): Pipe = - IterableSource[T](iter)(set, conv).read - - implicit def iterableToRichPipe[T](iter: Iterable[T])(implicit set: TupleSetter[T], conv: TupleConverter[T]): RichPipe = - RichPipe(toPipe(iter)(set, conv)) - - // Provide args as an implicit val for extensions such as the Checkpoint extension. -// implicit protected def _implicitJobArgs: Args = args - - def getFeaturesPipe1(pipe : cascading.pipe.Pipe) : TypedPipe[String] = { - pipe - // The next line gives an error: value toTypedPipe is not a member of cascading.pipe.Pipe - .toTypedPipe[String](new Fields("line")) - } - - def getFeaturesPipe(pipe : cascading.pipe.Pipe) : TypedPipe[MapFeatures] = { - pipe - .fromBytesWritable(new Fields("key", "tei_json")) - // I needed to change symbols to strings when I pulled this out of ScoreJob. - .toTypedPipe[(String, String)](new Fields("key", "tei_json")) - .map { entry => - val (key : String, json : String) = (entry._1, entry._2) - GrobidScorable.grobidToSlug(json) match { - case Some(slug) => new MapFeatures(slug, json) - case None => new MapFeatures(Scorable.NoSlug, json) - } - } - } - */ } + */ diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala index dc6f347..75be03e 100644 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala @@ -61,7 +61,7 @@ class CrossrefScorableTest extends FlatSpec with Matchers { "subject" : [ "Pediatrics, Perinatology, and Child Health" ] } """ - val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle") + val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "Some Title") val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") val MalformedCrossrefString = CrossrefString.replace("}", "") @@ -78,11 +78,11 @@ class CrossrefScorableTest extends FlatSpec with Matchers { it should "handle valid input" in { val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithTitle) - result.slug shouldBe "dummyexamplefile" + result.slug shouldBe "sometitle" Scorable.jsonToMap(result.json) match { case None => fail() case Some(map) => { - map("title").asInstanceOf[String] shouldBe "Dummy Example File" + map("title").asInstanceOf[String] shouldBe "Some Title" } } } diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala index 8436817..f0b411f 100644 --- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala @@ -113,25 +113,32 @@ class ScoreJobTest extends FlatSpec with Matchers { val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle") val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") val MalformedCrossrefString = CrossrefString.replace("}", "") + val CrossrefStrings = List( + CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"), + CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2A").replace("<<DOI>>", "DOI-0.5"), + CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"), + CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1")) // Pipeline tests val output = "/tmp/testOutput" val input = "/tmp/testInput" val (testTable, testHost) = ("test-table", "dummy-host:2181") - val grobidSampleData = List( - List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), - Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 1"))), - List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), - Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 2: TNG"))), - List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), - Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"))), - List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), - Bytes.toBytes(MalformedGrobidString))) + val Sha1Strings = List( + "sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q", + "sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU", + "sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT", + "sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56") - // TODO: Make less yucky. - ScoreJob.setScorable1(new CrossrefScorable()) - ScoreJob.setScorable2(new GrobidScorable()) + val GrobidStrings = List( + GrobidString.replace("<<TITLE>>", "Title 1"), + GrobidString.replace("<<TITLE>>", "Title 2: TNG"), + GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"), + MalformedGrobidString) + + val GrobidSampleData = (Sha1Strings zip GrobidStrings) + .map{case(s, g) => + List(Bytes.toBytes(s), Bytes.toBytes(g))} JobTest("sandcrawler.ScoreJob") .arg("test", "") @@ -142,12 +149,12 @@ class ScoreJobTest extends FlatSpec with Matchers { .arg("crossref-input", input) .arg("debug", "true") .source[Tuple](GrobidScorable.getHBaseSource(testTable, testHost), - grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*))) + GrobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*))) .source(TextLine(input), List( - 0 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"), - 1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"), - 2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"), - 3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"))) + 0 -> CrossrefStrings(0), + 1 -> CrossrefStrings(1), + 2 -> CrossrefStrings(2), + 3 -> CrossrefStrings(3))) .sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) { // Grobid titles and slugs (in parentheses): // Title 1 (title1) @@ -155,27 +162,40 @@ class ScoreJobTest extends FlatSpec with Matchers { // Title 3: The Sequel (title3) // crossref titles and slugs (in parentheses): // Title 1: TNG (title1) - // Title 1: TNG 2 (title1) + // Title 1: TNG 2A (title1) // Title 1: TNG 3 (title1) - // Title 2 Rebooted (title2rebooted) + // Title 2: Rebooted (title2) // Join should have 3 "title1" slugs and 1 "title2" slug outputBuffer => "The pipeline" should "return a 4-element list" in { outputBuffer should have length 4 } - /* - it should "return the right first entry" in { - outputBuffer(0) shouldBe ReduceOutput("slug", 50, "", - "") - val (slug, slug0, slug1, sha1, grobidJson, crossrefJson) = outputBuffer(0) - slug shouldBe "title 1" - slug shouldBe slug0 - slug shouldBe slug1 - sha1 shouldBe new String(grobidSampleData(0)(0), "UTF-8") - grobidJson shouldBe new String(grobidSampleData(0)(1), "UTF-8") + it should "has right # of entries with each slug" in { + val slugs = outputBuffer.map(_._1) + val countMap : Map[String, Int] = slugs.groupBy(identity).mapValues(_.size) + countMap("title1") shouldBe 3 + countMap("title2") shouldBe 1 + } + + def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) = { + val mf1 : MapFeatures = GrobidScorable.jsonToMapFeatures( + Sha1Strings(grobidIndex), + GrobidStrings(grobidIndex)) + val mf2 : MapFeatures = CrossrefScorable.jsonToMapFeatures( + CrossrefStrings(crossrefIndex)) + val score = Scorable.computeSimilarity( + ReduceFeatures(mf1.json), + ReduceFeatures(mf2.json)) + (slug, score, mf1.json, mf2.json) + } + + it should "have right output values" in { + outputBuffer.exists(_ == bundle("title1", 0, 0)) + outputBuffer.exists(_ == bundle("title1", 0, 2)) + outputBuffer.exists(_ == bundle("title1", 0, 1)) + outputBuffer.exists(_ == bundle("title2", 1, 3)) } - */ } .run .finish -- cgit v1.2.3 From 3ff30c8f20d36f8e47ec5478c10c3348d2f45fa6 Mon Sep 17 00:00:00 2001 From: Ellen Spertus <ellen.spertus@gmail.com> Date: Tue, 14 Aug 2018 20:38:29 -0700 Subject: Fixed style problems (or disabled warning when appropriate) for tests. --- scalding/build.sbt | 7 ++ .../scala/sandcrawler/CrossrefScorableTest.scala | 87 ++++++++++--------- .../scala/sandcrawler/GrobidScorableTest.scala | 7 +- .../test/scala/sandcrawler/HBaseBuilderTest.scala | 1 + .../scala/sandcrawler/HBaseMimeCountTest.scala | 9 +- .../test/scala/sandcrawler/HBaseRowCountTest.scala | 11 +-- .../scala/sandcrawler/HBaseStatusCountTest.scala | 10 ++- .../scala/sandcrawler/ScorableFeaturesTest.scala | 1 + .../src/test/scala/sandcrawler/ScorableTest.scala | 5 +- .../src/test/scala/sandcrawler/ScoreJobTest.scala | 97 ++++++++++++---------- 10 files changed, 135 insertions(+), 100 deletions(-) (limited to 'scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala') diff --git a/scalding/build.sbt b/scalding/build.sbt index 2addd60..d477399 100644 --- a/scalding/build.sbt +++ b/scalding/build.sbt @@ -20,6 +20,13 @@ lazy val root = (project in file(".")). scalaSourceFiles.filterNot(_.getAbsolutePath.contains(dirNameToExclude)) }, + (scalastyleSources in Test) := { + // all .scala files in "src/test/scala" + val scalaSourceFiles = ((scalaSource in Test).value ** "*.scala").get + val dirNameToExclude = "/example/" + scalaSourceFiles.filterNot(_.getAbsolutePath.contains(dirNameToExclude)) + }, + name := "sandcrawler", resolvers += "conjars.org" at "http://conjars.org/repo", diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala index 75be03e..e171dba 100644 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala @@ -2,72 +2,77 @@ package sandcrawler import cascading.tuple.Fields import cascading.tuple.Tuple -import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} +import com.twitter.scalding.JobTest +import com.twitter.scalding.TextLine +import com.twitter.scalding.TupleConversions +import com.twitter.scalding.TypedTsv import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.scalatest._ import parallelai.spyglass.hbase.HBaseConstants.SourceMode class CrossrefScorableTest extends FlatSpec with Matchers { + // scalastyle:off val CrossrefString = """ -{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, - "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], - "date-time" : "2017-10-23T17:19:16Z", - "timestamp" : { "$numberLong" : "1508779156477" } }, - "reference-count" : 0, - "publisher" : "Elsevier BV", - "issue" : "3", - "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", - "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], - "date-time" : "1996-01-01T00:00:00Z", - "timestamp" : { "$numberLong" : "820454400000" } }, +{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, + "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], + "date-time" : "2017-10-23T17:19:16Z", + "timestamp" : { "$numberLong" : "1508779156477" } }, + "reference-count" : 0, + "publisher" : "Elsevier BV", + "issue" : "3", + "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", + "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], + "date-time" : "1996-01-01T00:00:00Z", + "timestamp" : { "$numberLong" : "820454400000" } }, "delay-in-days" : 0, "content-version" : "tdm" }], - "content-domain" : { "domain" : [], "crossmark-restriction" : false }, - "published-print" : { "date-parts" : [ [ 1996 ] ] }, + "content-domain" : { "domain" : [], "crossmark-restriction" : false }, + "published-print" : { "date-parts" : [ [ 1996 ] ] }, "DOI" : "<<DOI>>", - "type" : "journal-article", - "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], - "date-time" : "2002-07-25T15:09:41Z", - "timestamp" : { "$numberLong" : "1027609781000" } }, - "page" : "186-187", - "source" : "Crossref", - "is-referenced-by-count" : 0, + "type" : "journal-article", + "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], + "date-time" : "2002-07-25T15:09:41Z", + "timestamp" : { "$numberLong" : "1027609781000" } }, + "page" : "186-187", + "source" : "Crossref", + "is-referenced-by-count" : 0, "title" : [ "<<TITLE>>" ], - "prefix" : "10.1016", - "volume" : "9", - "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], - "member" : "78", - "container-title" : [ "Journal de Pédiatrie et de Puériculture" ], + "prefix" : "10.1016", + "volume" : "9", + "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], + "member" : "78", + "container-title" : [ "Journal de Pédiatrie et de Puériculture" ], "link" : [ { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml", - "content-type" : "text/xml", + "content-type" : "text/xml", "content-version" : "vor", - "intended-application" : "text-mining" }, + "intended-application" : "text-mining" }, { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain", - "content-type" : "text/plain", + "content-type" : "text/plain", "content-version" : "vor", - "intended-application" : "text-mining" } ], - "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], - "date-time" : "2015-09-03T10:03:43Z", - "timestamp" : { "$numberLong" : "1441274623000" } }, - "score" : 1, - "issued" : { "date-parts" : [ [ 1996 ] ] }, - "references-count" : 0, - "alternative-id" : [ "0987-7983(96)87729-2" ], - "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", - "ISSN" : [ "0987-7983" ], - "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], + "intended-application" : "text-mining" } ], + "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], + "date-time" : "2015-09-03T10:03:43Z", + "timestamp" : { "$numberLong" : "1441274623000" } }, + "score" : 1, + "issued" : { "date-parts" : [ [ 1996 ] ] }, + "references-count" : 0, + "alternative-id" : [ "0987-7983(96)87729-2" ], + "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", + "ISSN" : [ "0987-7983" ], + "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], "subject" : [ "Pediatrics, Perinatology, and Child Health" ] } """ + // scalastyle:on val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "Some Title") val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") val MalformedCrossrefString = CrossrefString.replace("}", "") // Unit tests "CrossrefScorable.jsonToMapFeatures()" should "handle invalid JSON" in { - val result = CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString) + val result = CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString) result.slug shouldBe Scorable.NoSlug } diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala index 4b958b9..661824b 100644 --- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala @@ -2,7 +2,10 @@ package sandcrawler import cascading.tuple.Fields import cascading.tuple.Tuple -import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} +import com.twitter.scalding.JobTest +import com.twitter.scalding.TextLine +import com.twitter.scalding.TupleConversions +import com.twitter.scalding.TypedTsv import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.scalatest._ @@ -62,7 +65,7 @@ class GrobidScorableTest extends FlatSpec with Matchers { // Unit tests "GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in { - val result = GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString) + val result = GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString) result.slug shouldBe Scorable.NoSlug } diff --git a/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala b/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala index 603a4c7..c61cb22 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala @@ -22,6 +22,7 @@ class HBaseBuilderTest extends FlatSpec with Matchers { fields should have length 0 } + //scalastyle:off no.whitespace.before.left.bracket it should "throw IllegalArgumentException on malformed input" in { a [IllegalArgumentException] should be thrownBy { HBaseBuilder.parseColSpecs(List("file_size")) diff --git a/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala index fde2290..d6d283f 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala @@ -1,15 +1,18 @@ package sandcrawler -import cascading.tuple.{Tuple, Fields} -import com.twitter.scalding.{JobTest, Tsv, TupleConversions} +import cascading.tuple.Fields +import cascading.tuple.Tuple +import com.twitter.scalding.JobTest +import com.twitter.scalding.Tsv +import com.twitter.scalding.TupleConversions import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.junit.runner.RunWith import org.scalatest.FunSpec import org.scalatest.junit.JUnitRunner import org.slf4j.LoggerFactory -import parallelai.spyglass.hbase.HBaseSource import parallelai.spyglass.hbase.HBaseConstants.SourceMode +import parallelai.spyglass.hbase.HBaseSource import scala._ @RunWith(classOf[JUnitRunner]) diff --git a/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala index 3424a36..c4ca5aa 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala @@ -1,15 +1,18 @@ package sandcrawler -import cascading.tuple.{Tuple, Fields} -import com.twitter.scalding.{JobTest, Tsv, TupleConversions} +import cascading.tuple.Fields +import cascading.tuple.Tuple +import com.twitter.scalding.JobTest +import com.twitter.scalding.Tsv +import com.twitter.scalding.TupleConversions import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.junit.runner.RunWith import org.scalatest.FunSpec import org.scalatest.junit.JUnitRunner import org.slf4j.LoggerFactory -import parallelai.spyglass.hbase.HBaseSource import parallelai.spyglass.hbase.HBaseConstants.SourceMode +import parallelai.spyglass.hbase.HBaseSource import scala._ /** @@ -47,12 +50,10 @@ class HBaseRowCountTest extends FunSpec with TupleConversions { outputBuffer => it("should return the test data provided.") { - println("outputBuffer.size => " + outputBuffer.size) assert(outputBuffer.size === 1) } it("should return the correct count") { - println("raw output => " + outputBuffer) assert(outputBuffer(0).getObject(0) === 8) } } diff --git a/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala index 8a71f31..fe3ff21 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala @@ -1,15 +1,19 @@ package sandcrawler -import cascading.tuple.{Tuple, Fields} -import com.twitter.scalding.{JobTest, Tsv, TypedTsv, TupleConversions} +import cascading.tuple.Fields +import cascading.tuple.Tuple +import com.twitter.scalding.JobTest +import com.twitter.scalding.Tsv +import com.twitter.scalding.TupleConversions +import com.twitter.scalding.TypedTsv import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.junit.runner.RunWith import org.scalatest.FunSpec import org.scalatest.junit.JUnitRunner import org.slf4j.LoggerFactory -import parallelai.spyglass.hbase.HBaseSource import parallelai.spyglass.hbase.HBaseConstants.SourceMode +import parallelai.spyglass.hbase.HBaseSource import scala._ @RunWith(classOf[JUnitRunner]) diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index 7ec0c4d..f9c30a2 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -2,6 +2,7 @@ package sandcrawler import org.scalatest._ +// scalastyle:off null class ScorableFeaturesTest extends FlatSpec with Matchers { private def titleToSlug(s : String) : String = { new ScorableFeatures(title = s).toSlug diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala index fd44f57..f63bef8 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala @@ -2,7 +2,10 @@ package sandcrawler import cascading.tuple.Fields import cascading.tuple.Tuple -import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} +import com.twitter.scalding.JobTest +import com.twitter.scalding.TextLine +import com.twitter.scalding.TupleConversions +import com.twitter.scalding.TypedTsv import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.scalatest._ diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala index 1c6ae83..34081a5 100644 --- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala @@ -2,13 +2,17 @@ package sandcrawler import cascading.tuple.Fields import cascading.tuple.Tuple -import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} +import com.twitter.scalding.JobTest +import com.twitter.scalding.TextLine +import com.twitter.scalding.TupleConversions +import com.twitter.scalding.TypedTsv import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.scalatest._ import parallelai.spyglass.hbase.HBaseConstants.SourceMode class ScoreJobTest extends FlatSpec with Matchers { + //scalastyle:off val JsonString = """ { "title": "<<TITLE>>", @@ -54,62 +58,65 @@ class ScoreJobTest extends FlatSpec with Matchers { "annex": null } """ + // scalastyle:on val JsonStringWithTitle = JsonString.replace("<<TITLE>>", "Dummy Example File") val JsonStringWithoutTitle = JsonString.replace("title", "nottitle") val MalformedJsonString = JsonString.replace("}", "") + // scalastyle:off val CrossrefString = """ -{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, - "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], - "date-time" : "2017-10-23T17:19:16Z", - "timestamp" : { "$numberLong" : "1508779156477" } }, - "reference-count" : 0, - "publisher" : "Elsevier BV", - "issue" : "3", - "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", - "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], - "date-time" : "1996-01-01T00:00:00Z", - "timestamp" : { "$numberLong" : "820454400000" } }, +{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, + "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], + "date-time" : "2017-10-23T17:19:16Z", + "timestamp" : { "$numberLong" : "1508779156477" } }, + "reference-count" : 0, + "publisher" : "Elsevier BV", + "issue" : "3", + "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", + "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], + "date-time" : "1996-01-01T00:00:00Z", + "timestamp" : { "$numberLong" : "820454400000" } }, "delay-in-days" : 0, "content-version" : "tdm" }], - "content-domain" : { "domain" : [], "crossmark-restriction" : false }, - "published-print" : { "date-parts" : [ [ 1996 ] ] }, + "content-domain" : { "domain" : [], "crossmark-restriction" : false }, + "published-print" : { "date-parts" : [ [ 1996 ] ] }, "DOI" : "<<DOI>>", - "type" : "journal-article", - "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], - "date-time" : "2002-07-25T15:09:41Z", - "timestamp" : { "$numberLong" : "1027609781000" } }, - "page" : "186-187", - "source" : "Crossref", - "is-referenced-by-count" : 0, + "type" : "journal-article", + "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], + "date-time" : "2002-07-25T15:09:41Z", + "timestamp" : { "$numberLong" : "1027609781000" } }, + "page" : "186-187", + "source" : "Crossref", + "is-referenced-by-count" : 0, "title" : [ "<<TITLE>>" ], - "prefix" : "10.1016", - "volume" : "9", - "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], - "member" : "78", - "container-title" : [ "Journal de Pédiatrie et de Puériculture" ], + "prefix" : "10.1016", + "volume" : "9", + "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], + "member" : "78", + "container-title" : [ "Journal de Pédiatrie et de Puériculture" ], "link" : [ { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml", - "content-type" : "text/xml", - "content-version" : "vor", - "intended-application" : "text-mining" }, + "content-type" : "text/xml", + "content-version" : "vor", + "intended-application" : "text-mining" }, { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain", - "content-type" : "text/plain", + "content-type" : "text/plain", "content-version" : "vor", - "intended-application" : "text-mining" } ], - "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], - "date-time" : "2015-09-03T10:03:43Z", - "timestamp" : { "$numberLong" : "1441274623000" } }, - "score" : 1, - "issued" : { "date-parts" : [ [ 1996 ] ] }, - "references-count" : 0, - "alternative-id" : [ "0987-7983(96)87729-2" ], - "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", - "ISSN" : [ "0987-7983" ], - "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], + "intended-application" : "text-mining" } ], + "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], + "date-time" : "2015-09-03T10:03:43Z", + "timestamp" : { "$numberLong" : "1441274623000" } }, + "score" : 1, + "issued" : { "date-parts" : [ [ 1996 ] ] }, + "references-count" : 0, + "alternative-id" : [ "0987-7983(96)87729-2" ], + "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", + "ISSN" : [ "0987-7983" ], + "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], "subject" : [ "Pediatrics, Perinatology, and Child Health" ] } """ + // scalastyle:on val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle") val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") val MalformedCrossrefString = CrossrefString.replace("}", "") @@ -168,7 +175,7 @@ class ScoreJobTest extends FlatSpec with Matchers { 2 -> CrossrefStrings(2), 3 -> CrossrefStrings(3))) .sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) { - // Grobid titles and slugs (in parentheses): + // Grobid titles and slugs (in parentheses): // Title 1 (title1) // Title 2: TNG (title2) // Title 3: The Sequel (title3) @@ -178,7 +185,7 @@ class ScoreJobTest extends FlatSpec with Matchers { // Title 1: TNG 3 (title1) // Title 2: Rebooted (title2) // Join should have 3 "title1" slugs and 1 "title2" slug - outputBuffer => + outputBuffer => "The pipeline" should "return a 4-element list" in { outputBuffer should have length 4 } @@ -190,9 +197,9 @@ class ScoreJobTest extends FlatSpec with Matchers { countMap("title2") shouldBe 1 } - def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) = { + def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) : (String, Int, String, String) = { val mf1 : MapFeatures = GrobidScorable.jsonToMapFeatures( - Sha1Strings(grobidIndex), + Sha1Strings(grobidIndex), JsonStrings(grobidIndex)) val mf2 : MapFeatures = CrossrefScorable.jsonToMapFeatures( CrossrefStrings(crossrefIndex)) -- cgit v1.2.3 From a3bf1d47fac53b818a8118020adced6c54be7cba Mon Sep 17 00:00:00 2001 From: Bryan Newbold <bnewbold@archive.org> Date: Wed, 15 Aug 2018 20:22:44 -0700 Subject: crossref: test for empty-string title --- scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala') diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala index e171dba..1789d1a 100644 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala @@ -67,6 +67,7 @@ class CrossrefScorableTest extends FlatSpec with Matchers { """ // scalastyle:on val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "Some Title") + val CrossrefStringWithEmptyTitle = CrossrefString.replace("<<TITLE>>", "") val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") val MalformedCrossrefString = CrossrefString.replace("}", "") @@ -81,6 +82,11 @@ class CrossrefScorableTest extends FlatSpec with Matchers { result.slug shouldBe Scorable.NoSlug } + it should "handle empty title" in { + val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithEmptyTitle) + result.slug shouldBe Scorable.NoSlug + } + it should "handle valid input" in { val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithTitle) result.slug shouldBe "sometitle" -- cgit v1.2.3