From 98f78c0ef17436f87991169b4a7bedadf602527a Mon Sep 17 00:00:00 2001 From: Ellen Spertus Date: Mon, 27 Aug 2018 18:05:21 -0700 Subject: replaced NoSlug with proper use of Option --- .../src/main/scala/sandcrawler/BibjsonScorable.scala | 15 +++++++++------ .../src/main/scala/sandcrawler/CrossrefScorable.scala | 15 +++++++++------ .../src/main/scala/sandcrawler/GrobidScorable.scala | 8 ++++---- .../main/scala/sandcrawler/GrobidScorableDumpJob.scala | 7 ++----- .../src/main/scala/sandcrawler/MatchBenchmarkJob.scala | 3 ++- scalding/src/main/scala/sandcrawler/Scorable.scala | 17 +++++++---------- .../src/main/scala/sandcrawler/ScorableFeatures.scala | 13 ++++++++----- 7 files changed, 41 insertions(+), 37 deletions(-) (limited to 'scalding/src') diff --git a/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala b/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala index 0d26d75..abf9220 100644 --- a/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala +++ b/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala @@ -15,7 +15,7 @@ class BibjsonScorable extends Scorable { TextLine(args("bibjson-input")) } - def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = { + def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]] = { getSource(args).read .toTypedPipe[String](new Fields("line")) .map { BibjsonScorable.bibjsonToMapFeatures(_) } @@ -23,9 +23,9 @@ class BibjsonScorable extends Scorable { } object BibjsonScorable { - def bibjsonToMapFeatures(json : String) : MapFeatures = { + def bibjsonToMapFeatures(json : String) : Option[MapFeatures] = { Scorable.jsonToMap(json) match { - case None => MapFeatures(Scorable.NoSlug, json) + case None => None case Some(map) => { if (map contains "title") { val title = Scorable.getString(map, "title") @@ -33,13 +33,16 @@ object BibjsonScorable { val sha1 = Scorable.getString(map, "sha") // TODO: year, authors (if available) if (title == null || title.isEmpty) { - new MapFeatures(Scorable.NoSlug, json) + None } else { val sf : ScorableFeatures = ScorableFeatures.create(title=title, doi=doi, sha1=sha1) - new MapFeatures(sf.toSlug, sf.toString) + sf.toSlug match { + case None => None + case Some(slug) => Some(MapFeatures(slug, sf.toString)) + } } } else { - new MapFeatures(Scorable.NoSlug, json) + None } } } diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index f51c210..c13945f 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -17,7 +17,7 @@ class CrossrefScorable extends Scorable with HBasePipeConversions { TextLine(args("crossref-input")) } - def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = { + def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]] = { getSource(args).read .toTypedPipe[String](new Fields("line")) .filter { CrossrefScorable.keepRecord(_) } @@ -116,22 +116,25 @@ object CrossrefScorable { } } - def jsonToMapFeatures(json : String) : MapFeatures = { + def jsonToMapFeatures(json : String) : Option[MapFeatures] = { Scorable.jsonToMap(json) match { - case None => MapFeatures(Scorable.NoSlug, json) + case None => None case Some(map) => mapToTitle(map) match { - case None => MapFeatures(Scorable.NoSlug, json) + case None => None case Some(title) => { val doi = Scorable.getString(map, "DOI") val authors: List[String] = mapToAuthorList(map) val year: Int = mapToYear(map).getOrElse(0) val contentType: String = map.get("type").map(e => e.asInstanceOf[String]).getOrElse("MISSING-CONTENT-TYPE") if (doi.isEmpty || doi == null || authors.length == 0 || !(ContentTypeWhitelist contains contentType)) { - MapFeatures(Scorable.NoSlug, json) + None } else { val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi.toLowerCase(), year=year) - MapFeatures(sf.toSlug, sf.toString) + sf.toSlug match { + case None => None + case Some(slug) => Some(MapFeatures(slug, sf.toString)) + } } } } diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index 899ce66..f4ed129 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -20,7 +20,7 @@ class GrobidScorable extends Scorable with HBasePipeConversions { GrobidScorable.getHBaseSource(args("hbase-table"), args("zookeeper-hosts")) } - def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = { + def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]] = { getSource(args) .read // Can't just "fromBytesWritable" because we have multiple types @@ -65,16 +65,16 @@ object GrobidScorable { HBaseBuilder.build(table, host, List("grobid0:metadata", "grobid0:status_code"), SourceMode.SCAN_ALL) } - def jsonToMapFeatures(key : String, json : String) : MapFeatures = { + def jsonToMapFeatures(key : String, json : String) : Option[MapFeatures] = { Scorable.jsonToMap(json) match { - case None => MapFeatures(Scorable.NoSlug, json) + case None => None case Some(map) => { if (map contains "title") { val authors: List[String] = mapToAuthorList(map) val title = Scorable.getString(map, "title") ScorableFeatures.create(title=title, authors=authors, sha1=key).toMapFeatures } else { - MapFeatures(Scorable.NoSlug, json) + None } } } diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala index 19b257f..d40410b 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala @@ -40,11 +40,8 @@ class GrobidScorableDumpJob(args: Args) extends JobBase(args) { parsedGrobidRows.inc GrobidScorable.jsonToMapFeatures(entry._1, entry._2) } - .filter { entry => Scorable.isValidSlug(entry.slug) } - .map { entry => - validGrobidRows.inc - entry - } + .filterNot { entry => entry.isEmpty } + .map { entry => entry.get } .groupBy { case MapFeatures(slug, json) => slug } .map { tuple => val (slug : String, features : MapFeatures) = tuple diff --git a/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala b/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala index 1578258..292de75 100644 --- a/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala +++ b/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala @@ -15,7 +15,8 @@ class MatchBenchmarkJob(args: Args) extends JobBase(args) { val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(leftArgs) val pipe2 : TypedPipe[(String, ReduceFeatures)] = sc2.getInputPipe(rightArgs) - pipe1.join(pipe2).map { entry => + pipe1.join(pipe2) + .map { entry => val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry new ReduceOutput( slug, diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index f7eb95d..5d67044 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -13,10 +13,12 @@ case class ReduceFeatures(json : String) case class ReduceOutput(val slug : String, score : Int, json1 : String, json2 : String) abstract class Scorable { - def getInputPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[(String, ReduceFeatures)] = - { - getFeaturesPipe(args) - .filter { entry => Scorable.isValidSlug(entry.slug) } + def getInputPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[(String, ReduceFeatures)] = { + val validFeatures : TypedPipe[MapFeatures] = getFeaturesPipe(args) + .filterNot { entry => entry.isEmpty } + .map { entry => entry.get } + + validFeatures .groupBy { case MapFeatures(slug, json) => slug } .map { tuple => val (slug : String, features : MapFeatures) = tuple @@ -26,16 +28,11 @@ abstract class Scorable { // abstract methods def getSource(args : Args) : Source - def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] + def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]] } object Scorable { val MaxTitleLength = 1023 - val NoSlug = "NO SLUG" // Used for slug if title is empty or unparseable - - def isValidSlug(slug : String) : Boolean = { - slug != NoSlug - } def jsonToMap(json : String) : Option[Map[String, Any]] = { // https://stackoverflow.com/a/32717262/631051 diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 241db79..b56f102 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -35,9 +35,9 @@ class ScorableFeatures private(title : String, authors : List[Any] = List(), yea JSONObject(toMap).toString } - def toSlug() : String = { + def toSlug() : Option[String] = { if (title == null) { - Scorable.NoSlug + None } else { val unaccented = StringUtilities.removeAccents(title) // Remove punctuation @@ -45,10 +45,13 @@ class ScorableFeatures private(title : String, authors : List[Any] = List(), yea if (slug.isEmpty || slug == null || (ScorableFeatures.SlugBlacklist contains slug) - || (slug.length < ScorableFeatures.MinSlugLength)) Scorable.NoSlug else slug + || (slug.length < ScorableFeatures.MinSlugLength)) None else Some(slug) } } - def toMapFeatures : MapFeatures = - MapFeatures(toSlug, toString) + def toMapFeatures : Option[MapFeatures] = + toSlug match { + case None => None + case Some(slug) => Some(MapFeatures(slug, toString)) + } } -- cgit v1.2.3 From fc2e349f2bdadf5e05734a9304d20a84d4b6d927 Mon Sep 17 00:00:00 2001 From: Ellen Spertus Date: Tue, 28 Aug 2018 07:37:28 -0700 Subject: fixed tests after replacing NoSlug with None --- .../scala/sandcrawler/CrossrefScorableTest.scala | 67 +++++++++++----------- .../scala/sandcrawler/GrobidScorableTest.scala | 27 ++++----- .../scala/sandcrawler/ScorableFeaturesTest.scala | 45 +++++++-------- .../src/test/scala/sandcrawler/ScoreJobTest.scala | 23 +++++--- 4 files changed, 85 insertions(+), 77 deletions(-) (limited to 'scalding/src') diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala index c0d1cb0..8302b8f 100644 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala @@ -79,59 +79,64 @@ class CrossrefScorableTest extends FlatSpec with Matchers { // Unit tests "CrossrefScorable.jsonToMapFeatures()" should "handle invalid JSON" in { - val result = CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString) - result.slug shouldBe Scorable.NoSlug + CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString) should be (None) } it should "handle missing title" in { - val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithoutTitle) - result.slug shouldBe Scorable.NoSlug + CrossrefScorable.jsonToMapFeatures(CrossrefStringWithoutTitle) should be (None) } it should "handle null title" in { - val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNullTitle) - result.slug shouldBe Scorable.NoSlug + CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNullTitle) should be (None) } it should "handle empty title" in { - val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithEmptyTitle) - result.slug shouldBe Scorable.NoSlug + CrossrefScorable.jsonToMapFeatures(CrossrefStringWithEmptyTitle) should be (None) } it should "handle subtitle" in { - val result = CrossrefScorable.jsonToMapFeatures( - """{"title": ["short but not too short"], "subtitle": ["just right!"], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") - result.slug shouldBe "shortbutnottooshortjustright" + CrossrefScorable.jsonToMapFeatures( + """{"title": ["short but not too short"], "subtitle": ["just right!"], "DOI": "10.123/asdf", "type":"journal-article","author":[{ "given" : "W", "family" : "Gaier"}]}""") match { + case None => fail() + case Some(result) => result.slug shouldBe "shortbutnottooshortjustright" + } } it should "handle empty subtitle" in { - val result = CrossrefScorable.jsonToMapFeatures( - """{"title": ["short but not too short"], "subtitle": [""], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") - result.slug shouldBe "shortbutnottooshort" + CrossrefScorable.jsonToMapFeatures( + """{"title": ["short but not too short"], "subtitle": [""], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") match { + case None => fail() + case Some(result) => result.slug shouldBe "shortbutnottooshort" + } } it should "handle null subtitle" in { - val result = CrossrefScorable.jsonToMapFeatures( - """{"title": ["short but not too short"], "subtitle": [null], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") - result.slug shouldBe "shortbutnottooshort" + CrossrefScorable.jsonToMapFeatures( + """{"title": ["short but not too short"], "subtitle": [null], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") match { + case None => fail() + case Some(result) => result.slug shouldBe "shortbutnottooshort" + } } it should "handle missing authors" in { - val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNoAuthors) - result.slug shouldBe Scorable.NoSlug + CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNoAuthors) should be (None) } it should "handle valid input" in { - val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithGoodTitle) - result.slug shouldBe "sometitle" - Scorable.jsonToMap(result.json) match { + CrossrefScorable.jsonToMapFeatures(CrossrefStringWithGoodTitle) match { case None => fail() - case Some(map) => { - map("title").asInstanceOf[String] shouldBe "Some Title" - map("doi").asInstanceOf[String] shouldBe "10.123/abc" - // TODO: full name? not just a string? - map("authors").asInstanceOf[List[String]] shouldBe List("Gaier") - map("year").asInstanceOf[Double].toInt shouldBe 2002 + case Some(result) => { + result.slug shouldBe "sometitle" + Scorable.jsonToMap(result.json) match { + case None => fail() + case Some(map) => { + map("title").asInstanceOf[String] shouldBe "Some Title" + map("doi").asInstanceOf[String] shouldBe "10.123/abc" + // TODO: full name? not just a string? + map("authors").asInstanceOf[List[String]] shouldBe List("Gaier") + map("year").asInstanceOf[Double].toInt shouldBe 2002 + } + } } } } @@ -161,9 +166,7 @@ class CrossrefScorableTest extends FlatSpec with Matchers { } it should "handle content types" in { - val resultWrong = CrossrefScorable.jsonToMapFeatures(CrossrefStringWrongType) - resultWrong.slug shouldBe Scorable.NoSlug - val resultMissing = CrossrefScorable.jsonToMapFeatures(CrossrefStringNoType) - resultMissing.slug shouldBe Scorable.NoSlug + CrossrefScorable.jsonToMapFeatures(CrossrefStringWrongType) should be (None) + CrossrefScorable.jsonToMapFeatures(CrossrefStringNoType) should be (None) } } diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala index 119cf90..b395a64 100644 --- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala @@ -68,29 +68,30 @@ class GrobidScorableTest extends FlatSpec with Matchers { // Unit tests "GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in { - val result = GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString) - result.slug shouldBe Scorable.NoSlug + GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString) should be (None) } it should "handle null title" in { - val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithNullTitle) - result.slug shouldBe Scorable.NoSlug + GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithNullTitle) should be (None) } it should "handle missing title" in { - val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle) - result.slug shouldBe Scorable.NoSlug + GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle) should be (None) } it should "handle valid input" in { - val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithGoodTitle) - result.slug shouldBe "dummyexamplefile" - Scorable.jsonToMap(result.json) match { + GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithGoodTitle) match { case None => fail() - case Some(map) => { - map should contain key "title" - map("title").asInstanceOf[String] shouldBe "Dummy Example File" - map("authors").asInstanceOf[List[String]] shouldBe List("Brewster Kahle", "J Doe") + case Some(result) => { + result.slug shouldBe "dummyexamplefile" + Scorable.jsonToMap(result.json) match { + case None => fail() + case Some(map) => { + map should contain key "title" + map("title").asInstanceOf[String] shouldBe "Dummy Example File" + map("authors").asInstanceOf[List[String]] shouldBe List("Brewster Kahle", "J Doe") + } + } } } } diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index 450c169..3f6b87c 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -8,60 +8,57 @@ import org.scalatest._ // scalastyle:off null class ScorableFeaturesTest extends FlatSpec with Matchers { - - private def titleToSlug(s : String) : String = { - ScorableFeatures.create(title = s).toSlug - } - "toMapFeatures()" should "work with gnarly inputs" in { ScorableFeatures.create(title = null).toMapFeatures ScorableFeatures.create(title = "something", doi = null, sha1 = null, year = 123).toMapFeatures } + private def titleToSlug(s : String) : Option[String] = ScorableFeatures.create(title = s).toSlug + "mapToSlug()" should "extract the parts of titles before a colon" in { - titleToSlug("HELLO:there") shouldBe "hellothere" + titleToSlug("HELLO:there") shouldBe (Some("hellothere")) } it should "extract an entire colon-less string" in { - titleToSlug("hello THERE") shouldBe "hellothere" + titleToSlug("hello THERE") shouldBe (Some("hellothere")) } it should "return Scorable.NoSlug if given empty string" in { - titleToSlug("") shouldBe Scorable.NoSlug + titleToSlug("") shouldBe (None) } it should "return Scorable.NoSlug if given null" in { - titleToSlug(null) shouldBe Scorable.NoSlug + titleToSlug(null) shouldBe (None) } it should "strip punctuation" in { - titleToSlug("HELLO!:the:re") shouldBe "hellothere" - titleToSlug("a:b:cdefgh") shouldBe "abcdefgh" + titleToSlug("HELLO!:the:re") shouldBe Some("hellothere") + titleToSlug("a:b:cdefgh") shouldBe Some("abcdefgh") titleToSlug( - "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands" - titleToSlug(":;\"\'") shouldBe Scorable.NoSlug + "If you're happy and you know it, clap your hands!") shouldBe Some("ifyourehappyandyouknowitclapyourhands") + titleToSlug(":;\"\'") shouldBe (None) } it should "filter stub titles" in { - titleToSlug("abstract") shouldBe Scorable.NoSlug - titleToSlug("title!") shouldBe Scorable.NoSlug - titleToSlug("a real title which is not on blacklist") shouldBe "arealtitlewhichisnotonblacklist" + titleToSlug("abstract") shouldBe (None) + titleToSlug("title!") shouldBe (None) + titleToSlug("a real title which is not on blacklist") shouldBe Some("arealtitlewhichisnotonblacklist") } it should "strip special characters" in { - titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」¿–±§ʿ") shouldBe Scorable.NoSlug - // TODO: titleToSlug("©™₨№…") shouldBe Scorable.NoSlug - // TODO: titleToSlug("πµΣσ") shouldBe Scorable.NoSlug + titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」¿–±§ʿ") shouldBe (None) + // TODO: titleToSlug("©™₨№…") shouldBe (None) + // TODO: titleToSlug("πµΣσ") shouldBe (None) } it should "remove whitespace" in { - titleToSlug("foo bar : baz ::") shouldBe "foobarbaz" - titleToSlug("\na\t:b:cdefghi") shouldBe "abcdefghi" - titleToSlug("\n \t \r ") shouldBe Scorable.NoSlug + titleToSlug("foo bar : baz ::") shouldBe Some("foobarbaz") + titleToSlug("\na\t:b:cdefghi") shouldBe Some("abcdefghi") + titleToSlug("\n \t \r ") shouldBe (None) } it should "skip very short slugs" in { - titleToSlug("short") shouldBe Scorable.NoSlug - titleToSlug("a longer, more in depth title") shouldBe "alongermoreindepthtitle" + titleToSlug("short") shouldBe (None) + titleToSlug("a longer, more in depth title") shouldBe Some("alongermoreindepthtitle") } } diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala index 32fb16c..c3e4ff9 100644 --- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala @@ -222,15 +222,22 @@ class ScoreJobTest extends FlatSpec with Matchers { } def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) : (String, Int, String, String) = { - val mf1 : MapFeatures = GrobidScorable.jsonToMapFeatures( + GrobidScorable.jsonToMapFeatures( Sha1Strings(grobidIndex), - JsonStrings(grobidIndex)) - val mf2 : MapFeatures = CrossrefScorable.jsonToMapFeatures( - CrossrefStrings(crossrefIndex)) - val score = Scorable.computeSimilarity( - ReduceFeatures(mf1.json), - ReduceFeatures(mf2.json)) - (slug, score, mf1.json, mf2.json) + JsonStrings(grobidIndex)) match { + case None => fail() + case Some(mf1) => { + CrossrefScorable.jsonToMapFeatures(CrossrefStrings(crossrefIndex)) match { + case None => fail() + case Some(mf2) => { + val score = Scorable.computeSimilarity( + ReduceFeatures(mf1.json), + ReduceFeatures(mf2.json)) + (slug, score, mf1.json, mf2.json) + } + } + } + } } it should "have right output values" in { -- cgit v1.2.3 From 3eb3e38daf4ad58b6da88d7abda222018e4a1ab5 Mon Sep 17 00:00:00 2001 From: Ellen Spertus Date: Tue, 28 Aug 2018 08:32:24 -0700 Subject: fixed scalastyle issues, including cyclomatic complexity --- .../main/scala/sandcrawler/CrossrefScorable.scala | 100 +++++++++++---------- .../main/scala/sandcrawler/ScorableFeatures.scala | 10 ++- 2 files changed, 62 insertions(+), 48 deletions(-) (limited to 'scalding/src') diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index c13945f..bb6413f 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -55,37 +55,45 @@ object CrossrefScorable { // Returns None if title is null, empty, or too long. def mapToTitle(map : Map[String, Any]) : Option[String] = { - if (map contains "title") { - val titles = map("title").asInstanceOf[List[String]] - if (titles.isEmpty || titles == null) { - None + def getTitle : Option[String] = { + if (map contains "title") { + val titles = map("title").asInstanceOf[List[String]] + if (titles.isEmpty || titles == null) None else Some(titles(0)) } else { - val baseTitle: String = titles(0) - // TODO(bnewbold): this code block is horrible - val baseSubtitle: String = if (map contains "subtitle") { - val subtitles = map("subtitle").asInstanceOf[List[String]] - if (!subtitles.isEmpty && subtitles != null) { - val sub = subtitles(0) - if (sub != null && !sub.isEmpty && baseTitle != null) { - sub - } else { - "" - } + None + } + } + + def getSubtitle : Option[String] = { + if (map contains "subtitle") { + val subtitles = map("subtitle").asInstanceOf[List[String]] + if (subtitles.isEmpty || subtitles == null) { + None + } else { + val sub = subtitles(0) + if (sub == null || sub.isEmpty) { + None } else { - "" + Some(sub) } - } else { - "" } - val title = if (baseSubtitle.isEmpty) { - baseTitle + } else { + None + } + } + + getTitle match { + case None => None + case Some(baseTitle) => { + if (baseTitle == null) { + None } else { - baseTitle.concat(": ".concat(baseSubtitle)) + getSubtitle match { + case None => Some(baseTitle) + case Some(baseSubtitle) => Some(baseTitle.concat(":".concat(baseSubtitle))) + } } - if (title == null || title.isEmpty || title.length > Scorable.MaxTitleLength) None else Some(title) } - } else { - None } } @@ -106,37 +114,39 @@ object CrossrefScorable { case None => None case Some(created) => { Some(created.asInstanceOf[Map[String,Any]] - .get("date-parts") - .get - .asInstanceOf[List[Any]](0) - .asInstanceOf[List[Any]](0) - .asInstanceOf[Double] - .toInt) + .get("date-parts") + .get + .asInstanceOf[List[Any]](0) + .asInstanceOf[List[Any]](0) + .asInstanceOf[Double] + .toInt) } } } def jsonToMapFeatures(json : String) : Option[MapFeatures] = { + def makeMapFeatures(title : String, doi : String, authors : List[String], year : Int, contentType : String) : Option[MapFeatures] = { + if (doi.isEmpty || doi == null || authors.length == 0 || !(ContentTypeWhitelist contains contentType)) { + None + } else { + val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi.toLowerCase(), year=year) + sf.toSlug match { + case None => None + case Some(slug) => Some(MapFeatures(slug, sf.toString)) + } + } + } Scorable.jsonToMap(json) match { case None => None case Some(map) => mapToTitle(map) match { case None => None - case Some(title) => { - val doi = Scorable.getString(map, "DOI") - val authors: List[String] = mapToAuthorList(map) - val year: Int = mapToYear(map).getOrElse(0) - val contentType: String = map.get("type").map(e => e.asInstanceOf[String]).getOrElse("MISSING-CONTENT-TYPE") - if (doi.isEmpty || doi == null || authors.length == 0 || !(ContentTypeWhitelist contains contentType)) { - None - } else { - val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi.toLowerCase(), year=year) - sf.toSlug match { - case None => None - case Some(slug) => Some(MapFeatures(slug, sf.toString)) - } - } - } + case Some(title) => makeMapFeatures( + title=title, + doi=Scorable.getString(map, "DOI"), + authors=mapToAuthorList(map), + year=mapToYear(map).getOrElse(0), + contentType=map.get("type").map(e => e.asInstanceOf[String]).getOrElse("MISSING-CONTENT-TYPE")) } } } diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index b56f102..be2b495 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -43,9 +43,13 @@ class ScorableFeatures private(title : String, authors : List[Any] = List(), yea // Remove punctuation val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "") if (slug.isEmpty - || slug == null - || (ScorableFeatures.SlugBlacklist contains slug) - || (slug.length < ScorableFeatures.MinSlugLength)) None else Some(slug) + || slug == null + || (ScorableFeatures.SlugBlacklist contains slug) + || (slug.length < ScorableFeatures.MinSlugLength)) { + None + } else { + Some(slug) + } } } -- cgit v1.2.3 From 2c9172a3797dd218b5648c5390bf2dfb39d8e3a3 Mon Sep 17 00:00:00 2001 From: Ellen Spertus Date: Tue, 28 Aug 2018 08:45:04 -0700 Subject: restored code I inadvertantly removed when merging --- scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'scalding/src') diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala index d40410b..3146a6c 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala @@ -41,7 +41,10 @@ class GrobidScorableDumpJob(args: Args) extends JobBase(args) { GrobidScorable.jsonToMapFeatures(entry._1, entry._2) } .filterNot { entry => entry.isEmpty } - .map { entry => entry.get } + .map { entry => { + validGrobidRows.inc + entry.get + }} .groupBy { case MapFeatures(slug, json) => slug } .map { tuple => val (slug : String, features : MapFeatures) = tuple -- cgit v1.2.3 From a905dc0c606710b12d33e1df2dc6b109812f7276 Mon Sep 17 00:00:00 2001 From: Ellen Spertus Date: Tue, 4 Sep 2018 14:45:08 -0700 Subject: minor style improvement --- scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'scalding/src') diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index 3f6b87c..112a5e5 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -16,11 +16,11 @@ class ScorableFeaturesTest extends FlatSpec with Matchers { private def titleToSlug(s : String) : Option[String] = ScorableFeatures.create(title = s).toSlug "mapToSlug()" should "extract the parts of titles before a colon" in { - titleToSlug("HELLO:there") shouldBe (Some("hellothere")) + titleToSlug("HELLO:there") shouldBe Some("hellothere") } it should "extract an entire colon-less string" in { - titleToSlug("hello THERE") shouldBe (Some("hellothere")) + titleToSlug("hello THERE") shouldBe Some("hellothere") } it should "return Scorable.NoSlug if given empty string" in { -- cgit v1.2.3 From 9eb2fca606e17c63406c136f58ed9a079748a4d7 Mon Sep 17 00:00:00 2001 From: Ellen Spertus Date: Tue, 4 Sep 2018 14:58:25 -0700 Subject: changed style of ScoreJobTest.bundle --- .../src/test/scala/sandcrawler/ScoreJobTest.scala | 24 +++++++++------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'scalding/src') diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala index c3e4ff9..fbc0ee5 100644 --- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala @@ -222,21 +222,17 @@ class ScoreJobTest extends FlatSpec with Matchers { } def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) : (String, Int, String, String) = { - GrobidScorable.jsonToMapFeatures( + val mfg : Option[MapFeatures] = GrobidScorable.jsonToMapFeatures( Sha1Strings(grobidIndex), - JsonStrings(grobidIndex)) match { - case None => fail() - case Some(mf1) => { - CrossrefScorable.jsonToMapFeatures(CrossrefStrings(crossrefIndex)) match { - case None => fail() - case Some(mf2) => { - val score = Scorable.computeSimilarity( - ReduceFeatures(mf1.json), - ReduceFeatures(mf2.json)) - (slug, score, mf1.json, mf2.json) - } - } - } + JsonStrings(grobidIndex)) + val mfc : Option[MapFeatures] = CrossrefScorable.jsonToMapFeatures(CrossrefStrings(crossrefIndex)) + if (mfg.isEmpty || mfc.isEmpty) { + fail() + } else { + val score = Scorable.computeSimilarity( + ReduceFeatures(mfg.get.json), + ReduceFeatures(mfc.get.json)) + (slug, score, mfg.get.json, mfc.get.json) } } -- cgit v1.2.3