From 03663873fbc556f670cc695f90a2b74bd2bc72de Mon Sep 17 00:00:00 2001 From: Ellen Spertus Date: Mon, 20 Aug 2018 14:06:19 -0700 Subject: Created static factory method for ScorableCreations to deal with null. --- .../main/scala/sandcrawler/CrossrefScorable.scala | 2 +- .../main/scala/sandcrawler/GrobidScorable.scala | 2 +- .../main/scala/sandcrawler/ScorableFeatures.scala | 34 +++++++++++++--------- 3 files changed, 23 insertions(+), 15 deletions(-) (limited to 'scalding/src/main') diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index ff8201a..5d1eaf5 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -35,7 +35,7 @@ object CrossrefScorable { new MapFeatures(Scorable.NoSlug, json) } else { // bnewbold: not checking that titles(0) is non-null/non-empty; case would be, in JSON, "title": [ null ] - val sf : ScorableFeatures = new ScorableFeatures(title=titles(0), doi=doi) + val sf : ScorableFeatures = ScorableFeatures.create(title=titles(0), doi=doi) new MapFeatures(sf.toSlug, sf.toString) } } else { diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index 9a09e05..d7a1eea 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -45,7 +45,7 @@ object GrobidScorable { case None => MapFeatures(Scorable.NoSlug, json) case Some(map) => { if (map contains "title") { - new ScorableFeatures(Scorable.getString(map, "title"), sha1=key).toMapFeatures + ScorableFeatures.create(title=Scorable.getString(map, "title"), sha1=key).toMapFeatures } else { MapFeatures(Scorable.NoSlug, json) } diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 8ed3369..d9461e7 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -2,12 +2,20 @@ package sandcrawler import scala.util.parsing.json.JSONObject +object ScorableFeatures { + def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = { + new ScorableFeatures( + title=if (title == null) "" else title, + year=year, + doi=if (doi == null) "" else doi, + sha1=if (sha1 == null) "" else sha1) + } +} // Contains features needed to make slug and to score (in combination -// with a second ScorableFeatures). -class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") { - - val slugBlacklist = Set( "abbreviations", "abstract", "acknowledgements", +// with a second ScorableFeatures). Create with above static factory method. +class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", sha1: String = "") { + val SlugBlacklist = Set( "abbreviations", "abstract", "acknowledgements", "article", "authorreply", "authorsreply", "bookreview", "bookreviews", "casereport", "commentary", "commentaryon", "commenton", "commentto", "contents", "correspondence", "dedication", "editorialadvisoryboard", @@ -16,15 +24,15 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S "references", "results", "review", "reviewarticle", "summary", "title", "name") - def toMap() : Map[String, Any] = { - Map("title" -> (if (title == null) "" else title), - "year" -> year, - "doi" -> (if (doi == null) "" else doi), - "sha1" -> (if (sha1 == null) "" else sha1)) - } + def toMap() : Map[String, Any] = Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) override def toString() : String = { - JSONObject(toMap()).toString + val myMap = toMap() + assert(myMap("title") != null) + assert(myMap("year") != null) + assert(myMap("doi") != null) + assert(myMap("sha1") != null) + JSONObject(myMap).toString } def toSlug() : String = { @@ -34,11 +42,11 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S val unaccented = StringUtilities.removeAccents(title) // Remove punctuation val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "") - if (slug.isEmpty || slug == null || (slugBlacklist contains slug)) Scorable.NoSlug else slug + if (slug.isEmpty || slug == null || (SlugBlacklist contains slug)) Scorable.NoSlug else slug } } - def toMapFeatures = { + def toMapFeatures : MapFeatures = { MapFeatures(toSlug, toString) } } -- cgit v1.2.3 From 4f4571bbc1717c5ad9740377fdb8297da6632639 Mon Sep 17 00:00:00 2001 From: Ellen Spertus Date: Mon, 20 Aug 2018 14:32:28 -0700 Subject: Removed debugging code, fixed style warnings. --- .../src/main/scala/sandcrawler/ScorableFeatures.scala | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'scalding/src/main') diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index d9461e7..e71abfa 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -3,6 +3,7 @@ package sandcrawler import scala.util.parsing.json.JSONObject object ScorableFeatures { + // Static factory method def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = { new ScorableFeatures( title=if (title == null) "" else title, @@ -24,16 +25,11 @@ class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", "references", "results", "review", "reviewarticle", "summary", "title", "name") - def toMap() : Map[String, Any] = Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) + def toMap() : Map[String, Any] = + Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) - override def toString() : String = { - val myMap = toMap() - assert(myMap("title") != null) - assert(myMap("year") != null) - assert(myMap("doi") != null) - assert(myMap("sha1") != null) - JSONObject(myMap).toString - } + override def toString() : String = + JSONObject(toMap).toString def toSlug() : String = { if (title == null) { @@ -46,7 +42,6 @@ class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", } } - def toMapFeatures : MapFeatures = { + def toMapFeatures : MapFeatures = MapFeatures(toSlug, toString) - } } -- cgit v1.2.3