diff options
Diffstat (limited to 'scalding/src/main/scala')
3 files changed, 27 insertions, 27 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index ff8201a..5d1eaf5 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -35,7 +35,7 @@ object CrossrefScorable { new MapFeatures(Scorable.NoSlug, json) } else { // bnewbold: not checking that titles(0) is non-null/non-empty; case would be, in JSON, "title": [ null ] - val sf : ScorableFeatures = new ScorableFeatures(title=titles(0), doi=doi) + val sf : ScorableFeatures = ScorableFeatures.create(title=titles(0), doi=doi) new MapFeatures(sf.toSlug, sf.toString) } } else { diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index 9a09e05..d7a1eea 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -45,7 +45,7 @@ object GrobidScorable { case None => MapFeatures(Scorable.NoSlug, json) case Some(map) => { if (map contains "title") { - new ScorableFeatures(Scorable.getString(map, "title"), sha1=key).toMapFeatures + ScorableFeatures.create(title=Scorable.getString(map, "title"), sha1=key).toMapFeatures } else { MapFeatures(Scorable.NoSlug, json) } diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 610f1a4..0b9868a 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -5,19 +5,31 @@ import java.io.InputStream import scala.io.Source import scala.util.parsing.json.JSONObject -// Contains features needed to make slug and to score (in combination -// with a second ScorableFeatures). -class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") { - def toMap() : Map[String, Any] = { Map( - "title" -> (if (title == null) "" else title), - "year" -> year, - "doi" -> (if (doi == null) "" else doi), - "sha1" -> (if (sha1 == null) "" else sha1)) - } +object ScorableFeatures { + // TODO: Add exception handling. + val fileStream : InputStream = getClass.getResourceAsStream("/slug-blacklist.txt") + val SlugBlacklist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet + fileStream.close - override def toString() : String = { - JSONObject(toMap()).toString + // Static factory method + def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = { + new ScorableFeatures( + title=if (title == null) "" else title, + year=year, + doi=if (doi == null) "" else doi, + sha1=if (sha1 == null) "" else sha1) } +} + +// Contains features needed to make slug and to score (in combination +// with a second ScorableFeatures). Create with above static factory method. +class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", sha1: String = "") { + + def toMap() : Map[String, Any] = + Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) + + override def toString() : String = + JSONObject(toMap).toString def toSlug() : String = { if (title == null) { @@ -26,22 +38,10 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S val unaccented = StringUtilities.removeAccents(title) // Remove punctuation val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "") - // scalastyle:off if.brace - if (slug.isEmpty || slug == null || (ScorableFeatures.SlugBlacklist contains slug)) - Scorable.NoSlug - else - slug + if (slug.isEmpty || slug == null || (ScorableFeatures.SlugBlacklist contains slug)) Scorable.NoSlug else slug } } - def toMapFeatures : MapFeatures = { + def toMapFeatures : MapFeatures = MapFeatures(toSlug, toString) - } -} - -object ScorableFeatures { - // TODO: Add exception handling. - val fileStream : InputStream = getClass.getResourceAsStream("/slug-blacklist.txt") - val SlugBlacklist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet - fileStream.close } |