diff options
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/ScorableFeatures.scala')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/ScorableFeatures.scala | 34 |
1 files changed, 21 insertions, 13 deletions
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 8ed3369..d9461e7 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -2,12 +2,20 @@ package sandcrawler import scala.util.parsing.json.JSONObject +object ScorableFeatures { + def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = { + new ScorableFeatures( + title=if (title == null) "" else title, + year=year, + doi=if (doi == null) "" else doi, + sha1=if (sha1 == null) "" else sha1) + } +} // Contains features needed to make slug and to score (in combination -// with a second ScorableFeatures). -class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") { - - val slugBlacklist = Set( "abbreviations", "abstract", "acknowledgements", +// with a second ScorableFeatures). Create with above static factory method. +class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", sha1: String = "") { + val SlugBlacklist = Set( "abbreviations", "abstract", "acknowledgements", "article", "authorreply", "authorsreply", "bookreview", "bookreviews", "casereport", "commentary", "commentaryon", "commenton", "commentto", "contents", "correspondence", "dedication", "editorialadvisoryboard", @@ -16,15 +24,15 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S "references", "results", "review", "reviewarticle", "summary", "title", "name") - def toMap() : Map[String, Any] = { - Map("title" -> (if (title == null) "" else title), - "year" -> year, - "doi" -> (if (doi == null) "" else doi), - "sha1" -> (if (sha1 == null) "" else sha1)) - } + def toMap() : Map[String, Any] = Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) override def toString() : String = { - JSONObject(toMap()).toString + val myMap = toMap() + assert(myMap("title") != null) + assert(myMap("year") != null) + assert(myMap("doi") != null) + assert(myMap("sha1") != null) + JSONObject(myMap).toString } def toSlug() : String = { @@ -34,11 +42,11 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S val unaccented = StringUtilities.removeAccents(title) // Remove punctuation val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "") - if (slug.isEmpty || slug == null || (slugBlacklist contains slug)) Scorable.NoSlug else slug + if (slug.isEmpty || slug == null || (SlugBlacklist contains slug)) Scorable.NoSlug else slug } } - def toMapFeatures = { + def toMapFeatures : MapFeatures = { MapFeatures(toSlug, toString) } } |