diff options
Diffstat (limited to 'scalding/src/main')
3 files changed, 27 insertions, 27 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index ff8201a..5d1eaf5 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -35,7 +35,7 @@ object CrossrefScorable {              new MapFeatures(Scorable.NoSlug, json)            } else {              // bnewbold: not checking that titles(0) is non-null/non-empty; case would be, in JSON, "title": [ null ] -            val sf : ScorableFeatures = new ScorableFeatures(title=titles(0), doi=doi) +            val sf : ScorableFeatures = ScorableFeatures.create(title=titles(0), doi=doi)              new MapFeatures(sf.toSlug, sf.toString)            }          } else { diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index 9a09e05..d7a1eea 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -45,7 +45,7 @@ object GrobidScorable {        case None => MapFeatures(Scorable.NoSlug, json)        case Some(map) => {          if (map contains "title") { -          new ScorableFeatures(Scorable.getString(map, "title"), sha1=key).toMapFeatures +          ScorableFeatures.create(title=Scorable.getString(map, "title"), sha1=key).toMapFeatures          } else {            MapFeatures(Scorable.NoSlug, json)          } diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 610f1a4..0b9868a 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -5,19 +5,31 @@ import java.io.InputStream  import scala.io.Source  import scala.util.parsing.json.JSONObject -// Contains features needed to make slug and to score (in combination -// with a second ScorableFeatures). -class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") { -  def toMap() : Map[String, Any] = { Map( -    "title" -> (if (title == null) "" else title), -    "year" -> year, -    "doi" -> (if (doi == null) "" else doi), -    "sha1" -> (if (sha1 == null) "" else sha1)) -  } +object ScorableFeatures { +  // TODO: Add exception handling. +  val fileStream : InputStream = getClass.getResourceAsStream("/slug-blacklist.txt") +  val SlugBlacklist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet +  fileStream.close -  override def toString() : String = { -    JSONObject(toMap()).toString +  // Static factory method +  def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = { +    new ScorableFeatures( +      title=if (title == null) "" else title, +      year=year, +      doi=if (doi == null) "" else doi, +      sha1=if (sha1 == null) "" else sha1)    } +} + +// Contains features needed to make slug and to score (in combination +// with a second ScorableFeatures). Create with above static factory method. +class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", sha1: String = "") { + +  def toMap() : Map[String, Any] = +    Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) + +  override def toString() : String = +    JSONObject(toMap).toString    def toSlug() : String = {      if (title == null) { @@ -26,22 +38,10 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S        val unaccented = StringUtilities.removeAccents(title)        // Remove punctuation        val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "") -      // scalastyle:off if.brace -      if (slug.isEmpty || slug == null || (ScorableFeatures.SlugBlacklist contains slug)) -        Scorable.NoSlug -      else -        slug +      if (slug.isEmpty || slug == null || (ScorableFeatures.SlugBlacklist contains slug)) Scorable.NoSlug else slug      }    } -  def toMapFeatures : MapFeatures = { +  def toMapFeatures : MapFeatures =      MapFeatures(toSlug, toString) -  } -} - -object ScorableFeatures { -  // TODO: Add exception handling. -  val fileStream : InputStream = getClass.getResourceAsStream("/slug-blacklist.txt") -  val SlugBlacklist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet -  fileStream.close  }  | 
