diff options
Diffstat (limited to 'scalding/src/main')
| -rw-r--r-- | scalding/src/main/resources/slug-blacklist.txt | 34 | ||||
| -rw-r--r-- | scalding/src/main/scala/sandcrawler/ScorableFeatures.scala | 39 | 
2 files changed, 55 insertions, 18 deletions
| diff --git a/scalding/src/main/resources/slug-blacklist.txt b/scalding/src/main/resources/slug-blacklist.txt new file mode 100644 index 0000000..7dc701f --- /dev/null +++ b/scalding/src/main/resources/slug-blacklist.txt @@ -0,0 +1,34 @@ +abbreviations +abstract +acknowledgements +article +authorreply +authorsreply +bookreview +bookreviews +casereport +commentary +commentaryon +commenton +commentto +contents +correspondence +dedication +editorialadvisoryboard +focus +hypothesis +inbrief +introduction +introductiontotheissue +lettertotheeditor +listofabbreviations +note +overview +preface +references +results +review +reviewarticle +summary +title +name diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 8ed3369..610f1a4 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -1,26 +1,18 @@  package sandcrawler -import scala.util.parsing.json.JSONObject +import java.io.InputStream +import scala.io.Source +import scala.util.parsing.json.JSONObject  // Contains features needed to make slug and to score (in combination  // with a second ScorableFeatures).  class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") { - -  val slugBlacklist = Set( "abbreviations", "abstract", "acknowledgements", -    "article", "authorreply", "authorsreply", "bookreview", "bookreviews", -    "casereport", "commentary", "commentaryon", "commenton", "commentto", -    "contents", "correspondence", "dedication", "editorialadvisoryboard", -    "focus", "hypothesis", "inbrief", "introduction", "introductiontotheissue", -    "lettertotheeditor", "listofabbreviations", "note", "overview", "preface", -    "references", "results", "review", "reviewarticle", "summary", "title", -    "name") - -  def toMap() : Map[String, Any] = { -    Map("title" -> (if (title == null) "" else title), -        "year" -> year, -        "doi" -> (if (doi == null) "" else doi), -        "sha1" -> (if (sha1 == null) "" else sha1)) +  def toMap() : Map[String, Any] = { Map( +    "title" -> (if (title == null) "" else title), +    "year" -> year, +    "doi" -> (if (doi == null) "" else doi), +    "sha1" -> (if (sha1 == null) "" else sha1))    }    override def toString() : String = { @@ -34,11 +26,22 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S        val unaccented = StringUtilities.removeAccents(title)        // Remove punctuation        val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "") -      if (slug.isEmpty || slug == null || (slugBlacklist contains slug)) Scorable.NoSlug else slug +      // scalastyle:off if.brace +      if (slug.isEmpty || slug == null || (ScorableFeatures.SlugBlacklist contains slug)) +        Scorable.NoSlug +      else +        slug      }    } -  def toMapFeatures = { +  def toMapFeatures : MapFeatures = {      MapFeatures(toSlug, toString)    }  } + +object ScorableFeatures { +  // TODO: Add exception handling. +  val fileStream : InputStream = getClass.getResourceAsStream("/slug-blacklist.txt") +  val SlugBlacklist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet +  fileStream.close +} | 
