diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-08-21 11:14:22 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-21 11:14:22 -0700 |
commit | 96f45740599a9d9ed06cf22eba8cd0a0e0927a42 (patch) | |
tree | f9064ccd26c02a813b128e36827418a1f5840ba3 /scalding/src/main/scala | |
parent | 34fa226b27a8597ae1da788a41be2880b1cbf4fc (diff) | |
parent | f73796fe22d96e1d5ad559ffcb8dfe8fc10b3c20 (diff) | |
download | sandcrawler-96f45740599a9d9ed06cf22eba8cd0a0e0927a42.tar.gz sandcrawler-96f45740599a9d9ed06cf22eba8cd0a0e0927a42.zip |
Merge branch 'strings'
Resolved conflicts in:
scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
Diffstat (limited to 'scalding/src/main/scala')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/ScorableFeatures.scala | 18 |
1 files changed, 9 insertions, 9 deletions
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index e71abfa..0b9868a 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -1,8 +1,16 @@ package sandcrawler +import java.io.InputStream + +import scala.io.Source import scala.util.parsing.json.JSONObject object ScorableFeatures { + // TODO: Add exception handling. + val fileStream : InputStream = getClass.getResourceAsStream("/slug-blacklist.txt") + val SlugBlacklist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet + fileStream.close + // Static factory method def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = { new ScorableFeatures( @@ -16,14 +24,6 @@ object ScorableFeatures { // Contains features needed to make slug and to score (in combination // with a second ScorableFeatures). Create with above static factory method. class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", sha1: String = "") { - val SlugBlacklist = Set( "abbreviations", "abstract", "acknowledgements", - "article", "authorreply", "authorsreply", "bookreview", "bookreviews", - "casereport", "commentary", "commentaryon", "commenton", "commentto", - "contents", "correspondence", "dedication", "editorialadvisoryboard", - "focus", "hypothesis", "inbrief", "introduction", "introductiontotheissue", - "lettertotheeditor", "listofabbreviations", "note", "overview", "preface", - "references", "results", "review", "reviewarticle", "summary", "title", - "name") def toMap() : Map[String, Any] = Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) @@ -38,7 +38,7 @@ class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", val unaccented = StringUtilities.removeAccents(title) // Remove punctuation val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "") - if (slug.isEmpty || slug == null || (SlugBlacklist contains slug)) Scorable.NoSlug else slug + if (slug.isEmpty || slug == null || (ScorableFeatures.SlugBlacklist contains slug)) Scorable.NoSlug else slug } } |