aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-08-20 15:16:43 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-08-20 15:16:43 -0700
commitf73796fe22d96e1d5ad559ffcb8dfe8fc10b3c20 (patch)
tree454c1becaba40ec9ea6d5c8a349dee050d3ed03d /scalding/src/main
parentaf0fa6edf3c21ac38a8ab4e0fb425e5471e6c3b6 (diff)
downloadsandcrawler-f73796fe22d96e1d5ad559ffcb8dfe8fc10b3c20.tar.gz
sandcrawler-f73796fe22d96e1d5ad559ffcb8dfe8fc10b3c20.zip
Reads blacklist from file.
Diffstat (limited to 'scalding/src/main')
-rw-r--r--scalding/src/main/resources/slug-blacklist.txt34
-rw-r--r--scalding/src/main/scala/sandcrawler/ScorableFeatures.scala39
2 files changed, 55 insertions, 18 deletions
diff --git a/scalding/src/main/resources/slug-blacklist.txt b/scalding/src/main/resources/slug-blacklist.txt
new file mode 100644
index 0000000..7dc701f
--- /dev/null
+++ b/scalding/src/main/resources/slug-blacklist.txt
@@ -0,0 +1,34 @@
+abbreviations
+abstract
+acknowledgements
+article
+authorreply
+authorsreply
+bookreview
+bookreviews
+casereport
+commentary
+commentaryon
+commenton
+commentto
+contents
+correspondence
+dedication
+editorialadvisoryboard
+focus
+hypothesis
+inbrief
+introduction
+introductiontotheissue
+lettertotheeditor
+listofabbreviations
+note
+overview
+preface
+references
+results
+review
+reviewarticle
+summary
+title
+name
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
index 8ed3369..610f1a4 100644
--- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -1,26 +1,18 @@
package sandcrawler
-import scala.util.parsing.json.JSONObject
+import java.io.InputStream
+import scala.io.Source
+import scala.util.parsing.json.JSONObject
// Contains features needed to make slug and to score (in combination
// with a second ScorableFeatures).
class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") {
-
- val slugBlacklist = Set( "abbreviations", "abstract", "acknowledgements",
- "article", "authorreply", "authorsreply", "bookreview", "bookreviews",
- "casereport", "commentary", "commentaryon", "commenton", "commentto",
- "contents", "correspondence", "dedication", "editorialadvisoryboard",
- "focus", "hypothesis", "inbrief", "introduction", "introductiontotheissue",
- "lettertotheeditor", "listofabbreviations", "note", "overview", "preface",
- "references", "results", "review", "reviewarticle", "summary", "title",
- "name")
-
- def toMap() : Map[String, Any] = {
- Map("title" -> (if (title == null) "" else title),
- "year" -> year,
- "doi" -> (if (doi == null) "" else doi),
- "sha1" -> (if (sha1 == null) "" else sha1))
+ def toMap() : Map[String, Any] = { Map(
+ "title" -> (if (title == null) "" else title),
+ "year" -> year,
+ "doi" -> (if (doi == null) "" else doi),
+ "sha1" -> (if (sha1 == null) "" else sha1))
}
override def toString() : String = {
@@ -34,11 +26,22 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S
val unaccented = StringUtilities.removeAccents(title)
// Remove punctuation
val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "")
- if (slug.isEmpty || slug == null || (slugBlacklist contains slug)) Scorable.NoSlug else slug
+ // scalastyle:off if.brace
+ if (slug.isEmpty || slug == null || (ScorableFeatures.SlugBlacklist contains slug))
+ Scorable.NoSlug
+ else
+ slug
}
}
- def toMapFeatures = {
+ def toMapFeatures : MapFeatures = {
MapFeatures(toSlug, toString)
}
}
+
+object ScorableFeatures {
+ // TODO: Add exception handling.
+ val fileStream : InputStream = getClass.getResourceAsStream("/slug-blacklist.txt")
+ val SlugBlacklist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet
+ fileStream.close
+}