aboutsummaryrefslogtreecommitdiffstats
path: root/scalding
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-15 22:33:09 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-15 22:33:09 -0700
commit3f668933d71b82555e89a3bfefe83039ff7ddbfb (patch)
tree619ec9976ccde6594cd461e0acee3c34f173ed6b /scalding
parent70350899dda973cdf7a5cfdd941ae80319254587 (diff)
downloadsandcrawler-3f668933d71b82555e89a3bfefe83039ff7ddbfb.tar.gz
sandcrawler-3f668933d71b82555e89a3bfefe83039ff7ddbfb.zip
add a stub title blacklist
Diffstat (limited to 'scalding')
-rw-r--r--scalding/src/main/scala/sandcrawler/ScorableFeatures.scala13
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala6
2 files changed, 18 insertions, 1 deletions
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
index 966fb93..696b2ef 100644
--- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -2,9 +2,20 @@ package sandcrawler
import scala.util.parsing.json.JSONObject
+
// Contains features needed to make slug and to score (in combination
// with a second ScorableFeatures).
class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") {
+
+ val slugBlacklist = Set( "abbreviations", "abstract", "acknowledgements",
+ "article", "authorreply", "authorsreply", "bookreview", "bookreviews",
+ "casereport", "commentary", "commentaryon", "commenton", "commentto",
+ "contents", "correspondence", "dedication", "editorialadvisoryboard",
+ "focus", "hypothesis", "inbrief", "introduction", "introductiontotheissue",
+ "lettertotheeditor", "listofabbreviations", "note", "overview", "preface",
+ "references", "results", "review", "reviewarticle", "summary", "title",
+ "name")
+
def toMap() : Map[String, Any] = {
Map("title" -> (if (title == null) "" else title),
"year" -> year,
@@ -23,7 +34,7 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S
val unaccented = StringUtilities.removeAccents(title)
// Remove punctuation after splitting on colon.
val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "")
- if (slug.isEmpty || slug == null) Scorable.NoSlug else slug
+ if (slug.isEmpty || slug == null || (slugBlacklist contains slug)) Scorable.NoSlug else slug
}
}
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
index fd01c91..0acf0b8 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
@@ -37,6 +37,12 @@ class ScorableFeaturesTest extends FlatSpec with Matchers {
titleToSlug(":;\"\'") shouldBe Scorable.NoSlug
}
+ it should "filter stub titles" in {
+ titleToSlug("abstract") shouldBe Scorable.NoSlug
+ titleToSlug("title!") shouldBe Scorable.NoSlug
+ titleToSlug("a real title which is not on blacklist") shouldBe "arealtitlewhichisnotonblacklist"
+ }
+
it should "strip special characters" in {
titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_…") shouldBe Scorable.NoSlug
// TODO: titleToSlug("©™₨№") shouldBe Scorable.NoSlug