diff options
Diffstat (limited to 'scalding/src')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/ScorableFeatures.scala | 13 | ||||
-rw-r--r-- | scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala | 6 |
2 files changed, 18 insertions, 1 deletions
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 966fb93..696b2ef 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -2,9 +2,20 @@ package sandcrawler import scala.util.parsing.json.JSONObject + // Contains features needed to make slug and to score (in combination // with a second ScorableFeatures). class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") { + + val slugBlacklist = Set( "abbreviations", "abstract", "acknowledgements", + "article", "authorreply", "authorsreply", "bookreview", "bookreviews", + "casereport", "commentary", "commentaryon", "commenton", "commentto", + "contents", "correspondence", "dedication", "editorialadvisoryboard", + "focus", "hypothesis", "inbrief", "introduction", "introductiontotheissue", + "lettertotheeditor", "listofabbreviations", "note", "overview", "preface", + "references", "results", "review", "reviewarticle", "summary", "title", + "name") + def toMap() : Map[String, Any] = { Map("title" -> (if (title == null) "" else title), "year" -> year, @@ -23,7 +34,7 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S val unaccented = StringUtilities.removeAccents(title) // Remove punctuation after splitting on colon. val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "") - if (slug.isEmpty || slug == null) Scorable.NoSlug else slug + if (slug.isEmpty || slug == null || (slugBlacklist contains slug)) Scorable.NoSlug else slug } } diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index fd01c91..0acf0b8 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -37,6 +37,12 @@ class ScorableFeaturesTest extends FlatSpec with Matchers { titleToSlug(":;\"\'") shouldBe Scorable.NoSlug } + it should "filter stub titles" in { + titleToSlug("abstract") shouldBe Scorable.NoSlug + titleToSlug("title!") shouldBe Scorable.NoSlug + titleToSlug("a real title which is not on blacklist") shouldBe "arealtitlewhichisnotonblacklist" + } + it should "strip special characters" in { titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_…") shouldBe Scorable.NoSlug // TODO: titleToSlug("©™₨№") shouldBe Scorable.NoSlug |