aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
blob: 5a22ef8216c5d00ecf3f947064a6610649921192 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
package sandcrawler

import java.io.InputStream

import scala.io.Source

import org.scalatest._

// scalastyle:off null
class ScorableFeaturesTest extends FlatSpec with Matchers {

  // TODO: Remove this when we're convinced that our file-reading code
  // works. (I'm already convinced. --Ellen)
  "read slugs" should "work" in {
    val SlugBlacklist = Set( "abbreviations", "abstract", "acknowledgements",
      "article", "authorreply", "authorsreply", "bookreview", "bookreviews",
      "casereport", "commentary", "commentaryon", "commenton", "commentto",
      "contents", "correspondence", "dedication", "editorialadvisoryboard",
      "focus", "hypothesis", "inbrief", "introduction", "introductiontotheissue",
      "lettertotheeditor", "listofabbreviations", "note", "overview", "preface",
      "references", "results", "review", "reviewarticle", "summary", "title",
      "name")

    ScorableFeatures.SlugBlacklist.size shouldBe SlugBlacklist.size
    for (s <- ScorableFeatures.SlugBlacklist) SlugBlacklist should contain (s)
  }

  private def titleToSlug(s : String) : String = {
    ScorableFeatures.create(title = s).toSlug
  }

  "toMapFeatures()" should "work with gnarly inputs" in {
    ScorableFeatures.create(title = null).toMapFeatures
    ScorableFeatures.create(title = "something", doi = null, sha1 = null, year = 123).toMapFeatures
  }

  "mapToSlug()" should "extract the parts of titles before a colon" in {
    titleToSlug("HELLO:there") shouldBe "hellothere"
  }

  it should "extract an entire colon-less string" in {
    titleToSlug("hello THERE") shouldBe "hellothere"
  }

  it should "return Scorable.NoSlug if given empty string" in {
    titleToSlug("") shouldBe Scorable.NoSlug
  }

  it should "return Scorable.NoSlug if given null" in {
    titleToSlug(null) shouldBe Scorable.NoSlug
  }

  it should "strip punctuation" in {
    titleToSlug("HELLO!:the:re") shouldBe "hellothere"
    titleToSlug("a:b:c") shouldBe "abc"
    titleToSlug(
      "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands"
    titleToSlug(":;\"\'") shouldBe Scorable.NoSlug
  }

  it should "filter stub titles" in {
    titleToSlug("abstract") shouldBe Scorable.NoSlug
    titleToSlug("title!") shouldBe Scorable.NoSlug
    titleToSlug("a real title which is not on blacklist") shouldBe "arealtitlewhichisnotonblacklist"
  }

  it should "strip special characters" in {
    titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_") shouldBe Scorable.NoSlug
    // TODO: titleToSlug("©™₨№…") shouldBe Scorable.NoSlug
    // TODO: titleToSlug("πµΣσ") shouldBe Scorable.NoSlug
  }

  it should "remove whitespace" in {
    titleToSlug("foo bar : baz ::") shouldBe "foobarbaz"
    titleToSlug("\na\t:b:c") shouldBe "abc"
    titleToSlug("\n \t \r  ") shouldBe Scorable.NoSlug
  }
}