blob: 112a5e56a3052280004d506f2c0973208d6d8a44 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
|
package sandcrawler
import java.io.InputStream
import scala.io.Source
import org.scalatest._
// scalastyle:off null
class ScorableFeaturesTest extends FlatSpec with Matchers {
"toMapFeatures()" should "work with gnarly inputs" in {
ScorableFeatures.create(title = null).toMapFeatures
ScorableFeatures.create(title = "something", doi = null, sha1 = null, year = 123).toMapFeatures
}
private def titleToSlug(s : String) : Option[String] = ScorableFeatures.create(title = s).toSlug
"mapToSlug()" should "extract the parts of titles before a colon" in {
titleToSlug("HELLO:there") shouldBe Some("hellothere")
}
it should "extract an entire colon-less string" in {
titleToSlug("hello THERE") shouldBe Some("hellothere")
}
it should "return Scorable.NoSlug if given empty string" in {
titleToSlug("") shouldBe (None)
}
it should "return Scorable.NoSlug if given null" in {
titleToSlug(null) shouldBe (None)
}
it should "strip punctuation" in {
titleToSlug("HELLO!:the:re") shouldBe Some("hellothere")
titleToSlug("a:b:cdefgh") shouldBe Some("abcdefgh")
titleToSlug(
"If you're happy and you know it, clap your hands!") shouldBe Some("ifyourehappyandyouknowitclapyourhands")
titleToSlug(":;\"\'") shouldBe (None)
}
it should "filter stub titles" in {
titleToSlug("abstract") shouldBe (None)
titleToSlug("title!") shouldBe (None)
titleToSlug("a real title which is not on blacklist") shouldBe Some("arealtitlewhichisnotonblacklist")
}
it should "strip special characters" in {
titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」¿–±§ʿ") shouldBe (None)
// TODO: titleToSlug("©™₨№…") shouldBe (None)
// TODO: titleToSlug("πµΣσ") shouldBe (None)
}
it should "remove whitespace" in {
titleToSlug("foo bar : baz ::") shouldBe Some("foobarbaz")
titleToSlug("\na\t:b:cdefghi") shouldBe Some("abcdefghi")
titleToSlug("\n \t \r ") shouldBe (None)
}
it should "skip very short slugs" in {
titleToSlug("short") shouldBe (None)
titleToSlug("a longer, more in depth title") shouldBe Some("alongermoreindepthtitle")
}
}
|