aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2018-08-16 21:09:54 +0000
committerbnewbold <bnewbold@archive.org>2018-08-16 21:09:54 +0000
commitaf0fa6edf3c21ac38a8ab4e0fb425e5471e6c3b6 (patch)
treed32953c30c5f7342672694708b385936e5a36dfe /scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
parent71be2e685848a31888811e2e398e769f7e0486c2 (diff)
parent96ea0ddd06ee4a7c11c7d5def976749ab3675878 (diff)
downloadsandcrawler-af0fa6edf3c21ac38a8ab4e0fb425e5471e6c3b6.tar.gz
sandcrawler-af0fa6edf3c21ac38a8ab4e0fb425e5471e6c3b6.zip
Merge branch 'bnewbold-scoring-patches' into 'master'
Patches on top of scoring-refactor branch (Crossref/GROBID matching work) See merge request webgroup/sandcrawler!15
Diffstat (limited to 'scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala')
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala57
1 files changed, 57 insertions, 0 deletions
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
new file mode 100644
index 0000000..80d92aa
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
@@ -0,0 +1,57 @@
+package sandcrawler
+
+import org.scalatest._
+
+// scalastyle:off null
+class ScorableFeaturesTest extends FlatSpec with Matchers {
+ private def titleToSlug(s : String) : String = {
+ new ScorableFeatures(title = s).toSlug
+ }
+
+ "toMapFeatures()" should "work with gnarly inputs" in {
+ new ScorableFeatures(title = null).toMapFeatures
+ new ScorableFeatures(title = "something", doi = null, sha1 = null, year = 123).toMapFeatures
+ }
+
+ "mapToSlug()" should "extract the parts of titles before a colon" in {
+ titleToSlug("HELLO:there") shouldBe "hellothere"
+ }
+
+ it should "extract an entire colon-less string" in {
+ titleToSlug("hello THERE") shouldBe "hellothere"
+ }
+
+ it should "return Scorable.NoSlug if given empty string" in {
+ titleToSlug("") shouldBe Scorable.NoSlug
+ }
+
+ it should "return Scorable.NoSlug if given null" in {
+ titleToSlug(null) shouldBe Scorable.NoSlug
+ }
+
+ it should "strip punctuation" in {
+ titleToSlug("HELLO!:the:re") shouldBe "hellothere"
+ titleToSlug("a:b:c") shouldBe "abc"
+ titleToSlug(
+ "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands"
+ titleToSlug(":;\"\'") shouldBe Scorable.NoSlug
+ }
+
+ it should "filter stub titles" in {
+ titleToSlug("abstract") shouldBe Scorable.NoSlug
+ titleToSlug("title!") shouldBe Scorable.NoSlug
+ titleToSlug("a real title which is not on blacklist") shouldBe "arealtitlewhichisnotonblacklist"
+ }
+
+ it should "strip special characters" in {
+ titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_") shouldBe Scorable.NoSlug
+ // TODO: titleToSlug("©™₨№…") shouldBe Scorable.NoSlug
+ // TODO: titleToSlug("πµΣσ") shouldBe Scorable.NoSlug
+ }
+
+ it should "remove whitespace" in {
+ titleToSlug("foo bar : baz ::") shouldBe "foobarbaz"
+ titleToSlug("\na\t:b:c") shouldBe "abc"
+ titleToSlug("\n \t \r ") shouldBe Scorable.NoSlug
+ }
+}