diff options
Diffstat (limited to 'scalding')
4 files changed, 85 insertions, 77 deletions
| diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala index c0d1cb0..8302b8f 100644 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala @@ -79,59 +79,64 @@ class CrossrefScorableTest extends FlatSpec with Matchers {    // Unit tests    "CrossrefScorable.jsonToMapFeatures()" should "handle invalid JSON" in { -    val result = CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString) -    result.slug shouldBe Scorable.NoSlug +    CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString) should be (None)    }    it should "handle missing title" in { -    val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithoutTitle) -    result.slug shouldBe Scorable.NoSlug +    CrossrefScorable.jsonToMapFeatures(CrossrefStringWithoutTitle) should be (None)    }    it should "handle null title" in { -    val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNullTitle) -    result.slug shouldBe Scorable.NoSlug +    CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNullTitle) should be (None)    }    it should "handle empty title" in { -    val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithEmptyTitle) -    result.slug shouldBe Scorable.NoSlug +    CrossrefScorable.jsonToMapFeatures(CrossrefStringWithEmptyTitle) should be (None)    }    it should "handle subtitle" in { -    val result = CrossrefScorable.jsonToMapFeatures( -      """{"title": ["short but not too short"], "subtitle": ["just right!"], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") -    result.slug shouldBe "shortbutnottooshortjustright" +    CrossrefScorable.jsonToMapFeatures( +      """{"title": ["short but not too short"], "subtitle": ["just right!"], "DOI": "10.123/asdf", "type":"journal-article","author":[{ "given" : "W", "family" : "Gaier"}]}""") match { +      case None => fail() +      case Some(result) => result.slug shouldBe "shortbutnottooshortjustright" +    }    }    it should "handle empty subtitle" in { -    val result = CrossrefScorable.jsonToMapFeatures( -      """{"title": ["short but not too short"], "subtitle": [""], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") -    result.slug shouldBe "shortbutnottooshort" +    CrossrefScorable.jsonToMapFeatures( +      """{"title": ["short but not too short"], "subtitle": [""], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") match { +      case None => fail() +      case Some(result) => result.slug shouldBe "shortbutnottooshort" +    }    }    it should "handle null subtitle" in { -    val result = CrossrefScorable.jsonToMapFeatures( -      """{"title": ["short but not too short"], "subtitle": [null], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") -    result.slug shouldBe "shortbutnottooshort" +    CrossrefScorable.jsonToMapFeatures( +      """{"title": ["short but not too short"], "subtitle": [null], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") match { +      case None => fail() +      case Some(result) => result.slug shouldBe "shortbutnottooshort" +    }    }    it should "handle missing authors" in { -    val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNoAuthors) -    result.slug shouldBe Scorable.NoSlug +    CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNoAuthors) should be (None)    }    it should "handle valid input" in { -    val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithGoodTitle) -    result.slug shouldBe "sometitle" -    Scorable.jsonToMap(result.json) match { +    CrossrefScorable.jsonToMapFeatures(CrossrefStringWithGoodTitle) match {        case None => fail() -      case Some(map) => { -        map("title").asInstanceOf[String] shouldBe "Some Title" -        map("doi").asInstanceOf[String] shouldBe "10.123/abc" -        // TODO: full name? not just a string? -        map("authors").asInstanceOf[List[String]] shouldBe List("Gaier") -        map("year").asInstanceOf[Double].toInt shouldBe 2002 +      case Some(result) => { +        result.slug shouldBe "sometitle" +        Scorable.jsonToMap(result.json) match { +          case None => fail() +          case Some(map) => { +            map("title").asInstanceOf[String] shouldBe "Some Title" +            map("doi").asInstanceOf[String] shouldBe "10.123/abc" +            // TODO: full name? not just a string? +            map("authors").asInstanceOf[List[String]] shouldBe List("Gaier") +            map("year").asInstanceOf[Double].toInt shouldBe 2002 +          } +        }        }      }    } @@ -161,9 +166,7 @@ class CrossrefScorableTest extends FlatSpec with Matchers {    }    it should "handle content types" in { -    val resultWrong = CrossrefScorable.jsonToMapFeatures(CrossrefStringWrongType) -    resultWrong.slug shouldBe Scorable.NoSlug -    val resultMissing = CrossrefScorable.jsonToMapFeatures(CrossrefStringNoType) -    resultMissing.slug shouldBe Scorable.NoSlug +    CrossrefScorable.jsonToMapFeatures(CrossrefStringWrongType) should be (None) +    CrossrefScorable.jsonToMapFeatures(CrossrefStringNoType) should be (None)    }  } diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala index 119cf90..b395a64 100644 --- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala @@ -68,29 +68,30 @@ class GrobidScorableTest extends FlatSpec with Matchers {    // Unit tests    "GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in { -    val result = GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString) -    result.slug shouldBe Scorable.NoSlug +    GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString) should be (None)    }    it should "handle null title" in { -    val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithNullTitle) -    result.slug shouldBe Scorable.NoSlug +    GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithNullTitle) should be (None)    }    it should "handle missing title" in { -    val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle) -    result.slug shouldBe Scorable.NoSlug +    GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle) should be (None)    }    it should "handle valid input" in { -    val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithGoodTitle) -    result.slug shouldBe "dummyexamplefile" -    Scorable.jsonToMap(result.json) match { +    GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithGoodTitle) match {        case None => fail() -      case Some(map) => { -        map should contain key "title" -        map("title").asInstanceOf[String] shouldBe "Dummy Example File" -        map("authors").asInstanceOf[List[String]] shouldBe List("Brewster Kahle", "J Doe") +      case Some(result) => { +        result.slug shouldBe "dummyexamplefile" +        Scorable.jsonToMap(result.json) match { +          case None => fail() +          case Some(map) => { +            map should contain key "title" +            map("title").asInstanceOf[String] shouldBe "Dummy Example File" +            map("authors").asInstanceOf[List[String]] shouldBe List("Brewster Kahle", "J Doe") +          } +        }        }      }    } diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index 450c169..3f6b87c 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -8,60 +8,57 @@ import org.scalatest._  // scalastyle:off null  class ScorableFeaturesTest extends FlatSpec with Matchers { - -  private def titleToSlug(s : String) : String = { -    ScorableFeatures.create(title = s).toSlug -  } -    "toMapFeatures()" should "work with gnarly inputs" in {      ScorableFeatures.create(title = null).toMapFeatures      ScorableFeatures.create(title = "something", doi = null, sha1 = null, year = 123).toMapFeatures    } +  private def titleToSlug(s : String) : Option[String] = ScorableFeatures.create(title = s).toSlug +    "mapToSlug()" should "extract the parts of titles before a colon" in { -    titleToSlug("HELLO:there") shouldBe "hellothere" +    titleToSlug("HELLO:there") shouldBe (Some("hellothere"))    }    it should "extract an entire colon-less string" in { -    titleToSlug("hello THERE") shouldBe "hellothere" +    titleToSlug("hello THERE") shouldBe (Some("hellothere"))    }    it should "return Scorable.NoSlug if given empty string" in { -    titleToSlug("") shouldBe Scorable.NoSlug +    titleToSlug("") shouldBe (None)    }    it should "return Scorable.NoSlug if given null" in { -    titleToSlug(null) shouldBe Scorable.NoSlug +    titleToSlug(null) shouldBe (None)    }    it should "strip punctuation" in { -    titleToSlug("HELLO!:the:re") shouldBe "hellothere" -    titleToSlug("a:b:cdefgh") shouldBe "abcdefgh" +    titleToSlug("HELLO!:the:re") shouldBe Some("hellothere") +    titleToSlug("a:b:cdefgh") shouldBe Some("abcdefgh")      titleToSlug( -      "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands" -    titleToSlug(":;\"\'") shouldBe Scorable.NoSlug +      "If you're happy and you know it, clap your hands!") shouldBe Some("ifyourehappyandyouknowitclapyourhands") +    titleToSlug(":;\"\'") shouldBe (None)    }    it should "filter stub titles" in { -    titleToSlug("abstract") shouldBe Scorable.NoSlug -    titleToSlug("title!") shouldBe Scorable.NoSlug -    titleToSlug("a real title which is not on blacklist") shouldBe "arealtitlewhichisnotonblacklist" +    titleToSlug("abstract") shouldBe (None) +    titleToSlug("title!") shouldBe (None) +    titleToSlug("a real title which is not on blacklist") shouldBe Some("arealtitlewhichisnotonblacklist")    }    it should "strip special characters" in { -    titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」¿–±§ʿ") shouldBe Scorable.NoSlug -    // TODO: titleToSlug("©™₨№…") shouldBe Scorable.NoSlug -    // TODO: titleToSlug("πµΣσ") shouldBe Scorable.NoSlug +    titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」¿–±§ʿ") shouldBe (None) +    // TODO: titleToSlug("©™₨№…") shouldBe (None) +    // TODO: titleToSlug("πµΣσ") shouldBe (None)    }    it should "remove whitespace" in { -    titleToSlug("foo bar : baz ::") shouldBe "foobarbaz" -    titleToSlug("\na\t:b:cdefghi") shouldBe "abcdefghi" -    titleToSlug("\n \t \r  ") shouldBe Scorable.NoSlug +    titleToSlug("foo bar : baz ::") shouldBe Some("foobarbaz") +    titleToSlug("\na\t:b:cdefghi") shouldBe Some("abcdefghi") +    titleToSlug("\n \t \r  ") shouldBe (None)    }    it should "skip very short slugs" in { -    titleToSlug("short") shouldBe Scorable.NoSlug -    titleToSlug("a longer, more in depth title") shouldBe "alongermoreindepthtitle" +    titleToSlug("short") shouldBe (None) +    titleToSlug("a longer, more in depth title") shouldBe Some("alongermoreindepthtitle")    }  } diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala index 32fb16c..c3e4ff9 100644 --- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala @@ -222,15 +222,22 @@ class ScoreJobTest extends FlatSpec with Matchers {        }        def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) : (String, Int, String, String) = { -        val mf1 : MapFeatures = GrobidScorable.jsonToMapFeatures( +        GrobidScorable.jsonToMapFeatures(            Sha1Strings(grobidIndex), -          JsonStrings(grobidIndex)) -        val mf2 : MapFeatures = CrossrefScorable.jsonToMapFeatures( -          CrossrefStrings(crossrefIndex)) -        val score = Scorable.computeSimilarity( -          ReduceFeatures(mf1.json), -          ReduceFeatures(mf2.json)) -        (slug, score, mf1.json, mf2.json) +          JsonStrings(grobidIndex)) match { +          case None => fail() +          case Some(mf1) => { +            CrossrefScorable.jsonToMapFeatures(CrossrefStrings(crossrefIndex)) match { +              case None => fail() +              case Some(mf2) => { +                val score = Scorable.computeSimilarity( +                  ReduceFeatures(mf1.json), +                  ReduceFeatures(mf2.json)) +                (slug, score, mf1.json, mf2.json) +              } +            } +          } +        }        }        it should "have right output values" in { | 
