diff options
| author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-12 18:08:51 -0700 | 
|---|---|---|
| committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-12 18:08:51 -0700 | 
| commit | 31354b1a6062c5c56a30610f68fa48c82a7e83f0 (patch) | |
| tree | a730150c3f29ea76579ee6928a7c2db9e5b22eac /scalding/src/test/scala | |
| parent | 728e50a33cec921c9a624439f2e1c8561a6e12ce (diff) | |
| download | sandcrawler-31354b1a6062c5c56a30610f68fa48c82a7e83f0.tar.gz sandcrawler-31354b1a6062c5c56a30610f68fa48c82a7e83f0.zip | |
Tests pass.
Diffstat (limited to 'scalding/src/test/scala')
3 files changed, 33 insertions, 104 deletions
| diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala deleted file mode 100644 index 1c35d66..0000000 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ /dev/null @@ -1,89 +0,0 @@ -package sandcrawler - -import cascading.tuple.Fields -import cascading.tuple.Tuple -import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} -import org.apache.hadoop.hbase.io.ImmutableBytesWritable -import org.apache.hadoop.hbase.util.Bytes -import org.scalatest._ -import parallelai.spyglass.hbase.HBaseConstants.SourceMode - -class CrossrefScorableTest extends FlatSpec with Matchers { -  val CrossrefString = -""" -{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" },  -  "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ],  -    "date-time" : "2017-10-23T17:19:16Z",  -    "timestamp" : { "$numberLong" : "1508779156477" } },  -  "reference-count" : 0,  -  "publisher" : "Elsevier BV",  -  "issue" : "3",  -  "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/",  -                    "start" : { "date-parts" : [ [ 1996, 1, 1 ] ],  -                                "date-time" : "1996-01-01T00:00:00Z",  -                                "timestamp" : { "$numberLong" : "820454400000" } },  -                                "delay-in-days" : 0, "content-version" : "tdm" }], -  "content-domain" : { "domain" : [], "crossmark-restriction" : false },  -  "published-print" : { "date-parts" : [ [ 1996 ] ] },  -  "DOI" : "<<DOI>>", -  "type" : "journal-article",  -  "created" : { "date-parts" : [ [ 2002, 7, 25 ] ],  -    "date-time" : "2002-07-25T15:09:41Z",  -    "timestamp" : { "$numberLong" : "1027609781000" } },  -  "page" : "186-187",  -  "source" : "Crossref",  -  "is-referenced-by-count" : 0,  -  "title" : [ "<<TITLE>>" ], -  "prefix" : "10.1016",  -  "volume" : "9",  -  "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ],  -  "member" : "78",  -  "container-title" : [ "Journal de Pédiatrie et de Puériculture" ],  -  "link" : [ { "URL" :  "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml", -               "content-type" : "text/xml",  -                 "content-version" : "vor", -                 "intended-application" : "text-mining" },  -               { "URL" : -  "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain", -                 "content-type" : "text/plain",  -                 "content-version" : "vor", -                 "intended-application" : "text-mining" } ],  -  "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ],  -                  "date-time" : "2015-09-03T10:03:43Z",  -                  "timestamp" : { "$numberLong" : "1441274623000" } },  -  "score" : 1,  -  "issued" : { "date-parts" : [ [ 1996 ] ] },  -  "references-count" : 0,  -  "alternative-id" : [ "0987-7983(96)87729-2" ],  -  "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2",  -  "ISSN" : [ "0987-7983" ],  -  "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],  -  "subject" : [ "Pediatrics, Perinatology, and Child Health" ] -} -""" -  val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle") -  val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") -  val MalformedCrossrefString = CrossrefString.replace("}", "") - -  // Unit tests -  "simplifyJson()" should "return None for bad JSON" in { -    CrossrefScorable.simplifyJson("") shouldBe None -    CrossrefScorable.simplifyJson(MalformedCrossrefString) shouldBe None -  } - -  it should "return None for JSON lacking title" in { -    CrossrefScorable.simplifyJson(CrossrefStringWithoutTitle) shouldBe None -  } - -  it should "return appropriate result for valid JSON" in { -    CrossrefScorable.simplifyJson(CrossrefStringWithTitle) match { -      case None => fail("None unexpectedly returned by simplifyJson") -      case Some(map) => { -        Scorable.isScorableMap(map) shouldBe true -        map.size shouldBe 1 -        map.keys should contain ("title") -        map("title") shouldBe "SomeTitle" -      } -    } -  } -} diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala index 5bb955a..3fcd856 100644 --- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala @@ -57,18 +57,28 @@ class GrobidScorableTest extends FlatSpec with Matchers {    val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File")    val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle")    val MalformedGrobidString = GrobidString.replace("}", "") +  val Key = "Dummy Key"    // Unit tests    "GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in { -    val result = GrobidScorable.jsonToMapFeatures(MalformedGrobidString) shouldBe None +    val result = GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString)       result.slug shouldBe Scorable.NoSlug -    result.json shouldBe MalformedGrobidString    } -  "GrobidScorable.jsonToMapFeatures()" should "handle missing title" in { -    val result = GrobidScorable.jsonToMapFeatures(GrobidStringWithoutTitle) shouldBe None +  it should "handle missing title" in { +    val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle)      result.slug shouldBe Scorable.NoSlug -    result.json shouldBe GrobidStringWithoutTitle +  } + +  it should "handle valid input" in { +    val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithTitle) +    result.slug shouldBe "dummyexamplefile" +    Scorable.jsonToMap(result.json) match { +      case None => fail() +      case Some(map) => { +        map("title").asInstanceOf[String] shouldBe "Dummy Example File" +      } +    }    }  } diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala index 2f80492..95faacc 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala @@ -54,28 +54,36 @@ class ScorableTest extends FlatSpec with Matchers {    "annex": null  }  """ +  private def titleToSlug(s : String) : String = { +    Scorable.mapToSlug(Scorable.toScorableMap(title = s)) +  } -  "titleToSlug()" should "extract the parts of titles before a colon" in { -    Scorable.titleToSlug("HELLO:there") shouldBe "hello" +  "mapToSlug()" should "extract the parts of titles before a colon" in { +    titleToSlug("HELLO:there") shouldBe "hello"    }    it should "extract an entire colon-less string" in { -    Scorable.titleToSlug("hello THERE") shouldBe "hello there" +    titleToSlug("hello THERE") shouldBe "hellothere"    }    it should "return Scorable.NoSlug if given empty string" in { -    Scorable.titleToSlug("") shouldBe Scorable.NoSlug +    titleToSlug("") shouldBe Scorable.NoSlug    }    it should "return Scorable.NoSlug if given null" in { -    Scorable.titleToSlug(null) shouldBe Scorable.NoSlug +    titleToSlug(null) shouldBe Scorable.NoSlug +  } + +  it should "strip punctuation" in { +    titleToSlug("HELLO!:the:re") shouldBe "hello" +    titleToSlug("a:b:c") shouldBe "a" +    titleToSlug( +      "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands"    } -  "titleToSlug()" should "strip punctuation" in { -    Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello" -    Scorable.titleToSlug("a:b:c") shouldBe "a" -    Scorable.titleToSlug( -      "If you're happy and you know it, clap your hands!") shouldBe "if youre happy and you know it clap your hands" +  it should "remove whitespace" in { +    titleToSlug("foo bar : baz ::") shouldBe "foobar" +    titleToSlug("\na\t:b:c") shouldBe "a"    }    "jsonToMap()" should "return a map, given a legal JSON string" in { | 
