package sandcrawler import cascading.tuple.Fields import cascading.tuple.Tuple import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions} import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.scalatest._ import parallelai.spyglass.hbase.HBaseConstants.SourceMode class ScorableTest extends FlatSpec with Matchers { val JsonString = """ { "title": "<>", "authors": [ {"name": "Brewster Kahle"}, {"name": "J Doe"} ], "journal": { "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", "eissn": null, "issn": null, "issue": null, "publisher": null, "volume": null }, "date": "2000", "doi": null, "citations": [ { "authors": [{"name": "A Seaperson"}], "date": "2001", "id": "b0", "index": 0, "issue": null, "journal": "Letters in the Alphabet", "publisher": null, "title": "Everything is Wonderful", "url": null, "volume": "20"}, { "authors": [], "date": "2011-03-28", "id": "b1", "index": 1, "issue": null, "journal": "The Dictionary", "publisher": null, "title": "All about Facts", "url": null, "volume": "14"} ], "abstract": "Everything you ever wanted to know about nothing", "body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", "acknowledgement": null, "annex": null } """ val MalformedJsonString = JsonString.replace("}", "") "titleToSlug()" should "extract the parts of titles before a colon" in { Scorable.titleToSlug("HELLO:there") shouldBe "hello" } it should "extract an entire colon-less string" in { Scorable.titleToSlug("hello THERE") shouldBe "hello there" } it should "return Scorable.NoSlug if given empty string" in { Scorable.titleToSlug("") shouldBe Scorable.NoSlug } "jsonToMap()" should "return a map, given a legal JSON string" in { Scorable.jsonToMap(JsonString) should not be (None) } it should "return None, given illegal JSON" in { Scorable.jsonToMap("illegal{,json{{") should be (None) } "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in { val score = Scorable.computeSimilarity( new ReduceFeatures(JsonString), new ReduceFeatures(JsonString)) score shouldBe Scorable.MaxScore } /* it should "return None if given a malformed json string" in { val slug = Scorable.grobidToSlug(MalformedGrobidString) slug shouldBe None } it should "return None if given an empty json string" in { val slug = Scorable.grobidToSlug("") slug shouldBe None } "crossrefToSlug()" should "get the right slug for a crossref json string" in { val slug = Scorable.crossrefToSlug(CrossrefStringWithTitle) slug should contain ("sometitle") } it should "return None if given json string without title" in { val slug = Scorable.grobidToSlug(CrossrefStringWithoutTitle) slug shouldBe None } it should "return None if given a malformed json string" in { val slug = Scorable.grobidToSlug(MalformedCrossrefString) slug shouldBe None } */ }