diff options
author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-24 11:53:58 -0700 |
---|---|---|
committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-24 11:53:58 -0700 |
commit | 3e33d60aac9db78d0458876fbe987627db222bbb (patch) | |
tree | 05680bd5cfc53348c966f7a03235547a01c5c5d1 /scalding/src/test | |
parent | c4db53036eac90841eb4f970b77db8c1677ef75b (diff) | |
download | sandcrawler-3e33d60aac9db78d0458876fbe987627db222bbb.tar.gz sandcrawler-3e33d60aac9db78d0458876fbe987627db222bbb.zip |
grobidToSlug() seems to work, including parsing of valid JSON strings.
Diffstat (limited to 'scalding/src/test')
-rw-r--r-- | scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala | 73 |
1 files changed, 73 insertions, 0 deletions
diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala new file mode 100644 index 0000000..186bb70 --- /dev/null +++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala @@ -0,0 +1,73 @@ +package sandcrawler + +import cascading.tuple.Fields +import org.scalatest._ +import parallelai.spyglass.hbase.HBaseConstants.SourceMode + +class HBaseCrossrefScoreTest extends FlatSpec with Matchers { + val GrobidString = """ +{ + "title": "Dummy Example File", + "authors": [ + {"name": "Brewster Kahle"}, + {"name": "J Doe"} + ], + "journal": { + "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", + "eissn": null, + "issn": null, + "issue": null, + "publisher": null, + "volume": null + }, + "date": "2000", + "doi": null, + "citations": [ + { "authors": [{"name": "A Seaperson"}], + "date": "2001", + "id": "b0", + "index": 0, + "issue": null, + "journal": "Letters in the Alphabet", + "publisher": null, + "title": "Everything is Wonderful", + "url": null, + "volume": "20"}, + { "authors": [], + "date": "2011-03-28", + "id": "b1", + "index": 1, + "issue": null, + "journal": "The Dictionary", + "publisher": null, + "title": "All about Facts", + "url": null, + "volume": "14"} + ], + "abstract": "Everything you ever wanted to know about nothing", + "body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", + "acknowledgement": null, + "annex": null +} +""" + val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle") + + "titleToSlug()" should "extract the parts of titles before a colon" in { + val slug = HBaseCrossrefScore.titleToSlug("hello:there") + slug shouldBe "hello" + } + it should "extract an entire colon-less string" in { + val slug = HBaseCrossrefScore.titleToSlug("hello there") + slug shouldBe "hello there" + } + + "grobidToSlug()" should "get the right slug for a grobid json string" in { + val slug = HBaseCrossrefScore.grobidToSlug(GrobidString) + slug shouldBe "Dummy Example File" + } + + "grobidToSlug()" should "return empty string for a grobid json string without a title" in { + val slug = HBaseCrossrefScore.grobidToSlug(GrobidStringWithoutTitle) + slug shouldBe "" + } +} |