aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/test
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src/test')
-rw-r--r--scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala293
1 files changed, 0 insertions, 293 deletions
diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
deleted file mode 100644
index ebe7dc0..0000000
--- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
+++ /dev/null
@@ -1,293 +0,0 @@
-package sandcrawler
-
-import cascading.tuple.Fields
-import cascading.tuple.Tuple
-import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
-import org.apache.hadoop.hbase.io.ImmutableBytesWritable
-import org.apache.hadoop.hbase.util.Bytes
-import org.scalatest._
-import parallelai.spyglass.hbase.HBaseConstants.SourceMode
-
-class HBaseCrossrefScoreTest extends FlatSpec with Matchers {
-/*
- val GrobidString = """
-{
- "title": "<<TITLE>>",
- "authors": [
- {"name": "Brewster Kahle"},
- {"name": "J Doe"}
- ],
- "journal": {
- "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
- "eissn": null,
- "issn": null,
- "issue": null,
- "publisher": null,
- "volume": null
- },
- "date": "2000",
- "doi": null,
- "citations": [
- { "authors": [{"name": "A Seaperson"}],
- "date": "2001",
- "id": "b0",
- "index": 0,
- "issue": null,
- "journal": "Letters in the Alphabet",
- "publisher": null,
- "title": "Everything is Wonderful",
- "url": null,
- "volume": "20"},
- { "authors": [],
- "date": "2011-03-28",
- "id": "b1",
- "index": 1,
- "issue": null,
- "journal": "The Dictionary",
- "publisher": null,
- "title": "All about Facts",
- "url": null,
- "volume": "14"}
- ],
- "abstract": "Everything you ever wanted to know about nothing",
- "body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
- "acknowledgement": null,
- "annex": null
-}
-"""
- val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File")
- val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle")
- val MalformedGrobidString = GrobidString.replace("}", "")
-
- val CrossrefString =
-"""
-{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" },
- "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ],
- "date-time" : "2017-10-23T17:19:16Z",
- "timestamp" : { "$numberLong" : "1508779156477" } },
- "reference-count" : 0,
- "publisher" : "Elsevier BV",
- "issue" : "3",
- "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/",
- "start" : { "date-parts" : [ [ 1996, 1, 1 ] ],
- "date-time" : "1996-01-01T00:00:00Z",
- "timestamp" : { "$numberLong" : "820454400000" } },
- "delay-in-days" : 0, "content-version" : "tdm" }],
- "content-domain" : { "domain" : [], "crossmark-restriction" : false },
- "published-print" : { "date-parts" : [ [ 1996 ] ] },
- "DOI" : "<<DOI>>",
- "type" : "journal-article",
- "created" : { "date-parts" : [ [ 2002, 7, 25 ] ],
- "date-time" : "2002-07-25T15:09:41Z",
- "timestamp" : { "$numberLong" : "1027609781000" } },
- "page" : "186-187",
- "source" : "Crossref",
- "is-referenced-by-count" : 0,
- "title" : [ "<<TITLE>>" ],
- "prefix" : "10.1016",
- "volume" : "9",
- "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ],
- "member" : "78",
- "container-title" : [ "Journal de Pédiatrie et de Puériculture" ],
- "link" : [ { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml",
- "content-type" : "text/xml",
- "content-version" : "vor",
- "intended-application" : "text-mining" },
- { "URL" :
- "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain",
- "content-type" : "text/plain",
- "content-version" : "vor",
- "intended-application" : "text-mining" } ],
- "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ],
- "date-time" : "2015-09-03T10:03:43Z",
- "timestamp" : { "$numberLong" : "1441274623000" } },
- "score" : 1,
- "issued" : { "date-parts" : [ [ 1996 ] ] },
- "references-count" : 0,
- "alternative-id" : [ "0987-7983(96)87729-2" ],
- "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2",
- "ISSN" : [ "0987-7983" ],
- "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],
- "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
-}
-"""
- val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
- val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
- val MalformedCrossrefString = CrossrefString.replace("}", "")
-
- // Unit tests
-
- "titleToSlug()" should "extract the parts of titles before a colon" in {
- val slug = HBaseCrossrefScore.titleToSlug("HELLO:there")
- slug should contain ("hello")
- }
-
- it should "extract an entire colon-less string" in {
- val slug = HBaseCrossrefScore.titleToSlug("hello THERE")
- slug should contain ("hello there")
- }
-
- it should "return None if given empty string" in {
- HBaseCrossrefScore.titleToSlug("") shouldBe None
- }
-
- "grobidToSlug()" should "get the right slug for a grobid json string" in {
- val slug = HBaseCrossrefScore.grobidToSlug(GrobidStringWithTitle)
- slug should contain ("dummy example file")
- }
-
- it should "return None if given json string without title" in {
- val slug = HBaseCrossrefScore.grobidToSlug(GrobidStringWithoutTitle)
- slug shouldBe None
- }
-
- it should "return None if given a malformed json string" in {
- val slug = HBaseCrossrefScore.grobidToSlug(MalformedGrobidString)
- slug shouldBe None
- }
-
- it should "return None if given an empty json string" in {
- val slug = HBaseCrossrefScore.grobidToSlug("")
- slug shouldBe None
- }
-
- "crossrefToSlug()" should "get the right slug for a crossref json string" in {
- val slug = HBaseCrossrefScore.crossrefToSlug(CrossrefStringWithTitle)
- slug should contain ("sometitle")
- }
-
- it should "return None if given json string without title" in {
- val slug = HBaseCrossrefScore.grobidToSlug(CrossrefStringWithoutTitle)
- slug shouldBe None
- }
-
- it should "return None if given a malformed json string" in {
- val slug = HBaseCrossrefScore.grobidToSlug(MalformedCrossrefString)
- slug shouldBe None
- }
-
- "removeAccents()" should "handle the empty string" in {
- HBaseCrossrefScore.removeAccents("") shouldBe ""
- }
-
- it should "not change a string with unaccented characters" in {
- HBaseCrossrefScore.removeAccents("abc123") shouldBe "abc123"
- }
-
- it should "remove accents from Ls" in {
- HBaseCrossrefScore.removeAccents("E\u0141\u0142en") shouldBe "ELlen"
- }
-
- it should "remove accents from Es without changing case" in {
- val result = HBaseCrossrefScore.removeAccents("\u00e9")
- result should have length 1
- result shouldBe "e"
- }
-
- it should "convert the ø in Soren" in {
- HBaseCrossrefScore.removeAccents("Søren") shouldBe "Soren"
- HBaseCrossrefScore.removeAccents("SØREN") shouldBe "SOREN"
- }
-
- // Tests adapted from https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/
- "stringDistance" should "work on empty strings" in {
- HBaseCrossrefScore.stringDistance("", "") shouldBe 0
- HBaseCrossrefScore.stringDistance("a", "") shouldBe 1
- HBaseCrossrefScore.stringDistance("", "a") shouldBe 1
- HBaseCrossrefScore.stringDistance("abc", "") shouldBe 3
- HBaseCrossrefScore.stringDistance("", "abc") shouldBe 3
- }
-
- it should "work on equal strings" in {
- HBaseCrossrefScore.stringDistance("", "") shouldBe 0
- HBaseCrossrefScore.stringDistance("a", "a") shouldBe 0
- HBaseCrossrefScore.stringDistance("abc", "abc") shouldBe 0
- }
-
- it should "work where only inserts are needed" in {
- HBaseCrossrefScore.stringDistance("", "a") shouldBe 1
- HBaseCrossrefScore.stringDistance("a", "ab") shouldBe 1
- HBaseCrossrefScore.stringDistance("b", "ab") shouldBe 1
- HBaseCrossrefScore.stringDistance("ac", "abc") shouldBe 1
- HBaseCrossrefScore.stringDistance("abcdefg", "xabxcdxxefxgx") shouldBe 6
- }
-
- it should "work where only deletes are needed" in {
- HBaseCrossrefScore.stringDistance( "a", "") shouldBe 1
- HBaseCrossrefScore.stringDistance( "ab", "a") shouldBe 1
- HBaseCrossrefScore.stringDistance( "ab", "b") shouldBe 1
- HBaseCrossrefScore.stringDistance("abc", "ac") shouldBe 1
- HBaseCrossrefScore.stringDistance("xabxcdxxefxgx", "abcdefg") shouldBe 6
- }
-
- it should "work where only substitutions are needed" in {
- HBaseCrossrefScore.stringDistance( "a", "b") shouldBe 1
- HBaseCrossrefScore.stringDistance( "ab", "ac") shouldBe 1
- HBaseCrossrefScore.stringDistance( "ac", "bc") shouldBe 1
- HBaseCrossrefScore.stringDistance("abc", "axc") shouldBe 1
- HBaseCrossrefScore.stringDistance("xabxcdxxefxgx", "1ab2cd34ef5g6") shouldBe 6
- }
-
- it should "work where many operations are needed" in {
- HBaseCrossrefScore.stringDistance("example", "samples") shouldBe 3
- HBaseCrossrefScore.stringDistance("sturgeon", "urgently") shouldBe 6
- HBaseCrossrefScore.stringDistance("levenshtein", "frankenstein") shouldBe 6
- HBaseCrossrefScore.stringDistance("distance", "difference") shouldBe 5
- HBaseCrossrefScore.stringDistance("java was neat", "scala is great") shouldBe 7
- }
-
- // Pipeline tests
- val output = "/tmp/testOutput"
- val input = "/tmp/testInput"
- val (testTable, testHost) = ("test-table", "dummy-host:2181")
-
- val grobidSampleData = List(
- List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"),
- Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 1"))),
- List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"),
- Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 2: TNG"))),
- List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"),
- Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"))),
- List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"),
- Bytes.toBytes(MalformedGrobidString)))
-
- JobTest("sandcrawler.HBaseCrossrefScoreJob")
- .arg("test", "")
- .arg("app.conf.path", "app.conf")
- .arg("output", output)
- .arg("hbase-table", testTable)
- .arg("zookeeper-hosts", testHost)
- .arg("crossref-input", input)
- .arg("debug", "true")
- .source[Tuple](HBaseCrossrefScore.getHBaseSource(testTable, testHost),
- grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
- .source(TextLine(input), List(
- 0 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"),
- 1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"),
- 2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
- 3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1")))
- .sink[(Int, String, String, String, String)](TypedTsv[(Int,
- String, String, String, String)](output)) {
- // Grobid titles:
- // "Title 1", "Title 2: TNG", "Title 3: The Sequel"
- // crossref slugs:
- // "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted"
- // Join should have 3 "Title 1" slugs and 1 "Title 2" slug
- outputBuffer =>
- "The pipeline" should "return a 4-element list" in {
- outputBuffer should have length 4
- }
-
- it should "return the right first entry" in {
- val (slug, slug0, slug1, sha1, grobidJson, crossrefJson) = outputBuffer(0)
- slug shouldBe "title 1"
- slug shouldBe slug0
- slug shouldBe slug1
- sha1 shouldBe new String(grobidSampleData(0)(0), "UTF-8")
- grobidJson shouldBe new String(grobidSampleData(0)(1), "UTF-8")
- }
- }
- .run
- .finish
- */
-}