aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/test/scala/sandcrawler/ScorableTest.scala
blob: 713a7e5667cafe47a045f8d3f8e7a9a35e5787ca (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
package sandcrawler

import cascading.tuple.Fields
import cascading.tuple.Tuple
import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.scalatest._
import parallelai.spyglass.hbase.HBaseConstants.SourceMode

class ScorableTest extends FlatSpec with Matchers {
      val JsonString = """
{
  "title": "<<TITLE>>",
  "authors": [
    {"name": "Brewster Kahle"},
    {"name": "J Doe"}
  ],
  "journal": {
    "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
    "eissn": null,
    "issn": null,
    "issue": null,
    "publisher": null,
    "volume": null
  },
  "date": "2000",
  "doi": null,
  "citations": [
    { "authors": [{"name": "A Seaperson"}],
      "date": "2001",
      "id": "b0",
      "index": 0,
      "issue": null,
      "journal": "Letters in the Alphabet",
      "publisher": null,
      "title": "Everything is Wonderful",
      "url": null,
      "volume": "20"},
    { "authors": [],
      "date": "2011-03-28",
      "id": "b1",
      "index": 1,
      "issue": null,
      "journal": "The Dictionary",
      "publisher": null,
      "title": "All about Facts",
      "url": null,
      "volume": "14"}
  ],
  "abstract": "Everything you ever wanted to know about nothing",
  "body": "Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
  "acknowledgement": null,
  "annex": null
}
"""

  performUnitTests()
  performPipelineTests()

  def performUnitTests() {
    "titleToSlug()" should "extract the parts of titles before a colon" in {
      Scorable.titleToSlug("HELLO:there") shouldBe "hello"
    }

    it should "extract an entire colon-less string" in {
      Scorable.titleToSlug("hello THERE") shouldBe "hello there"
    }

    it should "return Scorable.NoSlug if given empty string" in {
      Scorable.titleToSlug("") shouldBe Scorable.NoSlug
    }

    "titleToSlug()" should "strip punctuation" in {
      Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello"
      Scorable.titleToSlug("a:b:c") shouldBe "a"
      Scorable.titleToSlug(
        "If you're happy and you know it, clap your hands!") shouldBe "if youre happy and you know it clap your hands"
    }

    "jsonToMap()" should "return a map, given a legal JSON string" in {
      Scorable.jsonToMap(JsonString) should not be (None)
    }

    it should "return None, given illegal JSON" in {
      Scorable.jsonToMap("illegal{,json{{") should be (None)
    }

    "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in {
      val score = Scorable.computeSimilarity(
        new ReduceFeatures(JsonString), new ReduceFeatures(JsonString))
      score shouldBe Scorable.MaxScore
    }
  }

  def performPipelineTests() {
      /*

    val output = "/tmp/testOutput"
    val input = "/tmp/testInput"
    val (testTable, testHost) = ("test-table", "dummy-host:2181")

  val grobidSampleData = List(
    List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"),
      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 1"))),
    List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"),
      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 2: TNG"))),
    List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"),
      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"))),
    List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), 
      Bytes.toBytes(MalformedGrobidString)))

  JobTest("sandcrawler.HBaseCrossrefScoreJob")
    .arg("test", "")
    .arg("app.conf.path", "app.conf")
    .arg("output", output)
    .arg("hbase-table", testTable)
    .arg("zookeeper-hosts", testHost)
    .arg("crossref-input", input)
    .arg("debug", "true")
    .source[Tuple](HBaseCrossrefScore.getHBaseSource(testTable, testHost),
      grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
    .source(TextLine(input), List(
      0 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"),
      1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"),
      2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
      3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1")))
    .sink[(Int, String, String, String, String)](TypedTsv[(Int,
    String, String, String, String)](output)) {
      // Grobid titles: 
      //   "Title 1", "Title 2: TNG", "Title 3: The Sequel"
      // crossref slugs: 
      //   "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted"
      // Join should have 3 "Title  1" slugs and 1 "Title 2" slug
      outputBuffer =>
      "The pipeline" should "return a 4-element list" in {
        outputBuffer should have length 4
      }
    }
    .run
    .finish
}
       */
  }
}