aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
blob: 620998e435c466428ba18204f3db12bc04655ac7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
package sandcrawler

import cascading.tuple.Fields
import cascading.tuple.Tuple
import com.twitter.scalding.JobTest
import com.twitter.scalding.TextLine
import com.twitter.scalding.TupleConversions
import com.twitter.scalding.TypedTsv
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.scalatest._
import parallelai.spyglass.hbase.HBaseConstants.SourceMode

class GrobidScorableTest extends FlatSpec with Matchers {
  val GrobidString = """
{
  "title": "<<TITLE>>",
  "authors": [
    {"name": "Brewster Kahle"},
    {"name": "J Doe"}
  ],
  "journal": {
    "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
    "eissn": null,
    "issn": null,
    "issue": null,
    "publisher": null,
    "volume": null
  },
  "date": "2000",
  "doi": null,
  "citations": [
    { "authors": [{"name": "A Seaperson"}],
      "date": "2001",
      "id": "b0",
      "index": 0,
      "issue": null,
      "journal": "Letters in the Alphabet",
      "publisher": null,
      "title": "Everything is Wonderful",
      "url": null,
      "volume": "20"},
    { "authors": [],
      "date": "2011-03-28",
      "id": "b1",
      "index": 1,
      "issue": null,
      "journal": "The Dictionary",
      "publisher": null,
      "title": "All about Facts",
      "url": null,
      "volume": "14"}
  ],
  "abstract": "Everything you ever wanted to know about nothing",
  "body": "Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
  "acknowledgement": null,
  "annex": null
}
"""
  val GrobidStringWithGoodTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File")
  val GrobidStringWithExcessiveTitle = GrobidString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0")
  val GrobidStringWithNullTitle = GrobidString.replace("\"<<TITLE>>\"", "null")
  val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle")
  val MalformedGrobidString = GrobidString.replace("}", "")
  val Key = "Dummy Key"

  // Unit tests

  "GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
    val result = GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString)
    result.slug shouldBe Scorable.NoSlug
  }

  it should "handle null title" in {
    val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithNullTitle)
    result.slug shouldBe Scorable.NoSlug
  }

  it should "handle missing title" in {
    val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle)
    result.slug shouldBe Scorable.NoSlug
  }

  it should "handle valid input" in {
    val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithGoodTitle)
    result.slug shouldBe "dummyexamplefile"
    Scorable.jsonToMap(result.json) match {
      case None => fail()
      case Some(map) => {
        map should contain key "title"
        map("title").asInstanceOf[String] shouldBe "Dummy Example File"
      }
    }
  }

  "GrobidScorable.keepRecord()" should "return true for valid JSON with title" in {
    GrobidScorable.keepRecord(GrobidStringWithGoodTitle) shouldBe true
  }

  it should "return false for valid JSON with excessively long title" in {
    GrobidScorable.keepRecord(GrobidStringWithExcessiveTitle) shouldBe false
  }

  it should "return false for valid JSON with null title" in {
    GrobidScorable.keepRecord(GrobidStringWithNullTitle) shouldBe false
  }

  it should "return false for valid JSON with no title" in {
    GrobidScorable.keepRecord(GrobidStringWithoutTitle) shouldBe false
  }

  it should "return false for invalid JSON" in {
    GrobidScorable.keepRecord(GrobidStringWithoutTitle) shouldBe false
  }
}