aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src')
-rw-r--r--scalding/src/main/scala/sandcrawler/Scorable.scala11
-rw-r--r--scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala89
-rw-r--r--scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala20
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableTest.scala28
4 files changed, 39 insertions, 109 deletions
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index 9c8da69..929461b 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -66,13 +66,14 @@ object Scorable {
// This guarantees it will have all of the fields needed to compute
// the ultimate score, which are a superset of those needed for a slug.
def mapToSlug(map : Map[String, Any]) : String = {
- val unaccented = StringUtilities.removeAccents(getString(map, "title"))
- // Remove punctuation after splitting on colon.
- val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase()))
- if (slug.isEmpty || slug == null) {
+ val title = getString(map, "title")
+ if (title == null) {
NoSlug
} else {
- slug
+ val unaccented = StringUtilities.removeAccents(title)
+ // Remove punctuation after splitting on colon.
+ val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "")
+ if (slug.isEmpty || slug == null) NoSlug else slug
}
}
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
deleted file mode 100644
index 1c35d66..0000000
--- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
+++ /dev/null
@@ -1,89 +0,0 @@
-package sandcrawler
-
-import cascading.tuple.Fields
-import cascading.tuple.Tuple
-import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
-import org.apache.hadoop.hbase.io.ImmutableBytesWritable
-import org.apache.hadoop.hbase.util.Bytes
-import org.scalatest._
-import parallelai.spyglass.hbase.HBaseConstants.SourceMode
-
-class CrossrefScorableTest extends FlatSpec with Matchers {
- val CrossrefString =
-"""
-{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" },
- "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ],
- "date-time" : "2017-10-23T17:19:16Z",
- "timestamp" : { "$numberLong" : "1508779156477" } },
- "reference-count" : 0,
- "publisher" : "Elsevier BV",
- "issue" : "3",
- "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/",
- "start" : { "date-parts" : [ [ 1996, 1, 1 ] ],
- "date-time" : "1996-01-01T00:00:00Z",
- "timestamp" : { "$numberLong" : "820454400000" } },
- "delay-in-days" : 0, "content-version" : "tdm" }],
- "content-domain" : { "domain" : [], "crossmark-restriction" : false },
- "published-print" : { "date-parts" : [ [ 1996 ] ] },
- "DOI" : "<<DOI>>",
- "type" : "journal-article",
- "created" : { "date-parts" : [ [ 2002, 7, 25 ] ],
- "date-time" : "2002-07-25T15:09:41Z",
- "timestamp" : { "$numberLong" : "1027609781000" } },
- "page" : "186-187",
- "source" : "Crossref",
- "is-referenced-by-count" : 0,
- "title" : [ "<<TITLE>>" ],
- "prefix" : "10.1016",
- "volume" : "9",
- "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ],
- "member" : "78",
- "container-title" : [ "Journal de Pédiatrie et de Puériculture" ],
- "link" : [ { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml",
- "content-type" : "text/xml",
- "content-version" : "vor",
- "intended-application" : "text-mining" },
- { "URL" :
- "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain",
- "content-type" : "text/plain",
- "content-version" : "vor",
- "intended-application" : "text-mining" } ],
- "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ],
- "date-time" : "2015-09-03T10:03:43Z",
- "timestamp" : { "$numberLong" : "1441274623000" } },
- "score" : 1,
- "issued" : { "date-parts" : [ [ 1996 ] ] },
- "references-count" : 0,
- "alternative-id" : [ "0987-7983(96)87729-2" ],
- "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2",
- "ISSN" : [ "0987-7983" ],
- "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],
- "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
-}
-"""
- val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
- val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
- val MalformedCrossrefString = CrossrefString.replace("}", "")
-
- // Unit tests
- "simplifyJson()" should "return None for bad JSON" in {
- CrossrefScorable.simplifyJson("") shouldBe None
- CrossrefScorable.simplifyJson(MalformedCrossrefString) shouldBe None
- }
-
- it should "return None for JSON lacking title" in {
- CrossrefScorable.simplifyJson(CrossrefStringWithoutTitle) shouldBe None
- }
-
- it should "return appropriate result for valid JSON" in {
- CrossrefScorable.simplifyJson(CrossrefStringWithTitle) match {
- case None => fail("None unexpectedly returned by simplifyJson")
- case Some(map) => {
- Scorable.isScorableMap(map) shouldBe true
- map.size shouldBe 1
- map.keys should contain ("title")
- map("title") shouldBe "SomeTitle"
- }
- }
- }
-}
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
index 5bb955a..3fcd856 100644
--- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
@@ -57,18 +57,28 @@ class GrobidScorableTest extends FlatSpec with Matchers {
val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File")
val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle")
val MalformedGrobidString = GrobidString.replace("}", "")
+ val Key = "Dummy Key"
// Unit tests
"GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
- val result = GrobidScorable.jsonToMapFeatures(MalformedGrobidString) shouldBe None
+ val result = GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString)
result.slug shouldBe Scorable.NoSlug
- result.json shouldBe MalformedGrobidString
}
- "GrobidScorable.jsonToMapFeatures()" should "handle missing title" in {
- val result = GrobidScorable.jsonToMapFeatures(GrobidStringWithoutTitle) shouldBe None
+ it should "handle missing title" in {
+ val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle)
result.slug shouldBe Scorable.NoSlug
- result.json shouldBe GrobidStringWithoutTitle
+ }
+
+ it should "handle valid input" in {
+ val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithTitle)
+ result.slug shouldBe "dummyexamplefile"
+ Scorable.jsonToMap(result.json) match {
+ case None => fail()
+ case Some(map) => {
+ map("title").asInstanceOf[String] shouldBe "Dummy Example File"
+ }
+ }
}
}
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
index 2f80492..95faacc 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -54,28 +54,36 @@ class ScorableTest extends FlatSpec with Matchers {
"annex": null
}
"""
+ private def titleToSlug(s : String) : String = {
+ Scorable.mapToSlug(Scorable.toScorableMap(title = s))
+ }
- "titleToSlug()" should "extract the parts of titles before a colon" in {
- Scorable.titleToSlug("HELLO:there") shouldBe "hello"
+ "mapToSlug()" should "extract the parts of titles before a colon" in {
+ titleToSlug("HELLO:there") shouldBe "hello"
}
it should "extract an entire colon-less string" in {
- Scorable.titleToSlug("hello THERE") shouldBe "hello there"
+ titleToSlug("hello THERE") shouldBe "hellothere"
}
it should "return Scorable.NoSlug if given empty string" in {
- Scorable.titleToSlug("") shouldBe Scorable.NoSlug
+ titleToSlug("") shouldBe Scorable.NoSlug
}
it should "return Scorable.NoSlug if given null" in {
- Scorable.titleToSlug(null) shouldBe Scorable.NoSlug
+ titleToSlug(null) shouldBe Scorable.NoSlug
+ }
+
+ it should "strip punctuation" in {
+ titleToSlug("HELLO!:the:re") shouldBe "hello"
+ titleToSlug("a:b:c") shouldBe "a"
+ titleToSlug(
+ "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands"
}
- "titleToSlug()" should "strip punctuation" in {
- Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello"
- Scorable.titleToSlug("a:b:c") shouldBe "a"
- Scorable.titleToSlug(
- "If you're happy and you know it, clap your hands!") shouldBe "if youre happy and you know it clap your hands"
+ it should "remove whitespace" in {
+ titleToSlug("foo bar : baz ::") shouldBe "foobar"
+ titleToSlug("\na\t:b:c") shouldBe "a"
}
"jsonToMap()" should "return a map, given a legal JSON string" in {