aboutsummaryrefslogtreecommitdiffstats
path: root/scalding
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-08-12 18:41:27 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-08-12 18:41:27 -0700
commit05c0213547f29842bbae6faaf77e983a364d4a2e (patch)
tree5001b54b1fe8dc9c01d8142e82de684dd561eef7 /scalding
parent31354b1a6062c5c56a30610f68fa48c82a7e83f0 (diff)
downloadsandcrawler-05c0213547f29842bbae6faaf77e983a364d4a2e.tar.gz
sandcrawler-05c0213547f29842bbae6faaf77e983a364d4a2e.zip
Added back file I shouldn't have deleted.
Diffstat (limited to 'scalding')
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala22
-rw-r--r--scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala89
2 files changed, 89 insertions, 22 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index 5113b0c..667a5cc 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -64,25 +64,3 @@ class CrossrefScorable extends Scorable with HBasePipeConversions {
}
}
}
-
-/*
-object CrossrefScorable {
- def simplifyJson(json : String) : Option[Map[String, Any]] = {
- Scorable.jsonToMap(json) match {
- case None => None
- case Some(map) => {
- if (map contains "title") {
- val titles = map("title").asInstanceOf[List[String]]
- if (titles.isEmpty) {
- None
- } else {
- Some(Map("title" -> titles(0)))
- }
- } else {
- None
- }
- }
- }
- }
-}
- */
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
new file mode 100644
index 0000000..1c35d66
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -0,0 +1,89 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class CrossrefScorableTest extends FlatSpec with Matchers {
+ val CrossrefString =
+"""
+{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" },
+ "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ],
+ "date-time" : "2017-10-23T17:19:16Z",
+ "timestamp" : { "$numberLong" : "1508779156477" } },
+ "reference-count" : 0,
+ "publisher" : "Elsevier BV",
+ "issue" : "3",
+ "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/",
+ "start" : { "date-parts" : [ [ 1996, 1, 1 ] ],
+ "date-time" : "1996-01-01T00:00:00Z",
+ "timestamp" : { "$numberLong" : "820454400000" } },
+ "delay-in-days" : 0, "content-version" : "tdm" }],
+ "content-domain" : { "domain" : [], "crossmark-restriction" : false },
+ "published-print" : { "date-parts" : [ [ 1996 ] ] },
+ "DOI" : "<<DOI>>",
+ "type" : "journal-article",
+ "created" : { "date-parts" : [ [ 2002, 7, 25 ] ],
+ "date-time" : "2002-07-25T15:09:41Z",
+ "timestamp" : { "$numberLong" : "1027609781000" } },
+ "page" : "186-187",
+ "source" : "Crossref",
+ "is-referenced-by-count" : 0,
+ "title" : [ "<<TITLE>>" ],
+ "prefix" : "10.1016",
+ "volume" : "9",
+ "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ],
+ "member" : "78",
+ "container-title" : [ "Journal de Pédiatrie et de Puériculture" ],
+ "link" : [ { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml",
+ "content-type" : "text/xml",
+ "content-version" : "vor",
+ "intended-application" : "text-mining" },
+ { "URL" :
+ "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain",
+ "content-type" : "text/plain",
+ "content-version" : "vor",
+ "intended-application" : "text-mining" } ],
+ "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ],
+ "date-time" : "2015-09-03T10:03:43Z",
+ "timestamp" : { "$numberLong" : "1441274623000" } },
+ "score" : 1,
+ "issued" : { "date-parts" : [ [ 1996 ] ] },
+ "references-count" : 0,
+ "alternative-id" : [ "0987-7983(96)87729-2" ],
+ "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2",
+ "ISSN" : [ "0987-7983" ],
+ "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],
+ "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
+}
+"""
+ val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
+ val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
+ val MalformedCrossrefString = CrossrefString.replace("}", "")
+
+ // Unit tests
+ "simplifyJson()" should "return None for bad JSON" in {
+ CrossrefScorable.simplifyJson("") shouldBe None
+ CrossrefScorable.simplifyJson(MalformedCrossrefString) shouldBe None
+ }
+
+ it should "return None for JSON lacking title" in {
+ CrossrefScorable.simplifyJson(CrossrefStringWithoutTitle) shouldBe None
+ }
+
+ it should "return appropriate result for valid JSON" in {
+ CrossrefScorable.simplifyJson(CrossrefStringWithTitle) match {
+ case None => fail("None unexpectedly returned by simplifyJson")
+ case Some(map) => {
+ Scorable.isScorableMap(map) shouldBe true
+ map.size shouldBe 1
+ map.keys should contain ("title")
+ map("title") shouldBe "SomeTitle"
+ }
+ }
+ }
+}