aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-07-25 11:18:15 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-07-25 11:18:15 -0700
commit773d5c28e2ac6085172aaebf86031358261a7014 (patch)
tree55f3eb6e8323f206f7b8fe597cdb507381760a17 /scalding/src
parent4c5dbdf964da9ca29246b0f8eadec6daae1d3ffb (diff)
downloadsandcrawler-773d5c28e2ac6085172aaebf86031358261a7014.tar.gz
sandcrawler-773d5c28e2ac6085172aaebf86031358261a7014.zip
Grobid entries without legal slugs are removed from the pipe.
Diffstat (limited to 'scalding/src')
-rw-r--r--scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala14
-rw-r--r--scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala18
2 files changed, 22 insertions, 10 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
index 56eb91e..7b7deec 100644
--- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
@@ -15,7 +15,9 @@ import parallelai.spyglass.hbase.HBaseConstants.SourceMode
import parallelai.spyglass.hbase.HBasePipeConversions
import parallelai.spyglass.hbase.HBaseSource
-class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConversions {
+class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with
+ HBasePipeConversions {
+ val NoTitle = "NO TITLE" // Used for slug if title is empty or unparseable
// key is SHA1
val grobidSource = HBaseCrossrefScore.getHBaseSource(
@@ -30,9 +32,13 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConv
val (key, json) = (entry._1, entry._2)
HBaseCrossrefScore.grobidToSlug(json) match {
case Some(slug) => (key, json, slug)
- case None => (key, json, "none")
+ case None => (key, json, NoTitle)
}
}
+ .filter { entry =>
+ val (_, _, slug) = entry
+ slug != NoTitle && slug.length > 0
+ }
.write(TypedTsv[(String, String, String)](args("output")))
/*
@@ -79,7 +85,7 @@ object HBaseCrossrefScore {
if (map contains "title") {
titleToSlug(map("title").asInstanceOf[String])
} else {
- Some("grobidToSlug None: " + map("foo"))
+ None
}
}
@@ -89,7 +95,7 @@ object HBaseCrossrefScore {
// TODO: Don't ignore titles after the first.
titleToSlug(map("title").asInstanceOf[List[String]](0))
} else {
- Some("crossRefToSlug None")
+ None
}
}
diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
index 0d681b9..d70c8f2 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
@@ -163,7 +163,7 @@ class HBaseCrossrefScoreTest extends FunSpec with TupleConversions {
List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title1"))),
List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title2: TNG"))),
List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title3: The Sequel"))),
- List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title4"))))
+ List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), Bytes.toBytes(MalformedGrobidString)))
JobTest("sandcrawler.HBaseCrossrefScoreJob")
.arg("test", "")
@@ -180,13 +180,19 @@ class HBaseCrossrefScoreTest extends FunSpec with TupleConversions {
"1" -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"))))
.sink[(String, String, String)](TypedTsv[(String, String, String)](output)) {
outputBuffer =>
- it("should return a 4-element list.") {
- assert(outputBuffer.size === 4)
+ it("should return a 3-element list.") {
+ assert(outputBuffer.size === 3)
}
- it("should return the right slugs.") {
- val (sha1, json, slug) = outputBuffer(0)
- assert(slug == "title1")
+ it("should return the right first slug.") {
+ val (_, _, slug0) = outputBuffer(0)
+ assert(slug0 == "title1")
}
+ /*
+ it("should return the right last slug.") {
+ val (_, _, slug3) = outputBuffer(3)
+ assert(slug3 == "foo")
+ }
+ */
}
.run
.finish