aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-07-25 20:33:38 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-07-25 20:33:38 -0700
commit0f0152189cf6df0f4b56d92149a60e902eb20be6 (patch)
tree8d22242ed5ae38caa086be16dee3a07a7fc44989 /scalding/src/main/scala
parent4b63570522e5ebbc73980356372c39ce7547ba68 (diff)
downloadsandcrawler-0f0152189cf6df0f4b56d92149a60e902eb20be6.tar.gz
sandcrawler-0f0152189cf6df0f4b56d92149a60e902eb20be6.zip
Fixed bug with reading from TextLine. (Thanks, Bryan\!) Still had to comment out some tests.
Diffstat (limited to 'scalding/src/main/scala')
-rw-r--r--scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala12
1 files changed, 6 insertions, 6 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
index ac633e4..bcb6156 100644
--- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
@@ -17,7 +17,6 @@ import parallelai.spyglass.hbase.HBaseConstants.SourceMode
import parallelai.spyglass.hbase.HBasePipeConversions
import parallelai.spyglass.hbase.HBaseSource
-
class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with
HBasePipeConversions {
val NoTitle = "NO TITLE" // Used for slug if title is empty or unparseable
@@ -29,6 +28,7 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with
val grobidPipe : TypedPipe[(String, String, String)] = grobidSource
.read
.fromBytesWritable(new Fields("key", "tei_json"))
+ .debug
.toTypedPipe[(String, String)]('key, 'tei_json)
.map { entry =>
val (key, json) = (entry._1, entry._2)
@@ -41,24 +41,24 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with
val (slug, _, _) = entry
slug != NoTitle
}
+ .debug
+ .write(TypedTsv[(String, String, String)](args("output")))
+
+ /*
val grobidGroup = grobidPipe
.groupBy { case (slug, key, json) => slug }
-// .debug
-
val crossrefSource = TextLine(args("crossref-input"))
val crossrefPipe : TypedPipe[(String, String)] = crossrefSource
.read
.toTypedPipe[String]('line)
.map{ json : String =>
-// val (offset, json) = entry
HBaseCrossrefScore.crossrefToSlug(json) match {
case Some(slug) => (slug, json)
case None => (NoTitle, json)
}
}
- .debug
.filter { entry =>
val (slug, json) = entry
slug != NoTitle
@@ -77,7 +77,7 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with
// TODO: For now, output it all.
(slug, slug0, slug1, sha1, grobidJson, crossrefJson)}
.write(TypedTsv[(String, String, String, String, String, String)](args("output")))
-
+ */
}