diff options
author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-09 19:03:01 -0700 |
---|---|---|
committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-09 19:03:01 -0700 |
commit | 9d7adc94ad63e85ffb2b459d4a8c2ed0ed46d8c8 (patch) | |
tree | 24cf1126815d7e0fc0b44261747a3320492c0640 /scalding/src/main/scala/sandcrawler/GrobidScorable.scala | |
parent | 25ade249538aade9dcd39d459bacdf43ea0a7dd6 (diff) | |
download | sandcrawler-9d7adc94ad63e85ffb2b459d4a8c2ed0ed46d8c8.tar.gz sandcrawler-9d7adc94ad63e85ffb2b459d4a8c2ed0ed46d8c8.zip |
WIP
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/GrobidScorable.scala')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/GrobidScorable.scala | 15 |
1 files changed, 7 insertions, 8 deletions
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index 95d6dae..4c67074 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -11,14 +11,9 @@ import parallelai.spyglass.hbase.HBaseSource class GrobidScorable extends Scorable with HBasePipeConversions { def getFeaturesPipe(args : Args)(implicit flowDef : FlowDef, mode : Mode) : TypedPipe[MapFeatures] = { - // TODO: Clean up code after debugging. - val grobidSource = HBaseBuilder.build( - args("hbase-table"), - args("zookeeper-hosts"), - List("grobid0:tei_json"), - SourceMode.SCAN_ALL) - - grobidSource.read + // TODO: Generalize args so there can be multiple grobid pipes in one job. + GrobidScorable.getHBaseSource(args("hbase-table"), args("zookeeper-hosts")) + .read .fromBytesWritable(new Fields("key", "tei_json")) // TODO: Figure out why this line (used in HBaseCrossrefScoreJob.scala) // didn't work here: .toTypedPipe[(String, String)]('key, 'tei_json) @@ -34,6 +29,10 @@ class GrobidScorable extends Scorable with HBasePipeConversions { } object GrobidScorable { + def getHBaseSource(table : String, host : String) : HBaseSource = { + HBaseBuilder.build(table, host, List("grobid0:tei_json"), SourceMode.SCAN_ALL) + } + def grobidToSlug(json : String) : Option[String] = { Scorable.jsonToMap(json) match { case None => None |