diff options
author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-24 11:53:58 -0700 |
---|---|---|
committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-24 11:53:58 -0700 |
commit | 3e33d60aac9db78d0458876fbe987627db222bbb (patch) | |
tree | 05680bd5cfc53348c966f7a03235547a01c5c5d1 /scalding/src/main/scala | |
parent | c4db53036eac90841eb4f970b77db8c1677ef75b (diff) | |
download | sandcrawler-3e33d60aac9db78d0458876fbe987627db222bbb.tar.gz sandcrawler-3e33d60aac9db78d0458876fbe987627db222bbb.zip |
grobidToSlug() seems to work, including parsing of valid JSON strings.
Diffstat (limited to 'scalding/src/main/scala')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala | 58 |
1 files changed, 58 insertions, 0 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala new file mode 100644 index 0000000..a22af81 --- /dev/null +++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala @@ -0,0 +1,58 @@ +package sandcrawler + +import java.util.Properties + +import scala.util.parsing.json.JSON + +import cascading.tuple.Fields +import com.twitter.scalding._ +import com.twitter.scalding.typed.TDsl._ +import org.apache.hadoop.hbase.io.ImmutableBytesWritable +import parallelai.spyglass.base.JobBase +import parallelai.spyglass.hbase.HBaseConstants.SourceMode +import parallelai.spyglass.hbase.HBasePipeConversions + +class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConversions { + + // key is SHA1 + val grobidSource = HBaseBuilder.build( + args("grobid-table"), + args("zookeeper-hosts"), + List("grobid0:tei_json"), + sourceMode = SourceMode.SCAN_ALL) + + val grobidPipe = grobidSource + .read + .map('tei_json -> 'slug) { + json : String => HBaseCrossrefScore.grobidToSlug(json)} + + /* + val crossrefSource = TextLine(args("input")) + val crossrefPipe = crossrefSource + .read + .map('line -> 'slug) { + json : String => crossrefToSlug(json)} + + + statusPipe.groupBy { identity } + .size + .debug + .write(TypedTsv[(Long,Long)](args("output"))) + */ +} + +object HBaseCrossrefScore { + def grobidToSlug(json : String) = { + // https://stackoverflow.com/a/32717262/631051 + val jsonObject = JSON.parseFull(json) + val globalMap = jsonObject.get.asInstanceOf[Map[String, Any]] + globalMap.get("title") match { + case Some(title) => titleToSlug(title.asInstanceOf[String]) + case None => "" + } + } + + def titleToSlug(title : String) = { + title.split(":")(0) + } +} |