diff options
author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-24 13:53:17 -0700 |
---|---|---|
committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-24 13:53:17 -0700 |
commit | 8a63e05c18bbf84dddccd5596f9e0aefbf469789 (patch) | |
tree | ed420287944c8f0984cf3e8b27a0da86e1053fe1 /scalding/src/main | |
parent | dae965840db388c53b969d76849e5e8e9569ceee (diff) | |
download | sandcrawler-8a63e05c18bbf84dddccd5596f9e0aefbf469789.tar.gz sandcrawler-8a63e05c18bbf84dddccd5596f9e0aefbf469789.zip |
Added grobidToSlug().
Diffstat (limited to 'scalding/src/main')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala | 20 |
1 files changed, 16 insertions, 4 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala index d3e78fe..30f76a0 100644 --- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala +++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala @@ -26,14 +26,13 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConv .map('tei_json -> 'slug) { json : String => HBaseCrossrefScore.grobidToSlug(json)} - /* val crossrefSource = TextLine(args("input")) val crossrefPipe = crossrefSource .read .map('line -> 'slug) { - json : String => crossrefToSlug(json)} - + json : String => HBaseCrossrefScore.crossrefToSlug(json)} +/* statusPipe.groupBy { identity } .size .debug @@ -56,7 +55,20 @@ object HBaseCrossrefScore { } } + def crossrefToSlug(json : String) : Option[String] = { + val jsonObject = JSON.parseFull(json) + if (jsonObject == None) { + None + } else { + val globalMap = jsonObject.get.asInstanceOf[Map[String, Any]] + globalMap.get("title") match { + case Some(title) => Some(titleToSlug(title.asInstanceOf[List[String]](0))) + case None => None + } + } + } + def titleToSlug(title : String) : String = { - title.split(":")(0) + title.split(":")(0).toLowerCase() } } |