diff options
| author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-24 13:53:17 -0700 | 
|---|---|---|
| committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-24 13:53:17 -0700 | 
| commit | 8a63e05c18bbf84dddccd5596f9e0aefbf469789 (patch) | |
| tree | ed420287944c8f0984cf3e8b27a0da86e1053fe1 /scalding/src/main/scala | |
| parent | dae965840db388c53b969d76849e5e8e9569ceee (diff) | |
| download | sandcrawler-8a63e05c18bbf84dddccd5596f9e0aefbf469789.tar.gz sandcrawler-8a63e05c18bbf84dddccd5596f9e0aefbf469789.zip | |
Added grobidToSlug().
Diffstat (limited to 'scalding/src/main/scala')
| -rw-r--r-- | scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala | 20 | 
1 files changed, 16 insertions, 4 deletions
| diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala index d3e78fe..30f76a0 100644 --- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala +++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala @@ -26,14 +26,13 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConv      .map('tei_json -> 'slug) {        json : String => HBaseCrossrefScore.grobidToSlug(json)} -  /*    val crossrefSource = TextLine(args("input"))    val crossrefPipe = crossrefSource      .read      .map('line -> 'slug) { -      json : String => crossrefToSlug(json)} - +      json : String => HBaseCrossrefScore.crossrefToSlug(json)} +/*    statusPipe.groupBy { identity }      .size      .debug @@ -56,7 +55,20 @@ object HBaseCrossrefScore {      }    } +  def crossrefToSlug(json : String) : Option[String] = { +    val jsonObject = JSON.parseFull(json) +    if (jsonObject == None) { +      None +    } else { +      val globalMap = jsonObject.get.asInstanceOf[Map[String, Any]] +      globalMap.get("title") match { +        case Some(title) => Some(titleToSlug(title.asInstanceOf[List[String]](0))) +        case None => None +      } +    } +  } +    def titleToSlug(title : String) : String = { -    title.split(":")(0) +    title.split(":")(0).toLowerCase()    }  } | 
