aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-07-24 13:53:17 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-07-24 13:53:17 -0700
commit8a63e05c18bbf84dddccd5596f9e0aefbf469789 (patch)
treeed420287944c8f0984cf3e8b27a0da86e1053fe1 /scalding/src/main
parentdae965840db388c53b969d76849e5e8e9569ceee (diff)
downloadsandcrawler-8a63e05c18bbf84dddccd5596f9e0aefbf469789.tar.gz
sandcrawler-8a63e05c18bbf84dddccd5596f9e0aefbf469789.zip
Added grobidToSlug().
Diffstat (limited to 'scalding/src/main')
-rw-r--r--scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala20
1 files changed, 16 insertions, 4 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
index d3e78fe..30f76a0 100644
--- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
@@ -26,14 +26,13 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConv
.map('tei_json -> 'slug) {
json : String => HBaseCrossrefScore.grobidToSlug(json)}
- /*
val crossrefSource = TextLine(args("input"))
val crossrefPipe = crossrefSource
.read
.map('line -> 'slug) {
- json : String => crossrefToSlug(json)}
-
+ json : String => HBaseCrossrefScore.crossrefToSlug(json)}
+/*
statusPipe.groupBy { identity }
.size
.debug
@@ -56,7 +55,20 @@ object HBaseCrossrefScore {
}
}
+ def crossrefToSlug(json : String) : Option[String] = {
+ val jsonObject = JSON.parseFull(json)
+ if (jsonObject == None) {
+ None
+ } else {
+ val globalMap = jsonObject.get.asInstanceOf[Map[String, Any]]
+ globalMap.get("title") match {
+ case Some(title) => Some(titleToSlug(title.asInstanceOf[List[String]](0)))
+ case None => None
+ }
+ }
+ }
+
def titleToSlug(title : String) : String = {
- title.split(":")(0)
+ title.split(":")(0).toLowerCase()
}
}