scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

package sandcrawler

import java.util.Arrays
import java.util.Properties

import scala.util.parsing.json.JSON

import cascading.tuple.Fields
import com.twitter.scalding._
import com.twitter.scalding.typed.CoGrouped
import com.twitter.scalding.typed.Grouped
import com.twitter.scalding.typed.TDsl._
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import parallelai.spyglass.base.JobBase
import parallelai.spyglass.hbase.HBaseConstants.SourceMode
import parallelai.spyglass.hbase.HBasePipeConversions
import parallelai.spyglass.hbase.HBaseSource


class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with
    HBasePipeConversions {
  val NoTitle = "NO TITLE" // Used for slug if title is empty or unparseable

  // key is SHA1
  val grobidSource = HBaseCrossrefScore.getHBaseSource(
    args("hbase-table"),
    args("zookeeper-hosts"))
  val grobidPipe : TypedPipe[(String, String, String)] = grobidSource
    .read
    .fromBytesWritable(new Fields("key", "tei_json"))
    .toTypedPipe[(String, String)]('key, 'tei_json)
    .map { entry =>
      val (key, json) = (entry._1, entry._2)
      HBaseCrossrefScore.grobidToSlug(json) match {
          case Some(slug) => (slug, key, json)
          case None => (NoTitle, key, json)
      }
    }
    .filter { entry =>
      val (slug, _, _) = entry
      slug != NoTitle
    }

  val grobidGroup = grobidPipe
    .groupBy { case (slug, key, json) => slug }
//    .debug


  val crossrefSource = TextLine(args("crossref-input"))
  val crossrefPipe : TypedPipe[(String, String)] = crossrefSource
    .read
    .toTypedPipe[String]('line)
    .map{ json : String =>
//      val (offset, json) = entry
      HBaseCrossrefScore.crossrefToSlug(json) match {
        case Some(slug) => (slug, json)
        case None => (NoTitle, json)
      }
    }
  .debug
    .filter { entry =>
      val (slug, json) = entry
      slug != NoTitle
    }
  val crossrefGroup = crossrefPipe
  .groupBy { case (slug, json) => slug }

  // TODO: Figure out which is smaller.
  val theJoin : CoGrouped[String, ((String, String, String), (String, String))] = 
    grobidGroup.join(crossrefGroup)

  theJoin.map{ entry =>
        val (slug : String, 
          ((slug0: String, sha1 : String, grobidJson : String), 
            (slug1 : String, crossrefJson : String))) = entry
        // TODO: For now, output it all.
        (slug, slug0, slug1, sha1, grobidJson, crossrefJson)}
      .write(TypedTsv[(String, String, String, String, String, String)](args("output")))


}

object HBaseCrossrefScore {
  def getHBaseSource(hbaseTable: String, zookeeperHosts: String) : HBaseSource = HBaseBuilder.build(
    hbaseTable,      // HBase Table Name
    zookeeperHosts,  // HBase Zookeeper server (to get runtime config info; can be array?)
    List("grobid0:tei_json"),
    SourceMode.SCAN_ALL)

  def performJoin(grobidJson : String, crossRefJson : String, sha1 : String) : (String, String, String) = {
    (sha1, "1.2.3.4", "100")
  }

  def jsonToMap(json : String) : Map[String, Any] = {
    // https://stackoverflow.com/a/32717262/631051
    val jsonObject = JSON.parseFull(json)
    if (jsonObject == None) {
      // Empty map for malformed JSON
      Map[String, Any]("malformed json" -> json)
    } else {
      jsonObject.get.asInstanceOf[Map[String, Any]]
    }
  }

  def grobidToSlug(json : String) : Option[String] = {
    val map = jsonToMap(json)
    if (map contains "title") {
      titleToSlug(map("title").asInstanceOf[String])
    } else {
      None
    }
  }

  def crossrefToSlug(json : String) : Option[String] = {
    val map = jsonToMap(json)
    if (map contains "title") {
      // TODO: Don't ignore titles after the first.
      titleToSlug(map("title").asInstanceOf[List[String]](0))
    } else {
      Some(map.keys.mkString(","))
    }
  }

  def titleToSlug(title : String) : Option[String] = {
    val slug = title.split(":")(0).toLowerCase()
    if (slug.isEmpty) {
      None
    } else {
      Some(slug)
    }
  }
}