1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
|
package sandcrawler
import java.util.Arrays
import java.util.Properties
import scala.util.parsing.json.JSON
import cascading.tuple.Fields
import com.twitter.scalding._
import com.twitter.scalding.typed.CoGrouped
import com.twitter.scalding.typed.Grouped
import com.twitter.scalding.typed.TDsl._
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import parallelai.spyglass.base.JobBase
import parallelai.spyglass.hbase.HBaseConstants.SourceMode
import parallelai.spyglass.hbase.HBasePipeConversions
import parallelai.spyglass.hbase.HBaseSource
class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with
HBasePipeConversions {
val NoTitle = "NO TITLE" // Used for slug if title is empty or unparseable
// key is SHA1
val grobidSource = HBaseCrossrefScore.getHBaseSource(
args("hbase-table"),
args("zookeeper-hosts"))
val grobidPipe : TypedPipe[(String, String, String)] = grobidSource
.read
.fromBytesWritable(new Fields("key", "tei_json"))
.toTypedPipe[(String, String)]('key, 'tei_json)
.map { entry =>
val (key, json) = (entry._1, entry._2)
HBaseCrossrefScore.grobidToSlug(json) match {
case Some(slug) => (slug, key, json)
case None => (NoTitle, key, json)
}
}
.filter { entry =>
val (slug, _, _) = entry
slug != NoTitle
}
val grobidGroup = grobidPipe
.groupBy { case (slug, key, json) => slug }
// .debug
val crossrefSource = TextLine(args("crossref-input"))
val crossrefPipe : TypedPipe[(String, String)] = crossrefSource
.read
.toTypedPipe[String]('line)
.map{ json : String =>
// val (offset, json) = entry
HBaseCrossrefScore.crossrefToSlug(json) match {
case Some(slug) => (slug, json)
case None => (NoTitle, json)
}
}
.debug
.filter { entry =>
val (slug, json) = entry
slug != NoTitle
}
val crossrefGroup = crossrefPipe
.groupBy { case (slug, json) => slug }
// TODO: Figure out which is smaller.
val theJoin : CoGrouped[String, ((String, String, String), (String, String))] =
grobidGroup.join(crossrefGroup)
theJoin.map{ entry =>
val (slug : String,
((slug0: String, sha1 : String, grobidJson : String),
(slug1 : String, crossrefJson : String))) = entry
// TODO: For now, output it all.
(slug, slug0, slug1, sha1, grobidJson, crossrefJson)}
.write(TypedTsv[(String, String, String, String, String, String)](args("output")))
}
object HBaseCrossrefScore {
def getHBaseSource(hbaseTable: String, zookeeperHosts: String) : HBaseSource = HBaseBuilder.build(
hbaseTable, // HBase Table Name
zookeeperHosts, // HBase Zookeeper server (to get runtime config info; can be array?)
List("grobid0:tei_json"),
SourceMode.SCAN_ALL)
def performJoin(grobidJson : String, crossRefJson : String, sha1 : String) : (String, String, String) = {
(sha1, "1.2.3.4", "100")
}
def jsonToMap(json : String) : Map[String, Any] = {
// https://stackoverflow.com/a/32717262/631051
val jsonObject = JSON.parseFull(json)
if (jsonObject == None) {
// Empty map for malformed JSON
Map[String, Any]("malformed json" -> json)
} else {
jsonObject.get.asInstanceOf[Map[String, Any]]
}
}
def grobidToSlug(json : String) : Option[String] = {
val map = jsonToMap(json)
if (map contains "title") {
titleToSlug(map("title").asInstanceOf[String])
} else {
None
}
}
def crossrefToSlug(json : String) : Option[String] = {
val map = jsonToMap(json)
if (map contains "title") {
// TODO: Don't ignore titles after the first.
titleToSlug(map("title").asInstanceOf[List[String]](0))
} else {
Some(map.keys.mkString(","))
}
}
def titleToSlug(title : String) : Option[String] = {
val slug = title.split(":")(0).toLowerCase()
if (slug.isEmpty) {
None
} else {
Some(slug)
}
}
}
|