aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-08-11 21:03:53 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-08-11 21:03:53 -0700
commit728e50a33cec921c9a624439f2e1c8561a6e12ce (patch)
tree671548fe0e4bd38badb76453c0a1a90dea5e0ce7 /scalding/src/main/scala/sandcrawler/GrobidScorable.scala
parent768e7ef0d127cf55119543be6e656751704ca5b2 (diff)
downloadsandcrawler-728e50a33cec921c9a624439f2e1c8561a6e12ce.tar.gz
sandcrawler-728e50a33cec921c9a624439f2e1c8561a6e12ce.zip
It compiles.
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/GrobidScorable.scala')
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorable.scala21
1 files changed, 10 insertions, 11 deletions
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
index 61055f2..de9f51a 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -1,5 +1,6 @@
package sandcrawler
+import scala.util.parsing.json.JSONObject
import cascading.flow.FlowDef
import cascading.pipe.Pipe
import cascading.tuple.Fields
@@ -21,13 +22,7 @@ class GrobidScorable extends Scorable with HBasePipeConversions {
.read
.fromBytesWritable(new Fields("key", "tei_json"))
.toTypedPipe[(String, String)](new Fields("key", "tei_json"))
- .map { entry =>
- val (key : String, json : String) = (entry._1, entry._2)
- GrobidScorable.grobidToSlug(json) match {
- case Some(slug) => new MapFeatures(slug, json)
- case None => new MapFeatures(Scorable.NoSlug, json)
- }
- }
+ .map { entry : (String, String) => GrobidScorable.jsonToMapFeatures(entry._1, entry._2) }
}
}
@@ -36,14 +31,18 @@ object GrobidScorable {
HBaseBuilder.build(table, host, List("grobid0:tei_json"), SourceMode.SCAN_ALL)
}
- def grobidToSlug(json : String) : Option[String] = {
+ def jsonToMapFeatures(key : String, json : String) : MapFeatures = {
Scorable.jsonToMap(json) match {
- case None => None
+ case None => MapFeatures(Scorable.NoSlug, json)
case Some(map) => {
if (map contains "title") {
- Some(Scorable.titleToSlug(map("title").asInstanceOf[String]))
+ val map2 = Scorable.toScorableMap(Scorable.getString(map, "title"),
+ sha1=key)
+ new MapFeatures(
+ Scorable.mapToSlug(map2),
+ JSONObject(map2).toString)
} else {
- None
+ MapFeatures(Scorable.NoSlug, json)
}
}
}