aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-08-07 11:05:23 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-08-07 11:05:23 -0700
commit8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64 (patch)
treef515c25882aebeb5edb8d8a13e06e457e19a4fb4 /scalding/src/main/scala/sandcrawler/GrobidScorable.scala
parent408123177b9e8afd145ea0f0fa1d6bb449f1bd20 (diff)
downloadsandcrawler-8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64.tar.gz
sandcrawler-8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64.zip
Added GrobidScorableTest, minor improvements.
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/GrobidScorable.scala')
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorable.scala24
1 files changed, 15 insertions, 9 deletions
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
index 8da7708..25e5985 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -32,14 +32,20 @@ class GrobidScorable extends Scorable with HBasePipeConversions {
}
}
}
-/*
- def fromBytesWritableLocal(f: Fields): Pipe = {
- asList(f)
- .foldLeft(pipe) { (p, fld) => {
- p.map(fld.toString -> fld.toString) { from: org.apache.hadoop.hbase.io.ImmutableBytesWritable =>
- Option(from).map(x => Bytes.toString(x.get)).getOrElse(null)
- }
- }}
+}
+
+object GrobidScorable {
+ def grobidToSlug(json : String) : Option[String] = {
+ Scorable.jsonToMap(json) match {
+ case None => None
+ case Some(map) => {
+ if (map contains "title") {
+ Some(Scorable.titleToSlug(map("title").asInstanceOf[String]))
+ } else {
+ None
+ }
+ }
+ }
}
- */
}
+