diff options
author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-07 11:05:23 -0700 |
---|---|---|
committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-07 11:05:23 -0700 |
commit | 8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64 (patch) | |
tree | f515c25882aebeb5edb8d8a13e06e457e19a4fb4 /scalding/src/main/scala/sandcrawler/GrobidScorable.scala | |
parent | 408123177b9e8afd145ea0f0fa1d6bb449f1bd20 (diff) | |
download | sandcrawler-8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64.tar.gz sandcrawler-8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64.zip |
Added GrobidScorableTest, minor improvements.
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/GrobidScorable.scala')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/GrobidScorable.scala | 24 |
1 files changed, 15 insertions, 9 deletions
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index 8da7708..25e5985 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -32,14 +32,20 @@ class GrobidScorable extends Scorable with HBasePipeConversions { } } } -/* - def fromBytesWritableLocal(f: Fields): Pipe = { - asList(f) - .foldLeft(pipe) { (p, fld) => { - p.map(fld.toString -> fld.toString) { from: org.apache.hadoop.hbase.io.ImmutableBytesWritable => - Option(from).map(x => Bytes.toString(x.get)).getOrElse(null) - } - }} +} + +object GrobidScorable { + def grobidToSlug(json : String) : Option[String] = { + Scorable.jsonToMap(json) match { + case None => None + case Some(map) => { + if (map contains "title") { + Some(Scorable.titleToSlug(map("title").asInstanceOf[String])) + } else { + None + } + } + } } - */ } + |