diff options
author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-07 11:05:23 -0700 |
---|---|---|
committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-07 11:05:23 -0700 |
commit | 8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64 (patch) | |
tree | f515c25882aebeb5edb8d8a13e06e457e19a4fb4 /scalding/src/main | |
parent | 408123177b9e8afd145ea0f0fa1d6bb449f1bd20 (diff) | |
download | sandcrawler-8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64.tar.gz sandcrawler-8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64.zip |
Added GrobidScorableTest, minor improvements.
Diffstat (limited to 'scalding/src/main')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/CrossrefScorable.scala | 19 | ||||
-rw-r--r-- | scalding/src/main/scala/sandcrawler/GrobidScorable.scala | 24 |
2 files changed, 33 insertions, 10 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index 0849aff..cf5849c 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -15,10 +15,27 @@ class CrossrefScorable extends Scorable { .read .toTypedPipe[String](new Fields("line")) .map{ json : String => - HBaseCrossrefScore.crossrefToSlug(json) match { + CrossrefScorable.crossrefToSlug(json) match { case Some(slug) => new MapFeatures(slug, json) case None => new MapFeatures(Scorable.NoSlug, json) } } } } + +object CrossrefScorable { + def crossrefToSlug(json : String) : Option[String] = { + Scorable.jsonToMap(json) match { + case None => None + case Some(map) => { + if (map contains "title") { + // TODO: Don't ignore titles after the first. + val title = map("title").asInstanceOf[List[String]](0) + Some(Scorable.titleToSlug(title)) + } else { + None + } + } + } + } +} diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index 8da7708..25e5985 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -32,14 +32,20 @@ class GrobidScorable extends Scorable with HBasePipeConversions { } } } -/* - def fromBytesWritableLocal(f: Fields): Pipe = { - asList(f) - .foldLeft(pipe) { (p, fld) => { - p.map(fld.toString -> fld.toString) { from: org.apache.hadoop.hbase.io.ImmutableBytesWritable => - Option(from).map(x => Bytes.toString(x.get)).getOrElse(null) - } - }} +} + +object GrobidScorable { + def grobidToSlug(json : String) : Option[String] = { + Scorable.jsonToMap(json) match { + case None => None + case Some(map) => { + if (map contains "title") { + Some(Scorable.titleToSlug(map("title").asInstanceOf[String])) + } else { + None + } + } + } } - */ } + |