diff options
author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-10 20:49:44 -0700 |
---|---|---|
committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-10 20:49:44 -0700 |
commit | 768e7ef0d127cf55119543be6e656751704ca5b2 (patch) | |
tree | 27df4f067ebe693275f4995ac271660f5ac676d9 /scalding/src/main/scala/sandcrawler/CrossrefScorable.scala | |
parent | b7f77f6337b450406ae0a90b81faeba27394afb0 (diff) | |
download | sandcrawler-768e7ef0d127cf55119543be6e656751704ca5b2.tar.gz sandcrawler-768e7ef0d127cf55119543be6e656751704ca5b2.zip |
Tests pass. Still have changes to do but made huge progress.
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/CrossrefScorable.scala')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/CrossrefScorable.scala | 38 |
1 files changed, 22 insertions, 16 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index 817bee5..b2f6537 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -9,6 +9,7 @@ import parallelai.spyglass.hbase.HBaseConstants.SourceMode import parallelai.spyglass.hbase.HBasePipeConversions import parallelai.spyglass.hbase.HBaseSource import TDsl._ +import scala.util.parsing.json.JSONObject import java.text.Normalizer import java.util.Arrays @@ -31,7 +32,7 @@ import parallelai.spyglass.hbase.HBasePipeConversions import parallelai.spyglass.hbase.HBaseSource class CrossrefScorable extends Scorable with HBasePipeConversions { - // TODO: Generalize args so there can be multiple Grobid pipes in one job. + // TODO: Generalize args so there can be multiple Crossref pipes in one job. def getSource(args : Args) : Source = { TextLine(args("crossref-input")) } @@ -39,26 +40,31 @@ class CrossrefScorable extends Scorable with HBasePipeConversions { def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = { getSource(args).read .toTypedPipe[String](new Fields("line")) - .map{ json : String => - CrossrefScorable.crossrefToSlug(json) match { - case Some(slug) => new MapFeatures(slug, json) + .map{ json : String => + CrossrefScorable.simplifyJson(json) match { case None => new MapFeatures(Scorable.NoSlug, json) + case Some(map) => new MapFeatures( + Scorable.titleToSlug(map("title").asInstanceOf[String]), + JSONObject(map).toString) } } } -} -object CrossrefScorable { - def crossrefToSlug(json : String) : Option[String] = { - Scorable.jsonToMap(json) match { - case None => None - case Some(map) => { - if (map contains "title") { - // TODO: Don't ignore titles after the first. - val title = map("title").asInstanceOf[List[String]](0) - Some(Scorable.titleToSlug(title)) - } else { - None + object CrossrefScorable { + def simplifyJson(json : String) : Option[Map[String, Any]] = { + Scorable.jsonToMap(json) match { + case None => None + case Some(map) => { + if (map contains "title") { + val titles = map("title").asInstanceOf[List[String]] + if (titles.isEmpty) { + None + } else { + Some(Map("title" -> titles(0))) + } + } else { + None + } } } } |