aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-08-10 20:49:44 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-08-10 20:49:44 -0700
commit768e7ef0d127cf55119543be6e656751704ca5b2 (patch)
tree27df4f067ebe693275f4995ac271660f5ac676d9 /scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
parentb7f77f6337b450406ae0a90b81faeba27394afb0 (diff)
downloadsandcrawler-768e7ef0d127cf55119543be6e656751704ca5b2.tar.gz
sandcrawler-768e7ef0d127cf55119543be6e656751704ca5b2.zip
Tests pass. Still have changes to do but made huge progress.
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/CrossrefScorable.scala')
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala38
1 files changed, 22 insertions, 16 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index 817bee5..b2f6537 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -9,6 +9,7 @@ import parallelai.spyglass.hbase.HBaseConstants.SourceMode
import parallelai.spyglass.hbase.HBasePipeConversions
import parallelai.spyglass.hbase.HBaseSource
import TDsl._
+import scala.util.parsing.json.JSONObject
import java.text.Normalizer
import java.util.Arrays
@@ -31,7 +32,7 @@ import parallelai.spyglass.hbase.HBasePipeConversions
import parallelai.spyglass.hbase.HBaseSource
class CrossrefScorable extends Scorable with HBasePipeConversions {
- // TODO: Generalize args so there can be multiple Grobid pipes in one job.
+ // TODO: Generalize args so there can be multiple Crossref pipes in one job.
def getSource(args : Args) : Source = {
TextLine(args("crossref-input"))
}
@@ -39,26 +40,31 @@ class CrossrefScorable extends Scorable with HBasePipeConversions {
def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = {
getSource(args).read
.toTypedPipe[String](new Fields("line"))
- .map{ json : String =>
- CrossrefScorable.crossrefToSlug(json) match {
- case Some(slug) => new MapFeatures(slug, json)
+ .map{ json : String =>
+ CrossrefScorable.simplifyJson(json) match {
case None => new MapFeatures(Scorable.NoSlug, json)
+ case Some(map) => new MapFeatures(
+ Scorable.titleToSlug(map("title").asInstanceOf[String]),
+ JSONObject(map).toString)
}
}
}
-}
-object CrossrefScorable {
- def crossrefToSlug(json : String) : Option[String] = {
- Scorable.jsonToMap(json) match {
- case None => None
- case Some(map) => {
- if (map contains "title") {
- // TODO: Don't ignore titles after the first.
- val title = map("title").asInstanceOf[List[String]](0)
- Some(Scorable.titleToSlug(title))
- } else {
- None
+ object CrossrefScorable {
+ def simplifyJson(json : String) : Option[Map[String, Any]] = {
+ Scorable.jsonToMap(json) match {
+ case None => None
+ case Some(map) => {
+ if (map contains "title") {
+ val titles = map("title").asInstanceOf[List[String]]
+ if (titles.isEmpty) {
+ None
+ } else {
+ Some(Map("title" -> titles(0)))
+ }
+ } else {
+ None
+ }
}
}
}