aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/CrossrefScorable.scala')
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala15
1 files changed, 9 insertions, 6 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index f51c210..c13945f 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -17,7 +17,7 @@ class CrossrefScorable extends Scorable with HBasePipeConversions {
TextLine(args("crossref-input"))
}
- def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = {
+ def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]] = {
getSource(args).read
.toTypedPipe[String](new Fields("line"))
.filter { CrossrefScorable.keepRecord(_) }
@@ -116,22 +116,25 @@ object CrossrefScorable {
}
}
- def jsonToMapFeatures(json : String) : MapFeatures = {
+ def jsonToMapFeatures(json : String) : Option[MapFeatures] = {
Scorable.jsonToMap(json) match {
- case None => MapFeatures(Scorable.NoSlug, json)
+ case None => None
case Some(map) =>
mapToTitle(map) match {
- case None => MapFeatures(Scorable.NoSlug, json)
+ case None => None
case Some(title) => {
val doi = Scorable.getString(map, "DOI")
val authors: List[String] = mapToAuthorList(map)
val year: Int = mapToYear(map).getOrElse(0)
val contentType: String = map.get("type").map(e => e.asInstanceOf[String]).getOrElse("MISSING-CONTENT-TYPE")
if (doi.isEmpty || doi == null || authors.length == 0 || !(ContentTypeWhitelist contains contentType)) {
- MapFeatures(Scorable.NoSlug, json)
+ None
} else {
val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi.toLowerCase(), year=year)
- MapFeatures(sf.toSlug, sf.toString)
+ sf.toSlug match {
+ case None => None
+ case Some(slug) => Some(MapFeatures(slug, sf.toString))
+ }
}
}
}