aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src/main/scala')
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala100
-rw-r--r--scalding/src/main/scala/sandcrawler/ScorableFeatures.scala10
2 files changed, 62 insertions, 48 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index c13945f..bb6413f 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -55,37 +55,45 @@ object CrossrefScorable {
// Returns None if title is null, empty, or too long.
def mapToTitle(map : Map[String, Any]) : Option[String] = {
- if (map contains "title") {
- val titles = map("title").asInstanceOf[List[String]]
- if (titles.isEmpty || titles == null) {
- None
+ def getTitle : Option[String] = {
+ if (map contains "title") {
+ val titles = map("title").asInstanceOf[List[String]]
+ if (titles.isEmpty || titles == null) None else Some(titles(0))
} else {
- val baseTitle: String = titles(0)
- // TODO(bnewbold): this code block is horrible
- val baseSubtitle: String = if (map contains "subtitle") {
- val subtitles = map("subtitle").asInstanceOf[List[String]]
- if (!subtitles.isEmpty && subtitles != null) {
- val sub = subtitles(0)
- if (sub != null && !sub.isEmpty && baseTitle != null) {
- sub
- } else {
- ""
- }
+ None
+ }
+ }
+
+ def getSubtitle : Option[String] = {
+ if (map contains "subtitle") {
+ val subtitles = map("subtitle").asInstanceOf[List[String]]
+ if (subtitles.isEmpty || subtitles == null) {
+ None
+ } else {
+ val sub = subtitles(0)
+ if (sub == null || sub.isEmpty) {
+ None
} else {
- ""
+ Some(sub)
}
- } else {
- ""
}
- val title = if (baseSubtitle.isEmpty) {
- baseTitle
+ } else {
+ None
+ }
+ }
+
+ getTitle match {
+ case None => None
+ case Some(baseTitle) => {
+ if (baseTitle == null) {
+ None
} else {
- baseTitle.concat(": ".concat(baseSubtitle))
+ getSubtitle match {
+ case None => Some(baseTitle)
+ case Some(baseSubtitle) => Some(baseTitle.concat(":".concat(baseSubtitle)))
+ }
}
- if (title == null || title.isEmpty || title.length > Scorable.MaxTitleLength) None else Some(title)
}
- } else {
- None
}
}
@@ -106,37 +114,39 @@ object CrossrefScorable {
case None => None
case Some(created) => {
Some(created.asInstanceOf[Map[String,Any]]
- .get("date-parts")
- .get
- .asInstanceOf[List[Any]](0)
- .asInstanceOf[List[Any]](0)
- .asInstanceOf[Double]
- .toInt)
+ .get("date-parts")
+ .get
+ .asInstanceOf[List[Any]](0)
+ .asInstanceOf[List[Any]](0)
+ .asInstanceOf[Double]
+ .toInt)
}
}
}
def jsonToMapFeatures(json : String) : Option[MapFeatures] = {
+ def makeMapFeatures(title : String, doi : String, authors : List[String], year : Int, contentType : String) : Option[MapFeatures] = {
+ if (doi.isEmpty || doi == null || authors.length == 0 || !(ContentTypeWhitelist contains contentType)) {
+ None
+ } else {
+ val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi.toLowerCase(), year=year)
+ sf.toSlug match {
+ case None => None
+ case Some(slug) => Some(MapFeatures(slug, sf.toString))
+ }
+ }
+ }
Scorable.jsonToMap(json) match {
case None => None
case Some(map) =>
mapToTitle(map) match {
case None => None
- case Some(title) => {
- val doi = Scorable.getString(map, "DOI")
- val authors: List[String] = mapToAuthorList(map)
- val year: Int = mapToYear(map).getOrElse(0)
- val contentType: String = map.get("type").map(e => e.asInstanceOf[String]).getOrElse("MISSING-CONTENT-TYPE")
- if (doi.isEmpty || doi == null || authors.length == 0 || !(ContentTypeWhitelist contains contentType)) {
- None
- } else {
- val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi.toLowerCase(), year=year)
- sf.toSlug match {
- case None => None
- case Some(slug) => Some(MapFeatures(slug, sf.toString))
- }
- }
- }
+ case Some(title) => makeMapFeatures(
+ title=title,
+ doi=Scorable.getString(map, "DOI"),
+ authors=mapToAuthorList(map),
+ year=mapToYear(map).getOrElse(0),
+ contentType=map.get("type").map(e => e.asInstanceOf[String]).getOrElse("MISSING-CONTENT-TYPE"))
}
}
}
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
index b56f102..be2b495 100644
--- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -43,9 +43,13 @@ class ScorableFeatures private(title : String, authors : List[Any] = List(), yea
// Remove punctuation
val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "")
if (slug.isEmpty
- || slug == null
- || (ScorableFeatures.SlugBlacklist contains slug)
- || (slug.length < ScorableFeatures.MinSlugLength)) None else Some(slug)
+ || slug == null
+ || (ScorableFeatures.SlugBlacklist contains slug)
+ || (slug.length < ScorableFeatures.MinSlugLength)) {
+ None
+ } else {
+ Some(slug)
+ }
}
}