aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src/main')
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala52
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorable.scala16
-rw-r--r--scalding/src/main/scala/sandcrawler/Scorable.scala1
3 files changed, 56 insertions, 13 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index 5d1eaf5..ab33d03 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -19,29 +19,55 @@ class CrossrefScorable extends Scorable with HBasePipeConversions {
def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = {
getSource(args).read
.toTypedPipe[String](new Fields("line"))
+ .filter { CrossrefScorable.keepRecord(_) }
.map { CrossrefScorable.jsonToMapFeatures(_) }
}
}
object CrossrefScorable {
+ def keepRecord(json : String) : Boolean = {
+ Scorable.jsonToMap(json) match {
+ case None => false
+ case Some(map) => {
+ mapToTitle(map) match {
+ case None => false
+ case Some(title) => title.length <= Scorable.MaxTitleLength
+ }
+ }
+ }
+ }
+
+ // Returns None if title is null, empty, or too long.
+ def mapToTitle(map : Map[String, Any]) : Option[String] = {
+ if (map contains "title") {
+ val titles = map("title").asInstanceOf[List[String]]
+ if (titles.isEmpty || titles == null) {
+ None
+ } else {
+ val title = titles(0)
+ if (title == null || title.isEmpty || title.length > Scorable.MaxTitleLength) None else Some(title)
+ }
+ } else {
+ None
+ }
+ }
+
def jsonToMapFeatures(json : String) : MapFeatures = {
Scorable.jsonToMap(json) match {
case None => MapFeatures(Scorable.NoSlug, json)
- case Some(map) => {
- if ((map contains "title") && (map contains "DOI")) {
- val titles = map("title").asInstanceOf[List[String]]
- val doi = Scorable.getString(map, "DOI")
- if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) {
- new MapFeatures(Scorable.NoSlug, json)
- } else {
- // bnewbold: not checking that titles(0) is non-null/non-empty; case would be, in JSON, "title": [ null ]
- val sf : ScorableFeatures = ScorableFeatures.create(title=titles(0), doi=doi)
- new MapFeatures(sf.toSlug, sf.toString)
+ case Some(map) =>
+ mapToTitle(map) match {
+ case None => MapFeatures(Scorable.NoSlug, json)
+ case Some(title) => {
+ val doi = Scorable.getString(map, "DOI")
+ if (doi.isEmpty || doi == null) {
+ MapFeatures(Scorable.NoSlug, json)
+ } else {
+ val sf : ScorableFeatures = ScorableFeatures.create(title=title, doi=doi)
+ MapFeatures(sf.toSlug, sf.toString)
+ }
}
- } else {
- new MapFeatures(Scorable.NoSlug, json)
}
- }
}
}
}
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
index e510f75..76f4f22 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -31,11 +31,27 @@ class GrobidScorable extends Scorable with HBasePipeConversions {
}
// TODO: Should I combine next two stages for efficiency?
.collect { case (key, json, StatusOK) => (key, json) }
+ .filter { case (key, json) => GrobidScorable.keepRecord(json) }
.map { entry : (String, String) => GrobidScorable.jsonToMapFeatures(entry._1, entry._2) }
}
}
object GrobidScorable {
+ def keepRecord(json : String) : Boolean = {
+ Scorable.jsonToMap(json) match {
+ case None => false
+ case Some(map) => {
+ if (map contains "title") {
+ val title = Scorable.getString(map, "title")
+ title != null && title.length <= Scorable.MaxTitleLength
+ } else {
+ false
+ }
+ }
+ }
+ }
+
+
def getHBaseSource(table : String, host : String) : HBaseSource = {
HBaseBuilder.build(table, host, List("grobid0:metadata", "grobid0:status_code"), SourceMode.SCAN_ALL)
}
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index 9b9c633..c704ed9 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -30,6 +30,7 @@ abstract class Scorable {
}
object Scorable {
+ val MaxTitleLength = 255
val NoSlug = "NO SLUG" // Used for slug if title is empty or unparseable
def isValidSlug(slug : String) : Boolean = {