aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala')
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala7
1 files changed, 2 insertions, 5 deletions
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
index 19b257f..d40410b 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
@@ -40,11 +40,8 @@ class GrobidScorableDumpJob(args: Args) extends JobBase(args) {
parsedGrobidRows.inc
GrobidScorable.jsonToMapFeatures(entry._1, entry._2)
}
- .filter { entry => Scorable.isValidSlug(entry.slug) }
- .map { entry =>
- validGrobidRows.inc
- entry
- }
+ .filterNot { entry => entry.isEmpty }
+ .map { entry => entry.get }
.groupBy { case MapFeatures(slug, json) => slug }
.map { tuple =>
val (slug : String, features : MapFeatures) = tuple