aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala5
1 files changed, 4 insertions, 1 deletions
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
index d40410b..3146a6c 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
@@ -41,7 +41,10 @@ class GrobidScorableDumpJob(args: Args) extends JobBase(args) {
GrobidScorable.jsonToMapFeatures(entry._1, entry._2)
}
.filterNot { entry => entry.isEmpty }
- .map { entry => entry.get }
+ .map { entry => {
+ validGrobidRows.inc
+ entry.get
+ }}
.groupBy { case MapFeatures(slug, json) => slug }
.map { tuple =>
val (slug : String, features : MapFeatures) = tuple