aboutsummaryrefslogtreecommitdiffstats
path: root/scalding
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-08-28 08:45:04 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-09-04 14:59:29 -0700
commit2c9172a3797dd218b5648c5390bf2dfb39d8e3a3 (patch)
tree6bace5b3b3327b864bab04bba489f1dddd3c5365 /scalding
parent3eb3e38daf4ad58b6da88d7abda222018e4a1ab5 (diff)
downloadsandcrawler-2c9172a3797dd218b5648c5390bf2dfb39d8e3a3.tar.gz
sandcrawler-2c9172a3797dd218b5648c5390bf2dfb39d8e3a3.zip
restored code I inadvertantly removed when merging
Diffstat (limited to 'scalding')
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala5
1 files changed, 4 insertions, 1 deletions
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
index d40410b..3146a6c 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
@@ -41,7 +41,10 @@ class GrobidScorableDumpJob(args: Args) extends JobBase(args) {
GrobidScorable.jsonToMapFeatures(entry._1, entry._2)
}
.filterNot { entry => entry.isEmpty }
- .map { entry => entry.get }
+ .map { entry => {
+ validGrobidRows.inc
+ entry.get
+ }}
.groupBy { case MapFeatures(slug, json) => slug }
.map { tuple =>
val (slug : String, features : MapFeatures) = tuple