aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-08-28 08:45:04 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-08-28 08:45:04 -0700
commit98eebb1d9abc954c0fe1735bf5472c57e71dc7a8 (patch)
tree344ec1bf43a945b8bd97e97fc5c1d9ef2b4a14df /scalding/src/main
parent93c866293ccca23131224b47e868c632ad2de032 (diff)
downloadsandcrawler-98eebb1d9abc954c0fe1735bf5472c57e71dc7a8.tar.gz
sandcrawler-98eebb1d9abc954c0fe1735bf5472c57e71dc7a8.zip
restored code I inadvertantly removed when merging
Diffstat (limited to 'scalding/src/main')
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala5
1 files changed, 4 insertions, 1 deletions
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
index d40410b..3146a6c 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
@@ -41,7 +41,10 @@ class GrobidScorableDumpJob(args: Args) extends JobBase(args) {
GrobidScorable.jsonToMapFeatures(entry._1, entry._2)
}
.filterNot { entry => entry.isEmpty }
- .map { entry => entry.get }
+ .map { entry => {
+ validGrobidRows.inc
+ entry.get
+ }}
.groupBy { case MapFeatures(slug, json) => slug }
.map { tuple =>
val (slug : String, features : MapFeatures) = tuple