aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-08-27 18:05:21 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-09-04 14:59:29 -0700
commit98f78c0ef17436f87991169b4a7bedadf602527a (patch)
tree219a43578fbbdc1ef5aacda8fdca1d8ccf55131f /scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
parent7df420014512d6e34ca5ed7db9b7690fbdf5e44b (diff)
downloadsandcrawler-98f78c0ef17436f87991169b4a7bedadf602527a.tar.gz
sandcrawler-98f78c0ef17436f87991169b4a7bedadf602527a.zip
replaced NoSlug with proper use of Option
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala')
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala7
1 files changed, 2 insertions, 5 deletions
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
index 19b257f..d40410b 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
@@ -40,11 +40,8 @@ class GrobidScorableDumpJob(args: Args) extends JobBase(args) {
parsedGrobidRows.inc
GrobidScorable.jsonToMapFeatures(entry._1, entry._2)
}
- .filter { entry => Scorable.isValidSlug(entry.slug) }
- .map { entry =>
- validGrobidRows.inc
- entry
- }
+ .filterNot { entry => entry.isEmpty }
+ .map { entry => entry.get }
.groupBy { case MapFeatures(slug, json) => slug }
.map { tuple =>
val (slug : String, features : MapFeatures) = tuple