aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-15 22:05:59 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-15 22:05:59 -0700
commit70350899dda973cdf7a5cfdd941ae80319254587 (patch)
tree19e444d4037da3124ed9c09ffeb5e8ac1ff6769a /scalding/src/main
parentc3c2760fb388059a9942a61965b79c42bc03f11b (diff)
downloadsandcrawler-70350899dda973cdf7a5cfdd941ae80319254587.tar.gz
sandcrawler-70350899dda973cdf7a5cfdd941ae80319254587.zip
handle null status_code lines
Diffstat (limited to 'scalding/src/main')
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorable.scala1
1 files changed, 1 insertions, 0 deletions
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
index f484fad..9a09e05 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -25,6 +25,7 @@ class GrobidScorable extends Scorable with HBasePipeConversions {
.read
// Can't just "fromBytesWritable" because we have multiple types?
.toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "tei_json", "status_code"))
+ .filter { case (_, tei_json, status_code) => tei_json != null && status_code != null }
.map { case (key, tei_json, status_code) =>
(Bytes.toString(key.copyBytes()), Bytes.toString(tei_json.copyBytes()), Bytes.toLong(status_code.copyBytes()))
}