From 70350899dda973cdf7a5cfdd941ae80319254587 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 15 Aug 2018 22:05:59 -0700 Subject: handle null status_code lines --- scalding/src/main/scala/sandcrawler/GrobidScorable.scala | 1 + 1 file changed, 1 insertion(+) (limited to 'scalding/src/main/scala/sandcrawler/GrobidScorable.scala') diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index f484fad..9a09e05 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -25,6 +25,7 @@ class GrobidScorable extends Scorable with HBasePipeConversions { .read // Can't just "fromBytesWritable" because we have multiple types? .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "tei_json", "status_code")) + .filter { case (_, tei_json, status_code) => tei_json != null && status_code != null } .map { case (key, tei_json, status_code) => (Bytes.toString(key.copyBytes()), Bytes.toString(tei_json.copyBytes()), Bytes.toLong(status_code.copyBytes())) } -- cgit v1.2.3