From 71be2e685848a31888811e2e398e769f7e0486c2 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 8 Aug 2018 12:14:16 -0700 Subject: row-count: require f:c, not file:size I tried using the empty List() and got a test failure, so it seems like we do need to specific *some* field here. file:size gets populated by the extraction job, not the backfill job, so I had been miscounting table sizes (counting only the number of GROBID extracted items, not rows in the table). TODO: count on key or no column, not f:c --- scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala b/scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala index 4c3de33..5c7954a 100644 --- a/scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala +++ b/scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala @@ -30,7 +30,7 @@ object HBaseRowCountJob { HBaseBuilder.build( hbaseTable, zookeeperHosts, - List("file:size"), + List("f:c"), SourceMode.SCAN_ALL) } } -- cgit v1.2.3