aboutsummaryrefslogtreecommitdiffstats
path: root/pig
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-12-27 17:21:36 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-02 18:12:58 -0800
commit028a0c27a832833e8833e3b3d0e1d6725a48e953 (patch)
treeb9c4b0691ae9e9f7012276820c2b3970104005df /pig
parentacea0838caa93f194caa380a6211bf57cc8fc5bf (diff)
downloadsandcrawler-028a0c27a832833e8833e3b3d0e1d6725a48e953.tar.gz
sandcrawler-028a0c27a832833e8833e3b3d0e1d6725a48e953.zip
small (syntax?) changes to pig join script
Diffstat (limited to 'pig')
-rw-r--r--pig/join-cdx-sha1.pig4
1 files changed, 2 insertions, 2 deletions
diff --git a/pig/join-cdx-sha1.pig b/pig/join-cdx-sha1.pig
index 460f8b0..86b9bb6 100644
--- a/pig/join-cdx-sha1.pig
+++ b/pig/join-cdx-sha1.pig
@@ -16,7 +16,7 @@
set mapreduce.job.queuename default
-digests = LOAD '$INPUT_DIGEST' USING PigStorage() AS sha1b32:chararray;
+digests = LOAD '$INPUT_DIGEST' AS sha1b32:chararray;
digests = ORDER digests by sha1b32 ASC PARALLEL 20;
digests = DISTINCT digests;
@@ -29,7 +29,7 @@ cdx = FOREACH cdx GENERATE (chararray)cols.$0 as cdx_surt, (chararray)cols.$1 as
cdx = FILTER cdx BY not cdx_surt matches '-';
cdx = FILTER cdx BY httpstatus matches '200';
cdx = FILTER cdx BY not mimetype matches 'warc/revisit';
-cdx = ORDER cdx by sha1b32 ASC PARALLEL 40;
+cdx = ORDER cdx BY sha1b32 ASC PARALLEL 40;
-- TODO: DISTINCT by (sha1b32, cdx_surt) for efficiency