From 028a0c27a832833e8833e3b3d0e1d6725a48e953 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 27 Dec 2019 17:21:36 -0800 Subject: small (syntax?) changes to pig join script --- pig/join-cdx-sha1.pig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'pig') diff --git a/pig/join-cdx-sha1.pig b/pig/join-cdx-sha1.pig index 460f8b0..86b9bb6 100644 --- a/pig/join-cdx-sha1.pig +++ b/pig/join-cdx-sha1.pig @@ -16,7 +16,7 @@ set mapreduce.job.queuename default -digests = LOAD '$INPUT_DIGEST' USING PigStorage() AS sha1b32:chararray; +digests = LOAD '$INPUT_DIGEST' AS sha1b32:chararray; digests = ORDER digests by sha1b32 ASC PARALLEL 20; digests = DISTINCT digests; @@ -29,7 +29,7 @@ cdx = FOREACH cdx GENERATE (chararray)cols.$0 as cdx_surt, (chararray)cols.$1 as cdx = FILTER cdx BY not cdx_surt matches '-'; cdx = FILTER cdx BY httpstatus matches '200'; cdx = FILTER cdx BY not mimetype matches 'warc/revisit'; -cdx = ORDER cdx by sha1b32 ASC PARALLEL 40; +cdx = ORDER cdx BY sha1b32 ASC PARALLEL 40; -- TODO: DISTINCT by (sha1b32, cdx_surt) for efficiency -- cgit v1.2.3