diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-05-08 16:58:24 +0000 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-05-08 16:58:24 +0000 |
commit | 1831a3b4495aee275e4b4b187fa545eba75eb87b (patch) | |
tree | 8101a57cec0babcb29b496ecd152b100a4ddba97 | |
parent | 64503ce8fb755384623821bfabfa81bbb37d8f6e (diff) | |
download | sandcrawler-1831a3b4495aee275e4b4b187fa545eba75eb87b.tar.gz sandcrawler-1831a3b4495aee275e4b4b187fa545eba75eb87b.zip |
distinct on SHA1 in cdx scripts
-rw-r--r-- | pig/filter-cdx-join-urls.pig | 10 | ||||
-rw-r--r-- | pig/filter-cdx-paper-pdfs.pig | 14 |
2 files changed, 18 insertions, 6 deletions
diff --git a/pig/filter-cdx-join-urls.pig b/pig/filter-cdx-join-urls.pig index 70858b9..3d06804 100644 --- a/pig/filter-cdx-join-urls.pig +++ b/pig/filter-cdx-join-urls.pig @@ -23,7 +23,7 @@ cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc'); cdx = FILTER cdx BY not STARTSWITH (cdxline, ' '); cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline; -cdx = FOREACH cdx GENERATE (chararray)cols.$0 as cdx_surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, cdxline; +cdx = FOREACH cdx GENERATE (chararray)cols.$0 as cdx_surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1sum, cdxline; cdx = FILTER cdx BY not cdx_surt matches '-'; cdx = FILTER cdx BY httpstatus matches '200'; cdx = FILTER cdx BY mimetype matches '.*pdf.*'; @@ -31,7 +31,13 @@ cdx = FILTER cdx BY mimetype matches '.*pdf.*'; -- Core JOIN full_join = JOIN cdx BY cdx_surt, surts BY url_surt; -result = FOREACH full_join GENERATE cdxline; +-- DISTINCT by sha1 column +full_uniq = FOREACH (GROUP full_join BY sha1sum) { + r = TOP(1, 0, $1); + GENERATE FLATTEN(r); +}; + +result = FOREACH full_uniq GENERATE cdxline; result = DISTINCT result; STORE result INTO '$OUTPUT' USING PigStorage(); diff --git a/pig/filter-cdx-paper-pdfs.pig b/pig/filter-cdx-paper-pdfs.pig index 6559066..7e10720 100644 --- a/pig/filter-cdx-paper-pdfs.pig +++ b/pig/filter-cdx-paper-pdfs.pig @@ -16,7 +16,7 @@ cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc'); cdx = FILTER cdx BY not STARTSWITH (cdxline, ' '); cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline; -cdx = FOREACH cdx GENERATE (chararray)cols.$0 as surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, cdxline; +cdx = FOREACH cdx GENERATE (chararray)cols.$0 as surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1sum, cdxline; cdx = FILTER cdx BY not surt matches '-'; cdx = FILTER cdx BY httpstatus matches '200'; cdx = FILTER cdx BY mimetype matches '.*pdf.*'; @@ -35,7 +35,13 @@ cdx = FILTER cdx -- DOI-like pattern in URL OR surt matches '.*\\).*/10\\.\\d{3,5}/.*'; -cdx = ORDER cdx by surt, timestamp PARALLEL 50; -cdx = FOREACH cdx GENERATE cdxline; -STORE cdx INTO '$OUTPUT' USING PigStorage(' '); +-- DISTINCT by sha1 column +cdx_uniq = FOREACH (GROUP cdx BY sha1sum) { + r = TOP(1, 0, $1); + GENERATE FLATTEN(r); +}; + +cdx_uniq = ORDER cdx_uniq by surt, timestamp PARALLEL 50; +cdx_uniq = FOREACH cdx_uniq GENERATE cdxline; +STORE cdx_uniq INTO '$OUTPUT' USING PigStorage(' '); |