aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-05-08 16:58:24 +0000
committerBryan Newbold <bnewbold@archive.org>2018-05-08 16:58:24 +0000
commit1831a3b4495aee275e4b4b187fa545eba75eb87b (patch)
tree8101a57cec0babcb29b496ecd152b100a4ddba97
parent64503ce8fb755384623821bfabfa81bbb37d8f6e (diff)
downloadsandcrawler-1831a3b4495aee275e4b4b187fa545eba75eb87b.tar.gz
sandcrawler-1831a3b4495aee275e4b4b187fa545eba75eb87b.zip
distinct on SHA1 in cdx scripts
-rw-r--r--pig/filter-cdx-join-urls.pig10
-rw-r--r--pig/filter-cdx-paper-pdfs.pig14
2 files changed, 18 insertions, 6 deletions
diff --git a/pig/filter-cdx-join-urls.pig b/pig/filter-cdx-join-urls.pig
index 70858b9..3d06804 100644
--- a/pig/filter-cdx-join-urls.pig
+++ b/pig/filter-cdx-join-urls.pig
@@ -23,7 +23,7 @@ cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');
cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
-cdx = FOREACH cdx GENERATE (chararray)cols.$0 as cdx_surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, cdxline;
+cdx = FOREACH cdx GENERATE (chararray)cols.$0 as cdx_surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1sum, cdxline;
cdx = FILTER cdx BY not cdx_surt matches '-';
cdx = FILTER cdx BY httpstatus matches '200';
cdx = FILTER cdx BY mimetype matches '.*pdf.*';
@@ -31,7 +31,13 @@ cdx = FILTER cdx BY mimetype matches '.*pdf.*';
-- Core JOIN
full_join = JOIN cdx BY cdx_surt, surts BY url_surt;
-result = FOREACH full_join GENERATE cdxline;
+-- DISTINCT by sha1 column
+full_uniq = FOREACH (GROUP full_join BY sha1sum) {
+ r = TOP(1, 0, $1);
+ GENERATE FLATTEN(r);
+};
+
+result = FOREACH full_uniq GENERATE cdxline;
result = DISTINCT result;
STORE result INTO '$OUTPUT' USING PigStorage();
diff --git a/pig/filter-cdx-paper-pdfs.pig b/pig/filter-cdx-paper-pdfs.pig
index 6559066..7e10720 100644
--- a/pig/filter-cdx-paper-pdfs.pig
+++ b/pig/filter-cdx-paper-pdfs.pig
@@ -16,7 +16,7 @@ cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');
cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
-cdx = FOREACH cdx GENERATE (chararray)cols.$0 as surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, cdxline;
+cdx = FOREACH cdx GENERATE (chararray)cols.$0 as surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1sum, cdxline;
cdx = FILTER cdx BY not surt matches '-';
cdx = FILTER cdx BY httpstatus matches '200';
cdx = FILTER cdx BY mimetype matches '.*pdf.*';
@@ -35,7 +35,13 @@ cdx = FILTER cdx
-- DOI-like pattern in URL
OR surt matches '.*\\).*/10\\.\\d{3,5}/.*';
-cdx = ORDER cdx by surt, timestamp PARALLEL 50;
-cdx = FOREACH cdx GENERATE cdxline;
-STORE cdx INTO '$OUTPUT' USING PigStorage(' ');
+-- DISTINCT by sha1 column
+cdx_uniq = FOREACH (GROUP cdx BY sha1sum) {
+ r = TOP(1, 0, $1);
+ GENERATE FLATTEN(r);
+};
+
+cdx_uniq = ORDER cdx_uniq by surt, timestamp PARALLEL 50;
+cdx_uniq = FOREACH cdx_uniq GENERATE cdxline;
+STORE cdx_uniq INTO '$OUTPUT' USING PigStorage(' ');