aboutsummaryrefslogtreecommitdiffstats
path: root/pig/filter-cdx-join-urls.pig
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-05-18 22:55:03 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-05-18 22:55:03 +0200
commit0e8b18a2cc200a75fa69c808404e298d6ea0d154 (patch)
tree72c15be56a8be863f19344ec286f0eb2eb712bee /pig/filter-cdx-join-urls.pig
parentbb0e9b312f8248e882f8650897966ff57117aa17 (diff)
downloadrefcat-0e8b18a2cc200a75fa69c808404e298d6ea0d154.tar.gz
refcat-0e8b18a2cc200a75fa69c808404e298d6ea0d154.zip
pig test run resulted in 2k matches (pdf only)
Diffstat (limited to 'pig/filter-cdx-join-urls.pig')
-rw-r--r--pig/filter-cdx-join-urls.pig5
1 files changed, 4 insertions, 1 deletions
diff --git a/pig/filter-cdx-join-urls.pig b/pig/filter-cdx-join-urls.pig
index 3d06804..3c7a942 100644
--- a/pig/filter-cdx-join-urls.pig
+++ b/pig/filter-cdx-join-urls.pig
@@ -2,6 +2,9 @@
--
-- Author: Bryan Newbold <bnewbold@archive.org>
-- Date: May 2018
+--
+-- Edited: Martin Czygan <martin@archive.org>
+-- Date: May 2021
%default INPUT_CDX ''
%default INPUT_URLS ''
@@ -26,7 +29,7 @@ cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
cdx = FOREACH cdx GENERATE (chararray)cols.$0 as cdx_surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1sum, cdxline;
cdx = FILTER cdx BY not cdx_surt matches '-';
cdx = FILTER cdx BY httpstatus matches '200';
-cdx = FILTER cdx BY mimetype matches '.*pdf.*';
+-- cdx = FILTER cdx BY mimetype matches '.*pdf.*';
-- Core JOIN
full_join = JOIN cdx BY cdx_surt, surts BY url_surt;