diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-05-18 22:55:03 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-05-18 22:55:03 +0200 |
commit | 0e8b18a2cc200a75fa69c808404e298d6ea0d154 (patch) | |
tree | 72c15be56a8be863f19344ec286f0eb2eb712bee /pig/filter-cdx-join-urls.pig | |
parent | bb0e9b312f8248e882f8650897966ff57117aa17 (diff) | |
download | refcat-0e8b18a2cc200a75fa69c808404e298d6ea0d154.tar.gz refcat-0e8b18a2cc200a75fa69c808404e298d6ea0d154.zip |
pig test run resulted in 2k matches (pdf only)
Diffstat (limited to 'pig/filter-cdx-join-urls.pig')
-rw-r--r-- | pig/filter-cdx-join-urls.pig | 5 |
1 files changed, 4 insertions, 1 deletions
diff --git a/pig/filter-cdx-join-urls.pig b/pig/filter-cdx-join-urls.pig index 3d06804..3c7a942 100644 --- a/pig/filter-cdx-join-urls.pig +++ b/pig/filter-cdx-join-urls.pig @@ -2,6 +2,9 @@ -- -- Author: Bryan Newbold <bnewbold@archive.org> -- Date: May 2018 +-- +-- Edited: Martin Czygan <martin@archive.org> +-- Date: May 2021 %default INPUT_CDX '' %default INPUT_URLS '' @@ -26,7 +29,7 @@ cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline; cdx = FOREACH cdx GENERATE (chararray)cols.$0 as cdx_surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1sum, cdxline; cdx = FILTER cdx BY not cdx_surt matches '-'; cdx = FILTER cdx BY httpstatus matches '200'; -cdx = FILTER cdx BY mimetype matches '.*pdf.*'; +-- cdx = FILTER cdx BY mimetype matches '.*pdf.*'; -- Core JOIN full_join = JOIN cdx BY cdx_surt, surts BY url_surt; |