From 0e8b18a2cc200a75fa69c808404e298d6ea0d154 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 18 May 2021 22:55:03 +0200 Subject: pig test run resulted in 2k matches (pdf only) --- pig/README.md | 25 +++++++++++++++++++++++++ pig/filter-cdx-join-urls.pig | 5 ++++- 2 files changed, 29 insertions(+), 1 deletion(-) (limited to 'pig') diff --git a/pig/README.md b/pig/README.md index 5281169..c186e8e 100644 --- a/pig/README.md +++ b/pig/README.md @@ -30,6 +30,31 @@ $ pig -p INPUT_CDX=/user/wmdata2/cdx-all-index/20210422171221/part-a-00031.gz -p * http://ia802401.us.archive.org:6988/cluster/app/application_1611217683160_298042 * http://ia802401.us.archive.org:6988/proxy/application_1611217683160_298042/ +Running against 1/300 block of global CDX took about 15h. + +``` +2021-05-18 09:15:24,941 [main] INFO org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server +2021-05-18 09:15:30,959 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - Success! + +real 882m52.192s +user 44m25.048s +sys 5m51.749s +``` + +How many links? On live web? + +``` +$ gohdfs cat /user/martin/fatcat-refs-lookup-0/part-r-00000 | awk '{ print $3 }' > refs_links_testrun.tsv +$ time cat refs_links_testrun.tsv | clinker -w 128 -verbose > refs_links_liveweb.json +$ wc -l refs_links_liveweb.json +2623 refs_links_liveweb.json +$ jq -rc .status refs_links_liveweb.json | sort | uniq -c | sort -nr 2> /dev/null + 2252 200 + 266 403 + 154 404 + 10 null +``` + ---- diff --git a/pig/filter-cdx-join-urls.pig b/pig/filter-cdx-join-urls.pig index 3d06804..3c7a942 100644 --- a/pig/filter-cdx-join-urls.pig +++ b/pig/filter-cdx-join-urls.pig @@ -2,6 +2,9 @@ -- -- Author: Bryan Newbold -- Date: May 2018 +-- +-- Edited: Martin Czygan +-- Date: May 2021 %default INPUT_CDX '' %default INPUT_URLS '' @@ -26,7 +29,7 @@ cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline; cdx = FOREACH cdx GENERATE (chararray)cols.$0 as cdx_surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1sum, cdxline; cdx = FILTER cdx BY not cdx_surt matches '-'; cdx = FILTER cdx BY httpstatus matches '200'; -cdx = FILTER cdx BY mimetype matches '.*pdf.*'; +-- cdx = FILTER cdx BY mimetype matches '.*pdf.*'; -- Core JOIN full_join = JOIN cdx BY cdx_surt, surts BY url_surt; -- cgit v1.2.3