From 18a55d37a87d4391bd8161201c523dd7d7f0f1e7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 8 May 2018 10:06:14 -0700 Subject: fix tests post-DISTINCT Confirms it's working! --- TODO | 5 +++++ pig/tests/files/papers_domain_words.cdx | 8 ++++---- pig/tests/files/papers_edu_tilde.cdx | 14 +++++++------- pig/tests/files/papers_url_doi.cdx | 4 ++-- pig/tests/files/papers_url_words.cdx | 24 ++++++++++++------------ 5 files changed, 30 insertions(+), 25 deletions(-) diff --git a/TODO b/TODO index 57c827f..5e9220b 100644 --- a/TODO +++ b/TODO @@ -1,4 +1,9 @@ +pig: +- potentially want to *not* de-dupe CDX lines by uniq sha1 in all cases; run + this as a second-stage filter? for example, may want many URL links in fatcat + for a single file (different links, different policies) + - include input file name (and chunk? and CDX?) in sentry context - play with test image on older releases (eg, trusty) diff --git a/pig/tests/files/papers_domain_words.cdx b/pig/tests/files/papers_domain_words.cdx index 48e2313..02b5eaa 100644 --- a/pig/tests/files/papers_domain_words.cdx +++ b/pig/tests/files/papers_domain_words.cdx @@ -5,7 +5,7 @@ # should match 4: -edu,fit,research)/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz -org,sgmjournals,ijs)//cgi/reprint/54/6/2217.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz -uk,ac,soton,ecs,eprints)/12020/1/mind-the-semantic-gap.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz -au,edu,uq,eprint)/archive/00004120/01/R103_Forrester_pp.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +edu,fit,research)/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3I - - 123 456 CRAWL/CRAWL.warc.gz +org,sgmjournals,ijs)//cgi/reprint/54/6/2217.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 TQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +uk,ac,soton,ecs,eprints)/12020/1/mind-the-semantic-gap.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 NQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +au,edu,uq,eprint)/archive/00004120/01/R103_Forrester_pp.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 QQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz diff --git a/pig/tests/files/papers_edu_tilde.cdx b/pig/tests/files/papers_edu_tilde.cdx index 47ca069..f43a11a 100644 --- a/pig/tests/files/papers_edu_tilde.cdx +++ b/pig/tests/files/papers_edu_tilde.cdx @@ -4,12 +4,12 @@ #http://www.comp.hkbu.edu.hk/~ymc/papers/conference/ijcnn03_710.pdf # should be 6 matches: -hk,edu,hkbu,comp)/~ymc/papers/conference/ijcnn03_710.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz -edu,stanford,www)/~johntayl/Papers/taylor2.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz -edu,nps,met)/~mtmontgo/papers/isabel_part2.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz -edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz -jp,ac,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz -co,edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +hk,edu,hkbu,comp)/~ymc/papers/conference/ijcnn03_710.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 LQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +edu,stanford,www)/~johntayl/Papers/taylor2.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 XQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +edu,nps,met)/~mtmontgo/papers/isabel_part2.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 PQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 9QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +jp,ac,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 8QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +co,edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 7QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz # NOT: -com,corp,edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +com,corp,edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 6QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz diff --git a/pig/tests/files/papers_url_doi.cdx b/pig/tests/files/papers_url_doi.cdx index 1ad5792..ee90fb1 100644 --- a/pig/tests/files/papers_url_doi.cdx +++ b/pig/tests/files/papers_url_doi.cdx @@ -3,5 +3,5 @@ # should match 2: -org,ametsoc,journals)/doi/pdf/10.1175/2008BAMS2370.1 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz -org,nejm,www)/doi/pdf/10.1056/NEJMoa1013607 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +org,ametsoc,journals)/doi/pdf/10.1175/2008BAMS2370.1 20170706005950 http://mit.edu/file.pdf application/pdf 200 4QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +org,nejm,www)/doi/pdf/10.1056/NEJMoa1013607 20170706005950 http://mit.edu/file.pdf application/pdf 200 3QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz diff --git a/pig/tests/files/papers_url_words.cdx b/pig/tests/files/papers_url_words.cdx index e9bf661..8d4fe01 100644 --- a/pig/tests/files/papers_url_words.cdx +++ b/pig/tests/files/papers_url_words.cdx @@ -13,15 +13,15 @@ # 12 matches: -uk,ac,surrey,ee,personal)/Personal/R.Bowden/publications/2012/Gilbert_ACCV_2012pp.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz -gov,ed,eric,files)/fulltext/EJ798626.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz -edu,hbs,www)/research/pdf/10-108.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz -ch,unifr,www)/biochem/assets/files/albrecht/publications/Abraham06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz -edu,cmu,cnbc,www)/cns/papers/Kassetal2005.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz -org,macrothink,www)/journal/index.php/ijhrs/article/download/5765/4663 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz -ca,math,pims,www)/science/2004/fpsac/Papers/Liskovets.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz -at,ac,uni-linz,risc,www)/publications/download/risc_3287/synasc_revised.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz -gr,uoi,cs,softsys)/dbglobe/publications/wi04.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz -za,ac,journals,lexikos)/pub/article/download/1048/564 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz -org,siam,www)/proceedings/analco/2007/anl07_029ecesaratto.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz -uk,ac,bris,cs,www)/Publications/Papers/2000249.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +uk,ac,surrey,ee,personal)/Personal/R.Bowden/publications/2012/Gilbert_ACCV_2012pp.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 1QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +gov,ed,eric,files)/fulltext/EJ798626.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 2QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +edu,hbs,www)/research/pdf/10-108.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 3QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +ch,unifr,www)/biochem/assets/files/albrecht/publications/Abraham06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 4QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +edu,cmu,cnbc,www)/cns/papers/Kassetal2005.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 5QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +org,macrothink,www)/journal/index.php/ijhrs/article/download/5765/4663 20170706005950 http://mit.edu/file.pdf application/pdf 200 6QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +ca,math,pims,www)/science/2004/fpsac/Papers/Liskovets.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 7QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +at,ac,uni-linz,risc,www)/publications/download/risc_3287/synasc_revised.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 8QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +gr,uoi,cs,softsys)/dbglobe/publications/wi04.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 9QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +za,ac,journals,lexikos)/pub/article/download/1048/564 20170706005950 http://mit.edu/file.pdf application/pdf 200 HQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +org,siam,www)/proceedings/analco/2007/anl07_029ecesaratto.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 DQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +uk,ac,bris,cs,www)/Publications/Papers/2000249.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 SQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz -- cgit v1.2.3