aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-05-08 10:06:14 -0700
committerBryan Newbold <bnewbold@archive.org>2018-05-08 10:06:20 -0700
commit18a55d37a87d4391bd8161201c523dd7d7f0f1e7 (patch)
tree86db4c84cf4fd0dde5ea9508617344018e640104
parent1831a3b4495aee275e4b4b187fa545eba75eb87b (diff)
downloadsandcrawler-18a55d37a87d4391bd8161201c523dd7d7f0f1e7.tar.gz
sandcrawler-18a55d37a87d4391bd8161201c523dd7d7f0f1e7.zip
fix tests post-DISTINCT
Confirms it's working!
-rw-r--r--TODO5
-rw-r--r--pig/tests/files/papers_domain_words.cdx8
-rw-r--r--pig/tests/files/papers_edu_tilde.cdx14
-rw-r--r--pig/tests/files/papers_url_doi.cdx4
-rw-r--r--pig/tests/files/papers_url_words.cdx24
5 files changed, 30 insertions, 25 deletions
diff --git a/TODO b/TODO
index 57c827f..5e9220b 100644
--- a/TODO
+++ b/TODO
@@ -1,4 +1,9 @@
+pig:
+- potentially want to *not* de-dupe CDX lines by uniq sha1 in all cases; run
+ this as a second-stage filter? for example, may want many URL links in fatcat
+ for a single file (different links, different policies)
+
- include input file name (and chunk? and CDX?) in sentry context
- play with test image on older releases (eg, trusty)
diff --git a/pig/tests/files/papers_domain_words.cdx b/pig/tests/files/papers_domain_words.cdx
index 48e2313..02b5eaa 100644
--- a/pig/tests/files/papers_domain_words.cdx
+++ b/pig/tests/files/papers_domain_words.cdx
@@ -5,7 +5,7 @@
# should match 4:
-edu,fit,research)/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
-org,sgmjournals,ijs)//cgi/reprint/54/6/2217.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
-uk,ac,soton,ecs,eprints)/12020/1/mind-the-semantic-gap.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
-au,edu,uq,eprint)/archive/00004120/01/R103_Forrester_pp.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+edu,fit,research)/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3I - - 123 456 CRAWL/CRAWL.warc.gz
+org,sgmjournals,ijs)//cgi/reprint/54/6/2217.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 TQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+uk,ac,soton,ecs,eprints)/12020/1/mind-the-semantic-gap.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 NQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+au,edu,uq,eprint)/archive/00004120/01/R103_Forrester_pp.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 QQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
diff --git a/pig/tests/files/papers_edu_tilde.cdx b/pig/tests/files/papers_edu_tilde.cdx
index 47ca069..f43a11a 100644
--- a/pig/tests/files/papers_edu_tilde.cdx
+++ b/pig/tests/files/papers_edu_tilde.cdx
@@ -4,12 +4,12 @@
#http://www.comp.hkbu.edu.hk/~ymc/papers/conference/ijcnn03_710.pdf
# should be 6 matches:
-hk,edu,hkbu,comp)/~ymc/papers/conference/ijcnn03_710.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
-edu,stanford,www)/~johntayl/Papers/taylor2.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
-edu,nps,met)/~mtmontgo/papers/isabel_part2.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
-edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
-jp,ac,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
-co,edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+hk,edu,hkbu,comp)/~ymc/papers/conference/ijcnn03_710.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 LQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+edu,stanford,www)/~johntayl/Papers/taylor2.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 XQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+edu,nps,met)/~mtmontgo/papers/isabel_part2.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 PQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 9QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+jp,ac,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 8QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+co,edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 7QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
# NOT:
-com,corp,edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+com,corp,edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 6QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
diff --git a/pig/tests/files/papers_url_doi.cdx b/pig/tests/files/papers_url_doi.cdx
index 1ad5792..ee90fb1 100644
--- a/pig/tests/files/papers_url_doi.cdx
+++ b/pig/tests/files/papers_url_doi.cdx
@@ -3,5 +3,5 @@
# should match 2:
-org,ametsoc,journals)/doi/pdf/10.1175/2008BAMS2370.1 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
-org,nejm,www)/doi/pdf/10.1056/NEJMoa1013607 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+org,ametsoc,journals)/doi/pdf/10.1175/2008BAMS2370.1 20170706005950 http://mit.edu/file.pdf application/pdf 200 4QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+org,nejm,www)/doi/pdf/10.1056/NEJMoa1013607 20170706005950 http://mit.edu/file.pdf application/pdf 200 3QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
diff --git a/pig/tests/files/papers_url_words.cdx b/pig/tests/files/papers_url_words.cdx
index e9bf661..8d4fe01 100644
--- a/pig/tests/files/papers_url_words.cdx
+++ b/pig/tests/files/papers_url_words.cdx
@@ -13,15 +13,15 @@
# 12 matches:
-uk,ac,surrey,ee,personal)/Personal/R.Bowden/publications/2012/Gilbert_ACCV_2012pp.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
-gov,ed,eric,files)/fulltext/EJ798626.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
-edu,hbs,www)/research/pdf/10-108.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
-ch,unifr,www)/biochem/assets/files/albrecht/publications/Abraham06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
-edu,cmu,cnbc,www)/cns/papers/Kassetal2005.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
-org,macrothink,www)/journal/index.php/ijhrs/article/download/5765/4663 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
-ca,math,pims,www)/science/2004/fpsac/Papers/Liskovets.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
-at,ac,uni-linz,risc,www)/publications/download/risc_3287/synasc_revised.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
-gr,uoi,cs,softsys)/dbglobe/publications/wi04.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
-za,ac,journals,lexikos)/pub/article/download/1048/564 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
-org,siam,www)/proceedings/analco/2007/anl07_029ecesaratto.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
-uk,ac,bris,cs,www)/Publications/Papers/2000249.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+uk,ac,surrey,ee,personal)/Personal/R.Bowden/publications/2012/Gilbert_ACCV_2012pp.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 1QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+gov,ed,eric,files)/fulltext/EJ798626.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 2QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+edu,hbs,www)/research/pdf/10-108.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 3QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+ch,unifr,www)/biochem/assets/files/albrecht/publications/Abraham06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 4QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+edu,cmu,cnbc,www)/cns/papers/Kassetal2005.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 5QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+org,macrothink,www)/journal/index.php/ijhrs/article/download/5765/4663 20170706005950 http://mit.edu/file.pdf application/pdf 200 6QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+ca,math,pims,www)/science/2004/fpsac/Papers/Liskovets.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 7QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+at,ac,uni-linz,risc,www)/publications/download/risc_3287/synasc_revised.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 8QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+gr,uoi,cs,softsys)/dbglobe/publications/wi04.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 9QHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+za,ac,journals,lexikos)/pub/article/download/1048/564 20170706005950 http://mit.edu/file.pdf application/pdf 200 HQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+org,siam,www)/proceedings/analco/2007/anl07_029ecesaratto.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 DQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+uk,ac,bris,cs,www)/Publications/Papers/2000249.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 SQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz