diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-05-08 06:21:29 +0000 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-05-08 06:21:29 +0000 |
commit | e566ee1b4e134bfc06284cf77d8d1370df30d53f (patch) | |
tree | f3969054cc5f93608b5c72d41541ea381ef89a6b | |
parent | 0c398392aa298d28694bf5bd37d3e4912de8a2f5 (diff) | |
parent | 65b7d45852af3de557eaaf200471ff9b1a211970 (diff) | |
download | sandcrawler-e566ee1b4e134bfc06284cf77d8d1370df30d53f.tar.gz sandcrawler-e566ee1b4e134bfc06284cf77d8d1370df30d53f.zip |
Merge branch 'master' of git.archive.org:webgroup/sandcrawler
-rw-r--r-- | TODO | 1 | ||||
-rw-r--r-- | pig/filter-cdx-paper-pdfs.pig | 41 | ||||
-rw-r--r-- | pig/tests/files/papers_domain_words.cdx | 11 | ||||
-rw-r--r-- | pig/tests/files/papers_edu_tilde.cdx | 15 | ||||
-rw-r--r-- | pig/tests/files/papers_url_doi.cdx | 7 | ||||
-rw-r--r-- | pig/tests/files/papers_url_words.cdx | 27 | ||||
-rw-r--r-- | pig/tests/pighelper.py | 14 | ||||
-rw-r--r-- | pig/tests/test_filter_cdx_paper_pdfs.py | 26 |
8 files changed, 139 insertions, 3 deletions
@@ -1,4 +1,5 @@ +- include input file name (and chunk? and CDX?) in sentry context - play with test image on older releases (eg, trusty) - how to get argument (like --hbase-table) into mrjob.conf, or similar? diff --git a/pig/filter-cdx-paper-pdfs.pig b/pig/filter-cdx-paper-pdfs.pig new file mode 100644 index 0000000..6559066 --- /dev/null +++ b/pig/filter-cdx-paper-pdfs.pig @@ -0,0 +1,41 @@ + +-- Tries to filter down a large CDX file to a subset that is likely to be +-- journal article content, based on SURT regex patterns. +--- +-- Author: Bryan Newbold <bnewbold@archive.org> +-- Date: May 2018 + + +%default INPUT '' +%default OUTPUT '' + +set mapreduce.job.queuename default + +cdx = LOAD '$INPUT' AS cdxline:chararray; +cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc'); +cdx = FILTER cdx BY not STARTSWITH (cdxline, ' '); + +cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline; +cdx = FOREACH cdx GENERATE (chararray)cols.$0 as surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, cdxline; +cdx = FILTER cdx BY not surt matches '-'; +cdx = FILTER cdx BY httpstatus matches '200'; +cdx = FILTER cdx BY mimetype matches '.*pdf.*'; + +-- This is the core regex +cdx = FILTER cdx + -- academic domains; personal (tilde) directories + BY surt matches '(edu,|..,edu|..,ac,).*\\).*\\/~.*' + + -- words in URL + OR surt matches '(?i).+\\).*/(pubs|research|publications?|articles?|proceedings?|papers?|fulltext)/.*' + + -- words in domains + OR surt matches '.*(,hal|,eprint|scielo|redalyc|revues|revistas|research|journal).*\\).*' + + -- DOI-like pattern in URL + OR surt matches '.*\\).*/10\\.\\d{3,5}/.*'; + +cdx = ORDER cdx by surt, timestamp PARALLEL 50; +cdx = FOREACH cdx GENERATE cdxline; +STORE cdx INTO '$OUTPUT' USING PigStorage(' '); + diff --git a/pig/tests/files/papers_domain_words.cdx b/pig/tests/files/papers_domain_words.cdx new file mode 100644 index 0000000..48e2313 --- /dev/null +++ b/pig/tests/files/papers_domain_words.cdx @@ -0,0 +1,11 @@ +#http://research.fit.edu/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.pdf +#http://ijs.sgmjournals.org:80/cgi/reprint/54/6/2217.pdf +#http://eprints.ecs.soton.ac.uk/12020/1/mind-the-semantic-gap.pdf +#http://eprint.uq.edu.au/archive/00004120/01/R103_Forrester_pp.pdf + +# should match 4: + +edu,fit,research)/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +org,sgmjournals,ijs)//cgi/reprint/54/6/2217.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +uk,ac,soton,ecs,eprints)/12020/1/mind-the-semantic-gap.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +au,edu,uq,eprint)/archive/00004120/01/R103_Forrester_pp.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz diff --git a/pig/tests/files/papers_edu_tilde.cdx b/pig/tests/files/papers_edu_tilde.cdx new file mode 100644 index 0000000..47ca069 --- /dev/null +++ b/pig/tests/files/papers_edu_tilde.cdx @@ -0,0 +1,15 @@ +#http://www.stanford.edu:80/~johntayl/Papers/taylor2.pdf +#http://met.nps.edu/~mtmontgo/papers/isabel_part2.pdf +#http://www.pitt.edu:80/~druzdzel/psfiles/ecai06.pdf +#http://www.comp.hkbu.edu.hk/~ymc/papers/conference/ijcnn03_710.pdf + +# should be 6 matches: +hk,edu,hkbu,comp)/~ymc/papers/conference/ijcnn03_710.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +edu,stanford,www)/~johntayl/Papers/taylor2.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +edu,nps,met)/~mtmontgo/papers/isabel_part2.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +jp,ac,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +co,edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz + +# NOT: +com,corp,edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz diff --git a/pig/tests/files/papers_url_doi.cdx b/pig/tests/files/papers_url_doi.cdx new file mode 100644 index 0000000..1ad5792 --- /dev/null +++ b/pig/tests/files/papers_url_doi.cdx @@ -0,0 +1,7 @@ +#http://journals.ametsoc.org/doi/pdf/10.1175/2008BAMS2370.1 +#http://www.nejm.org:80/doi/pdf/10.1056/NEJMoa1013607 + +# should match 2: + +org,ametsoc,journals)/doi/pdf/10.1175/2008BAMS2370.1 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +org,nejm,www)/doi/pdf/10.1056/NEJMoa1013607 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz diff --git a/pig/tests/files/papers_url_words.cdx b/pig/tests/files/papers_url_words.cdx new file mode 100644 index 0000000..e9bf661 --- /dev/null +++ b/pig/tests/files/papers_url_words.cdx @@ -0,0 +1,27 @@ +#http://personal.ee.surrey.ac.uk/Personal/R.Bowden/publications/2012/Gilbert_ACCV_2012pp.pdf +#http://files.eric.ed.gov/fulltext/EJ798626.pdf +#http://www.hbs.edu/research/pdf/10-108.pdf +#http://www.unifr.ch/biochem/assets/files/albrecht/publications/Abraham06.pdf +#http://www.cnbc.cmu.edu/cns/papers/Kassetal2005.pdf +#http://www.macrothink.org/journal/index.php/ijhrs/article/download/5765/4663 +#http://www.pims.math.ca:80/science/2004/fpsac/Papers/Liskovets.pdf +#http://www.risc.uni-linz.ac.at/publications/download/risc_3287/synasc_revised.pdf +#http://softsys.cs.uoi.gr/dbglobe/publications/wi04.pdf +#http://lexikos.journals.ac.za/pub/article/download/1048/564 +#http://www.siam.org/proceedings/analco/2007/anl07_029ecesaratto.pdf +#http://www.cs.bris.ac.uk/Publications/Papers/2000249.pdf + +# 12 matches: + +uk,ac,surrey,ee,personal)/Personal/R.Bowden/publications/2012/Gilbert_ACCV_2012pp.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +gov,ed,eric,files)/fulltext/EJ798626.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +edu,hbs,www)/research/pdf/10-108.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +ch,unifr,www)/biochem/assets/files/albrecht/publications/Abraham06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +edu,cmu,cnbc,www)/cns/papers/Kassetal2005.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +org,macrothink,www)/journal/index.php/ijhrs/article/download/5765/4663 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +ca,math,pims,www)/science/2004/fpsac/Papers/Liskovets.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +at,ac,uni-linz,risc,www)/publications/download/risc_3287/synasc_revised.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +gr,uoi,cs,softsys)/dbglobe/publications/wi04.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +za,ac,journals,lexikos)/pub/article/download/1048/564 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +org,siam,www)/proceedings/analco/2007/anl07_029ecesaratto.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +uk,ac,bris,cs,www)/Publications/Papers/2000249.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz diff --git a/pig/tests/pighelper.py b/pig/tests/pighelper.py index d0d89d2..4aa4259 100644 --- a/pig/tests/pighelper.py +++ b/pig/tests/pighelper.py @@ -36,7 +36,8 @@ class PigTestHelper(unittest.TestCase): self._tmpdir = tempfile.mkdtemp() def tearDown(self): - shutil.rmtree(self._tmpdir) + pass + # XXX: shutil.rmtree(self._tmpdir) def run_pig_raw(self, params): """Low-level variant with params appended directly. Returns @@ -67,7 +68,14 @@ class PigTestHelper(unittest.TestCase): '-p', 'INPUT={}'.format(in_file), '-p', 'OUTPUT={}'.format(out_file), ] + pargs - self.run_pig_raw(params) - return out_file + status = self.run_pig_raw(params) + assert status.returncode == 0 + # Capture all the part-r-* files + print("out_file: {}".format(out_file)) + subprocess.run("/bin/ls -la {}/part-*".format(out_file), shell=True) + sub = subprocess.run("/bin/cat {}/part-*".format(out_file), stdout=subprocess.PIPE, shell=True) + out = sub.stdout.decode('utf-8') + print(out) + return out # TODO: helper to verify that output matches an expected file diff --git a/pig/tests/test_filter_cdx_paper_pdfs.py b/pig/tests/test_filter_cdx_paper_pdfs.py new file mode 100644 index 0000000..a8ebd9f --- /dev/null +++ b/pig/tests/test_filter_cdx_paper_pdfs.py @@ -0,0 +1,26 @@ + +import os +import unittest +from pighelper import PigTestHelper + +def count_lines(s): + return len([l for l in s.strip().split('\n') if len(l) > 0]) + +class TestFilterCDXPaperPdfs(PigTestHelper): + + def test_papers_domain_words(self): + r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_domain_words.cdx") + assert count_lines(r) == 4 + + def test_papers_edu_tilde(self): + r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_edu_tilde.cdx") + assert count_lines(r) == 6 + + def test_papers_url_doi(self): + r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_url_doi.cdx") + assert count_lines(r) == 2 + + def test_papers_url_words(self): + r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_url_words.cdx") + assert count_lines(r) == 12 + |