diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-05-08 06:21:29 +0000 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-05-08 06:21:29 +0000 |
commit | e566ee1b4e134bfc06284cf77d8d1370df30d53f (patch) | |
tree | f3969054cc5f93608b5c72d41541ea381ef89a6b /pig/filter-cdx-paper-pdfs.pig | |
parent | 0c398392aa298d28694bf5bd37d3e4912de8a2f5 (diff) | |
parent | 65b7d45852af3de557eaaf200471ff9b1a211970 (diff) | |
download | sandcrawler-e566ee1b4e134bfc06284cf77d8d1370df30d53f.tar.gz sandcrawler-e566ee1b4e134bfc06284cf77d8d1370df30d53f.zip |
Merge branch 'master' of git.archive.org:webgroup/sandcrawler
Diffstat (limited to 'pig/filter-cdx-paper-pdfs.pig')
-rw-r--r-- | pig/filter-cdx-paper-pdfs.pig | 41 |
1 files changed, 41 insertions, 0 deletions
diff --git a/pig/filter-cdx-paper-pdfs.pig b/pig/filter-cdx-paper-pdfs.pig new file mode 100644 index 0000000..6559066 --- /dev/null +++ b/pig/filter-cdx-paper-pdfs.pig @@ -0,0 +1,41 @@ + +-- Tries to filter down a large CDX file to a subset that is likely to be +-- journal article content, based on SURT regex patterns. +--- +-- Author: Bryan Newbold <bnewbold@archive.org> +-- Date: May 2018 + + +%default INPUT '' +%default OUTPUT '' + +set mapreduce.job.queuename default + +cdx = LOAD '$INPUT' AS cdxline:chararray; +cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc'); +cdx = FILTER cdx BY not STARTSWITH (cdxline, ' '); + +cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline; +cdx = FOREACH cdx GENERATE (chararray)cols.$0 as surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, cdxline; +cdx = FILTER cdx BY not surt matches '-'; +cdx = FILTER cdx BY httpstatus matches '200'; +cdx = FILTER cdx BY mimetype matches '.*pdf.*'; + +-- This is the core regex +cdx = FILTER cdx + -- academic domains; personal (tilde) directories + BY surt matches '(edu,|..,edu|..,ac,).*\\).*\\/~.*' + + -- words in URL + OR surt matches '(?i).+\\).*/(pubs|research|publications?|articles?|proceedings?|papers?|fulltext)/.*' + + -- words in domains + OR surt matches '.*(,hal|,eprint|scielo|redalyc|revues|revistas|research|journal).*\\).*' + + -- DOI-like pattern in URL + OR surt matches '.*\\).*/10\\.\\d{3,5}/.*'; + +cdx = ORDER cdx by surt, timestamp PARALLEL 50; +cdx = FOREACH cdx GENERATE cdxline; +STORE cdx INTO '$OUTPUT' USING PigStorage(' '); + |