Merge branch 'master' of git.archive.org:webgroup/sandcrawler

author: Bryan Newbold <bnewbold@archive.org> 2018-05-08 06:21:29 +0000
committer: Bryan Newbold <bnewbold@archive.org> 2018-05-08 06:21:29 +0000
commit: e566ee1b4e134bfc06284cf77d8d1370df30d53f (patch)
tree: f3969054cc5f93608b5c72d41541ea381ef89a6b /pig/filter-cdx-paper-pdfs.pig
parent: 0c398392aa298d28694bf5bd37d3e4912de8a2f5 (diff)
parent: 65b7d45852af3de557eaaf200471ff9b1a211970 (diff)
download: sandcrawler-e566ee1b4e134bfc06284cf77d8d1370df30d53f.tar.gz
sandcrawler-e566ee1b4e134bfc06284cf77d8d1370df30d53f.zip
1 files changed, 41 insertions, 0 deletions
diff --git a/pig/filter-cdx-paper-pdfs.pig b/pig/filter-cdx-paper-pdfs.pig
new file mode 100644
index 0000000..6559066
--- /dev/null
+++ b/pig/filter-cdx-paper-pdfs.pig
@@ -0,0 +1,41 @@
+
+-- Tries to filter down a large CDX file to a subset that is likely to be
+-- journal article content, based on SURT regex patterns.
+---
+-- Author: Bryan Newbold <bnewbold@archive.org>
+-- Date: May 2018
+
+
+%default INPUT ''
+%default OUTPUT ''
+
+set mapreduce.job.queuename default
+
+cdx = LOAD '$INPUT' AS cdxline:chararray;
+cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
+cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');
+
+cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
+cdx = FOREACH cdx GENERATE (chararray)cols.$0 as surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, cdxline;
+cdx = FILTER cdx BY not surt matches '-';
+cdx = FILTER cdx BY httpstatus matches '200';
+cdx = FILTER cdx BY mimetype matches '.*pdf.*';
+
+-- This is the core regex
+cdx = FILTER cdx
+        -- academic domains; personal (tilde) directories
+        BY surt matches '(edu,|..,edu|..,ac,).*\\).*\\/~.*'
+
+        -- words in URL
+        OR surt matches '(?i).+\\).*/(pubs|research|publications?|articles?|proceedings?|papers?|fulltext)/.*'
+
+        -- words in domains 
+        OR surt matches '.*(,hal|,eprint|scielo|redalyc|revues|revistas|research|journal).*\\).*'
+
+        -- DOI-like pattern in URL
+        OR surt matches '.*\\).*/10\\.\\d{3,5}/.*';
+
+cdx = ORDER cdx by surt, timestamp PARALLEL 50;
+cdx = FOREACH cdx GENERATE cdxline;
+STORE cdx INTO '$OUTPUT' USING PigStorage(' ');
+
author	Bryan Newbold <bnewbold@archive.org>	2018-05-08 06:21:29 +0000
committer	Bryan Newbold <bnewbold@archive.org>	2018-05-08 06:21:29 +0000
commit	e566ee1b4e134bfc06284cf77d8d1370df30d53f (patch)
tree	f3969054cc5f93608b5c72d41541ea381ef89a6b /pig/filter-cdx-paper-pdfs.pig
parent	0c398392aa298d28694bf5bd37d3e4912de8a2f5 (diff)
parent	65b7d45852af3de557eaaf200471ff9b1a211970 (diff)
download	sandcrawler-e566ee1b4e134bfc06284cf77d8d1370df30d53f.tar.gz sandcrawler-e566ee1b4e134bfc06284cf77d8d1370df30d53f.zip