aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-04-11 20:14:22 -0700
committerBryan Newbold <bnewbold@archive.org>2019-04-12 14:19:29 -0700
commit8ac10ab7fe310df55ab5a66d741ea25c24389418 (patch)
treedb4ae61bef6d3c4da47af242b26c65b0657c3729
parenta2306137c09e4f9e2af70ee97f0cb6c7e0f8134e (diff)
downloadsandcrawler-8ac10ab7fe310df55ab5a66d741ea25c24389418.tar.gz
sandcrawler-8ac10ab7fe310df55ab5a66d741ea25c24389418.zip
add ojs and dspace as in-domain patterns to look for in heuristic CDX PDF filter
-rw-r--r--pig/filter-cdx-paper-pdfs.pig2
1 files changed, 1 insertions, 1 deletions
diff --git a/pig/filter-cdx-paper-pdfs.pig b/pig/filter-cdx-paper-pdfs.pig
index 7e10720..402d340 100644
--- a/pig/filter-cdx-paper-pdfs.pig
+++ b/pig/filter-cdx-paper-pdfs.pig
@@ -30,7 +30,7 @@ cdx = FILTER cdx
OR surt matches '(?i).+\\).*/(pubs|research|publications?|articles?|proceedings?|papers?|fulltext)/.*'
-- words in domains
- OR surt matches '.*(,hal|,eprint|scielo|redalyc|revues|revistas|research|journal).*\\).*'
+ OR surt matches '.*(,hal|,eprint|,ojs|,dspace|scielo|redalyc|revues|revistas|research|journal).*\\).*'
-- DOI-like pattern in URL
OR surt matches '.*\\).*/10\\.\\d{3,5}/.*';