From 8ac10ab7fe310df55ab5a66d741ea25c24389418 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 11 Apr 2019 20:14:22 -0700 Subject: add ojs and dspace as in-domain patterns to look for in heuristic CDX PDF filter --- pig/filter-cdx-paper-pdfs.pig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'pig') diff --git a/pig/filter-cdx-paper-pdfs.pig b/pig/filter-cdx-paper-pdfs.pig index 7e10720..402d340 100644 --- a/pig/filter-cdx-paper-pdfs.pig +++ b/pig/filter-cdx-paper-pdfs.pig @@ -30,7 +30,7 @@ cdx = FILTER cdx OR surt matches '(?i).+\\).*/(pubs|research|publications?|articles?|proceedings?|papers?|fulltext)/.*' -- words in domains - OR surt matches '.*(,hal|,eprint|scielo|redalyc|revues|revistas|research|journal).*\\).*' + OR surt matches '.*(,hal|,eprint|,ojs|,dspace|scielo|redalyc|revues|revistas|research|journal).*\\).*' -- DOI-like pattern in URL OR surt matches '.*\\).*/10\\.\\d{3,5}/.*'; -- cgit v1.2.3