diff options
Diffstat (limited to 'pig/filter-cdx-paper-pdfs.pig')
-rw-r--r-- | pig/filter-cdx-paper-pdfs.pig | 2 |
1 files changed, 1 insertions, 1 deletions
diff --git a/pig/filter-cdx-paper-pdfs.pig b/pig/filter-cdx-paper-pdfs.pig index 7e10720..402d340 100644 --- a/pig/filter-cdx-paper-pdfs.pig +++ b/pig/filter-cdx-paper-pdfs.pig @@ -30,7 +30,7 @@ cdx = FILTER cdx OR surt matches '(?i).+\\).*/(pubs|research|publications?|articles?|proceedings?|papers?|fulltext)/.*' -- words in domains - OR surt matches '.*(,hal|,eprint|scielo|redalyc|revues|revistas|research|journal).*\\).*' + OR surt matches '.*(,hal|,eprint|,ojs|,dspace|scielo|redalyc|revues|revistas|research|journal).*\\).*' -- DOI-like pattern in URL OR surt matches '.*\\).*/10\\.\\d{3,5}/.*'; |