diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-04-11 20:14:22 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-04-12 14:19:29 -0700 |
commit | 8ac10ab7fe310df55ab5a66d741ea25c24389418 (patch) | |
tree | db4ae61bef6d3c4da47af242b26c65b0657c3729 /pig | |
parent | a2306137c09e4f9e2af70ee97f0cb6c7e0f8134e (diff) | |
download | sandcrawler-8ac10ab7fe310df55ab5a66d741ea25c24389418.tar.gz sandcrawler-8ac10ab7fe310df55ab5a66d741ea25c24389418.zip |
add ojs and dspace as in-domain patterns to look for in heuristic CDX PDF filter
Diffstat (limited to 'pig')
-rw-r--r-- | pig/filter-cdx-paper-pdfs.pig | 2 |
1 files changed, 1 insertions, 1 deletions
diff --git a/pig/filter-cdx-paper-pdfs.pig b/pig/filter-cdx-paper-pdfs.pig index 7e10720..402d340 100644 --- a/pig/filter-cdx-paper-pdfs.pig +++ b/pig/filter-cdx-paper-pdfs.pig @@ -30,7 +30,7 @@ cdx = FILTER cdx OR surt matches '(?i).+\\).*/(pubs|research|publications?|articles?|proceedings?|papers?|fulltext)/.*' -- words in domains - OR surt matches '.*(,hal|,eprint|scielo|redalyc|revues|revistas|research|journal).*\\).*' + OR surt matches '.*(,hal|,eprint|,ojs|,dspace|scielo|redalyc|revues|revistas|research|journal).*\\).*' -- DOI-like pattern in URL OR surt matches '.*\\).*/10\\.\\d{3,5}/.*'; |