From d1401444dbfb515e62094f873d520a23ccbc29d9 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 7 May 2018 22:10:51 -0700 Subject: pig script to filter GWB CDX by SURT regexes --- pig/tests/files/papers_url_doi.cdx | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 pig/tests/files/papers_url_doi.cdx (limited to 'pig/tests/files/papers_url_doi.cdx') diff --git a/pig/tests/files/papers_url_doi.cdx b/pig/tests/files/papers_url_doi.cdx new file mode 100644 index 0000000..1ad5792 --- /dev/null +++ b/pig/tests/files/papers_url_doi.cdx @@ -0,0 +1,7 @@ +#http://journals.ametsoc.org/doi/pdf/10.1175/2008BAMS2370.1 +#http://www.nejm.org:80/doi/pdf/10.1056/NEJMoa1013607 + +# should match 2: + +org,ametsoc,journals)/doi/pdf/10.1175/2008BAMS2370.1 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz +org,nejm,www)/doi/pdf/10.1056/NEJMoa1013607 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz -- cgit v1.2.3