diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-07-24 12:16:21 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-07-24 12:16:52 -0700 |
commit | 18bdcfd362a190dd36d3b86996808366b83bbbda (patch) | |
tree | 5ac1a424b515d22c5d26a4775d083a4d967b5a11 /scalding/src/test/scala | |
parent | 4080ea26892c6155eb9239b94102b32d7237678e (diff) | |
download | sandcrawler-18bdcfd362a190dd36d3b86996808366b83bbbda.tar.gz sandcrawler-18bdcfd362a190dd36d3b86996808366b83bbbda.zip |
do sha1 pattern match correctly
Diffstat (limited to 'scalding/src/test/scala')
-rw-r--r-- | scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala | 11 |
1 files changed, 11 insertions, 0 deletions
diff --git a/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala b/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala index a6107fc..c092f7f 100644 --- a/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala +++ b/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala @@ -35,6 +35,17 @@ class CdxBackfillTest extends FlatSpec with Matchers { // redirect assert(false === keepCdx(lineToCdxLine( "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 301 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"))) + // not PDF + assert(false === keepCdx(lineToCdxLine( + """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf text/plain 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"""))) + // invalid base32 SHA1 + assert(false === keepCdx(lineToCdxLine( + """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FE010101010101010101VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"""))) + assert(false === keepCdx(lineToCdxLine( + """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL33FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"""))) + // dashed field + assert(false === keepCdx(lineToCdxLine( + """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 - application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"""))) } it should "know what CDX lines are" in { |