aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/test/scala
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-06-11 17:49:28 -0700
committerBryan Newbold <bnewbold@archive.org>2018-07-17 18:55:55 -0700
commitf6f2fcafb245101f75c9cda175891ef2391cda97 (patch)
treefdb0b20ad09314a7542d4d8e66f19e3f9947a58d /scalding/src/test/scala
parent3e6432307c4e85598cf63ab8578fd678ba9df156 (diff)
downloadsandcrawler-f6f2fcafb245101f75c9cda175891ef2391cda97.tar.gz
sandcrawler-f6f2fcafb245101f75c9cda175891ef2391cda97.zip
early work on scalding CDX backfill
Diffstat (limited to 'scalding/src/test/scala')
-rw-r--r--scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala51
1 files changed, 51 insertions, 0 deletions
diff --git a/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala b/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala
new file mode 100644
index 0000000..a442d79
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala
@@ -0,0 +1,51 @@
+
+package sandcrawler
+
+import cascading.tuple.Fields
+import org.scalatest._
+
+class CdxBackfillTest extends FlatSpec with Matchers {
+
+ import CdxBackfillJob._
+
+ it should "normalize mimetypes" in {
+ assert(CdxBackfillJob.normalizeMime("asdf") === "asdf")
+ assert(CdxBackfillJob.normalizeMime("application/pdf") === "application/pdf")
+ assert(CdxBackfillJob.normalizeMime("application/pdf+journal") === "application/pdf")
+ assert(CdxBackfillJob.normalizeMime("Application/PDF") === "application/pdf")
+ assert(CdxBackfillJob.normalizeMime("application/p") === "application/p")
+ assert(CdxBackfillJob.normalizeMime("application/xml+stuff") === "text/xml")
+ assert(CdxBackfillJob.normalizeMime("application/x-pdf") === "application/pdf")
+ assert(CdxBackfillJob.normalizeMime("application/x-html") === "application/x-html")
+ }
+
+ it should "filter CDX lines" in {
+ assert(true === keepCdx(lineToCdxLine(
+ "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz")))
+ // redirect
+ assert(false === keepCdx(lineToCdxLine(
+ "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 301 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz")))
+ }
+
+ it should "know what CDX lines are" in {
+ assert(true === isCdxLine(
+ "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"))
+ assert(false === isCdxLine(""))
+ assert(false === isCdxLine(
+ " edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"))
+ assert(false === isCdxLine(
+ "#edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"))
+ // missing two fields
+ assert(false === isCdxLine(
+ "#edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"))
+ // extra field
+ assert(false === isCdxLine(
+ "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz -"))
+ }
+
+ it should "execute lineToRow" in {
+ cdxLineToRow(lineToCdxLine(
+ "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"))
+ }
+
+}