aboutsummaryrefslogtreecommitdiffstats
path: root/scalding
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-07-19 16:56:51 -0700
committerBryan Newbold <bnewbold@archive.org>2018-07-24 11:27:45 -0700
commit92650c4663bba7d8d9914e2bc120a4b923a7a94b (patch)
treedea85e5c62a4213881931631662d6900f41c89e3 /scalding
parentf6c88b66cea8919fe8a0a438e60841ad682aa71d (diff)
downloadsandcrawler-92650c4663bba7d8d9914e2bc120a4b923a7a94b.tar.gz
sandcrawler-92650c4663bba7d8d9914e2bc120a4b923a7a94b.zip
fix CdxBackfillJob tests
Diffstat (limited to 'scalding')
-rw-r--r--scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala15
-rw-r--r--scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala4
2 files changed, 13 insertions, 6 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala
index 4a2eaba..36e017e 100644
--- a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala
+++ b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala
@@ -1,6 +1,8 @@
package sandcrawler
import java.util.Properties
+import scala.util.parsing.json.JSONObject
+import scala.util.Try
import cascading.property.AppProps
import cascading.tap.SinkMode
@@ -12,7 +14,6 @@ import parallelai.spyglass.base.JobBase
import parallelai.spyglass.hbase.HBaseConstants.SourceMode
import parallelai.spyglass.hbase.HBasePipeConversions
import parallelai.spyglass.hbase.HBaseSource
-import scala.util.parsing.json.JSONObject
// Type that represents a raw parsed CDX line
case class CdxLine(surt: String,
@@ -130,11 +131,17 @@ object CdxBackfillJob {
}
def keepCdx(line: CdxLine) : Boolean = {
- // TODO: sha1.isalnum() and c_size.isdigit() and offset.isdigit() and dt.isdigit()
+ if (List(line.surt, line.datetime, line.url, line.mime, line.c_size, line.offset, line.warc).contains("-")) {
+ println("DASHLINE")
+ return false
+ }
+ // TODO: sha1.isalnum()
if (line.http_status != "200" || line.sha1.size != 32) {
return false
}
- // TODO: '-' in (line.surt, line.datetime, line.url, line.mime, line.c_size, line.offset, line.warc)
+ if (List(line.c_size, line.offset, line.datetime).map(s => Try(s.toLong).toOption).contains(None)) {
+ return false
+ }
return true
}
@@ -174,7 +181,7 @@ object CdxBackfillJob {
"offset" -> line.offset.toInt,
"warc" -> line.warc
))
- (key, heritrixInfo.toString(), fileCdx.toString(), line.mime)
+ (key, heritrixInfo.toString(), fileCdx.toString(), normalizeMime(line.mime))
}
def lineToCdxLine(line: String) : CdxLine = {
diff --git a/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala b/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala
index fb5b162..a6107fc 100644
--- a/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala
+++ b/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala
@@ -65,7 +65,7 @@ class CdxBackfillTest extends FlatSpec with Matchers {
assert(obj("f") == "CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz")
assert(obj("c") == 854156)
assert(obj("o") == 328850624)
- assert(obj("d") == "2017-08-28T23:31:54Z")
+ assert(obj("d") == "2017-07-05T06:22:02Z")
}
case other => assert(false)
}
@@ -80,7 +80,7 @@ class CdxBackfillTest extends FlatSpec with Matchers {
}
case other => assert(false)
}
- assert(row._3 == "application/pdf")
+ assert(row._4 == "application/pdf")
}
}