diff options
Diffstat (limited to 'scalding/src')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala | 15 | ||||
-rw-r--r-- | scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala | 4 |
2 files changed, 13 insertions, 6 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala index 4a2eaba..36e017e 100644 --- a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala +++ b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala @@ -1,6 +1,8 @@ package sandcrawler import java.util.Properties +import scala.util.parsing.json.JSONObject +import scala.util.Try import cascading.property.AppProps import cascading.tap.SinkMode @@ -12,7 +14,6 @@ import parallelai.spyglass.base.JobBase import parallelai.spyglass.hbase.HBaseConstants.SourceMode import parallelai.spyglass.hbase.HBasePipeConversions import parallelai.spyglass.hbase.HBaseSource -import scala.util.parsing.json.JSONObject // Type that represents a raw parsed CDX line case class CdxLine(surt: String, @@ -130,11 +131,17 @@ object CdxBackfillJob { } def keepCdx(line: CdxLine) : Boolean = { - // TODO: sha1.isalnum() and c_size.isdigit() and offset.isdigit() and dt.isdigit() + if (List(line.surt, line.datetime, line.url, line.mime, line.c_size, line.offset, line.warc).contains("-")) { + println("DASHLINE") + return false + } + // TODO: sha1.isalnum() if (line.http_status != "200" || line.sha1.size != 32) { return false } - // TODO: '-' in (line.surt, line.datetime, line.url, line.mime, line.c_size, line.offset, line.warc) + if (List(line.c_size, line.offset, line.datetime).map(s => Try(s.toLong).toOption).contains(None)) { + return false + } return true } @@ -174,7 +181,7 @@ object CdxBackfillJob { "offset" -> line.offset.toInt, "warc" -> line.warc )) - (key, heritrixInfo.toString(), fileCdx.toString(), line.mime) + (key, heritrixInfo.toString(), fileCdx.toString(), normalizeMime(line.mime)) } def lineToCdxLine(line: String) : CdxLine = { diff --git a/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala b/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala index fb5b162..a6107fc 100644 --- a/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala +++ b/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala @@ -65,7 +65,7 @@ class CdxBackfillTest extends FlatSpec with Matchers { assert(obj("f") == "CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz") assert(obj("c") == 854156) assert(obj("o") == 328850624) - assert(obj("d") == "2017-08-28T23:31:54Z") + assert(obj("d") == "2017-07-05T06:22:02Z") } case other => assert(false) } @@ -80,7 +80,7 @@ class CdxBackfillTest extends FlatSpec with Matchers { } case other => assert(false) } - assert(row._3 == "application/pdf") + assert(row._4 == "application/pdf") } } |