From 18bdcfd362a190dd36d3b86996808366b83bbbda Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 24 Jul 2018 12:16:21 -0700 Subject: do sha1 pattern match correctly --- scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'scalding/src/main/scala') diff --git a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala index f98b6e9..389a96a 100644 --- a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala +++ b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala @@ -3,6 +3,7 @@ package sandcrawler import java.util.Properties import scala.util.Try +import scala.util.matching.Regex import scala.util.parsing.json.JSONObject import cascading.pipe.joiner._ @@ -122,11 +123,14 @@ object CdxBackfillJob { } def keepCdx(line: CdxLine) : Boolean = { + val sha1Pattern = """[A-Z2-7]{32}""".r if (List(line.surt, line.datetime, line.url, line.mime, line.c_size, line.offset, line.warc).contains("-")) { - // TODO: hadoop counter (was: "DASHLINE") false - } else if (line.httpStatus != "200" || line.sha1.size != 32) { - // TODO: sha1.isalnum() + } else if (line.httpStatus != "200") { + false + } else if (line.mime != "application/pdf") { + false + } else if (sha1Pattern.unapplySeq(line.sha1).isEmpty) { false } else if (List(line.c_size, line.offset, line.datetime).map(s => Try(s.toLong).toOption).contains(None)) { false -- cgit v1.2.3