diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2018-07-24 12:16:21 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2018-07-24 12:16:52 -0700 | 
| commit | 18bdcfd362a190dd36d3b86996808366b83bbbda (patch) | |
| tree | 5ac1a424b515d22c5d26a4775d083a4d967b5a11 /scalding/src/main | |
| parent | 4080ea26892c6155eb9239b94102b32d7237678e (diff) | |
| download | sandcrawler-18bdcfd362a190dd36d3b86996808366b83bbbda.tar.gz sandcrawler-18bdcfd362a190dd36d3b86996808366b83bbbda.zip  | |
do sha1 pattern match correctly
Diffstat (limited to 'scalding/src/main')
| -rw-r--r-- | scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala | 10 | 
1 files changed, 7 insertions, 3 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala index f98b6e9..389a96a 100644 --- a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala +++ b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala @@ -3,6 +3,7 @@ package sandcrawler  import java.util.Properties  import scala.util.Try +import scala.util.matching.Regex  import scala.util.parsing.json.JSONObject  import cascading.pipe.joiner._ @@ -122,11 +123,14 @@ object CdxBackfillJob {    }    def keepCdx(line: CdxLine) : Boolean = { +    val sha1Pattern = """[A-Z2-7]{32}""".r      if (List(line.surt, line.datetime, line.url, line.mime, line.c_size, line.offset, line.warc).contains("-")) { -      // TODO: hadoop counter (was: "DASHLINE")        false -    } else if (line.httpStatus != "200" || line.sha1.size != 32) { -      // TODO: sha1.isalnum() +    } else if (line.httpStatus != "200") { +      false +    } else if (line.mime != "application/pdf") { +      false +    } else if (sha1Pattern.unapplySeq(line.sha1).isEmpty) {        false      } else if (List(line.c_size, line.offset, line.datetime).map(s => Try(s.toLong).toOption).contains(None)) {        false  | 
