address some (but not all) review comments

author: Bryan Newbold <bnewbold@archive.org> 2018-07-23 14:03:50 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2018-07-24 11:28:12 -0700
commit: 39353c37193ccd96c2e1f6cade50dcf9a3a34678 (patch)
tree: c112619a884e57efb806e91bda7fd2a37f3c6534 /scalding/src/main/scala
parent: 9a00e7beb5f1024ea77fc42196733a0447cf6d12 (diff)
download: sandcrawler-39353c37193ccd96c2e1f6cade50dcf9a3a34678.tar.gz
sandcrawler-39353c37193ccd96c2e1f6cade50dcf9a3a34678.zip
1 files changed, 21 insertions, 20 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala
index 36e017e..84c19b8 100644
--- a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala
+++ b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala
@@ -1,5 +1,7 @@
 package sandcrawler
 
+// TODO: fix import order to satisfy scala style
+
 import java.util.Properties
 import scala.util.parsing.json.JSONObject
 import scala.util.Try
@@ -20,7 +22,7 @@ case class CdxLine(surt: String,
                    datetime: String,
                    url: String,
                    mime: String,
-                   http_status: String,
+                   httpStatus: String,
                    sha1: String,
                    c_size: String,
                    offset: String,
@@ -55,6 +57,7 @@ class CdxBackfillJob(args: Args) extends JobBase(args) with HBasePipeConversions
     .map { lineToCdxLine }
     .filter { CdxBackfillJob.keepCdx(_) }
 
+  // (key, f:c, file:cdx, file:mime)
   val cdxRows : TypedPipe[(String, String, String, String)] = cdxLines
     .map { CdxBackfillJob.cdxLineToRow }
     .debug
@@ -66,7 +69,8 @@ class CdxBackfillJob(args: Args) extends JobBase(args) with HBasePipeConversions
     //.debug
 
   // filters out all the lines that have an existing SHA1 key in HBase
-  // the groupBy statements are to select key values to join on
+  // the groupBy statements are to select key values to join on.
+  // (key, f:c, file:cdx, file:mime)
   val newRows : TypedPipe[(String, String, String, String)] = existingKeys
     .groupBy( identity )
     .rightJoin(cdxRows.groupBy(_._1))
@@ -102,41 +106,38 @@ object CdxBackfillJob {
 
   def normalizeMime(raw: String) : String = {
 
-    val NORMAL_MIME = List("application/pdf",
-                           "application/postscript",
-                           "text/html",
-                           "text/xml")
+    val normalMime = Map(
+      "application/pdf" -> "application/pdf",
+      "application/x-pdf" -> "application/pdf",
+      "application/postscript" -> "application/postscript",
+      "text/html" -> "text/html",
+      "text/xml" -> "text/xml",
+      "application/xml" -> "text/xml"
+    )
 
+    // TODO: improvement of control flow
     val lower = raw.toLowerCase()
-    NORMAL_MIME.foreach(norm =>
-      if (lower.startsWith(norm)) {
-        return norm
+    normalMime.foreach { case (key, value) =>
+      if (lower.startsWith(key)) {
+        return value
       }
-    )
-
-    // Common special cases
-    if (lower.startsWith("application/xml")) {
-      return "text/xml"
-    }
-    if (lower.startsWith("application/x-pdf")) {
-      return "application/pdf"
     }
     lower
   }
 
   def isCdxLine(line: String) : Boolean = {
-    // malformated or non-CDX11 lines
+    // malformatted or non-CDX11 lines
     !(line.startsWith("#") || line.startsWith(" ") || line.startsWith("filedesc") ||
       line.split(" ").size != 11)
   }
 
   def keepCdx(line: CdxLine) : Boolean = {
     if (List(line.surt, line.datetime, line.url, line.mime, line.c_size, line.offset, line.warc).contains("-")) {
-      println("DASHLINE")
+      // TODO: hadoop counter (was: "DASHLINE")
       return false
     }
     // TODO: sha1.isalnum()
-    if (line.http_status != "200" || line.sha1.size != 32) {
+    if (line.httpStatus != "200" || line.sha1.size != 32) {
       return false
     }
     if (List(line.c_size, line.offset, line.datetime).map(s => Try(s.toLong).toOption).contains(None)) {
author	Bryan Newbold <bnewbold@archive.org>	2018-07-23 14:03:50 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2018-07-24 11:28:12 -0700
commit	39353c37193ccd96c2e1f6cade50dcf9a3a34678 (patch)
tree	c112619a884e57efb806e91bda7fd2a37f3c6534 /scalding/src/main/scala
parent	9a00e7beb5f1024ea77fc42196733a0447cf6d12 (diff)
download	sandcrawler-39353c37193ccd96c2e1f6cade50dcf9a3a34678.tar.gz sandcrawler-39353c37193ccd96c2e1f6cade50dcf9a3a34678.zip