aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src')
-rw-r--r--scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala7
1 files changed, 5 insertions, 2 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala
index eb168ac..f98b6e9 100644
--- a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala
+++ b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala
@@ -96,6 +96,10 @@ object CdxBackfillJob {
val normalMime = Map(
"application/pdf" -> "application/pdf",
"application/x-pdf" -> "application/pdf",
+ "('application/pdf'" -> "application/pdf",
+ "image/pdf" -> "application/pdf",
+ "text/pdf" -> "application/pdf",
+ "\"application/pdf\"" -> "application/pdf",
"application/postscript" -> "application/postscript",
"text/html" -> "text/html",
"text/xml" -> "text/xml",
@@ -105,8 +109,7 @@ object CdxBackfillJob {
val lower = raw.toLowerCase()
normalMime.foreach { case (key, value) =>
if (lower.startsWith(key)) {
- lower = value
- break
+ return value
}
}
lower