aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-07-24 12:16:00 -0700
committerBryan Newbold <bnewbold@archive.org>2018-07-24 12:16:00 -0700
commit4080ea26892c6155eb9239b94102b32d7237678e (patch)
treefb498864a6576864bf8869c727f443393d6e09b4
parent7802970c3d42cd3872ff0a0e8d0ffbbbae56ff80 (diff)
downloadsandcrawler-4080ea26892c6155eb9239b94102b32d7237678e.tar.gz
sandcrawler-4080ea26892c6155eb9239b94102b32d7237678e.zip
more PDF mimetypes; fix return refactor
-rw-r--r--scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala7
1 files changed, 5 insertions, 2 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala
index eb168ac..f98b6e9 100644
--- a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala
+++ b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala
@@ -96,6 +96,10 @@ object CdxBackfillJob {
val normalMime = Map(
"application/pdf" -> "application/pdf",
"application/x-pdf" -> "application/pdf",
+ "('application/pdf'" -> "application/pdf",
+ "image/pdf" -> "application/pdf",
+ "text/pdf" -> "application/pdf",
+ "\"application/pdf\"" -> "application/pdf",
"application/postscript" -> "application/postscript",
"text/html" -> "text/html",
"text/xml" -> "text/xml",
@@ -105,8 +109,7 @@ object CdxBackfillJob {
val lower = raw.toLowerCase()
normalMime.foreach { case (key, value) =>
if (lower.startsWith(key)) {
- lower = value
- break
+ return value
}
}
lower