From 4080ea26892c6155eb9239b94102b32d7237678e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 24 Jul 2018 12:16:00 -0700 Subject: more PDF mimetypes; fix return refactor --- scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'scalding/src') diff --git a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala index eb168ac..f98b6e9 100644 --- a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala +++ b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala @@ -96,6 +96,10 @@ object CdxBackfillJob { val normalMime = Map( "application/pdf" -> "application/pdf", "application/x-pdf" -> "application/pdf", + "('application/pdf'" -> "application/pdf", + "image/pdf" -> "application/pdf", + "text/pdf" -> "application/pdf", + "\"application/pdf\"" -> "application/pdf", "application/postscript" -> "application/postscript", "text/html" -> "text/html", "text/xml" -> "text/xml", @@ -105,8 +109,7 @@ object CdxBackfillJob { val lower = raw.toLowerCase() normalMime.foreach { case (key, value) => if (lower.startsWith(key)) { - lower = value - break + return value } } lower -- cgit v1.2.3