diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-07-24 12:16:00 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-07-24 12:16:00 -0700 |
commit | 4080ea26892c6155eb9239b94102b32d7237678e (patch) | |
tree | fb498864a6576864bf8869c727f443393d6e09b4 | |
parent | 7802970c3d42cd3872ff0a0e8d0ffbbbae56ff80 (diff) | |
download | sandcrawler-4080ea26892c6155eb9239b94102b32d7237678e.tar.gz sandcrawler-4080ea26892c6155eb9239b94102b32d7237678e.zip |
more PDF mimetypes; fix return refactor
-rw-r--r-- | scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala | 7 |
1 files changed, 5 insertions, 2 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala index eb168ac..f98b6e9 100644 --- a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala +++ b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala @@ -96,6 +96,10 @@ object CdxBackfillJob { val normalMime = Map( "application/pdf" -> "application/pdf", "application/x-pdf" -> "application/pdf", + "('application/pdf'" -> "application/pdf", + "image/pdf" -> "application/pdf", + "text/pdf" -> "application/pdf", + "\"application/pdf\"" -> "application/pdf", "application/postscript" -> "application/postscript", "text/html" -> "text/html", "text/xml" -> "text/xml", @@ -105,8 +109,7 @@ object CdxBackfillJob { val lower = raw.toLowerCase() normalMime.foreach { case (key, value) => if (lower.startsWith(key)) { - lower = value - break + return value } } lower |