aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-03-02 15:52:47 -0800
committerBryan Newbold <bnewbold@archive.org>2020-03-02 15:52:47 -0800
commit214c35db713ee5f34e9c6e8aa24df6456addcece (patch)
tree00d545ddbea084f1ed263f07d2e51220ffa6f408
parent137a3334729682ae3d8805bd130d3464ed3e1bb8 (diff)
downloadsandcrawler-214c35db713ee5f34e9c6e8aa24df6456addcece.tar.gz
sandcrawler-214c35db713ee5f34e9c6e8aa24df6456addcece.zip
remove protocols.io octet-stream hack
-rw-r--r--python/sandcrawler/ingest.py8
1 files changed, 2 insertions, 6 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 0d4e7c6..8e0efeb 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -348,12 +348,8 @@ class IngestFileWorker(SandcrawlerWorker):
return result
if not (resource.hit and file_meta['mimetype'] == "application/pdf"):
- # protocols.io PDFs are "application/octet-stream"
- if (file_meta['mimetype'] == "application/octet-stream" and "://protocols.io/" in resource.terminal_url):
- pass
- else:
- result['status'] = "wrong-mimetype" # formerly: "other-mimetype"
- return result
+ result['status'] = "wrong-mimetype" # formerly: "other-mimetype"
+ return result
info = self.process_hit(resource, file_meta)
result.update(info)