diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-03-02 15:52:47 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-03-02 15:52:47 -0800 |
commit | 214c35db713ee5f34e9c6e8aa24df6456addcece (patch) | |
tree | 00d545ddbea084f1ed263f07d2e51220ffa6f408 | |
parent | 137a3334729682ae3d8805bd130d3464ed3e1bb8 (diff) | |
download | sandcrawler-214c35db713ee5f34e9c6e8aa24df6456addcece.tar.gz sandcrawler-214c35db713ee5f34e9c6e8aa24df6456addcece.zip |
remove protocols.io octet-stream hack
-rw-r--r-- | python/sandcrawler/ingest.py | 8 |
1 files changed, 2 insertions, 6 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 0d4e7c6..8e0efeb 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -348,12 +348,8 @@ class IngestFileWorker(SandcrawlerWorker): return result if not (resource.hit and file_meta['mimetype'] == "application/pdf"): - # protocols.io PDFs are "application/octet-stream" - if (file_meta['mimetype'] == "application/octet-stream" and "://protocols.io/" in resource.terminal_url): - pass - else: - result['status'] = "wrong-mimetype" # formerly: "other-mimetype" - return result + result['status'] = "wrong-mimetype" # formerly: "other-mimetype" + return result info = self.process_hit(resource, file_meta) result.update(info) |