aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-27 17:07:44 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-27 17:07:44 -0800
commit137a3334729682ae3d8805bd130d3464ed3e1bb8 (patch)
tree6baed2f79d467626cecbec24a5617dcffd063758
parent2069332aa58bd7d5804639a1adec3cde0118a5b0 (diff)
downloadsandcrawler-137a3334729682ae3d8805bd130d3464ed3e1bb8.tar.gz
sandcrawler-137a3334729682ae3d8805bd130d3464ed3e1bb8.zip
more mime normalization
-rw-r--r--python/sandcrawler/misc.py19
1 files changed, 18 insertions, 1 deletions
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index 5713199..88669e6 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -51,10 +51,11 @@ NORMAL_MIME = (
'application/postscript',
'text/html',
'text/xml',
+ 'application/octet-stream',
)
def normalize_mime(raw):
- raw = raw.lower()
+ raw = raw.lower().strip()
for norm in NORMAL_MIME:
if raw.startswith(norm):
return norm
@@ -64,6 +65,20 @@ def normalize_mime(raw):
return 'text/xml'
if raw.startswith('application/x-pdf'):
return 'application/pdf'
+ if raw in (
+ '.pdf',
+ ):
+ return 'application/pdf'
+ if raw in (
+ 'application/download',
+ 'binary/octet-stream',
+ 'unk',
+ 'application/x-download',
+ 'application/octetstream',
+ 'application/force-download',
+ 'application/unknown',
+ ):
+ return 'application/octet-stream'
return None
@@ -76,6 +91,8 @@ def test_normalize_mime():
assert normalize_mime("application/xml+stuff") == "text/xml"
assert normalize_mime("application/x-pdf") == "application/pdf"
assert normalize_mime("application/x-html") is None
+ assert normalize_mime("unk") == "application/octet-stream"
+ assert normalize_mime("binary/octet-stream") == "application/octet-stream"
def parse_cdx_line(raw_cdx, normalize=True):