aboutsummaryrefslogtreecommitdiffstats
path: root/mapreduce/tests/test_common.py
diff options
context:
space:
mode:
Diffstat (limited to 'mapreduce/tests/test_common.py')
-rw-r--r--mapreduce/tests/test_common.py10
1 files changed, 10 insertions, 0 deletions
diff --git a/mapreduce/tests/test_common.py b/mapreduce/tests/test_common.py
index e2f96bb..34d50ed 100644
--- a/mapreduce/tests/test_common.py
+++ b/mapreduce/tests/test_common.py
@@ -28,3 +28,13 @@ def test_parse_cdx_line():
assert parse_cdx_line(raw) == correct
assert parse_cdx_line(raw + "\n") == correct
assert parse_cdx_line(raw + " extra_field") == correct
+
+def test_invalid_cdx():
+
+ print("missing warc")
+ raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -"
+ assert parse_cdx_line(raw) == None
+
+ print("bad datetime")
+ raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ assert parse_cdx_line(raw) == None