diff options
Diffstat (limited to 'mapreduce/tests/test_common.py')
-rw-r--r-- | mapreduce/tests/test_common.py | 10 |
1 files changed, 10 insertions, 0 deletions
diff --git a/mapreduce/tests/test_common.py b/mapreduce/tests/test_common.py index e2f96bb..34d50ed 100644 --- a/mapreduce/tests/test_common.py +++ b/mapreduce/tests/test_common.py @@ -28,3 +28,13 @@ def test_parse_cdx_line(): assert parse_cdx_line(raw) == correct assert parse_cdx_line(raw + "\n") == correct assert parse_cdx_line(raw + " extra_field") == correct + +def test_invalid_cdx(): + + print("missing warc") + raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -" + assert parse_cdx_line(raw) == None + + print("bad datetime") + raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" + assert parse_cdx_line(raw) == None |