From 77577da13afe07b5177452122f4cee77e3357b4e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 5 Apr 2018 17:18:52 -0700 Subject: improve test coverage --- mapreduce/tests/test_common.py | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'mapreduce/tests/test_common.py') diff --git a/mapreduce/tests/test_common.py b/mapreduce/tests/test_common.py index e2f96bb..34d50ed 100644 --- a/mapreduce/tests/test_common.py +++ b/mapreduce/tests/test_common.py @@ -28,3 +28,13 @@ def test_parse_cdx_line(): assert parse_cdx_line(raw) == correct assert parse_cdx_line(raw + "\n") == correct assert parse_cdx_line(raw + " extra_field") == correct + +def test_invalid_cdx(): + + print("missing warc") + raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -" + assert parse_cdx_line(raw) == None + + print("bad datetime") + raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" + assert parse_cdx_line(raw) == None -- cgit v1.2.3