aboutsummaryrefslogtreecommitdiffstats
path: root/python/tests
diff options
context:
space:
mode:
Diffstat (limited to 'python/tests')
-rw-r--r--python/tests/test_misc.py32
1 files changed, 31 insertions, 1 deletions
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index a7879c8..02deec9 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -1,7 +1,7 @@
import pytest
-from sandcrawler import gen_file_metadata, b32_hex
+from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line
def test_gen_file_metadata():
@@ -39,3 +39,33 @@ def test_b32_hex():
# invalid
with pytest.raises(ValueError):
assert b32_hex('blah') == 'blah'
+
+def test_parse_cdx_line():
+
+ raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ correct = {
+ 'sha1b32': "WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
+ 'sha1hex': "b2f65203da9929c2f758e8dd587b5524f904dbe6",
+ 'mimetype': "application/pdf",
+ 'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+ 'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+ 'datetime': "20170828233154",
+ 'warc': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
+ 'offset': 931661233,
+ 'c_size': 210251,
+ 'http_status': 200,
+ }
+
+ assert parse_cdx_line(raw) == correct
+ assert parse_cdx_line(raw + "\n") == correct
+ assert parse_cdx_line(raw + " extra_field") == correct
+
+def test_invalid_cdx():
+
+ print("missing warc")
+ raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -"
+ assert parse_cdx_line(raw) == None
+
+ print("bad datetime")
+ raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ assert parse_cdx_line(raw) == None