aboutsummaryrefslogtreecommitdiffstats
path: root/python/tests/test_misc.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/tests/test_misc.py')
-rw-r--r--python/tests/test_misc.py99
1 files changed, 66 insertions, 33 deletions
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index 29f9e9f..2bad851 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -1,77 +1,110 @@
-
import pytest
-from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line, clean_url
+from sandcrawler import (
+ b32_hex,
+ clean_url,
+ gen_file_metadata,
+ gen_file_metadata_path,
+ parse_cdx_line,
+)
+
def test_gen_file_metadata():
-
+
# valid (but very small) PDF file
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
file_meta = gen_file_metadata(f.read())
assert file_meta == {
- 'mimetype': 'application/pdf',
- 'md5hex': '2942bfabb3d05332b66eb128e0842cff',
- 'sha1hex': '90ffd2359008d82298821d16b21778c5c39aec36',
- 'sha256hex': '3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4',
- 'size_bytes': 13264,
+ "mimetype": "application/pdf",
+ "md5hex": "2942bfabb3d05332b66eb128e0842cff",
+ "sha1hex": "90ffd2359008d82298821d16b21778c5c39aec36",
+ "sha256hex": "3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4",
+ "size_bytes": 13264,
}
# valid HTML
fm = gen_file_metadata(
- b"""<html><head><title>dummy</title></head><body>html document</body></html>""")
- assert fm['mimetype'] == 'text/html'
+ b"""<html><head><title>dummy</title></head><body>html document</body></html>"""
+ )
+ assert fm["mimetype"] == "text/html"
# bogus text
fm = gen_file_metadata(b"asdf1234")
- assert fm['mimetype'] == 'text/plain'
- assert fm['size_bytes'] == 8
+ assert fm["mimetype"] == "text/plain"
+ assert fm["size_bytes"] == 8
+
+
+def test_gen_file_metadata_path():
+
+ # valid (but very small) PDF file
+ file_meta = gen_file_metadata_path("tests/files/dummy.pdf")
+ assert file_meta == {
+ "mimetype": "application/pdf",
+ "md5hex": "2942bfabb3d05332b66eb128e0842cff",
+ "sha1hex": "90ffd2359008d82298821d16b21778c5c39aec36",
+ "sha256hex": "3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4",
+ "size_bytes": 13264,
+ }
+
def test_b32_hex():
# valid b32
- assert b32_hex('sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
- assert b32_hex('TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
+ assert (
+ b32_hex("sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC")
+ == "9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982"
+ )
+ assert (
+ b32_hex("TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC")
+ == "9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982"
+ )
# sha1hex pass-through
- s = 'bda3c1017d52e826bbd1da51efad877272d300f9'
+ s = "bda3c1017d52e826bbd1da51efad877272d300f9"
assert b32_hex(s) == s
# invalid
with pytest.raises(ValueError):
- assert b32_hex('blah') == 'blah'
+ assert b32_hex("blah") == "blah"
+
def test_parse_cdx_line():
raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
correct = {
- 'sha1b32': "WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
- 'sha1hex': "b2f65203da9929c2f758e8dd587b5524f904dbe6",
- 'mimetype': "application/pdf",
- 'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
- 'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
- 'datetime': "20170828233154",
- 'warc_path': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
- 'warc_offset': 931661233,
- 'warc_csize': 210251,
- 'http_status': 200,
+ "sha1b32": "WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
+ "sha1hex": "b2f65203da9929c2f758e8dd587b5524f904dbe6",
+ "mimetype": "application/pdf",
+ "surt": "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+ "url": "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+ "datetime": "20170828233154",
+ "warc_path": "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
+ "warc_offset": 931661233,
+ "warc_csize": 210251,
+ "http_status": 200,
}
assert parse_cdx_line(raw) == correct
assert parse_cdx_line(raw + "\n") == correct
assert parse_cdx_line(raw + " extra_field") == correct
+
def test_invalid_cdx():
print("missing warc")
raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -"
- assert parse_cdx_line(raw) == None
+ assert parse_cdx_line(raw) is None
print("bad datetime")
- raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
- assert parse_cdx_line(raw) == None
+ raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ assert parse_cdx_line(raw) is None
+
def test_clean_url():
assert clean_url("http://BLAH.COM/file.pdf") == "http://blah.com/file.pdf"
- assert clean_url("https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view") == \
- "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view"
-
+ assert (
+ clean_url(
+ "https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view"
+ )
+ == "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view"
+ )