From 451815af3f0581c654cb38a2aabaef800789d037 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 8 Oct 2019 16:44:19 -0700 Subject: commit file cleaner tests --- python/tests/clean_files.py | 58 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 python/tests/clean_files.py diff --git a/python/tests/clean_files.py b/python/tests/clean_files.py new file mode 100644 index 00000000..8a87f218 --- /dev/null +++ b/python/tests/clean_files.py @@ -0,0 +1,58 @@ + +import copy +import pytest +from fatcat_tools.cleanups import FileCleaner +from fatcat_openapi_client import * +from fixtures import api + + +@pytest.fixture(scope="function") +def file_cleaner(api): + yield FileCleaner(api) + +def test_url_cleanups(file_cleaner): + + f = FileEntity( + sha1="027e7ed3ea1a40e92dd2657a1e3c992b5dc45dd2", + urls=[], + ) + + f.urls = [ + FileUrl(url="https://web.archive.org/web/12345542/something.com/blah.pdf", rel="webarchive"), + FileUrl(url="https://web.archive.org/web/None/something.com/blah.pdf", rel="webarchive"), + FileUrl(url="https://archive.org/details/None/something.com/blah.pdf", rel="repository"), + ] + f = file_cleaner.clean_entity(f) + + # remove None wayback links + assert len(f.urls) == 2 + for u in f.urls: + assert not 'web/None' in u.url + + assert f == file_cleaner.clean_entity(f) + assert f == file_cleaner.clean_entity(copy.deepcopy(f)) + + # rel=repository -> rel=archive for archive.org links + assert f.urls[1].rel == 'archive' + + # short wayback dates + f.urls = [ + FileUrl(url="http://web.archive.org/web/20181031120933/https://www.jstage.jst.go.jp/article/jsci1978/1/1/1_1_231/_pdf", rel="webarchive"), + FileUrl(url="http://web.archive.org/web/2018/https://www.jstage.jst.go.jp/article/jsci1978/1/1/1_1_231/_pdf", rel="webarchive"), + ] + f = file_cleaner.clean_entity(f) + assert len(f.urls) == 1 + assert f.urls[0].url == 'http://web.archive.org/web/20181031120933/https://www.jstage.jst.go.jp/article/jsci1978/1/1/1_1_231/_pdf' + + assert f == file_cleaner.clean_entity(f) + assert f == file_cleaner.clean_entity(copy.deepcopy(f)) + + f.urls = [ + FileUrl(url="http://web.archive.org/web/2018/https://www.jstage.jst.go.jp/article/jsci1978/1/1/1_1_231/_pdf", rel="webarchive"), + ] + f = file_cleaner.clean_entity(f) + assert len(f.urls) == 1 + assert f.urls[0].url == 'http://web.archive.org/web/2018/https://www.jstage.jst.go.jp/article/jsci1978/1/1/1_1_231/_pdf' + + assert f == file_cleaner.clean_entity(f) + assert f == file_cleaner.clean_entity(copy.deepcopy(f)) -- cgit v1.2.3