import copy import pytest from fatcat_openapi_client import * from fixtures import * from fatcat_tools.cleanups import FileCleaner @pytest.fixture(scope="function") def file_cleaner(api): yield FileCleaner(api) def test_url_cleanups(file_cleaner): f = FileEntity( sha1="027e7ed3ea1a40e92dd2657a1e3c992b5dc45dd2", urls=[], ) f.urls = [ FileUrl( url="https://web.archive.org/web/12345542/something.com/blah.pdf", rel="webarchive" ), FileUrl( url="https://web.archive.org/web/None/something.com/blah.pdf", rel="webarchive" ), FileUrl( url="https://archive.org/details/None/something.com/blah.pdf", rel="repository" ), ] f = file_cleaner.clean_entity(f) # remove None wayback links assert len(f.urls) == 2 for u in f.urls: assert "web/None" not in u.url assert f == file_cleaner.clean_entity(f) assert f == file_cleaner.clean_entity(copy.deepcopy(f)) # rel=repository -> rel=archive for archive.org links assert f.urls[1].rel == "archive" # short wayback dates f.urls = [ FileUrl( url="http://web.archive.org/web/20181031120933/https://www.jstage.jst.go.jp/article/jsci1978/1/1/1_1_231/_pdf", rel="webarchive", ), FileUrl( url="http://web.archive.org/web/2018/https://www.jstage.jst.go.jp/article/jsci1978/1/1/1_1_231/_pdf", rel="webarchive", ), ] f = file_cleaner.clean_entity(f) assert len(f.urls) == 1 assert ( f.urls[0].url == "http://web.archive.org/web/20181031120933/https://www.jstage.jst.go.jp/article/jsci1978/1/1/1_1_231/_pdf" ) assert f == file_cleaner.clean_entity(f) assert f == file_cleaner.clean_entity(copy.deepcopy(f)) f.urls = [ FileUrl( url="http://web.archive.org/web/2018/https://www.jstage.jst.go.jp/article/jsci1978/1/1/1_1_231/_pdf", rel="webarchive", ), ] f = file_cleaner.clean_entity(f) assert len(f.urls) == 1 assert ( f.urls[0].url == "http://web.archive.org/web/2018/https://www.jstage.jst.go.jp/article/jsci1978/1/1/1_1_231/_pdf" ) assert f == file_cleaner.clean_entity(f) assert f == file_cleaner.clean_entity(copy.deepcopy(f))