diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-05 11:03:06 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-09 14:17:35 -0800 |
commit | 2712b56580c7d5e2e07fdccd990e1918c43880f9 (patch) | |
tree | b682c1fdc6936e0e4657dc4e42185a8ee1303800 | |
parent | 766bc9c6ebe22a29c39c91941f12eed29b85a41d (diff) | |
download | fatcat-2712b56580c7d5e2e07fdccd990e1918c43880f9.tar.gz fatcat-2712b56580c7d5e2e07fdccd990e1918c43880f9.zip |
wayback short ts: another regression test, and some small fmt/tweaks
-rw-r--r-- | python/fatcat_tools/cleanups/file_short_wayback_ts.py | 41 |
1 files changed, 38 insertions, 3 deletions
diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py index d8b8ab9d..56a5c80e 100644 --- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py +++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py @@ -27,7 +27,10 @@ class FileShortWaybackTimestampCleanup(EntityImporter): def __init__(self, api: ApiClient, **kwargs): - eg_desc = kwargs.pop("editgroup_description", None) or "Expand trunacted timestamps in wayback URLs" + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Expand trunacted timestamps in wayback URLs" + ) eg_extra = kwargs.pop("editgroup_extra", dict()) eg_extra["agent"] = eg_extra.get( "agent", "fatcat_tools.FileShortWaybackTimestampCleanup" @@ -37,7 +40,7 @@ class FileShortWaybackTimestampCleanup(EntityImporter): do_updates=True, editgroup_description=eg_desc, editgroup_extra=eg_extra, - **kwargs + **kwargs, ) self.testing_mode = False @@ -74,7 +77,7 @@ class FileShortWaybackTimestampCleanup(EntityImporter): if fe_url.url in url_expansions: fix_url = url_expansions[fe_url.url] # defensive checks - assert partial_ts in fix_url + assert f"/web/{partial_ts}" in fix_url assert "://" in fix_url assert fe_url.url.endswith(original_url) assert fix_url.endswith(original_url) @@ -262,6 +265,38 @@ def test_short_wayback_ts() -> None: == "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf" ) + # ensure URL order is stable + example_line3: Dict[str, Any] = { + "file_entity": { + "release_ids": ["5rin7f2cdvc5hjkqqw53z7sr3i"], + "mimetype": "application/pdf", + "urls": [ + {"url": "https://pubs.usgs.gov/bul/1108/report.pdf", "rel": "web"}, + { + "url": "https://web.archive.org/web/201904291643/https://pubs.usgs.gov/bul/1108/report.pdf", + "rel": "webarchive", + }, + ], + "sha256": "714cd48c2577e9b058b8f16b4574765da685f67582cc53898a9d6933e45d6cc0", + "sha1": "4efbdb517c0ff3f58136e4efbbec2bd9315400d3", + "md5": "89b6e6cc4e0259317e26ddf1a9a336a0", + "size": 41265, + "revision": "926fcf73-e644-4446-a24b-4d0940a2cf65", + "ident": "lvnz23nzijaapf5iti45zez6zu", + "state": "active", + }, + "full_urls": { + "https://web.archive.org/web/201904291643/https://pubs.usgs.gov/bul/1108/report.pdf": "https://web.archive.org/web/20190429164342/https://pubs.usgs.gov/bul/1108/report.pdf" + }, + "status": "success-db", + } + + fe3 = fswtc.parse_record(example_line3) + assert len(fe3.urls) == 2 + assert fe3.urls[0].rel == "web" + assert fe3.urls[0].url == "https://pubs.usgs.gov/bul/1108/report.pdf" + assert fe3.urls[1].rel == "webarchive" + def main() -> None: parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) |