aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-05 11:03:06 -0700
committerBryan Newbold <bnewbold@robocracy.org>2021-11-09 14:17:35 -0800
commit2712b56580c7d5e2e07fdccd990e1918c43880f9 (patch)
treeb682c1fdc6936e0e4657dc4e42185a8ee1303800
parent766bc9c6ebe22a29c39c91941f12eed29b85a41d (diff)
downloadfatcat-2712b56580c7d5e2e07fdccd990e1918c43880f9.tar.gz
fatcat-2712b56580c7d5e2e07fdccd990e1918c43880f9.zip
wayback short ts: another regression test, and some small fmt/tweaks
-rw-r--r--python/fatcat_tools/cleanups/file_short_wayback_ts.py41
1 files changed, 38 insertions, 3 deletions
diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
index d8b8ab9d..56a5c80e 100644
--- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py
+++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
@@ -27,7 +27,10 @@ class FileShortWaybackTimestampCleanup(EntityImporter):
def __init__(self, api: ApiClient, **kwargs):
- eg_desc = kwargs.pop("editgroup_description", None) or "Expand trunacted timestamps in wayback URLs"
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Expand trunacted timestamps in wayback URLs"
+ )
eg_extra = kwargs.pop("editgroup_extra", dict())
eg_extra["agent"] = eg_extra.get(
"agent", "fatcat_tools.FileShortWaybackTimestampCleanup"
@@ -37,7 +40,7 @@ class FileShortWaybackTimestampCleanup(EntityImporter):
do_updates=True,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
- **kwargs
+ **kwargs,
)
self.testing_mode = False
@@ -74,7 +77,7 @@ class FileShortWaybackTimestampCleanup(EntityImporter):
if fe_url.url in url_expansions:
fix_url = url_expansions[fe_url.url]
# defensive checks
- assert partial_ts in fix_url
+ assert f"/web/{partial_ts}" in fix_url
assert "://" in fix_url
assert fe_url.url.endswith(original_url)
assert fix_url.endswith(original_url)
@@ -262,6 +265,38 @@ def test_short_wayback_ts() -> None:
== "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf"
)
+ # ensure URL order is stable
+ example_line3: Dict[str, Any] = {
+ "file_entity": {
+ "release_ids": ["5rin7f2cdvc5hjkqqw53z7sr3i"],
+ "mimetype": "application/pdf",
+ "urls": [
+ {"url": "https://pubs.usgs.gov/bul/1108/report.pdf", "rel": "web"},
+ {
+ "url": "https://web.archive.org/web/201904291643/https://pubs.usgs.gov/bul/1108/report.pdf",
+ "rel": "webarchive",
+ },
+ ],
+ "sha256": "714cd48c2577e9b058b8f16b4574765da685f67582cc53898a9d6933e45d6cc0",
+ "sha1": "4efbdb517c0ff3f58136e4efbbec2bd9315400d3",
+ "md5": "89b6e6cc4e0259317e26ddf1a9a336a0",
+ "size": 41265,
+ "revision": "926fcf73-e644-4446-a24b-4d0940a2cf65",
+ "ident": "lvnz23nzijaapf5iti45zez6zu",
+ "state": "active",
+ },
+ "full_urls": {
+ "https://web.archive.org/web/201904291643/https://pubs.usgs.gov/bul/1108/report.pdf": "https://web.archive.org/web/20190429164342/https://pubs.usgs.gov/bul/1108/report.pdf"
+ },
+ "status": "success-db",
+ }
+
+ fe3 = fswtc.parse_record(example_line3)
+ assert len(fe3.urls) == 2
+ assert fe3.urls[0].rel == "web"
+ assert fe3.urls[0].url == "https://pubs.usgs.gov/bul/1108/report.pdf"
+ assert fe3.urls[1].rel == "webarchive"
+
def main() -> None:
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)