aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-04 17:56:16 -0700
committerBryan Newbold <bnewbold@robocracy.org>2021-11-09 14:17:35 -0800
commit1ad08edf2d3d06196119ec1eddc932e6423e3e7c (patch)
tree210af1b82d989765926c323968e9510a16a8bacd
parent94ddfd7167994b4c0f7940317655d152aba302e6 (diff)
downloadfatcat-1ad08edf2d3d06196119ec1eddc932e6423e3e7c.tar.gz
fatcat-1ad08edf2d3d06196119ec1eddc932e6423e3e7c.zip
wayback short ts: add regression test for dupe URLs
-rw-r--r--python/fatcat_tools/cleanups/file_short_wayback_ts.py44
1 files changed, 44 insertions, 0 deletions
diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
index 2d893dbf..ab1b2a5f 100644
--- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py
+++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
@@ -216,6 +216,50 @@ def test_short_wayback_ts() -> None:
print(fswtc.counts)
assert fswtc.counts["update"] == 1
+ # another example, which failed with an assertion in prod due to duplicated URLs
+ example_line2: Dict[str, Any] = {
+ "file_entity": {
+ "release_ids": ["22jt7euq4fafhblzullmnesso4"],
+ "mimetype": "application/pdf",
+ "urls": [
+ {
+ "url": "https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf",
+ "rel": "repository",
+ },
+ {
+ "url": "https://web.archive.org/web/201811010021/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf",
+ "rel": "webarchive",
+ },
+ {
+ "url": "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf",
+ "rel": "webarchive",
+ },
+ ],
+ "sha256": "51ec58e7a2325d28d1deb0a4bc6422c0e4ae7b12ffb0b6298981a7b8b7730b19",
+ "sha1": "ad96a584fc6073b9a23736bc61ae0ec4a5661433",
+ "md5": "3d509743359649e34a27ae70c5cd3018",
+ "size": 430665,
+ "extra": {
+ "shadows": {"scimag_doi": "10.4259/ibk.59.1_194", "scimag_id": "69089904"}
+ },
+ "revision": "f1fa11ff-d521-45cf-9db1-cb3c8bd3ea48",
+ "ident": "duymhmxk3fgtzk37yp2pvthtxq",
+ "state": "active",
+ },
+ "full_urls": {
+ "https://web.archive.org/web/201811010021/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf": "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf"
+ },
+ "status": "success-self",
+ }
+
+ fe2 = fswtc.parse_record(example_line2)
+ assert len(fe2.urls) == 2
+ assert fe2.urls[0].rel == "repository"
+ assert (
+ fe2.urls[1].url
+ == "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf"
+ )
+
def main() -> None:
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)