From b1e26b8f2c53141d0cfed9199a771ff8f07926fd Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 24 Jun 2019 14:44:11 -0700 Subject: add minimal file URL cleanups to matched importer --- python/fatcat_tools/importers/matched.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 9cbd071f..1e154204 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -156,9 +156,17 @@ class MatchedImporter(EntityImporter): self.counts['exists'] += 1 return False + # minimum viable "existing" URL cleanup to fix dupes and broken links: + # remove 'None' wayback URLs, and set archive.org rel 'archive' + existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)] + for u in existing.urls: + if u.rel == 'repository' and '://archive.org/download/' in u.url: + u.rel == 'archive' + # merge the existing into this one and update existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) existing.urls = [fatcat_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls] + if len(existing.urls) > SANE_MAX_URLS: self.counts['skip-update-too-many-url'] += 1 return None -- cgit v1.2.3