From 91139d7fcaab707f694985d5ca49016cc87946cc Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 4 Nov 2021 17:58:05 -0700 Subject: imports: generic file cleanup removes exact duplicate URLs --- python/fatcat_tools/importers/common.py | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'python') diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index fd472d11..2ec6efda 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -436,6 +436,15 @@ class EntityImporter: if u.rel == "social": u.rel = "academicsocial" + # remove exact URL duplicates, while preserving order, and removing + # "later" copies, not "first" copies + # this is sensitive to both url.url and url.rel combined! + dedupe_urls = [] + for url_pair in existing.urls: + if url_pair not in dedupe_urls: + dedupe_urls.append(url_pair) + existing.urls = dedupe_urls + # remove URLs which are near-duplicates redundant_urls = [] all_urls = [u.url for u in existing.urls] -- cgit v1.2.3