summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/common.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers/common.py')
-rw-r--r--python/fatcat_tools/importers/common.py9
1 files changed, 9 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index fd472d11..2ec6efda 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -436,6 +436,15 @@ class EntityImporter:
if u.rel == "social":
u.rel = "academicsocial"
+ # remove exact URL duplicates, while preserving order, and removing
+ # "later" copies, not "first" copies
+ # this is sensitive to both url.url and url.rel combined!
+ dedupe_urls = []
+ for url_pair in existing.urls:
+ if url_pair not in dedupe_urls:
+ dedupe_urls.append(url_pair)
+ existing.urls = dedupe_urls
+
# remove URLs which are near-duplicates
redundant_urls = []
all_urls = [u.url for u in existing.urls]