diff options
Diffstat (limited to 'python/fatcat_tools/importers/common.py')
-rw-r--r-- | python/fatcat_tools/importers/common.py | 9 |
1 files changed, 9 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index fd472d11..2ec6efda 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -436,6 +436,15 @@ class EntityImporter: if u.rel == "social": u.rel = "academicsocial" + # remove exact URL duplicates, while preserving order, and removing + # "later" copies, not "first" copies + # this is sensitive to both url.url and url.rel combined! + dedupe_urls = [] + for url_pair in existing.urls: + if url_pair not in dedupe_urls: + dedupe_urls.append(url_pair) + existing.urls = dedupe_urls + # remove URLs which are near-duplicates redundant_urls = [] all_urls = [u.url for u in existing.urls] |