aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-04 17:58:05 -0700
committerBryan Newbold <bnewbold@robocracy.org>2021-11-09 14:17:35 -0800
commit91139d7fcaab707f694985d5ca49016cc87946cc (patch)
treec0553a31acc279aad97964c60a609e19239f0beb
parent1ad08edf2d3d06196119ec1eddc932e6423e3e7c (diff)
downloadfatcat-91139d7fcaab707f694985d5ca49016cc87946cc.tar.gz
fatcat-91139d7fcaab707f694985d5ca49016cc87946cc.zip
imports: generic file cleanup removes exact duplicate URLs
-rw-r--r--python/fatcat_tools/importers/common.py9
1 files changed, 9 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index fd472d11..2ec6efda 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -436,6 +436,15 @@ class EntityImporter:
if u.rel == "social":
u.rel = "academicsocial"
+ # remove exact URL duplicates, while preserving order, and removing
+ # "later" copies, not "first" copies
+ # this is sensitive to both url.url and url.rel combined!
+ dedupe_urls = []
+ for url_pair in existing.urls:
+ if url_pair not in dedupe_urls:
+ dedupe_urls.append(url_pair)
+ existing.urls = dedupe_urls
+
# remove URLs which are near-duplicates
redundant_urls = []
all_urls = [u.url for u in existing.urls]