diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-04 17:58:05 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-09 14:17:35 -0800 |
commit | 91139d7fcaab707f694985d5ca49016cc87946cc (patch) | |
tree | c0553a31acc279aad97964c60a609e19239f0beb | |
parent | 1ad08edf2d3d06196119ec1eddc932e6423e3e7c (diff) | |
download | fatcat-91139d7fcaab707f694985d5ca49016cc87946cc.tar.gz fatcat-91139d7fcaab707f694985d5ca49016cc87946cc.zip |
imports: generic file cleanup removes exact duplicate URLs
-rw-r--r-- | python/fatcat_tools/importers/common.py | 9 |
1 files changed, 9 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index fd472d11..2ec6efda 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -436,6 +436,15 @@ class EntityImporter: if u.rel == "social": u.rel = "academicsocial" + # remove exact URL duplicates, while preserving order, and removing + # "later" copies, not "first" copies + # this is sensitive to both url.url and url.rel combined! + dedupe_urls = [] + for url_pair in existing.urls: + if url_pair not in dedupe_urls: + dedupe_urls.append(url_pair) + existing.urls = dedupe_urls + # remove URLs which are near-duplicates redundant_urls = [] all_urls = [u.url for u in existing.urls] |