summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-06-24 14:44:11 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-06-24 14:44:11 -0700
commitb1e26b8f2c53141d0cfed9199a771ff8f07926fd (patch)
treeeb86abe9f36822cd4e12f09dd053d101144ee80a
parentfa9e3a2e8992779b80877b19090eaf15c8c7bc0e (diff)
downloadfatcat-b1e26b8f2c53141d0cfed9199a771ff8f07926fd.tar.gz
fatcat-b1e26b8f2c53141d0cfed9199a771ff8f07926fd.zip
add minimal file URL cleanups to matched importer
-rw-r--r--python/fatcat_tools/importers/matched.py8
1 files changed, 8 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 9cbd071f..1e154204 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -156,9 +156,17 @@ class MatchedImporter(EntityImporter):
self.counts['exists'] += 1
return False
+ # minimum viable "existing" URL cleanup to fix dupes and broken links:
+ # remove 'None' wayback URLs, and set archive.org rel 'archive'
+ existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)]
+ for u in existing.urls:
+ if u.rel == 'repository' and '://archive.org/download/' in u.url:
+ u.rel == 'archive'
+
# merge the existing into this one and update
existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls]))
existing.urls = [fatcat_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls]
+
if len(existing.urls) > SANE_MAX_URLS:
self.counts['skip-update-too-many-url'] += 1
return None