From 3011c6a088498ba566672d35aeee805c762808ba Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 31 Jan 2020 17:01:01 -0800 Subject: improve shadow import file url cleanup path Should probably be refactored out in to shared cleanup code. --- python/fatcat_tools/importers/shadow.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py index 261cf888..1a76299e 100644 --- a/python/fatcat_tools/importers/shadow.py +++ b/python/fatcat_tools/importers/shadow.py @@ -152,10 +152,20 @@ class ShadowLibraryImporter(EntityImporter): u = existing.urls[i] if u.rel == 'repository' and '://archive.org/download/' in u.url: existing.urls[i].rel = 'archive' + if u.rel == 'social': + u.rel = 'academicsocial' + + # new wayback URLs, could replace bad old short wayback URLs (from arabesque bug) + new_wb_urls = [u.url for u in fe.urls] + new_short_wb_urls = ['https://web.archive.org/web/{}/{}'.format( + u.split('/')[4][:12], '/'.join(u.split('/')[5:])) for u in new_wb_urls] + existing.urls = [u for u in existing.urls if not u.url in new_short_wb_urls] # merge the existing into this one and update - existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) - existing.urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls] + merged_urls = {} + for u in fe.urls + existing.urls: + merged_urls[u.url] = u + existing.urls = list(merged_urls.values()) if not existing.extra.get('shadows'): existing.extra['shadows'] = fe.extra['shadows'] else: -- cgit v1.2.3