diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-31 17:01:01 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-02-13 22:24:20 -0800 |
commit | 3011c6a088498ba566672d35aeee805c762808ba (patch) | |
tree | c0eb4a043645c7b08b6a73dc050c00f1d67ee3f5 | |
parent | 00754db377df53af18f9c4dddacdeb2e2c559206 (diff) | |
download | fatcat-3011c6a088498ba566672d35aeee805c762808ba.tar.gz fatcat-3011c6a088498ba566672d35aeee805c762808ba.zip |
improve shadow import file url cleanup path
Should probably be refactored out in to shared cleanup code.
-rw-r--r-- | python/fatcat_tools/importers/shadow.py | 14 |
1 files changed, 12 insertions, 2 deletions
diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py index 261cf888..1a76299e 100644 --- a/python/fatcat_tools/importers/shadow.py +++ b/python/fatcat_tools/importers/shadow.py @@ -152,10 +152,20 @@ class ShadowLibraryImporter(EntityImporter): u = existing.urls[i] if u.rel == 'repository' and '://archive.org/download/' in u.url: existing.urls[i].rel = 'archive' + if u.rel == 'social': + u.rel = 'academicsocial' + + # new wayback URLs, could replace bad old short wayback URLs (from arabesque bug) + new_wb_urls = [u.url for u in fe.urls] + new_short_wb_urls = ['https://web.archive.org/web/{}/{}'.format( + u.split('/')[4][:12], '/'.join(u.split('/')[5:])) for u in new_wb_urls] + existing.urls = [u for u in existing.urls if not u.url in new_short_wb_urls] # merge the existing into this one and update - existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) - existing.urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls] + merged_urls = {} + for u in fe.urls + existing.urls: + merged_urls[u.url] = u + existing.urls = list(merged_urls.values()) if not existing.extra.get('shadows'): existing.extra['shadows'] = fe.extra['shadows'] else: |