aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-01-31 17:01:01 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-02-13 22:24:20 -0800
commit3011c6a088498ba566672d35aeee805c762808ba (patch)
treec0eb4a043645c7b08b6a73dc050c00f1d67ee3f5
parent00754db377df53af18f9c4dddacdeb2e2c559206 (diff)
downloadfatcat-3011c6a088498ba566672d35aeee805c762808ba.tar.gz
fatcat-3011c6a088498ba566672d35aeee805c762808ba.zip
improve shadow import file url cleanup path
Should probably be refactored out in to shared cleanup code.
-rw-r--r--python/fatcat_tools/importers/shadow.py14
1 files changed, 12 insertions, 2 deletions
diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py
index 261cf888..1a76299e 100644
--- a/python/fatcat_tools/importers/shadow.py
+++ b/python/fatcat_tools/importers/shadow.py
@@ -152,10 +152,20 @@ class ShadowLibraryImporter(EntityImporter):
u = existing.urls[i]
if u.rel == 'repository' and '://archive.org/download/' in u.url:
existing.urls[i].rel = 'archive'
+ if u.rel == 'social':
+ u.rel = 'academicsocial'
+
+ # new wayback URLs, could replace bad old short wayback URLs (from arabesque bug)
+ new_wb_urls = [u.url for u in fe.urls]
+ new_short_wb_urls = ['https://web.archive.org/web/{}/{}'.format(
+ u.split('/')[4][:12], '/'.join(u.split('/')[5:])) for u in new_wb_urls]
+ existing.urls = [u for u in existing.urls if not u.url in new_short_wb_urls]
# merge the existing into this one and update
- existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls]))
- existing.urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls]
+ merged_urls = {}
+ for u in fe.urls + existing.urls:
+ merged_urls[u.url] = u
+ existing.urls = list(merged_urls.values())
if not existing.extra.get('shadows'):
existing.extra['shadows'] = fe.extra['shadows']
else: