aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-06-24 15:04:50 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-06-24 15:04:50 -0700
commit5285d4d1e2cea5dc8a0c57cc5bbf81a65d19163e (patch)
tree9dd3b3cf585dc54f83b4b161e235ea6248f26675 /python/fatcat_tools/importers
parentd98c3b95a7bb81fd239fd658d83a4d458673ca36 (diff)
downloadfatcat-5285d4d1e2cea5dc8a0c57cc5bbf81a65d19163e.tar.gz
fatcat-5285d4d1e2cea5dc8a0c57cc5bbf81a65d19163e.zip
fix typo; do arxiv-specific match import hack
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/matched.py17
1 files changed, 14 insertions, 3 deletions
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 2426c481..ed3cfb2f 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -150,8 +150,8 @@ class MatchedImporter(EntityImporter):
if not existing:
return True
- fe.release_ids = list(set(fe.release_ids + existing.release_ids))
- if set(fe.release_ids) == set(existing.release_ids) and len(existing.urls) > 0:
+ combined_release_ids = list(set(fe.release_ids + existing.release_ids))
+ if set(combined_release_ids) == set(existing.release_ids) and len(existing.urls) > 0:
# no new release matches *and* there are already existing URLs
self.counts['exists'] += 1
return False
@@ -162,7 +162,18 @@ class MatchedImporter(EntityImporter):
for i in range(len(existing.urls)):
u = existing.urls[i]
if u.rel == 'repository' and '://archive.org/download/' in u.url:
- existing.urls[i].rel == 'archive'
+ existing.urls[i].rel = 'archive'
+
+ # special case: if importing *new* from archive.org arxiv collections,
+ # blow away any existing release_id mappings; this is a direct arxiv_id
+ # map. This *should* be safe to run in all matched imports.
+ is_arxiv = False
+ for u in fe.urls:
+ if 'archive.org/download/arxiv' in u.url.lower():
+ is_arxiv = True
+ break
+ if is_arxiv and fe.release_ids:
+ existing.release_ids = fe.release_ids
# merge the existing into this one and update
existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls]))