diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-06-24 15:04:50 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-06-24 15:04:50 -0700 |
commit | 5285d4d1e2cea5dc8a0c57cc5bbf81a65d19163e (patch) | |
tree | 9dd3b3cf585dc54f83b4b161e235ea6248f26675 /python/fatcat_tools | |
parent | d98c3b95a7bb81fd239fd658d83a4d458673ca36 (diff) | |
download | fatcat-5285d4d1e2cea5dc8a0c57cc5bbf81a65d19163e.tar.gz fatcat-5285d4d1e2cea5dc8a0c57cc5bbf81a65d19163e.zip |
fix typo; do arxiv-specific match import hack
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/importers/matched.py | 17 |
1 files changed, 14 insertions, 3 deletions
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 2426c481..ed3cfb2f 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -150,8 +150,8 @@ class MatchedImporter(EntityImporter): if not existing: return True - fe.release_ids = list(set(fe.release_ids + existing.release_ids)) - if set(fe.release_ids) == set(existing.release_ids) and len(existing.urls) > 0: + combined_release_ids = list(set(fe.release_ids + existing.release_ids)) + if set(combined_release_ids) == set(existing.release_ids) and len(existing.urls) > 0: # no new release matches *and* there are already existing URLs self.counts['exists'] += 1 return False @@ -162,7 +162,18 @@ class MatchedImporter(EntityImporter): for i in range(len(existing.urls)): u = existing.urls[i] if u.rel == 'repository' and '://archive.org/download/' in u.url: - existing.urls[i].rel == 'archive' + existing.urls[i].rel = 'archive' + + # special case: if importing *new* from archive.org arxiv collections, + # blow away any existing release_id mappings; this is a direct arxiv_id + # map. This *should* be safe to run in all matched imports. + is_arxiv = False + for u in fe.urls: + if 'archive.org/download/arxiv' in u.url.lower(): + is_arxiv = True + break + if is_arxiv and fe.release_ids: + existing.release_ids = fe.release_ids # merge the existing into this one and update existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) |