diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat_tools/importers/matched.py | 17 | 
1 files changed, 14 insertions, 3 deletions
| diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 2426c481..ed3cfb2f 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -150,8 +150,8 @@ class MatchedImporter(EntityImporter):          if not existing:              return True -        fe.release_ids = list(set(fe.release_ids + existing.release_ids)) -        if set(fe.release_ids) == set(existing.release_ids) and len(existing.urls) > 0: +        combined_release_ids = list(set(fe.release_ids + existing.release_ids)) +        if set(combined_release_ids) == set(existing.release_ids) and len(existing.urls) > 0:              # no new release matches *and* there are already existing URLs              self.counts['exists'] += 1              return False @@ -162,7 +162,18 @@ class MatchedImporter(EntityImporter):          for i in range(len(existing.urls)):              u = existing.urls[i]              if u.rel == 'repository' and '://archive.org/download/' in u.url: -                existing.urls[i].rel == 'archive' +                existing.urls[i].rel = 'archive' + +        # special case: if importing *new* from archive.org arxiv collections, +        # blow away any existing release_id mappings; this is a direct arxiv_id +        # map. This *should* be safe to run in all matched imports. +        is_arxiv = False +        for u in fe.urls: +            if 'archive.org/download/arxiv' in u.url.lower(): +                is_arxiv = True +                break +        if is_arxiv and fe.release_ids: +            existing.release_ids = fe.release_ids          # merge the existing into this one and update          existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) | 
