diff options
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r-- | python/fatcat_tools/importers/arabesque.py | 12 | ||||
-rw-r--r-- | python/fatcat_tools/importers/common.py | 3 | ||||
-rw-r--r-- | python/fatcat_tools/importers/matched.py | 14 |
3 files changed, 27 insertions, 2 deletions
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index 4353795a..7838a7ff 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -5,7 +5,7 @@ import base64 import sqlite3 import itertools import fatcat_client -from .common import EntityImporter, clean, make_rel_url +from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS def b32_hex(s): @@ -130,6 +130,10 @@ class ArabesqueMatchImporter(EntityImporter): urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in urls] + if len(urls) > SANE_MAX_URLS: + self.counts['skip-too-many-url'] += 1 + return None + fe = fatcat_client.FileEntity( sha1=b32_hex(row['final_sha1']), mimetype=row['final_mimetype'], @@ -174,7 +178,13 @@ class ArabesqueMatchImporter(EntityImporter): # merge the existing into this one and update existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) existing.urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in existing.urls] + if len(existing.urls) > SANE_MAX_URLS: + self.counts['skip-update-too-many-url'] += 1 + return None existing.release_ids = list(set(fe.release_ids + existing.release_ids)) + if len(existing.release_ids) > SANE_MAX_RELEASES: + self.counts['skip-update-too-many-url'] += 1 + return None existing.mimetype = existing.mimetype or fe.mimetype edit = self.api.update_file(existing.ident, existing, editgroup_id=self.get_editgroup_id()) self._edits_inflight.append(edit) diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index beec99df..d291aeec 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -14,6 +14,9 @@ import fatcat_client from fatcat_client.rest import ApiException +SANE_MAX_RELEASES = 200 +SANE_MAX_URLS = 100 + def clean(thing, force_xml=False): """ This function is appropriate to be called on any random, non-markup string, diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 5096cc97..0afd00e3 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -4,7 +4,7 @@ import json import sqlite3 import itertools import fatcat_client -from .common import EntityImporter, clean, make_rel_url +from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS class MatchedImporter(EntityImporter): @@ -68,6 +68,9 @@ class MatchedImporter(EntityImporter): if len(release_ids) == 0: self.counts['skip-no-doi'] += 1 return None + if len(release_ids) > SANE_MAX_RELEASES: + self.counts['skip-too-many-dois'] += 1 + return None # parse URLs and CDX urls = set() @@ -89,6 +92,9 @@ class MatchedImporter(EntityImporter): if len(urls) == 0: self.counts['skip-no-urls'] += 1 return None + if len(urls) > SANE_MAX_URLS: + self.counts['skip-too-many-urls'] += 1 + return None size = obj.get('size') if size: @@ -126,7 +132,13 @@ class MatchedImporter(EntityImporter): # merge the existing into this one and update existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) existing.urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in existing.urls] + if len(existing.urls) > SANE_MAX_URLS: + self.counts['skip-update-too-many-url'] += 1 + return None existing.release_ids = list(set(fe.release_ids + existing.release_ids)) + if len(existing.release_ids) > SANE_MAX_RELEASES: + self.counts['skip-update-too-many-url'] += 1 + return None existing.mimetype = existing.mimetype or fe.mimetype existing.size = existing.size or fe.size existing.md5 = existing.md5 or fe.md5 |