diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-04-23 18:38:32 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-04-23 18:38:32 -0700 |
commit | 09c584774b242374625d481ae043e8162d94ab52 (patch) | |
tree | 8d7444f03f940eea66ddd81871ed9da37478381a /python/fatcat_tools/importers/matched.py | |
parent | 0bac91e6b124aff9e722f206d58e72c7c4ad861b (diff) | |
download | fatcat-09c584774b242374625d481ae043e8162d94ab52.tar.gz fatcat-09c584774b242374625d481ae043e8162d94ab52.zip |
add limits to match importers
Diffstat (limited to 'python/fatcat_tools/importers/matched.py')
-rw-r--r-- | python/fatcat_tools/importers/matched.py | 14 |
1 files changed, 13 insertions, 1 deletions
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 5096cc97..0afd00e3 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -4,7 +4,7 @@ import json import sqlite3 import itertools import fatcat_client -from .common import EntityImporter, clean, make_rel_url +from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS class MatchedImporter(EntityImporter): @@ -68,6 +68,9 @@ class MatchedImporter(EntityImporter): if len(release_ids) == 0: self.counts['skip-no-doi'] += 1 return None + if len(release_ids) > SANE_MAX_RELEASES: + self.counts['skip-too-many-dois'] += 1 + return None # parse URLs and CDX urls = set() @@ -89,6 +92,9 @@ class MatchedImporter(EntityImporter): if len(urls) == 0: self.counts['skip-no-urls'] += 1 return None + if len(urls) > SANE_MAX_URLS: + self.counts['skip-too-many-urls'] += 1 + return None size = obj.get('size') if size: @@ -126,7 +132,13 @@ class MatchedImporter(EntityImporter): # merge the existing into this one and update existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) existing.urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in existing.urls] + if len(existing.urls) > SANE_MAX_URLS: + self.counts['skip-update-too-many-url'] += 1 + return None existing.release_ids = list(set(fe.release_ids + existing.release_ids)) + if len(existing.release_ids) > SANE_MAX_RELEASES: + self.counts['skip-update-too-many-url'] += 1 + return None existing.mimetype = existing.mimetype or fe.mimetype existing.size = existing.size or fe.size existing.md5 = existing.md5 or fe.md5 |