summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/matched.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-04-23 18:38:32 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-04-23 18:38:32 -0700
commit09c584774b242374625d481ae043e8162d94ab52 (patch)
tree8d7444f03f940eea66ddd81871ed9da37478381a /python/fatcat_tools/importers/matched.py
parent0bac91e6b124aff9e722f206d58e72c7c4ad861b (diff)
downloadfatcat-09c584774b242374625d481ae043e8162d94ab52.tar.gz
fatcat-09c584774b242374625d481ae043e8162d94ab52.zip
add limits to match importers
Diffstat (limited to 'python/fatcat_tools/importers/matched.py')
-rw-r--r--python/fatcat_tools/importers/matched.py14
1 files changed, 13 insertions, 1 deletions
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 5096cc97..0afd00e3 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -4,7 +4,7 @@ import json
import sqlite3
import itertools
import fatcat_client
-from .common import EntityImporter, clean, make_rel_url
+from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS
class MatchedImporter(EntityImporter):
@@ -68,6 +68,9 @@ class MatchedImporter(EntityImporter):
if len(release_ids) == 0:
self.counts['skip-no-doi'] += 1
return None
+ if len(release_ids) > SANE_MAX_RELEASES:
+ self.counts['skip-too-many-dois'] += 1
+ return None
# parse URLs and CDX
urls = set()
@@ -89,6 +92,9 @@ class MatchedImporter(EntityImporter):
if len(urls) == 0:
self.counts['skip-no-urls'] += 1
return None
+ if len(urls) > SANE_MAX_URLS:
+ self.counts['skip-too-many-urls'] += 1
+ return None
size = obj.get('size')
if size:
@@ -126,7 +132,13 @@ class MatchedImporter(EntityImporter):
# merge the existing into this one and update
existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls]))
existing.urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in existing.urls]
+ if len(existing.urls) > SANE_MAX_URLS:
+ self.counts['skip-update-too-many-url'] += 1
+ return None
existing.release_ids = list(set(fe.release_ids + existing.release_ids))
+ if len(existing.release_ids) > SANE_MAX_RELEASES:
+ self.counts['skip-update-too-many-url'] += 1
+ return None
existing.mimetype = existing.mimetype or fe.mimetype
existing.size = existing.size or fe.size
existing.md5 = existing.md5 or fe.md5