aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-04-23 18:38:32 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-04-23 18:38:32 -0700
commit09c584774b242374625d481ae043e8162d94ab52 (patch)
tree8d7444f03f940eea66ddd81871ed9da37478381a /python/fatcat_tools/importers
parent0bac91e6b124aff9e722f206d58e72c7c4ad861b (diff)
downloadfatcat-09c584774b242374625d481ae043e8162d94ab52.tar.gz
fatcat-09c584774b242374625d481ae043e8162d94ab52.zip
add limits to match importers
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/arabesque.py12
-rw-r--r--python/fatcat_tools/importers/common.py3
-rw-r--r--python/fatcat_tools/importers/matched.py14
3 files changed, 27 insertions, 2 deletions
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py
index 4353795a..7838a7ff 100644
--- a/python/fatcat_tools/importers/arabesque.py
+++ b/python/fatcat_tools/importers/arabesque.py
@@ -5,7 +5,7 @@ import base64
import sqlite3
import itertools
import fatcat_client
-from .common import EntityImporter, clean, make_rel_url
+from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS
def b32_hex(s):
@@ -130,6 +130,10 @@ class ArabesqueMatchImporter(EntityImporter):
urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in urls]
+ if len(urls) > SANE_MAX_URLS:
+ self.counts['skip-too-many-url'] += 1
+ return None
+
fe = fatcat_client.FileEntity(
sha1=b32_hex(row['final_sha1']),
mimetype=row['final_mimetype'],
@@ -174,7 +178,13 @@ class ArabesqueMatchImporter(EntityImporter):
# merge the existing into this one and update
existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls]))
existing.urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in existing.urls]
+ if len(existing.urls) > SANE_MAX_URLS:
+ self.counts['skip-update-too-many-url'] += 1
+ return None
existing.release_ids = list(set(fe.release_ids + existing.release_ids))
+ if len(existing.release_ids) > SANE_MAX_RELEASES:
+ self.counts['skip-update-too-many-url'] += 1
+ return None
existing.mimetype = existing.mimetype or fe.mimetype
edit = self.api.update_file(existing.ident, existing, editgroup_id=self.get_editgroup_id())
self._edits_inflight.append(edit)
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index beec99df..d291aeec 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -14,6 +14,9 @@ import fatcat_client
from fatcat_client.rest import ApiException
+SANE_MAX_RELEASES = 200
+SANE_MAX_URLS = 100
+
def clean(thing, force_xml=False):
"""
This function is appropriate to be called on any random, non-markup string,
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 5096cc97..0afd00e3 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -4,7 +4,7 @@ import json
import sqlite3
import itertools
import fatcat_client
-from .common import EntityImporter, clean, make_rel_url
+from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS
class MatchedImporter(EntityImporter):
@@ -68,6 +68,9 @@ class MatchedImporter(EntityImporter):
if len(release_ids) == 0:
self.counts['skip-no-doi'] += 1
return None
+ if len(release_ids) > SANE_MAX_RELEASES:
+ self.counts['skip-too-many-dois'] += 1
+ return None
# parse URLs and CDX
urls = set()
@@ -89,6 +92,9 @@ class MatchedImporter(EntityImporter):
if len(urls) == 0:
self.counts['skip-no-urls'] += 1
return None
+ if len(urls) > SANE_MAX_URLS:
+ self.counts['skip-too-many-urls'] += 1
+ return None
size = obj.get('size')
if size:
@@ -126,7 +132,13 @@ class MatchedImporter(EntityImporter):
# merge the existing into this one and update
existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls]))
existing.urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in existing.urls]
+ if len(existing.urls) > SANE_MAX_URLS:
+ self.counts['skip-update-too-many-url'] += 1
+ return None
existing.release_ids = list(set(fe.release_ids + existing.release_ids))
+ if len(existing.release_ids) > SANE_MAX_RELEASES:
+ self.counts['skip-update-too-many-url'] += 1
+ return None
existing.mimetype = existing.mimetype or fe.mimetype
existing.size = existing.size or fe.size
existing.md5 = existing.md5 or fe.md5