diff options
Diffstat (limited to 'python/fatcat_tools/importers')
| -rw-r--r-- | python/fatcat_tools/importers/arabesque.py | 12 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 3 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/matched.py | 14 | 
3 files changed, 27 insertions, 2 deletions
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index 4353795a..7838a7ff 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -5,7 +5,7 @@ import base64  import sqlite3  import itertools  import fatcat_client -from .common import EntityImporter, clean, make_rel_url +from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS  def b32_hex(s): @@ -130,6 +130,10 @@ class ArabesqueMatchImporter(EntityImporter):          urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in urls] +        if len(urls) > SANE_MAX_URLS: +            self.counts['skip-too-many-url'] += 1 +            return None +          fe = fatcat_client.FileEntity(              sha1=b32_hex(row['final_sha1']),              mimetype=row['final_mimetype'], @@ -174,7 +178,13 @@ class ArabesqueMatchImporter(EntityImporter):          # merge the existing into this one and update          existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls]))          existing.urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in existing.urls] +        if len(existing.urls) > SANE_MAX_URLS: +            self.counts['skip-update-too-many-url'] += 1 +            return None          existing.release_ids = list(set(fe.release_ids + existing.release_ids)) +        if len(existing.release_ids) > SANE_MAX_RELEASES: +            self.counts['skip-update-too-many-url'] += 1 +            return None          existing.mimetype = existing.mimetype or fe.mimetype          edit = self.api.update_file(existing.ident, existing, editgroup_id=self.get_editgroup_id())          self._edits_inflight.append(edit) diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index beec99df..d291aeec 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -14,6 +14,9 @@ import fatcat_client  from fatcat_client.rest import ApiException +SANE_MAX_RELEASES = 200 +SANE_MAX_URLS = 100 +  def clean(thing, force_xml=False):      """      This function is appropriate to be called on any random, non-markup string, diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 5096cc97..0afd00e3 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -4,7 +4,7 @@ import json  import sqlite3  import itertools  import fatcat_client -from .common import EntityImporter, clean, make_rel_url +from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS  class MatchedImporter(EntityImporter): @@ -68,6 +68,9 @@ class MatchedImporter(EntityImporter):          if len(release_ids) == 0:              self.counts['skip-no-doi'] += 1              return None +        if len(release_ids) > SANE_MAX_RELEASES: +            self.counts['skip-too-many-dois'] += 1 +            return None          # parse URLs and CDX          urls = set() @@ -89,6 +92,9 @@ class MatchedImporter(EntityImporter):          if len(urls) == 0:              self.counts['skip-no-urls'] += 1              return None +        if len(urls) > SANE_MAX_URLS: +            self.counts['skip-too-many-urls'] += 1 +            return None          size = obj.get('size')          if size: @@ -126,7 +132,13 @@ class MatchedImporter(EntityImporter):          # merge the existing into this one and update          existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls]))          existing.urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in existing.urls] +        if len(existing.urls) > SANE_MAX_URLS: +            self.counts['skip-update-too-many-url'] += 1 +            return None          existing.release_ids = list(set(fe.release_ids + existing.release_ids)) +        if len(existing.release_ids) > SANE_MAX_RELEASES: +            self.counts['skip-update-too-many-url'] += 1 +            return None          existing.mimetype = existing.mimetype or fe.mimetype          existing.size = existing.size or fe.size          existing.md5 = existing.md5 or fe.md5  | 
