diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-02 18:14:59 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-02 18:14:59 -0700 |
commit | 31d1a6a713d177990609767d508209ced19ca396 (patch) | |
tree | a628a57bdb373669394a6b520102b1b4b5ffe7da /python/fatcat_tools/cleanups | |
parent | 9dc891b8098542bb089c8c47098b60a8beb76a53 (diff) | |
download | fatcat-31d1a6a713d177990609767d508209ced19ca396.tar.gz fatcat-31d1a6a713d177990609767d508209ced19ca396.zip |
fmt (black): fatcat_tools/
Diffstat (limited to 'python/fatcat_tools/cleanups')
-rw-r--r-- | python/fatcat_tools/cleanups/__init__.py | 1 | ||||
-rw-r--r-- | python/fatcat_tools/cleanups/common.py | 45 | ||||
-rw-r--r-- | python/fatcat_tools/cleanups/files.py | 38 |
3 files changed, 45 insertions, 39 deletions
diff --git a/python/fatcat_tools/cleanups/__init__.py b/python/fatcat_tools/cleanups/__init__.py index 587c7b9b..0aeec977 100644 --- a/python/fatcat_tools/cleanups/__init__.py +++ b/python/fatcat_tools/cleanups/__init__.py @@ -1,3 +1,2 @@ - from .common import EntityCleaner from .files import FileCleaner diff --git a/python/fatcat_tools/cleanups/common.py b/python/fatcat_tools/cleanups/common.py index d0fcc761..26ca7bd6 100644 --- a/python/fatcat_tools/cleanups/common.py +++ b/python/fatcat_tools/cleanups/common.py @@ -1,4 +1,3 @@ - import copy import json import subprocess @@ -30,16 +29,19 @@ class EntityCleaner: def __init__(self, api, entity_type, **kwargs): - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['git_rev'] = eg_extra.get('git_rev', - subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8') - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityCleaner') + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["git_rev"] = eg_extra.get( + "git_rev", subprocess.check_output(["git", "describe", "--always"]).strip() + ).decode("utf-8") + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.EntityCleaner") self.api = api self.entity_type = entity_type - self.dry_run_mode = kwargs.get('dry_run_mode', True) - self.edit_batch_size = kwargs.get('edit_batch_size', 50) - self.editgroup_description = kwargs.get('editgroup_description', "Generic Entity Cleaner Bot") + self.dry_run_mode = kwargs.get("dry_run_mode", True) + self.edit_batch_size = kwargs.get("edit_batch_size", 50) + self.editgroup_description = kwargs.get( + "editgroup_description", "Generic Entity Cleaner Bot" + ) self.editgroup_extra = eg_extra self.reset() self.ac = ApiClient() @@ -48,7 +50,7 @@ class EntityCleaner: print("Running in dry-run mode!") def reset(self): - self.counts = Counter({'lines': 0, 'cleaned': 0, 'updated': 0}) + self.counts = Counter({"lines": 0, "cleaned": 0, "updated": 0}) self._edit_count = 0 self._editgroup_id = None self._entity_queue = [] @@ -63,23 +65,23 @@ class EntityCleaner: Returns nothing. """ - self.counts['lines'] += 1 - if (not record): - self.counts['skip-null'] += 1 + self.counts["lines"] += 1 + if not record: + self.counts["skip-null"] += 1 return entity = entity_from_dict(record, self.entity_type, api_client=self.ac) - if entity.state != 'active': - self.counts['skip-inactive'] += 1 + if entity.state != "active": + self.counts["skip-inactive"] += 1 return cleaned = self.clean_entity(copy.deepcopy(entity)) if entity == cleaned: - self.counts['skip-clean'] += 1 + self.counts["skip-clean"] += 1 return else: - self.counts['cleaned'] += 1 + self.counts["cleaned"] += 1 if self.dry_run_mode: entity_dict = entity_to_dict(entity, api_client=self.ac) @@ -87,11 +89,13 @@ class EntityCleaner: return if entity.ident in self._idents_inflight: - raise ValueError("Entity already part of in-process update: {}".format(entity.ident)) + raise ValueError( + "Entity already part of in-process update: {}".format(entity.ident) + ) updated = self.try_update(cleaned) if updated: - self.counts['updated'] += updated + self.counts["updated"] += updated self._edit_count += updated self._idents_inflight.append(entity.ident) @@ -132,9 +136,8 @@ class EntityCleaner: if not self._editgroup_id: eg = self.api.create_editgroup( - Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra)) + Editgroup(description=self.editgroup_description, extra=self.editgroup_extra) + ) self._editgroup_id = eg.editgroup_id return self._editgroup_id diff --git a/python/fatcat_tools/cleanups/files.py b/python/fatcat_tools/cleanups/files.py index 0d275ba6..d378a91f 100644 --- a/python/fatcat_tools/cleanups/files.py +++ b/python/fatcat_tools/cleanups/files.py @@ -1,4 +1,3 @@ - from fatcat_openapi_client.models import FileEntity from fatcat_openapi_client.rest import ApiException @@ -12,14 +11,19 @@ class FileCleaner(EntityCleaner): def __init__(self, api, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Automated cleanup of file entities (eg, remove bad URLs)" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FileCleaner') - super().__init__(api, + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Automated cleanup of file entities (eg, remove bad URLs)" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FileCleaner") + super().__init__( + api, entity_type=FileEntity, editgroup_description=eg_desc, editgroup_extra=eg_extra, - **kwargs) + **kwargs + ) def clean_entity(self, entity): """ @@ -27,24 +31,24 @@ class FileCleaner(EntityCleaner): """ # URL has ://web.archive.org/web/None/ link => delete URL - entity.urls = [u for u in entity.urls if '://web.archive.org/web/None/' not in u.url] + entity.urls = [u for u in entity.urls if "://web.archive.org/web/None/" not in u.url] # URL has ://archive.org/ link with rel=repository => rel=archive for u in entity.urls: - if '://archive.org/' in u.url and u.rel == 'repository': - u.rel = 'archive' + if "://archive.org/" in u.url and u.rel == "repository": + u.rel = "archive" # URL has short wayback date ("2017") and another url with that as prefix => delete URL stub_wayback_urls = [] full_wayback_urls = [] for u in entity.urls: - if '://web.archive.org/web/' in u.url: - if len(u.url.split('/')[4]) <= 8: + if "://web.archive.org/web/" in u.url: + if len(u.url.split("/")[4]) <= 8: stub_wayback_urls.append(u.url) else: - full_wayback_urls.append('/'.join(u.url.split('/')[5:])) + full_wayback_urls.append("/".join(u.url.split("/")[5:])) for stub in stub_wayback_urls: - target = '/'.join(stub.split('/')[5:]) + target = "/".join(stub.split("/")[5:]) if target in full_wayback_urls: entity.urls = [u for u in entity.urls if u.url != stub] @@ -57,14 +61,14 @@ class FileCleaner(EntityCleaner): except ApiException as err: if err.status != 404: raise err - self.counts['skip-not-found'] += 1 + self.counts["skip-not-found"] += 1 return 0 - if existing.state != 'active': - self.counts['skip-existing-inactive'] += 1 + if existing.state != "active": + self.counts["skip-existing-inactive"] += 1 return 0 if existing.revision != entity.revision: - self.counts['skip-revision'] += 1 + self.counts["skip-revision"] += 1 return 0 self.api.update_file(self.get_editgroup_id(), entity.ident, entity) |