aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/cleanups
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-02 18:14:59 -0700
committerBryan Newbold <bnewbold@robocracy.org>2021-11-02 18:14:59 -0700
commit31d1a6a713d177990609767d508209ced19ca396 (patch)
treea628a57bdb373669394a6b520102b1b4b5ffe7da /python/fatcat_tools/cleanups
parent9dc891b8098542bb089c8c47098b60a8beb76a53 (diff)
downloadfatcat-31d1a6a713d177990609767d508209ced19ca396.tar.gz
fatcat-31d1a6a713d177990609767d508209ced19ca396.zip
fmt (black): fatcat_tools/
Diffstat (limited to 'python/fatcat_tools/cleanups')
-rw-r--r--python/fatcat_tools/cleanups/__init__.py1
-rw-r--r--python/fatcat_tools/cleanups/common.py45
-rw-r--r--python/fatcat_tools/cleanups/files.py38
3 files changed, 45 insertions, 39 deletions
diff --git a/python/fatcat_tools/cleanups/__init__.py b/python/fatcat_tools/cleanups/__init__.py
index 587c7b9b..0aeec977 100644
--- a/python/fatcat_tools/cleanups/__init__.py
+++ b/python/fatcat_tools/cleanups/__init__.py
@@ -1,3 +1,2 @@
-
from .common import EntityCleaner
from .files import FileCleaner
diff --git a/python/fatcat_tools/cleanups/common.py b/python/fatcat_tools/cleanups/common.py
index d0fcc761..26ca7bd6 100644
--- a/python/fatcat_tools/cleanups/common.py
+++ b/python/fatcat_tools/cleanups/common.py
@@ -1,4 +1,3 @@
-
import copy
import json
import subprocess
@@ -30,16 +29,19 @@ class EntityCleaner:
def __init__(self, api, entity_type, **kwargs):
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['git_rev'] = eg_extra.get('git_rev',
- subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8')
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityCleaner')
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["git_rev"] = eg_extra.get(
+ "git_rev", subprocess.check_output(["git", "describe", "--always"]).strip()
+ ).decode("utf-8")
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.EntityCleaner")
self.api = api
self.entity_type = entity_type
- self.dry_run_mode = kwargs.get('dry_run_mode', True)
- self.edit_batch_size = kwargs.get('edit_batch_size', 50)
- self.editgroup_description = kwargs.get('editgroup_description', "Generic Entity Cleaner Bot")
+ self.dry_run_mode = kwargs.get("dry_run_mode", True)
+ self.edit_batch_size = kwargs.get("edit_batch_size", 50)
+ self.editgroup_description = kwargs.get(
+ "editgroup_description", "Generic Entity Cleaner Bot"
+ )
self.editgroup_extra = eg_extra
self.reset()
self.ac = ApiClient()
@@ -48,7 +50,7 @@ class EntityCleaner:
print("Running in dry-run mode!")
def reset(self):
- self.counts = Counter({'lines': 0, 'cleaned': 0, 'updated': 0})
+ self.counts = Counter({"lines": 0, "cleaned": 0, "updated": 0})
self._edit_count = 0
self._editgroup_id = None
self._entity_queue = []
@@ -63,23 +65,23 @@ class EntityCleaner:
Returns nothing.
"""
- self.counts['lines'] += 1
- if (not record):
- self.counts['skip-null'] += 1
+ self.counts["lines"] += 1
+ if not record:
+ self.counts["skip-null"] += 1
return
entity = entity_from_dict(record, self.entity_type, api_client=self.ac)
- if entity.state != 'active':
- self.counts['skip-inactive'] += 1
+ if entity.state != "active":
+ self.counts["skip-inactive"] += 1
return
cleaned = self.clean_entity(copy.deepcopy(entity))
if entity == cleaned:
- self.counts['skip-clean'] += 1
+ self.counts["skip-clean"] += 1
return
else:
- self.counts['cleaned'] += 1
+ self.counts["cleaned"] += 1
if self.dry_run_mode:
entity_dict = entity_to_dict(entity, api_client=self.ac)
@@ -87,11 +89,13 @@ class EntityCleaner:
return
if entity.ident in self._idents_inflight:
- raise ValueError("Entity already part of in-process update: {}".format(entity.ident))
+ raise ValueError(
+ "Entity already part of in-process update: {}".format(entity.ident)
+ )
updated = self.try_update(cleaned)
if updated:
- self.counts['updated'] += updated
+ self.counts["updated"] += updated
self._edit_count += updated
self._idents_inflight.append(entity.ident)
@@ -132,9 +136,8 @@ class EntityCleaner:
if not self._editgroup_id:
eg = self.api.create_editgroup(
- Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra))
+ Editgroup(description=self.editgroup_description, extra=self.editgroup_extra)
+ )
self._editgroup_id = eg.editgroup_id
return self._editgroup_id
diff --git a/python/fatcat_tools/cleanups/files.py b/python/fatcat_tools/cleanups/files.py
index 0d275ba6..d378a91f 100644
--- a/python/fatcat_tools/cleanups/files.py
+++ b/python/fatcat_tools/cleanups/files.py
@@ -1,4 +1,3 @@
-
from fatcat_openapi_client.models import FileEntity
from fatcat_openapi_client.rest import ApiException
@@ -12,14 +11,19 @@ class FileCleaner(EntityCleaner):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Automated cleanup of file entities (eg, remove bad URLs)"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FileCleaner')
- super().__init__(api,
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Automated cleanup of file entities (eg, remove bad URLs)"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FileCleaner")
+ super().__init__(
+ api,
entity_type=FileEntity,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
- **kwargs)
+ **kwargs
+ )
def clean_entity(self, entity):
"""
@@ -27,24 +31,24 @@ class FileCleaner(EntityCleaner):
"""
# URL has ://web.archive.org/web/None/ link => delete URL
- entity.urls = [u for u in entity.urls if '://web.archive.org/web/None/' not in u.url]
+ entity.urls = [u for u in entity.urls if "://web.archive.org/web/None/" not in u.url]
# URL has ://archive.org/ link with rel=repository => rel=archive
for u in entity.urls:
- if '://archive.org/' in u.url and u.rel == 'repository':
- u.rel = 'archive'
+ if "://archive.org/" in u.url and u.rel == "repository":
+ u.rel = "archive"
# URL has short wayback date ("2017") and another url with that as prefix => delete URL
stub_wayback_urls = []
full_wayback_urls = []
for u in entity.urls:
- if '://web.archive.org/web/' in u.url:
- if len(u.url.split('/')[4]) <= 8:
+ if "://web.archive.org/web/" in u.url:
+ if len(u.url.split("/")[4]) <= 8:
stub_wayback_urls.append(u.url)
else:
- full_wayback_urls.append('/'.join(u.url.split('/')[5:]))
+ full_wayback_urls.append("/".join(u.url.split("/")[5:]))
for stub in stub_wayback_urls:
- target = '/'.join(stub.split('/')[5:])
+ target = "/".join(stub.split("/")[5:])
if target in full_wayback_urls:
entity.urls = [u for u in entity.urls if u.url != stub]
@@ -57,14 +61,14 @@ class FileCleaner(EntityCleaner):
except ApiException as err:
if err.status != 404:
raise err
- self.counts['skip-not-found'] += 1
+ self.counts["skip-not-found"] += 1
return 0
- if existing.state != 'active':
- self.counts['skip-existing-inactive'] += 1
+ if existing.state != "active":
+ self.counts["skip-existing-inactive"] += 1
return 0
if existing.revision != entity.revision:
- self.counts['skip-revision'] += 1
+ self.counts["skip-revision"] += 1
return 0
self.api.update_file(self.get_editgroup_id(), entity.ident, entity)