aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/cleanups/files.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/cleanups/files.py')
-rw-r--r--python/fatcat_tools/cleanups/files.py38
1 files changed, 21 insertions, 17 deletions
diff --git a/python/fatcat_tools/cleanups/files.py b/python/fatcat_tools/cleanups/files.py
index 0d275ba6..d378a91f 100644
--- a/python/fatcat_tools/cleanups/files.py
+++ b/python/fatcat_tools/cleanups/files.py
@@ -1,4 +1,3 @@
-
from fatcat_openapi_client.models import FileEntity
from fatcat_openapi_client.rest import ApiException
@@ -12,14 +11,19 @@ class FileCleaner(EntityCleaner):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Automated cleanup of file entities (eg, remove bad URLs)"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FileCleaner')
- super().__init__(api,
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Automated cleanup of file entities (eg, remove bad URLs)"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FileCleaner")
+ super().__init__(
+ api,
entity_type=FileEntity,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
- **kwargs)
+ **kwargs
+ )
def clean_entity(self, entity):
"""
@@ -27,24 +31,24 @@ class FileCleaner(EntityCleaner):
"""
# URL has ://web.archive.org/web/None/ link => delete URL
- entity.urls = [u for u in entity.urls if '://web.archive.org/web/None/' not in u.url]
+ entity.urls = [u for u in entity.urls if "://web.archive.org/web/None/" not in u.url]
# URL has ://archive.org/ link with rel=repository => rel=archive
for u in entity.urls:
- if '://archive.org/' in u.url and u.rel == 'repository':
- u.rel = 'archive'
+ if "://archive.org/" in u.url and u.rel == "repository":
+ u.rel = "archive"
# URL has short wayback date ("2017") and another url with that as prefix => delete URL
stub_wayback_urls = []
full_wayback_urls = []
for u in entity.urls:
- if '://web.archive.org/web/' in u.url:
- if len(u.url.split('/')[4]) <= 8:
+ if "://web.archive.org/web/" in u.url:
+ if len(u.url.split("/")[4]) <= 8:
stub_wayback_urls.append(u.url)
else:
- full_wayback_urls.append('/'.join(u.url.split('/')[5:]))
+ full_wayback_urls.append("/".join(u.url.split("/")[5:]))
for stub in stub_wayback_urls:
- target = '/'.join(stub.split('/')[5:])
+ target = "/".join(stub.split("/")[5:])
if target in full_wayback_urls:
entity.urls = [u for u in entity.urls if u.url != stub]
@@ -57,14 +61,14 @@ class FileCleaner(EntityCleaner):
except ApiException as err:
if err.status != 404:
raise err
- self.counts['skip-not-found'] += 1
+ self.counts["skip-not-found"] += 1
return 0
- if existing.state != 'active':
- self.counts['skip-existing-inactive'] += 1
+ if existing.state != "active":
+ self.counts["skip-existing-inactive"] += 1
return 0
if existing.revision != entity.revision:
- self.counts['skip-revision'] += 1
+ self.counts["skip-revision"] += 1
return 0
self.api.update_file(self.get_editgroup_id(), entity.ident, entity)