diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-02 18:14:59 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-02 18:14:59 -0700 |
commit | 31d1a6a713d177990609767d508209ced19ca396 (patch) | |
tree | a628a57bdb373669394a6b520102b1b4b5ffe7da /python/fatcat_tools/cleanups/files.py | |
parent | 9dc891b8098542bb089c8c47098b60a8beb76a53 (diff) | |
download | fatcat-31d1a6a713d177990609767d508209ced19ca396.tar.gz fatcat-31d1a6a713d177990609767d508209ced19ca396.zip |
fmt (black): fatcat_tools/
Diffstat (limited to 'python/fatcat_tools/cleanups/files.py')
-rw-r--r-- | python/fatcat_tools/cleanups/files.py | 38 |
1 files changed, 21 insertions, 17 deletions
diff --git a/python/fatcat_tools/cleanups/files.py b/python/fatcat_tools/cleanups/files.py index 0d275ba6..d378a91f 100644 --- a/python/fatcat_tools/cleanups/files.py +++ b/python/fatcat_tools/cleanups/files.py @@ -1,4 +1,3 @@ - from fatcat_openapi_client.models import FileEntity from fatcat_openapi_client.rest import ApiException @@ -12,14 +11,19 @@ class FileCleaner(EntityCleaner): def __init__(self, api, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Automated cleanup of file entities (eg, remove bad URLs)" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FileCleaner') - super().__init__(api, + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Automated cleanup of file entities (eg, remove bad URLs)" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FileCleaner") + super().__init__( + api, entity_type=FileEntity, editgroup_description=eg_desc, editgroup_extra=eg_extra, - **kwargs) + **kwargs + ) def clean_entity(self, entity): """ @@ -27,24 +31,24 @@ class FileCleaner(EntityCleaner): """ # URL has ://web.archive.org/web/None/ link => delete URL - entity.urls = [u for u in entity.urls if '://web.archive.org/web/None/' not in u.url] + entity.urls = [u for u in entity.urls if "://web.archive.org/web/None/" not in u.url] # URL has ://archive.org/ link with rel=repository => rel=archive for u in entity.urls: - if '://archive.org/' in u.url and u.rel == 'repository': - u.rel = 'archive' + if "://archive.org/" in u.url and u.rel == "repository": + u.rel = "archive" # URL has short wayback date ("2017") and another url with that as prefix => delete URL stub_wayback_urls = [] full_wayback_urls = [] for u in entity.urls: - if '://web.archive.org/web/' in u.url: - if len(u.url.split('/')[4]) <= 8: + if "://web.archive.org/web/" in u.url: + if len(u.url.split("/")[4]) <= 8: stub_wayback_urls.append(u.url) else: - full_wayback_urls.append('/'.join(u.url.split('/')[5:])) + full_wayback_urls.append("/".join(u.url.split("/")[5:])) for stub in stub_wayback_urls: - target = '/'.join(stub.split('/')[5:]) + target = "/".join(stub.split("/")[5:]) if target in full_wayback_urls: entity.urls = [u for u in entity.urls if u.url != stub] @@ -57,14 +61,14 @@ class FileCleaner(EntityCleaner): except ApiException as err: if err.status != 404: raise err - self.counts['skip-not-found'] += 1 + self.counts["skip-not-found"] += 1 return 0 - if existing.state != 'active': - self.counts['skip-existing-inactive'] += 1 + if existing.state != "active": + self.counts["skip-existing-inactive"] += 1 return 0 if existing.revision != entity.revision: - self.counts['skip-revision'] += 1 + self.counts["skip-revision"] += 1 return 0 self.api.update_file(self.get_editgroup_id(), entity.ident, entity) |