from fatcat_openapi_client.rest import ApiException from fatcat_openapi_client.models import FileEntity from fatcat_tools.transforms import entity_to_dict, entity_from_json from .common import EntityCleaner class FileCleaner(EntityCleaner): """ File fixups! """ def __init__(self, api, **kwargs): eg_desc = kwargs.pop('editgroup_description', "Automated cleanup of file entities (eg, remove bad URLs)") eg_extra = kwargs.pop('editgroup_extra', dict()) eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FileCleaner') super().__init__(api, entity_type=FileEntity, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def clean_entity(self, entity): """ TODO: mimetype is bogus like (???) => clean mimetype """ # URL has ://web.archive.org/web/None/ link => delete URL entity.urls = [u for u in entity.urls if not '://web.archive.org/web/None/' in u.url] # URL has ://archive.org/ link with rel=repository => rel=archive for u in entity.urls: if '://archive.org/' in u.url and u.rel == 'repository': u.rel = 'archive' # URL has short wayback date ("2017") and another url with that as prefix => delete URL stub_wayback_urls = [] full_wayback_urls = [] for u in entity.urls: if '://web.archive.org/web/' in u.url: if len(u.url.split('/')[4]) <= 8: stub_wayback_urls.append(u.url) else: full_wayback_urls.append('/'.join(u.url.split('/')[5:])) for stub in stub_wayback_urls: target = '/'.join(stub.split('/')[5:]) if target in full_wayback_urls: entity.urls = [u for u in entity.urls if u.url != stub] return entity def try_update(self, entity): try: existing = self.api.get_file(entity.ident) except ApiException as err: if err.status != 404: raise err self.counts['skip-not-found'] += 1 return 0 if existing.state != 'active': self.counts['skip-existing-inactive'] += 1 return 0 if existing.revision != entity.revision: self.counts['skip-revision'] += 1 return 0 self.update_file(self.get_editgroup_id(), entity.ident, entity) return 1