From 76db7f4048116a23c82bdd70bb11dd004e347e8e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 8 Oct 2019 15:56:53 -0700 Subject: new cleanup python tool/framework --- python/fatcat_tools/cleanups/files.py | 74 +++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 python/fatcat_tools/cleanups/files.py (limited to 'python/fatcat_tools/cleanups/files.py') diff --git a/python/fatcat_tools/cleanups/files.py b/python/fatcat_tools/cleanups/files.py new file mode 100644 index 00000000..c2733ba0 --- /dev/null +++ b/python/fatcat_tools/cleanups/files.py @@ -0,0 +1,74 @@ + +from fatcat_openapi_client.rest import ApiException +from fatcat_openapi_client.models import FileEntity +from fatcat_tools.transforms import entity_to_dict, entity_from_json + +from .common import EntityCleaner + + +class FileCleaner(EntityCleaner): + """ + File fixups! + """ + + def __init__(self, api, **kwargs): + + eg_desc = kwargs.pop('editgroup_description', + "Automated cleanup of file entities (eg, remove bad URLs)") + eg_extra = kwargs.pop('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FileCleaner') + super().__init__(api, + entity_type=FileEntity, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + + def clean_entity(self, entity): + """ + TODO: mimetype is bogus like (???) => clean mimetype + """ + + # URL has ://web.archive.org/web/None/ link => delete URL + entity.urls = [u for u in entity.urls if not '://web.archive.org/web/None/' in u.url] + + # URL has ://archive.org/ link with rel=repository => rel=archive + for u in entity.urls: + if '://archive.org/' in u.url and u.rel == 'repository': + u.rel = 'archive' + + # URL has short wayback date ("2017") and another url with that as prefix => delete URL + stub_wayback_urls = [] + full_wayback_urls = [] + for u in entity.urls: + if '://web.archive.org/web/' in u.url: + if len(u.url.split('/')[4]) <= 8: + stub_wayback_urls.append(u.url) + else: + full_wayback_urls.append('/'.join(u.url.split('/')[5:])) + for stub in stub_wayback_urls: + target = '/'.join(stub.split('/')[5:]) + if target in full_wayback_urls: + entity.urls = [u for u in entity.urls if u.url != stub] + + return entity + + def try_update(self, entity): + + try: + existing = self.api.get_file(entity.ident) + except ApiException as err: + if err.status != 404: + raise err + self.counts['skip-not-found'] += 1 + return 0 + + if existing.state != 'active': + self.counts['skip-existing-inactive'] += 1 + return 0 + if existing.revision != entity.revision: + self.counts['skip-revision'] += 1 + return 0 + + self.update_file(self.get_editgroup_id(), entity.ident, entity) + return 1 + -- cgit v1.2.3