aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/cleanups/files.py
blob: ec7e9064197ac7ed05e58771d546117ad557ee1c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73

from fatcat_openapi_client.rest import ApiException
from fatcat_openapi_client.models import FileEntity
from fatcat_tools.transforms import entity_to_dict, entity_from_json

from .common import EntityCleaner


class FileCleaner(EntityCleaner):
    """
    File fixups!
    """

    def __init__(self, api, **kwargs):

        eg_desc = kwargs.pop('editgroup_description', None) or "Automated cleanup of file entities (eg, remove bad URLs)"
        eg_extra = kwargs.pop('editgroup_extra', dict())
        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FileCleaner')
        super().__init__(api,
            entity_type=FileEntity,
            editgroup_description=eg_desc,
            editgroup_extra=eg_extra,
            **kwargs)

    def clean_entity(self, entity):
        """
        TODO: mimetype is bogus like (???) => clean mimetype
        """

        # URL has ://web.archive.org/web/None/ link => delete URL
        entity.urls = [u for u in entity.urls if not '://web.archive.org/web/None/' in u.url]

        # URL has ://archive.org/ link with rel=repository => rel=archive
        for u in entity.urls:
            if '://archive.org/' in u.url and u.rel == 'repository':
                u.rel = 'archive'

        # URL has short wayback date ("2017") and another url with that as prefix => delete URL
        stub_wayback_urls = []
        full_wayback_urls = []
        for u in entity.urls:
            if '://web.archive.org/web/' in u.url:
                if len(u.url.split('/')[4]) <= 8:
                    stub_wayback_urls.append(u.url)
                else:
                    full_wayback_urls.append('/'.join(u.url.split('/')[5:]))
        for stub in stub_wayback_urls:
            target = '/'.join(stub.split('/')[5:])
            if target in full_wayback_urls:
                entity.urls = [u for u in entity.urls if u.url != stub]

        return entity

    def try_update(self, entity):

        try:
            existing = self.api.get_file(entity.ident)
        except ApiException as err:
            if err.status != 404:
                raise err
            self.counts['skip-not-found'] += 1
            return 0

        if existing.state != 'active':
            self.counts['skip-existing-inactive'] += 1
            return 0
        if existing.revision != entity.revision:
            self.counts['skip-revision'] += 1
            return 0

        self.api.update_file(self.get_editgroup_id(), entity.ident, entity)
        return 1