summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/fileset_generic.py
blob: 43c2a49c25139a94f194b1737cae4aefa614b162 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84

import fatcat_openapi_client

from fatcat_tools import entity_from_dict

from .common import EntityImporter


class FilesetImporter(EntityImporter):
    """
    General purpose importer for fileset entities. Simply fileset schema JSON
    and inserts.

    By default requires release_ids to be non-empty, and will check each
    release_id to see if a fileset is already associated; if so, skips the
    import. This behavior may change in the future, and can be disabled.

    Currently only creates (insert), no updates.
    """

    def __init__(self, api, **kwargs):

        eg_desc = kwargs.pop('editgroup_description', None) or "Generic Fileset entity import"
        eg_extra = kwargs.pop('editgroup_extra', dict())
        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FilesetImporter')
        kwargs['do_updates'] = bool(kwargs.get("do_updates", False))
        self.skip_release_fileset_check = bool(kwargs.get("skip_release_fileset_check", False))
        super().__init__(api,
            editgroup_description=eg_desc,
            editgroup_extra=eg_extra,
            **kwargs)

        # bezerk mode doesn't make sense for this importer
        assert self.bezerk_mode is False

    def want(self, row):
        if not row.get('release_ids'):
            self.counts['skip-no-release-ids'] += 1
            return False
        if not row.get('urls'):
            self.counts['skip-no-urls'] += 1
            return False
        if not row.get('manifest'):
            self.counts['skip-no-files'] += 1
            return False

        for f in row.get('manifest'):
            for k in ('sha1', 'md5'):
                if not f.get(k):
                    self.counts['skip-missing-file-field'] += 1
                    return False
        return True

    def parse_record(self, row):

        fse = entity_from_dict(
            row,
            fatcat_openapi_client.FilesetEntity,
            api_client=self.api.api_client,
        )
        fse = self.generic_fileset_cleanups(fse)
        return fse

    def try_update(self, fse):

        if not self.skip_release_fileset_check:
            for release_id in fse.release_ids:
                # don't catch 404, that would be an error
                release = self.api.get_release(release_id, expand='filesets', hide='abstracts,refs')
                assert release.state == 'active'
                if release.filesets:
                    self.counts['exists'] += 1
                    self.counts['exists-via-release-filesets'] += 1
                    return False

        # do the insert
        return True

    def insert_batch(self, batch):
        self.api.create_fileset_auto_batch(fatcat_openapi_client.FilesetAutoBatch(
            editgroup=fatcat_openapi_client.Editgroup(
                description=self.editgroup_description,
                extra=self.editgroup_extra),
            entity_list=batch))