summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/fileset_generic.py
blob: 2207b93838c4ece4994c7a1a5ced3cc482ab1dc1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from typing import Any, Dict, List, Optional

import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, FilesetEntity

from fatcat_tools import entity_from_dict

from .common import EntityImporter


class FilesetImporter(EntityImporter):
    """
    General purpose importer for fileset entities. Simply fileset schema JSON
    and inserts.

    By default requires release_ids to be non-empty, and will check each
    release_id to see if a fileset is already associated; if so, skips the
    import. This behavior may change in the future, and can be disabled.

    Currently only creates (insert), no updates.
    """

    def __init__(self, api: ApiClient, **kwargs) -> None:

        eg_desc = kwargs.pop("editgroup_description", None) or "Generic Fileset entity import"
        eg_extra = kwargs.pop("editgroup_extra", dict())
        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FilesetImporter")
        kwargs["do_updates"] = bool(kwargs.get("do_updates", False))
        self.skip_release_fileset_check = bool(kwargs.get("skip_release_fileset_check", False))
        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)

        # bezerk mode doesn't make sense for this importer
        assert self.bezerk_mode is False

    def want(self, row: Dict[str, Any]) -> bool:
        if not row.get("release_ids"):
            self.counts["skip-no-release-ids"] += 1
            return False
        if not row.get("urls"):
            self.counts["skip-no-urls"] += 1
            return False
        if not row.get("manifest"):
            self.counts["skip-no-files"] += 1
            return False

        for f in row.get("manifest"):
            for k in ("sha1", "md5"):
                if not f.get(k):
                    self.counts["skip-missing-file-field"] += 1
                    return False
        return True

    def parse_record(self, row: Dict[str, Any]) -> Optional[FilesetEntity]:

        fse = entity_from_dict(
            row,
            fatcat_openapi_client.FilesetEntity,
            api_client=self.api.api_client,
        )
        fse = self.generic_fileset_cleanups(fse)
        return fse

    def try_update(self, fse: FilesetEntity) -> bool:

        if not self.skip_release_fileset_check:
            for release_id in fse.release_ids:
                # don't catch 404, that would be an error
                release = self.api.get_release(
                    release_id, expand="filesets", hide="abstracts,refs"
                )
                assert release.state == "active"
                if release.filesets:
                    self.counts["exists"] += 1
                    self.counts["exists-via-release-filesets"] += 1
                    return False

        # do the insert
        return True

    def insert_batch(self, batch: List[FilesetEntity]) -> None:
        self.api.create_fileset_auto_batch(
            fatcat_openapi_client.FilesetAutoBatch(
                editgroup=fatcat_openapi_client.Editgroup(
                    description=self.editgroup_description, extra=self.editgroup_extra
                ),
                entity_list=batch,
            )
        )