1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
|
from typing import Any, Dict, List, Optional
import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, FilesetEntity
from fatcat_tools import entity_from_dict
from .common import EntityImporter
class FilesetImporter(EntityImporter):
"""
General purpose importer for fileset entities. Simply fileset schema JSON
and inserts.
By default requires release_ids to be non-empty, and will check each
release_id to see if a fileset is already associated; if so, skips the
import. This behavior may change in the future, and can be disabled.
Currently only creates (insert), no updates.
"""
def __init__(self, api: ApiClient, **kwargs) -> None:
eg_desc = kwargs.pop("editgroup_description", None) or "Generic Fileset entity import"
eg_extra = kwargs.pop("editgroup_extra", dict())
eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FilesetImporter")
kwargs["do_updates"] = bool(kwargs.get("do_updates", False))
self.skip_release_fileset_check = bool(kwargs.get("skip_release_fileset_check", False))
super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
# bezerk mode doesn't make sense for this importer
assert self.bezerk_mode is False
def want(self, row: Dict[str, Any]) -> bool:
if not row.get("release_ids"):
self.counts["skip-no-release-ids"] += 1
return False
if not row.get("urls"):
self.counts["skip-no-urls"] += 1
return False
if not row.get("manifest"):
self.counts["skip-no-files"] += 1
return False
for f in row.get("manifest"):
for k in ("sha1", "md5"):
if not f.get(k):
self.counts["skip-missing-file-field"] += 1
return False
return True
def parse_record(self, row: Dict[str, Any]) -> Optional[FilesetEntity]:
fse = entity_from_dict(
row,
fatcat_openapi_client.FilesetEntity,
api_client=self.api.api_client,
)
fse = self.generic_fileset_cleanups(fse)
return fse
def try_update(self, fse: FilesetEntity) -> bool:
if not self.skip_release_fileset_check:
for release_id in fse.release_ids:
# don't catch 404, that would be an error
release = self.api.get_release(
release_id, expand="filesets", hide="abstracts,refs"
)
assert release.state == "active"
if release.filesets:
self.counts["exists"] += 1
self.counts["exists-via-release-filesets"] += 1
return False
# do the insert
return True
def insert_batch(self, batch: List[FilesetEntity]) -> None:
self.api.create_fileset_auto_batch(
fatcat_openapi_client.FilesetAutoBatch(
editgroup=fatcat_openapi_client.Editgroup(
description=self.editgroup_description, extra=self.editgroup_extra
),
entity_list=batch,
)
)
|