1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
|
import fatcat_openapi_client
from fatcat_tools import entity_from_dict
from .common import EntityImporter
class FilesetImporter(EntityImporter):
"""
General purpose importer for fileset entities. Simply fileset schema JSON
and inserts.
By default requires release_ids to be non-empty, and will check each
release_id to see if a fileset is already associated; if so, skips the
import. This behavior may change in the future, and can be disabled.
Currently only creates (insert), no updates.
"""
def __init__(self, api, **kwargs):
eg_desc = kwargs.pop('editgroup_description', None) or "Generic Fileset entity import"
eg_extra = kwargs.pop('editgroup_extra', dict())
eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FilesetImporter')
kwargs['do_updates'] = bool(kwargs.get("do_updates", False))
self.skip_release_fileset_check = bool(kwargs.get("skip_release_fileset_check", False))
super().__init__(api,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
**kwargs)
# bezerk mode doesn't make sense for this importer
assert self.bezerk_mode == False
def want(self, row):
if not row.get('release_ids'):
self.counts['skip-no-release-ids'] += 1
return False
if not row.get('urls'):
self.counts['skip-no-urls'] += 1
return False
if not row.get('manifest'):
self.counts['skip-no-files'] += 1
return False
for f in row.get('manifest'):
for k in ('sha1', 'md5'):
if not f.get(k):
self.counts['skip-missing-file-field'] += 1
return False
return True
def parse_record(self, row):
fse = entity_from_dict(
row,
fatcat_openapi_client.FilesetEntity,
api_client=self.api.api_client,
)
fse = self.generic_fileset_cleanups(fse)
return fse
def try_update(self, fse):
if not self.skip_release_fileset_check:
for release_id in fse.release_ids:
# don't catch 404, that would be an error
release = self.api.get_release(release_id, expand='filesets', hide='abstracts,refs')
assert release.state == 'active'
if release.filesets:
self.counts['exists'] += 1
self.counts['exists-via-release-filesets'] += 1
return False
# do the insert
return True
def insert_batch(self, batch):
self.api.create_fileset_auto_batch(fatcat_openapi_client.FilesetAutoBatch(
editgroup=fatcat_openapi_client.Editgroup(
description=self.editgroup_description,
extra=self.editgroup_extra),
entity_list=batch))
|