aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/file_meta.py
blob: 892c1dcde2079f29faf64e8e61d08498bcce2659 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from typing import Any, Dict

import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, FileEntity

from .common import EntityImporter


class FileMetaImporter(EntityImporter):
    """
    The purpose of this importer is to update file-level metadata for file
    entities that are missing some fields.

    It should *only* update entities, never create (insert) them.

    In particular, during early bootstrapping over 18 million file entities were
    imported which were missing file size, mimetype, md5, and/or sha256.
    """

    def __init__(self, api: ApiClient, require_grobid: bool = True, **kwargs):

        eg_desc = kwargs.pop("editgroup_description", None) or "File metadata updates"
        eg_extra = kwargs.pop("editgroup_extra", dict())
        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FileMetaImporter")
        kwargs["do_updates"] = kwargs.get("do_updates", True)
        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)

    def want(self, row: Any) -> bool:
        for k in ("sha1hex", "sha256hex", "md5hex", "size_bytes", "mimetype"):
            if not row.get(k):
                self.counts["skip-missing-field"] += 1
                return False
        return True

    def parse_record(self, row: Dict[str, Any]) -> FileEntity:

        # bezerk mode doesn't make sense for this importer
        assert self.bezerk_mode is False

        file_meta = row
        fe = fatcat_openapi_client.FileEntity(
            md5=file_meta["md5hex"],
            sha1=file_meta["sha1hex"],
            sha256=file_meta["sha256hex"],
            size=file_meta["size_bytes"],
            mimetype=file_meta["mimetype"],
        )
        return fe

    def try_update(self, fe: FileEntity) -> bool:

        # lookup sha1, or create new entity
        existing = None
        try:
            existing = self.api.lookup_file(sha1=fe.sha1)
        except fatcat_openapi_client.rest.ApiException as err:
            if err.status != 404:
                raise err

        if not existing:
            self.counts["skip-no-match"] += 1
            return False

        if existing.md5 and existing.sha256 and existing.size and existing.mimetype:
            self.counts["skip-existing-complete"] += 1
            return False

        existing.md5 = existing.md5 or fe.md5
        existing.sha256 = existing.sha256 or fe.sha256
        existing.size = existing.size or fe.size
        existing.mimetype = existing.mimetype or fe.mimetype

        # generic file entity cleanups
        existing = self.generic_file_cleanups(existing)

        self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
        self.counts["update"] += 1
        return False