diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2020-08-20 14:17:39 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-08-21 15:35:06 -0700 | 
| commit | cc7ebbc9afa540cff04989db1edb0913f0d46a54 (patch) | |
| tree | 3832197bef70d3356f22a23153eee54063bf0d1d /python/fatcat_tools/importers | |
| parent | daf91b137483b7345448b597289c78f8fb3f9969 (diff) | |
| download | fatcat-cc7ebbc9afa540cff04989db1edb0913f0d46a54.tar.gz fatcat-cc7ebbc9afa540cff04989db1edb0913f0d46a54.zip  | |
initial implementation of file_meta importer
Diffstat (limited to 'python/fatcat_tools/importers')
| -rw-r--r-- | python/fatcat_tools/importers/__init__.py | 1 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/file_meta.py | 70 | 
2 files changed, 71 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index c26446fd..b82eb11a 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -29,3 +29,4 @@ from .wayback_static import auto_wayback_static  from .cdl_dash_dat import auto_cdl_dash_dat  from .ingest import IngestFileResultImporter, SavePaperNowFileImporter  from .shadow import ShadowLibraryImporter +from .file_meta import FileMetaImporter diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py new file mode 100644 index 00000000..c3728570 --- /dev/null +++ b/python/fatcat_tools/importers/file_meta.py @@ -0,0 +1,70 @@ + +import fatcat_openapi_client +from .common import EntityImporter + + +class FileMetaImporter(EntityImporter): +    """ +    The purpose of this importer is to update file-level metadata for file +    entities that are missing some fields. + +    It should *only* update entities, never create (insert) them. + +    In particular, during early boostramping over 18 million file entities were +    imported which were missing file size, mimetype, md5, and/or sha256. +    """ + +    def __init__(self, api, require_grobid=True, **kwargs): + +        eg_desc = kwargs.pop('editgroup_description', None) or "File metadata updates" +        eg_extra = kwargs.pop('editgroup_extra', dict()) +        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FileMetaImporter') +        kwargs['do_updates'] = kwargs.get("do_updates", True) +        super().__init__(api, +            editgroup_description=eg_desc, +            editgroup_extra=eg_extra, +            **kwargs) + +    def want(self, row): +        for k in ('sha1hex', 'sha256hex', 'md5hex', 'size_bytes', 'mimetype'): +            if not row.get(k): +                return False +        return True + +    def parse_record(self, row): + +        file_meta = row +        fe = fatcat_openapi_client.FileEntity( +            md5=file_meta['md5hex'], +            sha1=file_meta['sha1hex'], +            sha256=file_meta['sha256hex'], +            size=file_meta['size_bytes'], +            mimetype=file_meta['mimetype'], +        ) +        return fe + +    def try_update(self, fe): +        # lookup sha1, or create new entity +        existing = None +        try: +            existing = self.api.lookup_file(sha1=fe.sha1) +        except fatcat_openapi_client.rest.ApiException as err: +            if err.status != 404: +                raise err + +        if not existing: +            self.counts['skip-no-match'] +            return False + +        if (existing.md5 and existing.sha256 and existing.size_bytes and existing.mimetype): +            self.counts['skip-existing-complete'] +            return False + +        existing.md5 = existing.md5 or fe.md5 +        existing.sha256 = existing.sha256 or fe.sha256 +        existing.file_bytes = existing.file_bytes or fe.file_bytes +        existing.mimetype = existing.mimetype or fe.mimetype + +        self.api.update_container(self.get_editgroup_id(), existing.ident, existing) +        self.counts['update'] += 1 +        return False  | 
