From cc7ebbc9afa540cff04989db1edb0913f0d46a54 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 20 Aug 2020 14:17:39 -0700 Subject: initial implementation of file_meta importer --- python/fatcat_import.py | 15 +++++++ python/fatcat_tools/importers/__init__.py | 1 + python/fatcat_tools/importers/file_meta.py | 70 ++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+) create mode 100644 python/fatcat_tools/importers/file_meta.py (limited to 'python') diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 252ab3a5..498683e0 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -236,6 +236,11 @@ def run_datacite(args): else: JsonLinePusher(dci, args.json_file).run() +def run_file_meta(args): + fmi = FileMetaImporter(args.api, + edit_batch_size=100) + JsonLinePusher(fmi, args.json_file).run() + def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -556,6 +561,16 @@ def main(): auth_var="FATCAT_AUTH_WORKER_DATACITE", ) + sub_file_meta = subparsers.add_parser('file-meta', + help="simple update-only importer for file metadata") + sub_file_meta.set_defaults( + func=run_file_meta, + auth_var="FATCAT_API_AUTH_TOKEN", + ) + sub_file_meta.add_argument('json_file', + help="File with jsonlines from file_meta schema to import from", + default=sys.stdin, type=argparse.FileType('r')) + args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index c26446fd..b82eb11a 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -29,3 +29,4 @@ from .wayback_static import auto_wayback_static from .cdl_dash_dat import auto_cdl_dash_dat from .ingest import IngestFileResultImporter, SavePaperNowFileImporter from .shadow import ShadowLibraryImporter +from .file_meta import FileMetaImporter diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py new file mode 100644 index 00000000..c3728570 --- /dev/null +++ b/python/fatcat_tools/importers/file_meta.py @@ -0,0 +1,70 @@ + +import fatcat_openapi_client +from .common import EntityImporter + + +class FileMetaImporter(EntityImporter): + """ + The purpose of this importer is to update file-level metadata for file + entities that are missing some fields. + + It should *only* update entities, never create (insert) them. + + In particular, during early boostramping over 18 million file entities were + imported which were missing file size, mimetype, md5, and/or sha256. + """ + + def __init__(self, api, require_grobid=True, **kwargs): + + eg_desc = kwargs.pop('editgroup_description', None) or "File metadata updates" + eg_extra = kwargs.pop('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FileMetaImporter') + kwargs['do_updates'] = kwargs.get("do_updates", True) + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + + def want(self, row): + for k in ('sha1hex', 'sha256hex', 'md5hex', 'size_bytes', 'mimetype'): + if not row.get(k): + return False + return True + + def parse_record(self, row): + + file_meta = row + fe = fatcat_openapi_client.FileEntity( + md5=file_meta['md5hex'], + sha1=file_meta['sha1hex'], + sha256=file_meta['sha256hex'], + size=file_meta['size_bytes'], + mimetype=file_meta['mimetype'], + ) + return fe + + def try_update(self, fe): + # lookup sha1, or create new entity + existing = None + try: + existing = self.api.lookup_file(sha1=fe.sha1) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + + if not existing: + self.counts['skip-no-match'] + return False + + if (existing.md5 and existing.sha256 and existing.size_bytes and existing.mimetype): + self.counts['skip-existing-complete'] + return False + + existing.md5 = existing.md5 or fe.md5 + existing.sha256 = existing.sha256 or fe.sha256 + existing.file_bytes = existing.file_bytes or fe.file_bytes + existing.mimetype = existing.mimetype or fe.mimetype + + self.api.update_container(self.get_editgroup_id(), existing.ident, existing) + self.counts['update'] += 1 + return False -- cgit v1.2.3