From cc7ebbc9afa540cff04989db1edb0913f0d46a54 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 20 Aug 2020 14:17:39 -0700 Subject: initial implementation of file_meta importer --- python/fatcat_tools/importers/__init__.py | 1 + python/fatcat_tools/importers/file_meta.py | 70 ++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 python/fatcat_tools/importers/file_meta.py (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index c26446fd..b82eb11a 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -29,3 +29,4 @@ from .wayback_static import auto_wayback_static from .cdl_dash_dat import auto_cdl_dash_dat from .ingest import IngestFileResultImporter, SavePaperNowFileImporter from .shadow import ShadowLibraryImporter +from .file_meta import FileMetaImporter diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py new file mode 100644 index 00000000..c3728570 --- /dev/null +++ b/python/fatcat_tools/importers/file_meta.py @@ -0,0 +1,70 @@ + +import fatcat_openapi_client +from .common import EntityImporter + + +class FileMetaImporter(EntityImporter): + """ + The purpose of this importer is to update file-level metadata for file + entities that are missing some fields. + + It should *only* update entities, never create (insert) them. + + In particular, during early boostramping over 18 million file entities were + imported which were missing file size, mimetype, md5, and/or sha256. + """ + + def __init__(self, api, require_grobid=True, **kwargs): + + eg_desc = kwargs.pop('editgroup_description', None) or "File metadata updates" + eg_extra = kwargs.pop('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FileMetaImporter') + kwargs['do_updates'] = kwargs.get("do_updates", True) + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + + def want(self, row): + for k in ('sha1hex', 'sha256hex', 'md5hex', 'size_bytes', 'mimetype'): + if not row.get(k): + return False + return True + + def parse_record(self, row): + + file_meta = row + fe = fatcat_openapi_client.FileEntity( + md5=file_meta['md5hex'], + sha1=file_meta['sha1hex'], + sha256=file_meta['sha256hex'], + size=file_meta['size_bytes'], + mimetype=file_meta['mimetype'], + ) + return fe + + def try_update(self, fe): + # lookup sha1, or create new entity + existing = None + try: + existing = self.api.lookup_file(sha1=fe.sha1) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + + if not existing: + self.counts['skip-no-match'] + return False + + if (existing.md5 and existing.sha256 and existing.size_bytes and existing.mimetype): + self.counts['skip-existing-complete'] + return False + + existing.md5 = existing.md5 or fe.md5 + existing.sha256 = existing.sha256 or fe.sha256 + existing.file_bytes = existing.file_bytes or fe.file_bytes + existing.mimetype = existing.mimetype or fe.mimetype + + self.api.update_container(self.get_editgroup_id(), existing.ident, existing) + self.counts['update'] += 1 + return False -- cgit v1.2.3 From cdecb18701587277ba75756b2401279770421ba3 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 21 Aug 2020 16:08:20 -0700 Subject: fixes and test coverage for file_meta importer --- python/fatcat_import.py | 5 ++- python/fatcat_tools/importers/file_meta.py | 15 +++++--- python/tests/files/example_file_meta.json | 7 ++++ python/tests/import_file_meta.py | 61 ++++++++++++++++++++++++++++++ 4 files changed, 82 insertions(+), 6 deletions(-) create mode 100644 python/tests/files/example_file_meta.json create mode 100644 python/tests/import_file_meta.py (limited to 'python/fatcat_tools') diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 498683e0..e92b3106 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -237,8 +237,11 @@ def run_datacite(args): JsonLinePusher(dci, args.json_file).run() def run_file_meta(args): + # do_updates defaults to true for this importer fmi = FileMetaImporter(args.api, - edit_batch_size=100) + edit_batch_size=100, + editgroup_description=args.editgroup_description_override, + ) JsonLinePusher(fmi, args.json_file).run() def main(): diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py index c3728570..1e9d2ab7 100644 --- a/python/fatcat_tools/importers/file_meta.py +++ b/python/fatcat_tools/importers/file_meta.py @@ -28,11 +28,15 @@ class FileMetaImporter(EntityImporter): def want(self, row): for k in ('sha1hex', 'sha256hex', 'md5hex', 'size_bytes', 'mimetype'): if not row.get(k): + self.counts['skip-missing-field'] += 1 return False return True def parse_record(self, row): + # bezerk mode doesn't make sense for this importer + assert self.bezerk_mode == False + file_meta = row fe = fatcat_openapi_client.FileEntity( md5=file_meta['md5hex'], @@ -44,6 +48,7 @@ class FileMetaImporter(EntityImporter): return fe def try_update(self, fe): + # lookup sha1, or create new entity existing = None try: @@ -53,18 +58,18 @@ class FileMetaImporter(EntityImporter): raise err if not existing: - self.counts['skip-no-match'] + self.counts['skip-no-match'] += 1 return False - if (existing.md5 and existing.sha256 and existing.size_bytes and existing.mimetype): - self.counts['skip-existing-complete'] + if (existing.md5 and existing.sha256 and existing.size and existing.mimetype): + self.counts['skip-existing-complete'] += 1 return False existing.md5 = existing.md5 or fe.md5 existing.sha256 = existing.sha256 or fe.sha256 - existing.file_bytes = existing.file_bytes or fe.file_bytes + existing.size = existing.size or fe.size existing.mimetype = existing.mimetype or fe.mimetype - self.api.update_container(self.get_editgroup_id(), existing.ident, existing) + self.api.update_file(self.get_editgroup_id(), existing.ident, existing) self.counts['update'] += 1 return False diff --git a/python/tests/files/example_file_meta.json b/python/tests/files/example_file_meta.json new file mode 100644 index 00000000..7cc42213 --- /dev/null +++ b/python/tests/files/example_file_meta.json @@ -0,0 +1,7 @@ +{"sha1hex":"00000088bbc15a03ab89d8da6c356bf25aea9519","sha256hex":"593f5a260129bb89ed316c9ddcb7b2f9c2e3da8adf87d29f212b423de32a2c59","md5hex":"f4308d58dc8806232c30edc56a896412","size_bytes":354118,"mimetype":"application/pdf"} +{"sha1hex":"0000020e28273e4ed93fb8cf7556dbff6aca94d1","sha256hex":"23722f8899c8d246d31c42b920c35c94ad2a4f3c55ca8444dd41649d9df21374","md5hex":"bb57fac6eaa09ffa93ebb16f01a33fd4","size_bytes":945918,"mimetype":"application/pdf"} +{"sha1hex":"000002d15f55047daafae1abf7e81c76b64e8062","sha256hex":"1d3f5440160359679f66c957061f51cf5c0a2f38c1fef34502aa1e020499aea8","md5hex":"4424c346ef22168ee63b76e9808d7788","size_bytes":130761,"mimetype":"application/pdf"} +{"sha1hex":"00000376ad49f56145721503f1eb5e6e49e779fd","sha256hex":"87b56ac0438bc8a6d6911f2ecb821395ef39c09899e51285dfb4cafd5a670f1c","md5hex":"c9f9056276fa33d08e309854700129db","size_bytes":462901,"mimetype":"application/pdf"} +{"sha1hex":"0000045687dad717ed6512e395b04ec9c00995b7","sha256hex":"51bdc9e40cc175089fcb60b0b188e6cbcdcddb1ff8acbe6b039b8f8fff0afff0","md5hex":"e1fd97475c8aa102568f5d70a1bd0c07","size_bytes":372121,"mimetype":"application/pdf"} +{"sha1hex":"000005fcdf1fbdb06d978bb0e2b60d2b3fdfeade","sha256hex":"6bef5dc02fdf158da42540fe12021f7964587c6d3c34669874758a6a43a2b5e0","md5hex":"bcff3dfbb33c21b10dc2c4c46c1c6d03","size_bytes":112587,"mimetype":"application/pdf"} +{"sha1hex":"000005fcdf1fbdb06d978bb0e2b60d2b3fdfeade","sha256hex":"","md5hex":"bcff3dfbb33c21b10dc2c4c46c1c6d03","size_bytes":112587,"mimetype":"application/pdf"} diff --git a/python/tests/import_file_meta.py b/python/tests/import_file_meta.py new file mode 100644 index 00000000..b59356b6 --- /dev/null +++ b/python/tests/import_file_meta.py @@ -0,0 +1,61 @@ + +import json +import pytest + +from fatcat_tools.importers import FileMetaImporter, JsonLinePusher +from fatcat_openapi_client import FileEntity +from fixtures import * + + +@pytest.fixture(scope="function") +def file_meta_importer(api): + yield FileMetaImporter(api) + +def test_file_meta_importer_basic(file_meta_importer): + + # insert two file entities + api = file_meta_importer.api + eg = quick_eg(file_meta_importer.api) + # with full metadata + f1edit = api.create_file(eg.editgroup_id, FileEntity( + size=372121, + md5="e1fd97475c8aa102568f5d70a1bd0c07", + sha1="0000045687dad717ed6512e395b04ec9c00995b7", + sha256="51bdc9e40cc175089fcb60b0b188e6cbcdcddb1ff8acbe6b039b8f8fff0afff0", + mimetype="application/pdf", + )) + # partial/stub metadata + f2edit = api.create_file(eg.editgroup_id, FileEntity( + sha1="00000376ad49f56145721503f1eb5e6e49e779fd", + mimetype="application/pdf", + )) + api.accept_editgroup(eg.editgroup_id) + + last_index = file_meta_importer.api.get_changelog(limit=1)[0].index + + with open('tests/files/example_file_meta.json', 'r') as f: + counts = JsonLinePusher(file_meta_importer, f).run() + + assert counts['insert'] == 0 + assert counts['exists'] == 0 + assert counts['update'] == 1 + assert counts['skip-no-match'] == 4 + assert counts['skip-missing-field'] == 1 + assert counts['skip-existing-complete'] == 1 + + # cleanup file entities + eg = quick_eg(file_meta_importer.api) + api.delete_file(eg.editgroup_id, f1edit.ident) + api.delete_file(eg.editgroup_id, f2edit.ident) + api.accept_editgroup(eg.editgroup_id) + +def test_file_meta_dict_parse(file_meta_importer): + with open('tests/files/example_file_meta.json', 'r') as f: + raw = json.loads(f.readline()) + f = file_meta_importer.parse_record(raw) + + assert f.sha1 == "00000088bbc15a03ab89d8da6c356bf25aea9519" + assert f.md5 == "f4308d58dc8806232c30edc56a896412" + assert f.sha256 == "593f5a260129bb89ed316c9ddcb7b2f9c2e3da8adf87d29f212b423de32a2c59" + assert f.mimetype == "application/pdf" + assert f.size == 354118 -- cgit v1.2.3 From 6eda9ff1a14f12531d99023fbcd7ff5a43b0c9aa Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 27 Aug 2020 13:03:08 -0700 Subject: fix comment typo (thanks martin) --- python/fatcat_tools/importers/file_meta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py index 1e9d2ab7..628ebde8 100644 --- a/python/fatcat_tools/importers/file_meta.py +++ b/python/fatcat_tools/importers/file_meta.py @@ -10,7 +10,7 @@ class FileMetaImporter(EntityImporter): It should *only* update entities, never create (insert) them. - In particular, during early boostramping over 18 million file entities were + In particular, during early bootstrapping over 18 million file entities were imported which were missing file size, mimetype, md5, and/or sha256. """ -- cgit v1.2.3