aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xpython/fatcat_import.py18
-rw-r--r--python/fatcat_tools/importers/__init__.py1
-rw-r--r--python/fatcat_tools/importers/file_meta.py75
-rw-r--r--python/tests/files/example_file_meta.json7
-rw-r--r--python/tests/import_file_meta.py61
5 files changed, 162 insertions, 0 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 252ab3a5..e92b3106 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -236,6 +236,14 @@ def run_datacite(args):
else:
JsonLinePusher(dci, args.json_file).run()
+def run_file_meta(args):
+ # do_updates defaults to true for this importer
+ fmi = FileMetaImporter(args.api,
+ edit_batch_size=100,
+ editgroup_description=args.editgroup_description_override,
+ )
+ JsonLinePusher(fmi, args.json_file).run()
+
def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -556,6 +564,16 @@ def main():
auth_var="FATCAT_AUTH_WORKER_DATACITE",
)
+ sub_file_meta = subparsers.add_parser('file-meta',
+ help="simple update-only importer for file metadata")
+ sub_file_meta.set_defaults(
+ func=run_file_meta,
+ auth_var="FATCAT_API_AUTH_TOKEN",
+ )
+ sub_file_meta.add_argument('json_file',
+ help="File with jsonlines from file_meta schema to import from",
+ default=sys.stdin, type=argparse.FileType('r'))
+
args = parser.parse_args()
if not args.__dict__.get("func"):
print("tell me what to do!")
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index c26446fd..b82eb11a 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -29,3 +29,4 @@ from .wayback_static import auto_wayback_static
from .cdl_dash_dat import auto_cdl_dash_dat
from .ingest import IngestFileResultImporter, SavePaperNowFileImporter
from .shadow import ShadowLibraryImporter
+from .file_meta import FileMetaImporter
diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py
new file mode 100644
index 00000000..628ebde8
--- /dev/null
+++ b/python/fatcat_tools/importers/file_meta.py
@@ -0,0 +1,75 @@
+
+import fatcat_openapi_client
+from .common import EntityImporter
+
+
+class FileMetaImporter(EntityImporter):
+ """
+ The purpose of this importer is to update file-level metadata for file
+ entities that are missing some fields.
+
+ It should *only* update entities, never create (insert) them.
+
+ In particular, during early bootstrapping over 18 million file entities were
+ imported which were missing file size, mimetype, md5, and/or sha256.
+ """
+
+ def __init__(self, api, require_grobid=True, **kwargs):
+
+ eg_desc = kwargs.pop('editgroup_description', None) or "File metadata updates"
+ eg_extra = kwargs.pop('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FileMetaImporter')
+ kwargs['do_updates'] = kwargs.get("do_updates", True)
+ super().__init__(api,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs)
+
+ def want(self, row):
+ for k in ('sha1hex', 'sha256hex', 'md5hex', 'size_bytes', 'mimetype'):
+ if not row.get(k):
+ self.counts['skip-missing-field'] += 1
+ return False
+ return True
+
+ def parse_record(self, row):
+
+ # bezerk mode doesn't make sense for this importer
+ assert self.bezerk_mode == False
+
+ file_meta = row
+ fe = fatcat_openapi_client.FileEntity(
+ md5=file_meta['md5hex'],
+ sha1=file_meta['sha1hex'],
+ sha256=file_meta['sha256hex'],
+ size=file_meta['size_bytes'],
+ mimetype=file_meta['mimetype'],
+ )
+ return fe
+
+ def try_update(self, fe):
+
+ # lookup sha1, or create new entity
+ existing = None
+ try:
+ existing = self.api.lookup_file(sha1=fe.sha1)
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ if not existing:
+ self.counts['skip-no-match'] += 1
+ return False
+
+ if (existing.md5 and existing.sha256 and existing.size and existing.mimetype):
+ self.counts['skip-existing-complete'] += 1
+ return False
+
+ existing.md5 = existing.md5 or fe.md5
+ existing.sha256 = existing.sha256 or fe.sha256
+ existing.size = existing.size or fe.size
+ existing.mimetype = existing.mimetype or fe.mimetype
+
+ self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
+ self.counts['update'] += 1
+ return False
diff --git a/python/tests/files/example_file_meta.json b/python/tests/files/example_file_meta.json
new file mode 100644
index 00000000..7cc42213
--- /dev/null
+++ b/python/tests/files/example_file_meta.json
@@ -0,0 +1,7 @@
+{"sha1hex":"00000088bbc15a03ab89d8da6c356bf25aea9519","sha256hex":"593f5a260129bb89ed316c9ddcb7b2f9c2e3da8adf87d29f212b423de32a2c59","md5hex":"f4308d58dc8806232c30edc56a896412","size_bytes":354118,"mimetype":"application/pdf"}
+{"sha1hex":"0000020e28273e4ed93fb8cf7556dbff6aca94d1","sha256hex":"23722f8899c8d246d31c42b920c35c94ad2a4f3c55ca8444dd41649d9df21374","md5hex":"bb57fac6eaa09ffa93ebb16f01a33fd4","size_bytes":945918,"mimetype":"application/pdf"}
+{"sha1hex":"000002d15f55047daafae1abf7e81c76b64e8062","sha256hex":"1d3f5440160359679f66c957061f51cf5c0a2f38c1fef34502aa1e020499aea8","md5hex":"4424c346ef22168ee63b76e9808d7788","size_bytes":130761,"mimetype":"application/pdf"}
+{"sha1hex":"00000376ad49f56145721503f1eb5e6e49e779fd","sha256hex":"87b56ac0438bc8a6d6911f2ecb821395ef39c09899e51285dfb4cafd5a670f1c","md5hex":"c9f9056276fa33d08e309854700129db","size_bytes":462901,"mimetype":"application/pdf"}
+{"sha1hex":"0000045687dad717ed6512e395b04ec9c00995b7","sha256hex":"51bdc9e40cc175089fcb60b0b188e6cbcdcddb1ff8acbe6b039b8f8fff0afff0","md5hex":"e1fd97475c8aa102568f5d70a1bd0c07","size_bytes":372121,"mimetype":"application/pdf"}
+{"sha1hex":"000005fcdf1fbdb06d978bb0e2b60d2b3fdfeade","sha256hex":"6bef5dc02fdf158da42540fe12021f7964587c6d3c34669874758a6a43a2b5e0","md5hex":"bcff3dfbb33c21b10dc2c4c46c1c6d03","size_bytes":112587,"mimetype":"application/pdf"}
+{"sha1hex":"000005fcdf1fbdb06d978bb0e2b60d2b3fdfeade","sha256hex":"","md5hex":"bcff3dfbb33c21b10dc2c4c46c1c6d03","size_bytes":112587,"mimetype":"application/pdf"}
diff --git a/python/tests/import_file_meta.py b/python/tests/import_file_meta.py
new file mode 100644
index 00000000..b59356b6
--- /dev/null
+++ b/python/tests/import_file_meta.py
@@ -0,0 +1,61 @@
+
+import json
+import pytest
+
+from fatcat_tools.importers import FileMetaImporter, JsonLinePusher
+from fatcat_openapi_client import FileEntity
+from fixtures import *
+
+
+@pytest.fixture(scope="function")
+def file_meta_importer(api):
+ yield FileMetaImporter(api)
+
+def test_file_meta_importer_basic(file_meta_importer):
+
+ # insert two file entities
+ api = file_meta_importer.api
+ eg = quick_eg(file_meta_importer.api)
+ # with full metadata
+ f1edit = api.create_file(eg.editgroup_id, FileEntity(
+ size=372121,
+ md5="e1fd97475c8aa102568f5d70a1bd0c07",
+ sha1="0000045687dad717ed6512e395b04ec9c00995b7",
+ sha256="51bdc9e40cc175089fcb60b0b188e6cbcdcddb1ff8acbe6b039b8f8fff0afff0",
+ mimetype="application/pdf",
+ ))
+ # partial/stub metadata
+ f2edit = api.create_file(eg.editgroup_id, FileEntity(
+ sha1="00000376ad49f56145721503f1eb5e6e49e779fd",
+ mimetype="application/pdf",
+ ))
+ api.accept_editgroup(eg.editgroup_id)
+
+ last_index = file_meta_importer.api.get_changelog(limit=1)[0].index
+
+ with open('tests/files/example_file_meta.json', 'r') as f:
+ counts = JsonLinePusher(file_meta_importer, f).run()
+
+ assert counts['insert'] == 0
+ assert counts['exists'] == 0
+ assert counts['update'] == 1
+ assert counts['skip-no-match'] == 4
+ assert counts['skip-missing-field'] == 1
+ assert counts['skip-existing-complete'] == 1
+
+ # cleanup file entities
+ eg = quick_eg(file_meta_importer.api)
+ api.delete_file(eg.editgroup_id, f1edit.ident)
+ api.delete_file(eg.editgroup_id, f2edit.ident)
+ api.accept_editgroup(eg.editgroup_id)
+
+def test_file_meta_dict_parse(file_meta_importer):
+ with open('tests/files/example_file_meta.json', 'r') as f:
+ raw = json.loads(f.readline())
+ f = file_meta_importer.parse_record(raw)
+
+ assert f.sha1 == "00000088bbc15a03ab89d8da6c356bf25aea9519"
+ assert f.md5 == "f4308d58dc8806232c30edc56a896412"
+ assert f.sha256 == "593f5a260129bb89ed316c9ddcb7b2f9c2e3da8adf87d29f212b423de32a2c59"
+ assert f.mimetype == "application/pdf"
+ assert f.size == 354118