From 87029cb13d244381f915fe66e40760477edb5675 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Jan 2020 12:15:09 -0800 Subject: shadow import: more filtering of file_meta fields --- python/fatcat_tools/importers/shadow.py | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py index 21a18837..cfe1b1cf 100644 --- a/python/fatcat_tools/importers/shadow.py +++ b/python/fatcat_tools/importers/shadow.py @@ -43,6 +43,16 @@ class ShadowLibraryImporter(EntityImporter): self.default_link_rel = kwargs.get("default_link_rel", "web") def want(self, raw_record): + """ + Only want to import records with complete file-level metadata + """ + fm = raw_record['file_meta'] + if not (fm['mimetype'] and fm['md5hex'] and fm['sha256hex'] and fm['size_bytes']): + self.counts['skip-file-meta-incomplete'] += 1 + return False + if fm['mimetype'] != 'application/pdf': + self.counts['skip-not-pdf'] += 1 + return False return True def parse_record(self, obj): -- cgit v1.2.3