aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-01-30 12:15:09 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-02-13 22:24:20 -0800
commit87029cb13d244381f915fe66e40760477edb5675 (patch)
tree6d4a9e0f8c1ec7327df03e9c77dd451e3e453710 /python/fatcat_tools
parente59d1b617d4abd5f002d9e59b6bbaebc9ff30993 (diff)
downloadfatcat-87029cb13d244381f915fe66e40760477edb5675.tar.gz
fatcat-87029cb13d244381f915fe66e40760477edb5675.zip
shadow import: more filtering of file_meta fields
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/importers/shadow.py10
1 files changed, 10 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py
index 21a18837..cfe1b1cf 100644
--- a/python/fatcat_tools/importers/shadow.py
+++ b/python/fatcat_tools/importers/shadow.py
@@ -43,6 +43,16 @@ class ShadowLibraryImporter(EntityImporter):
self.default_link_rel = kwargs.get("default_link_rel", "web")
def want(self, raw_record):
+ """
+ Only want to import records with complete file-level metadata
+ """
+ fm = raw_record['file_meta']
+ if not (fm['mimetype'] and fm['md5hex'] and fm['sha256hex'] and fm['size_bytes']):
+ self.counts['skip-file-meta-incomplete'] += 1
+ return False
+ if fm['mimetype'] != 'application/pdf':
+ self.counts['skip-not-pdf'] += 1
+ return False
return True
def parse_record(self, obj):