From 4d56e71598457489e9f71ef4ce5c9b0254a4cce1 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 15 May 2019 10:42:20 -0700 Subject: fix default mimetype (impacted pre-1923 files) --- python/fatcat_tools/importers/arabesque.py | 4 ++-- python/fatcat_tools/importers/matched.py | 9 +++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'python/fatcat_tools/importers') diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index ea38ec2f..e9376d96 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -66,7 +66,7 @@ class ArabesqueMatchImporter(EntityImporter): self.extid_type = extid_type self.default_link_rel = kwargs.get("default_link_rel", "web") assert self.default_link_rel - self.default_mime = kwargs.get("default_mime", None) + self.default_mimetype = kwargs.get("default_mimetype", None) self.do_updates = kwargs.get("do_updates", False) self.require_grobid = require_grobid if self.require_grobid: @@ -136,7 +136,7 @@ class ArabesqueMatchImporter(EntityImporter): fe = fatcat_client.FileEntity( sha1=b32_hex(row['final_sha1']), - mimetype=row['final_mimetype'], + mimetype=row['final_mimetype'] or self.default_mimetype, release_ids=[re.ident], urls=urls, ) diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 4d78fdc9..04ce4573 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -42,7 +42,7 @@ class MatchedImporter(EntityImporter): editgroup_extra=eg_extra, **kwargs) self.default_link_rel = kwargs.get("default_link_rel", "web") - self.default_mime = kwargs.get("default_mime", None) + self.default_mimetype = kwargs.get("default_mimetype", None) def want(self, raw_record): return True @@ -100,12 +100,17 @@ class MatchedImporter(EntityImporter): if size: size = int(size) + mimetype = obj.get('mimetype', self.default_mimetype) + if not mimetype and urls: + if urls[0].url.endswith('.pdf'): + mimetype = 'application/pdf' + fe = fatcat_client.FileEntity( md5=obj.get('md5'), sha1=obj['sha1'], sha256=obj.get('sha256'), size=size, - mimetype=obj.get('mimetype'), + mimetype=mimetype, release_ids=release_ids, urls=urls, ) -- cgit v1.2.3