summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/importers/arabesque.py4
-rw-r--r--python/fatcat_tools/importers/matched.py9
2 files changed, 9 insertions, 4 deletions
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py
index ea38ec2f..e9376d96 100644
--- a/python/fatcat_tools/importers/arabesque.py
+++ b/python/fatcat_tools/importers/arabesque.py
@@ -66,7 +66,7 @@ class ArabesqueMatchImporter(EntityImporter):
self.extid_type = extid_type
self.default_link_rel = kwargs.get("default_link_rel", "web")
assert self.default_link_rel
- self.default_mime = kwargs.get("default_mime", None)
+ self.default_mimetype = kwargs.get("default_mimetype", None)
self.do_updates = kwargs.get("do_updates", False)
self.require_grobid = require_grobid
if self.require_grobid:
@@ -136,7 +136,7 @@ class ArabesqueMatchImporter(EntityImporter):
fe = fatcat_client.FileEntity(
sha1=b32_hex(row['final_sha1']),
- mimetype=row['final_mimetype'],
+ mimetype=row['final_mimetype'] or self.default_mimetype,
release_ids=[re.ident],
urls=urls,
)
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 4d78fdc9..04ce4573 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -42,7 +42,7 @@ class MatchedImporter(EntityImporter):
editgroup_extra=eg_extra,
**kwargs)
self.default_link_rel = kwargs.get("default_link_rel", "web")
- self.default_mime = kwargs.get("default_mime", None)
+ self.default_mimetype = kwargs.get("default_mimetype", None)
def want(self, raw_record):
return True
@@ -100,12 +100,17 @@ class MatchedImporter(EntityImporter):
if size:
size = int(size)
+ mimetype = obj.get('mimetype', self.default_mimetype)
+ if not mimetype and urls:
+ if urls[0].url.endswith('.pdf'):
+ mimetype = 'application/pdf'
+
fe = fatcat_client.FileEntity(
md5=obj.get('md5'),
sha1=obj['sha1'],
sha256=obj.get('sha256'),
size=size,
- mimetype=obj.get('mimetype'),
+ mimetype=mimetype,
release_ids=release_ids,
urls=urls,
)