From 4d56e71598457489e9f71ef4ce5c9b0254a4cce1 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 15 May 2019 10:42:20 -0700 Subject: fix default mimetype (impacted pre-1923 files) --- python/README_import.md | 2 +- python/fatcat_import.py | 6 +++++- python/fatcat_tools/importers/arabesque.py | 4 ++-- python/fatcat_tools/importers/matched.py | 9 +++++++-- 4 files changed, 15 insertions(+), 6 deletions(-) (limited to 'python') diff --git a/python/README_import.md b/python/README_import.md index d4abe400..04b838f8 100644 --- a/python/README_import.md +++ b/python/README_import.md @@ -83,5 +83,5 @@ Run import in parallel: export FATCAT_EDITGROUP_DESCRIPTION="File/DOI matching to user-uploaded pre-1923 and pre-1909 paper corpus on archive.org" export FATCAT_API_AUTH_TOKEN=... (FATCAT_AUTH_WORKER_ARCHIVE_ORG) - zcat /srv/fatcat/datasets/crossref-pre-1923-scholarly-works.matched.json.gz | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched - + zcat /srv/fatcat/datasets/crossref-pre-1923-scholarly-works.matched.json.gz | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched - --default-mime 'application/pdf' diff --git a/python/fatcat_import.py b/python/fatcat_import.py index c421fb43..6b1a10b1 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -29,7 +29,8 @@ def run_journal_metadata(args): def run_matched(args): fmi = MatchedImporter(args.api, edit_batch_size=args.batch_size, - editgroup_description=args.editgroup_description_override) + editgroup_description=args.editgroup_description_override, + default_mimetype=args.default_mimetype) JsonLinePusher(fmi, args.json_file).run() def run_arabesque_match(args): @@ -168,6 +169,9 @@ def main(): sub_matched.add_argument('json_file', help="JSON file to import from (or stdin)", default=sys.stdin, type=argparse.FileType('r')) + sub_matched.add_argument('--default-mimetype', + default=None, + help="default mimetype for imported files (if not specified per-file)") sub_matched.add_argument('--bezerk-mode', action='store_true', help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)") diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index ea38ec2f..e9376d96 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -66,7 +66,7 @@ class ArabesqueMatchImporter(EntityImporter): self.extid_type = extid_type self.default_link_rel = kwargs.get("default_link_rel", "web") assert self.default_link_rel - self.default_mime = kwargs.get("default_mime", None) + self.default_mimetype = kwargs.get("default_mimetype", None) self.do_updates = kwargs.get("do_updates", False) self.require_grobid = require_grobid if self.require_grobid: @@ -136,7 +136,7 @@ class ArabesqueMatchImporter(EntityImporter): fe = fatcat_client.FileEntity( sha1=b32_hex(row['final_sha1']), - mimetype=row['final_mimetype'], + mimetype=row['final_mimetype'] or self.default_mimetype, release_ids=[re.ident], urls=urls, ) diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 4d78fdc9..04ce4573 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -42,7 +42,7 @@ class MatchedImporter(EntityImporter): editgroup_extra=eg_extra, **kwargs) self.default_link_rel = kwargs.get("default_link_rel", "web") - self.default_mime = kwargs.get("default_mime", None) + self.default_mimetype = kwargs.get("default_mimetype", None) def want(self, raw_record): return True @@ -100,12 +100,17 @@ class MatchedImporter(EntityImporter): if size: size = int(size) + mimetype = obj.get('mimetype', self.default_mimetype) + if not mimetype and urls: + if urls[0].url.endswith('.pdf'): + mimetype = 'application/pdf' + fe = fatcat_client.FileEntity( md5=obj.get('md5'), sha1=obj['sha1'], sha256=obj.get('sha256'), size=size, - mimetype=obj.get('mimetype'), + mimetype=mimetype, release_ids=release_ids, urls=urls, ) -- cgit v1.2.3