summaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/README_import.md2
-rwxr-xr-xpython/fatcat_import.py6
-rw-r--r--python/fatcat_tools/importers/arabesque.py4
-rw-r--r--python/fatcat_tools/importers/matched.py9
4 files changed, 15 insertions, 6 deletions
diff --git a/python/README_import.md b/python/README_import.md
index d4abe400..04b838f8 100644
--- a/python/README_import.md
+++ b/python/README_import.md
@@ -83,5 +83,5 @@ Run import in parallel:
export FATCAT_EDITGROUP_DESCRIPTION="File/DOI matching to user-uploaded pre-1923 and pre-1909 paper corpus on archive.org"
export FATCAT_API_AUTH_TOKEN=... (FATCAT_AUTH_WORKER_ARCHIVE_ORG)
- zcat /srv/fatcat/datasets/crossref-pre-1923-scholarly-works.matched.json.gz | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched -
+ zcat /srv/fatcat/datasets/crossref-pre-1923-scholarly-works.matched.json.gz | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched - --default-mime 'application/pdf'
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index c421fb43..6b1a10b1 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -29,7 +29,8 @@ def run_journal_metadata(args):
def run_matched(args):
fmi = MatchedImporter(args.api,
edit_batch_size=args.batch_size,
- editgroup_description=args.editgroup_description_override)
+ editgroup_description=args.editgroup_description_override,
+ default_mimetype=args.default_mimetype)
JsonLinePusher(fmi, args.json_file).run()
def run_arabesque_match(args):
@@ -168,6 +169,9 @@ def main():
sub_matched.add_argument('json_file',
help="JSON file to import from (or stdin)",
default=sys.stdin, type=argparse.FileType('r'))
+ sub_matched.add_argument('--default-mimetype',
+ default=None,
+ help="default mimetype for imported files (if not specified per-file)")
sub_matched.add_argument('--bezerk-mode',
action='store_true',
help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)")
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py
index ea38ec2f..e9376d96 100644
--- a/python/fatcat_tools/importers/arabesque.py
+++ b/python/fatcat_tools/importers/arabesque.py
@@ -66,7 +66,7 @@ class ArabesqueMatchImporter(EntityImporter):
self.extid_type = extid_type
self.default_link_rel = kwargs.get("default_link_rel", "web")
assert self.default_link_rel
- self.default_mime = kwargs.get("default_mime", None)
+ self.default_mimetype = kwargs.get("default_mimetype", None)
self.do_updates = kwargs.get("do_updates", False)
self.require_grobid = require_grobid
if self.require_grobid:
@@ -136,7 +136,7 @@ class ArabesqueMatchImporter(EntityImporter):
fe = fatcat_client.FileEntity(
sha1=b32_hex(row['final_sha1']),
- mimetype=row['final_mimetype'],
+ mimetype=row['final_mimetype'] or self.default_mimetype,
release_ids=[re.ident],
urls=urls,
)
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 4d78fdc9..04ce4573 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -42,7 +42,7 @@ class MatchedImporter(EntityImporter):
editgroup_extra=eg_extra,
**kwargs)
self.default_link_rel = kwargs.get("default_link_rel", "web")
- self.default_mime = kwargs.get("default_mime", None)
+ self.default_mimetype = kwargs.get("default_mimetype", None)
def want(self, raw_record):
return True
@@ -100,12 +100,17 @@ class MatchedImporter(EntityImporter):
if size:
size = int(size)
+ mimetype = obj.get('mimetype', self.default_mimetype)
+ if not mimetype and urls:
+ if urls[0].url.endswith('.pdf'):
+ mimetype = 'application/pdf'
+
fe = fatcat_client.FileEntity(
md5=obj.get('md5'),
sha1=obj['sha1'],
sha256=obj.get('sha256'),
size=size,
- mimetype=obj.get('mimetype'),
+ mimetype=mimetype,
release_ids=release_ids,
urls=urls,
)