diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/README_import.md | 2 | ||||
| -rwxr-xr-x | python/fatcat_import.py | 6 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/arabesque.py | 4 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/matched.py | 9 | 
4 files changed, 15 insertions, 6 deletions
| diff --git a/python/README_import.md b/python/README_import.md index d4abe400..04b838f8 100644 --- a/python/README_import.md +++ b/python/README_import.md @@ -83,5 +83,5 @@ Run import in parallel:      export FATCAT_EDITGROUP_DESCRIPTION="File/DOI matching to user-uploaded pre-1923 and pre-1909 paper corpus on archive.org"      export FATCAT_API_AUTH_TOKEN=... (FATCAT_AUTH_WORKER_ARCHIVE_ORG) -    zcat /srv/fatcat/datasets/crossref-pre-1923-scholarly-works.matched.json.gz | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched - +    zcat /srv/fatcat/datasets/crossref-pre-1923-scholarly-works.matched.json.gz | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched - --default-mime 'application/pdf' diff --git a/python/fatcat_import.py b/python/fatcat_import.py index c421fb43..6b1a10b1 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -29,7 +29,8 @@ def run_journal_metadata(args):  def run_matched(args):      fmi = MatchedImporter(args.api,          edit_batch_size=args.batch_size, -        editgroup_description=args.editgroup_description_override) +        editgroup_description=args.editgroup_description_override, +        default_mimetype=args.default_mimetype)      JsonLinePusher(fmi, args.json_file).run()  def run_arabesque_match(args): @@ -168,6 +169,9 @@ def main():      sub_matched.add_argument('json_file',          help="JSON file to import from (or stdin)",          default=sys.stdin, type=argparse.FileType('r')) +    sub_matched.add_argument('--default-mimetype', +        default=None, +        help="default mimetype for imported files (if not specified per-file)")      sub_matched.add_argument('--bezerk-mode',          action='store_true',          help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)") diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index ea38ec2f..e9376d96 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -66,7 +66,7 @@ class ArabesqueMatchImporter(EntityImporter):          self.extid_type = extid_type          self.default_link_rel = kwargs.get("default_link_rel", "web")          assert self.default_link_rel -        self.default_mime = kwargs.get("default_mime", None) +        self.default_mimetype = kwargs.get("default_mimetype", None)          self.do_updates = kwargs.get("do_updates", False)          self.require_grobid = require_grobid          if self.require_grobid: @@ -136,7 +136,7 @@ class ArabesqueMatchImporter(EntityImporter):          fe = fatcat_client.FileEntity(              sha1=b32_hex(row['final_sha1']), -            mimetype=row['final_mimetype'], +            mimetype=row['final_mimetype'] or self.default_mimetype,              release_ids=[re.ident],              urls=urls,          ) diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 4d78fdc9..04ce4573 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -42,7 +42,7 @@ class MatchedImporter(EntityImporter):              editgroup_extra=eg_extra,              **kwargs)          self.default_link_rel = kwargs.get("default_link_rel", "web") -        self.default_mime = kwargs.get("default_mime", None) +        self.default_mimetype = kwargs.get("default_mimetype", None)      def want(self, raw_record):          return True @@ -100,12 +100,17 @@ class MatchedImporter(EntityImporter):          if size:              size = int(size) +        mimetype = obj.get('mimetype', self.default_mimetype) +        if not mimetype and urls: +            if urls[0].url.endswith('.pdf'): +                mimetype = 'application/pdf' +          fe = fatcat_client.FileEntity(              md5=obj.get('md5'),              sha1=obj['sha1'],              sha256=obj.get('sha256'),              size=size, -            mimetype=obj.get('mimetype'), +            mimetype=mimetype,              release_ids=release_ids,              urls=urls,          ) | 
