diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-04-03 12:59:03 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-03 12:59:03 -0700 |
commit | 9598a4c14800f8ec2543b26872565b1c3b9d2677 (patch) | |
tree | 09631ff75bfa665da987ee0fd26fc228041ff16b /bin | |
parent | 3de4f762f02f95d17b912dadf64a1c00effd7f12 (diff) | |
download | fatcat-covid19-9598a4c14800f8ec2543b26872565b1c3b9d2677.tar.gz fatcat-covid19-9598a4c14800f8ec2543b26872565b1c3b9d2677.zip |
document scripts and tools a bit
Diffstat (limited to 'bin')
-rwxr-xr-x | bin/cord19_fatcat_enrich.py | 2 | ||||
-rwxr-xr-x | bin/deliver_file2disk.py | 2 | ||||
-rwxr-xr-x | bin/fix_extensions.sh | 6 | ||||
-rwxr-xr-x | bin/grobid2json.py | 2 | ||||
-rwxr-xr-x | bin/parse_cord19_csv.py | 5 |
5 files changed, 17 insertions, 0 deletions
diff --git a/bin/cord19_fatcat_enrich.py b/bin/cord19_fatcat_enrich.py index a911007..2478227 100755 --- a/bin/cord19_fatcat_enrich.py +++ b/bin/cord19_fatcat_enrich.py @@ -3,6 +3,8 @@ """ Takes a JSON-transformed CORD-19 *metadata* file and enriches it with fatcat metadata. + +TODO: refactor into `fatcat_covid19` module and wrapper CLI script. """ import sys diff --git a/bin/deliver_file2disk.py b/bin/deliver_file2disk.py index f54ecb3..49e0c73 100755 --- a/bin/deliver_file2disk.py +++ b/bin/deliver_file2disk.py @@ -10,6 +10,8 @@ Behavior: - try downloading from any archive.org or web.archive.org URLs - verify SHA-1 - write out to disk + +This file is copied from the fatcat repository. """ # XXX: some broken MRO thing going on in here due to python3 object wrangling diff --git a/bin/fix_extensions.sh b/bin/fix_extensions.sh index e3ddd67..9f6113c 100755 --- a/bin/fix_extensions.sh +++ b/bin/fix_extensions.sh @@ -1,5 +1,11 @@ #!/bin/bash +# Tiny helper to rename files based on their detect mimetype. +# +# Call with no trailing slash like: +# +# ./bin/fix_extensions.sh some_dir + for file in $1/*; do TYPE=$(file --mime-type -b "$file" | cut -f2 -d/); if [[ ! $file =~ \.$TYPE ]]; then diff --git a/bin/grobid2json.py b/bin/grobid2json.py index 39ab222..9c2ffad 100755 --- a/bin/grobid2json.py +++ b/bin/grobid2json.py @@ -21,6 +21,8 @@ A flag can be specified to disable copyright encumbered bits (--no-emcumbered): - tables, figures, equations Prints JSON to stdout, errors to stderr + +This file copied from the sandcrawler repository. """ import io diff --git a/bin/parse_cord19_csv.py b/bin/parse_cord19_csv.py index 536e5d3..55cd81b 100755 --- a/bin/parse_cord19_csv.py +++ b/bin/parse_cord19_csv.py @@ -1,5 +1,10 @@ #!/usr/bin/env python3 +""" +Trivial helper to transform the CORD-19 CSV file to JSON, and rename a couple +of the column keys. +""" + import sys import csv import json |