From 9598a4c14800f8ec2543b26872565b1c3b9d2677 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 3 Apr 2020 12:59:03 -0700 Subject: document scripts and tools a bit --- bin/cord19_fatcat_enrich.py | 2 ++ bin/deliver_file2disk.py | 2 ++ bin/fix_extensions.sh | 6 ++++++ bin/grobid2json.py | 2 ++ bin/parse_cord19_csv.py | 5 +++++ 5 files changed, 17 insertions(+) (limited to 'bin') diff --git a/bin/cord19_fatcat_enrich.py b/bin/cord19_fatcat_enrich.py index a911007..2478227 100755 --- a/bin/cord19_fatcat_enrich.py +++ b/bin/cord19_fatcat_enrich.py @@ -3,6 +3,8 @@ """ Takes a JSON-transformed CORD-19 *metadata* file and enriches it with fatcat metadata. + +TODO: refactor into `fatcat_covid19` module and wrapper CLI script. """ import sys diff --git a/bin/deliver_file2disk.py b/bin/deliver_file2disk.py index f54ecb3..49e0c73 100755 --- a/bin/deliver_file2disk.py +++ b/bin/deliver_file2disk.py @@ -10,6 +10,8 @@ Behavior: - try downloading from any archive.org or web.archive.org URLs - verify SHA-1 - write out to disk + +This file is copied from the fatcat repository. """ # XXX: some broken MRO thing going on in here due to python3 object wrangling diff --git a/bin/fix_extensions.sh b/bin/fix_extensions.sh index e3ddd67..9f6113c 100755 --- a/bin/fix_extensions.sh +++ b/bin/fix_extensions.sh @@ -1,5 +1,11 @@ #!/bin/bash +# Tiny helper to rename files based on their detect mimetype. +# +# Call with no trailing slash like: +# +# ./bin/fix_extensions.sh some_dir + for file in $1/*; do TYPE=$(file --mime-type -b "$file" | cut -f2 -d/); if [[ ! $file =~ \.$TYPE ]]; then diff --git a/bin/grobid2json.py b/bin/grobid2json.py index 39ab222..9c2ffad 100755 --- a/bin/grobid2json.py +++ b/bin/grobid2json.py @@ -21,6 +21,8 @@ A flag can be specified to disable copyright encumbered bits (--no-emcumbered): - tables, figures, equations Prints JSON to stdout, errors to stderr + +This file copied from the sandcrawler repository. """ import io diff --git a/bin/parse_cord19_csv.py b/bin/parse_cord19_csv.py index 536e5d3..55cd81b 100755 --- a/bin/parse_cord19_csv.py +++ b/bin/parse_cord19_csv.py @@ -1,5 +1,10 @@ #!/usr/bin/env python3 +""" +Trivial helper to transform the CORD-19 CSV file to JSON, and rename a couple +of the column keys. +""" + import sys import csv import json -- cgit v1.2.3