aboutsummaryrefslogtreecommitdiffstats
path: root/bin
diff options
context:
space:
mode:
Diffstat (limited to 'bin')
-rwxr-xr-xbin/cord19_fatcat_enrich.py2
-rwxr-xr-xbin/deliver_file2disk.py2
-rwxr-xr-xbin/fix_extensions.sh6
-rwxr-xr-xbin/grobid2json.py2
-rwxr-xr-xbin/parse_cord19_csv.py5
5 files changed, 17 insertions, 0 deletions
diff --git a/bin/cord19_fatcat_enrich.py b/bin/cord19_fatcat_enrich.py
index a911007..2478227 100755
--- a/bin/cord19_fatcat_enrich.py
+++ b/bin/cord19_fatcat_enrich.py
@@ -3,6 +3,8 @@
"""
Takes a JSON-transformed CORD-19 *metadata* file and enriches it with fatcat
metadata.
+
+TODO: refactor into `fatcat_covid19` module and wrapper CLI script.
"""
import sys
diff --git a/bin/deliver_file2disk.py b/bin/deliver_file2disk.py
index f54ecb3..49e0c73 100755
--- a/bin/deliver_file2disk.py
+++ b/bin/deliver_file2disk.py
@@ -10,6 +10,8 @@ Behavior:
- try downloading from any archive.org or web.archive.org URLs
- verify SHA-1
- write out to disk
+
+This file is copied from the fatcat repository.
"""
# XXX: some broken MRO thing going on in here due to python3 object wrangling
diff --git a/bin/fix_extensions.sh b/bin/fix_extensions.sh
index e3ddd67..9f6113c 100755
--- a/bin/fix_extensions.sh
+++ b/bin/fix_extensions.sh
@@ -1,5 +1,11 @@
#!/bin/bash
+# Tiny helper to rename files based on their detect mimetype.
+#
+# Call with no trailing slash like:
+#
+# ./bin/fix_extensions.sh some_dir
+
for file in $1/*; do
TYPE=$(file --mime-type -b "$file" | cut -f2 -d/);
if [[ ! $file =~ \.$TYPE ]]; then
diff --git a/bin/grobid2json.py b/bin/grobid2json.py
index 39ab222..9c2ffad 100755
--- a/bin/grobid2json.py
+++ b/bin/grobid2json.py
@@ -21,6 +21,8 @@ A flag can be specified to disable copyright encumbered bits (--no-emcumbered):
- tables, figures, equations
Prints JSON to stdout, errors to stderr
+
+This file copied from the sandcrawler repository.
"""
import io
diff --git a/bin/parse_cord19_csv.py b/bin/parse_cord19_csv.py
index 536e5d3..55cd81b 100755
--- a/bin/parse_cord19_csv.py
+++ b/bin/parse_cord19_csv.py
@@ -1,5 +1,10 @@
#!/usr/bin/env python3
+"""
+Trivial helper to transform the CORD-19 CSV file to JSON, and rename a couple
+of the column keys.
+"""
+
import sys
import csv
import json