aboutsummaryrefslogtreecommitdiffstats
path: root/bin
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-04-03 12:59:03 -0700
committerBryan Newbold <bnewbold@archive.org>2020-04-03 12:59:03 -0700
commit9598a4c14800f8ec2543b26872565b1c3b9d2677 (patch)
tree09631ff75bfa665da987ee0fd26fc228041ff16b /bin
parent3de4f762f02f95d17b912dadf64a1c00effd7f12 (diff)
downloadfatcat-covid19-9598a4c14800f8ec2543b26872565b1c3b9d2677.tar.gz
fatcat-covid19-9598a4c14800f8ec2543b26872565b1c3b9d2677.zip
document scripts and tools a bit
Diffstat (limited to 'bin')
-rwxr-xr-xbin/cord19_fatcat_enrich.py2
-rwxr-xr-xbin/deliver_file2disk.py2
-rwxr-xr-xbin/fix_extensions.sh6
-rwxr-xr-xbin/grobid2json.py2
-rwxr-xr-xbin/parse_cord19_csv.py5
5 files changed, 17 insertions, 0 deletions
diff --git a/bin/cord19_fatcat_enrich.py b/bin/cord19_fatcat_enrich.py
index a911007..2478227 100755
--- a/bin/cord19_fatcat_enrich.py
+++ b/bin/cord19_fatcat_enrich.py
@@ -3,6 +3,8 @@
"""
Takes a JSON-transformed CORD-19 *metadata* file and enriches it with fatcat
metadata.
+
+TODO: refactor into `fatcat_covid19` module and wrapper CLI script.
"""
import sys
diff --git a/bin/deliver_file2disk.py b/bin/deliver_file2disk.py
index f54ecb3..49e0c73 100755
--- a/bin/deliver_file2disk.py
+++ b/bin/deliver_file2disk.py
@@ -10,6 +10,8 @@ Behavior:
- try downloading from any archive.org or web.archive.org URLs
- verify SHA-1
- write out to disk
+
+This file is copied from the fatcat repository.
"""
# XXX: some broken MRO thing going on in here due to python3 object wrangling
diff --git a/bin/fix_extensions.sh b/bin/fix_extensions.sh
index e3ddd67..9f6113c 100755
--- a/bin/fix_extensions.sh
+++ b/bin/fix_extensions.sh
@@ -1,5 +1,11 @@
#!/bin/bash
+# Tiny helper to rename files based on their detect mimetype.
+#
+# Call with no trailing slash like:
+#
+# ./bin/fix_extensions.sh some_dir
+
for file in $1/*; do
TYPE=$(file --mime-type -b "$file" | cut -f2 -d/);
if [[ ! $file =~ \.$TYPE ]]; then
diff --git a/bin/grobid2json.py b/bin/grobid2json.py
index 39ab222..9c2ffad 100755
--- a/bin/grobid2json.py
+++ b/bin/grobid2json.py
@@ -21,6 +21,8 @@ A flag can be specified to disable copyright encumbered bits (--no-emcumbered):
- tables, figures, equations
Prints JSON to stdout, errors to stderr
+
+This file copied from the sandcrawler repository.
"""
import io
diff --git a/bin/parse_cord19_csv.py b/bin/parse_cord19_csv.py
index 536e5d3..55cd81b 100755
--- a/bin/parse_cord19_csv.py
+++ b/bin/parse_cord19_csv.py
@@ -1,5 +1,10 @@
#!/usr/bin/env python3
+"""
+Trivial helper to transform the CORD-19 CSV file to JSON, and rename a couple
+of the column keys.
+"""
+
import sys
import csv
import json