aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xbin/cord19_fatcat_enrich.py2
-rwxr-xr-xbin/deliver_file2disk.py2
-rwxr-xr-xbin/fix_extensions.sh6
-rwxr-xr-xbin/grobid2json.py2
-rwxr-xr-xbin/parse_cord19_csv.py5
-rwxr-xr-xcord19_fatcat_derivatives.py2
-rwxr-xr-xcovid19_tool.py6
-rw-r--r--fatcat_covid19/search.py7
8 files changed, 32 insertions, 0 deletions
diff --git a/bin/cord19_fatcat_enrich.py b/bin/cord19_fatcat_enrich.py
index a911007..2478227 100755
--- a/bin/cord19_fatcat_enrich.py
+++ b/bin/cord19_fatcat_enrich.py
@@ -3,6 +3,8 @@
"""
Takes a JSON-transformed CORD-19 *metadata* file and enriches it with fatcat
metadata.
+
+TODO: refactor into `fatcat_covid19` module and wrapper CLI script.
"""
import sys
diff --git a/bin/deliver_file2disk.py b/bin/deliver_file2disk.py
index f54ecb3..49e0c73 100755
--- a/bin/deliver_file2disk.py
+++ b/bin/deliver_file2disk.py
@@ -10,6 +10,8 @@ Behavior:
- try downloading from any archive.org or web.archive.org URLs
- verify SHA-1
- write out to disk
+
+This file is copied from the fatcat repository.
"""
# XXX: some broken MRO thing going on in here due to python3 object wrangling
diff --git a/bin/fix_extensions.sh b/bin/fix_extensions.sh
index e3ddd67..9f6113c 100755
--- a/bin/fix_extensions.sh
+++ b/bin/fix_extensions.sh
@@ -1,5 +1,11 @@
#!/bin/bash
+# Tiny helper to rename files based on their detect mimetype.
+#
+# Call with no trailing slash like:
+#
+# ./bin/fix_extensions.sh some_dir
+
for file in $1/*; do
TYPE=$(file --mime-type -b "$file" | cut -f2 -d/);
if [[ ! $file =~ \.$TYPE ]]; then
diff --git a/bin/grobid2json.py b/bin/grobid2json.py
index 39ab222..9c2ffad 100755
--- a/bin/grobid2json.py
+++ b/bin/grobid2json.py
@@ -21,6 +21,8 @@ A flag can be specified to disable copyright encumbered bits (--no-emcumbered):
- tables, figures, equations
Prints JSON to stdout, errors to stderr
+
+This file copied from the sandcrawler repository.
"""
import io
diff --git a/bin/parse_cord19_csv.py b/bin/parse_cord19_csv.py
index 536e5d3..55cd81b 100755
--- a/bin/parse_cord19_csv.py
+++ b/bin/parse_cord19_csv.py
@@ -1,5 +1,10 @@
#!/usr/bin/env python3
+"""
+Trivial helper to transform the CORD-19 CSV file to JSON, and rename a couple
+of the column keys.
+"""
+
import sys
import csv
import json
diff --git a/cord19_fatcat_derivatives.py b/cord19_fatcat_derivatives.py
index aa0382b..8b5b679 100755
--- a/cord19_fatcat_derivatives.py
+++ b/cord19_fatcat_derivatives.py
@@ -27,6 +27,8 @@ Keys added:
- glutton_fatcat_release (renamed from fatcat_release)
- fulltext_pdftotext: only if fulltext_grobid not set
- body
+
+TODO: refactor into fatcat_covid19 module and CLI wrapper
"""
import sys
diff --git a/covid19_tool.py b/covid19_tool.py
index 5be70b4..7a565b8 100755
--- a/covid19_tool.py
+++ b/covid19_tool.py
@@ -1,5 +1,11 @@
#!/usr/bin/env python3
+"""
+Wrapper CLI tool for invoking code in the `fatcat_covid19` module.
+
+Licensed the same as code under fatcat_covid19/
+"""
+
import sys
import argparse
diff --git a/fatcat_covid19/search.py b/fatcat_covid19/search.py
index 921520c..8b90a4a 100644
--- a/fatcat_covid19/search.py
+++ b/fatcat_covid19/search.py
@@ -1,4 +1,11 @@
+"""
+Helpers to make elasticsearch queries.
+
+TODO: switch to using elasticsearch-dsl library instead of requests+json.
+already have a WIP branch for this in fatcat repo.
+"""
+
import json
import datetime
import requests