From 9598a4c14800f8ec2543b26872565b1c3b9d2677 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 3 Apr 2020 12:59:03 -0700 Subject: document scripts and tools a bit --- bin/cord19_fatcat_enrich.py | 2 ++ bin/deliver_file2disk.py | 2 ++ bin/fix_extensions.sh | 6 ++++++ bin/grobid2json.py | 2 ++ bin/parse_cord19_csv.py | 5 +++++ cord19_fatcat_derivatives.py | 2 ++ covid19_tool.py | 6 ++++++ fatcat_covid19/search.py | 7 +++++++ 8 files changed, 32 insertions(+) diff --git a/bin/cord19_fatcat_enrich.py b/bin/cord19_fatcat_enrich.py index a911007..2478227 100755 --- a/bin/cord19_fatcat_enrich.py +++ b/bin/cord19_fatcat_enrich.py @@ -3,6 +3,8 @@ """ Takes a JSON-transformed CORD-19 *metadata* file and enriches it with fatcat metadata. + +TODO: refactor into `fatcat_covid19` module and wrapper CLI script. """ import sys diff --git a/bin/deliver_file2disk.py b/bin/deliver_file2disk.py index f54ecb3..49e0c73 100755 --- a/bin/deliver_file2disk.py +++ b/bin/deliver_file2disk.py @@ -10,6 +10,8 @@ Behavior: - try downloading from any archive.org or web.archive.org URLs - verify SHA-1 - write out to disk + +This file is copied from the fatcat repository. """ # XXX: some broken MRO thing going on in here due to python3 object wrangling diff --git a/bin/fix_extensions.sh b/bin/fix_extensions.sh index e3ddd67..9f6113c 100755 --- a/bin/fix_extensions.sh +++ b/bin/fix_extensions.sh @@ -1,5 +1,11 @@ #!/bin/bash +# Tiny helper to rename files based on their detect mimetype. +# +# Call with no trailing slash like: +# +# ./bin/fix_extensions.sh some_dir + for file in $1/*; do TYPE=$(file --mime-type -b "$file" | cut -f2 -d/); if [[ ! $file =~ \.$TYPE ]]; then diff --git a/bin/grobid2json.py b/bin/grobid2json.py index 39ab222..9c2ffad 100755 --- a/bin/grobid2json.py +++ b/bin/grobid2json.py @@ -21,6 +21,8 @@ A flag can be specified to disable copyright encumbered bits (--no-emcumbered): - tables, figures, equations Prints JSON to stdout, errors to stderr + +This file copied from the sandcrawler repository. """ import io diff --git a/bin/parse_cord19_csv.py b/bin/parse_cord19_csv.py index 536e5d3..55cd81b 100755 --- a/bin/parse_cord19_csv.py +++ b/bin/parse_cord19_csv.py @@ -1,5 +1,10 @@ #!/usr/bin/env python3 +""" +Trivial helper to transform the CORD-19 CSV file to JSON, and rename a couple +of the column keys. +""" + import sys import csv import json diff --git a/cord19_fatcat_derivatives.py b/cord19_fatcat_derivatives.py index aa0382b..8b5b679 100755 --- a/cord19_fatcat_derivatives.py +++ b/cord19_fatcat_derivatives.py @@ -27,6 +27,8 @@ Keys added: - glutton_fatcat_release (renamed from fatcat_release) - fulltext_pdftotext: only if fulltext_grobid not set - body + +TODO: refactor into fatcat_covid19 module and CLI wrapper """ import sys diff --git a/covid19_tool.py b/covid19_tool.py index 5be70b4..7a565b8 100755 --- a/covid19_tool.py +++ b/covid19_tool.py @@ -1,5 +1,11 @@ #!/usr/bin/env python3 +""" +Wrapper CLI tool for invoking code in the `fatcat_covid19` module. + +Licensed the same as code under fatcat_covid19/ +""" + import sys import argparse diff --git a/fatcat_covid19/search.py b/fatcat_covid19/search.py index 921520c..8b90a4a 100644 --- a/fatcat_covid19/search.py +++ b/fatcat_covid19/search.py @@ -1,4 +1,11 @@ +""" +Helpers to make elasticsearch queries. + +TODO: switch to using elasticsearch-dsl library instead of requests+json. +already have a WIP branch for this in fatcat repo. +""" + import json import datetime import requests -- cgit v1.2.3