diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-04-03 12:59:03 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-03 12:59:03 -0700 | 
| commit | 9598a4c14800f8ec2543b26872565b1c3b9d2677 (patch) | |
| tree | 09631ff75bfa665da987ee0fd26fc228041ff16b | |
| parent | 3de4f762f02f95d17b912dadf64a1c00effd7f12 (diff) | |
| download | fatcat-covid19-9598a4c14800f8ec2543b26872565b1c3b9d2677.tar.gz fatcat-covid19-9598a4c14800f8ec2543b26872565b1c3b9d2677.zip | |
document scripts and tools a bit
| -rwxr-xr-x | bin/cord19_fatcat_enrich.py | 2 | ||||
| -rwxr-xr-x | bin/deliver_file2disk.py | 2 | ||||
| -rwxr-xr-x | bin/fix_extensions.sh | 6 | ||||
| -rwxr-xr-x | bin/grobid2json.py | 2 | ||||
| -rwxr-xr-x | bin/parse_cord19_csv.py | 5 | ||||
| -rwxr-xr-x | cord19_fatcat_derivatives.py | 2 | ||||
| -rwxr-xr-x | covid19_tool.py | 6 | ||||
| -rw-r--r-- | fatcat_covid19/search.py | 7 | 
8 files changed, 32 insertions, 0 deletions
| diff --git a/bin/cord19_fatcat_enrich.py b/bin/cord19_fatcat_enrich.py index a911007..2478227 100755 --- a/bin/cord19_fatcat_enrich.py +++ b/bin/cord19_fatcat_enrich.py @@ -3,6 +3,8 @@  """  Takes a JSON-transformed CORD-19 *metadata* file and enriches it with fatcat  metadata. + +TODO: refactor into `fatcat_covid19` module and wrapper CLI script.  """  import sys diff --git a/bin/deliver_file2disk.py b/bin/deliver_file2disk.py index f54ecb3..49e0c73 100755 --- a/bin/deliver_file2disk.py +++ b/bin/deliver_file2disk.py @@ -10,6 +10,8 @@ Behavior:      - try downloading from any archive.org or web.archive.org URLs      - verify SHA-1      - write out to disk + +This file is copied from the fatcat repository.  """  # XXX: some broken MRO thing going on in here due to python3 object wrangling diff --git a/bin/fix_extensions.sh b/bin/fix_extensions.sh index e3ddd67..9f6113c 100755 --- a/bin/fix_extensions.sh +++ b/bin/fix_extensions.sh @@ -1,5 +1,11 @@  #!/bin/bash +# Tiny helper to rename files based on their detect mimetype. +# +# Call with no trailing slash like: +# +#   ./bin/fix_extensions.sh some_dir +  for file in $1/*; do      TYPE=$(file --mime-type -b "$file" | cut -f2 -d/);      if [[ ! $file =~ \.$TYPE ]]; then diff --git a/bin/grobid2json.py b/bin/grobid2json.py index 39ab222..9c2ffad 100755 --- a/bin/grobid2json.py +++ b/bin/grobid2json.py @@ -21,6 +21,8 @@ A flag can be specified to disable copyright encumbered bits (--no-emcumbered):  - tables, figures, equations  Prints JSON to stdout, errors to stderr + +This file copied from the sandcrawler repository.  """  import io diff --git a/bin/parse_cord19_csv.py b/bin/parse_cord19_csv.py index 536e5d3..55cd81b 100755 --- a/bin/parse_cord19_csv.py +++ b/bin/parse_cord19_csv.py @@ -1,5 +1,10 @@  #!/usr/bin/env python3 +""" +Trivial helper to transform the CORD-19 CSV file to JSON, and rename a couple +of the column keys. +""" +  import sys  import csv  import json diff --git a/cord19_fatcat_derivatives.py b/cord19_fatcat_derivatives.py index aa0382b..8b5b679 100755 --- a/cord19_fatcat_derivatives.py +++ b/cord19_fatcat_derivatives.py @@ -27,6 +27,8 @@ Keys added:      - glutton_fatcat_release (renamed from fatcat_release)  - fulltext_pdftotext: only if fulltext_grobid not set      - body + +TODO: refactor into fatcat_covid19 module and CLI wrapper  """  import sys diff --git a/covid19_tool.py b/covid19_tool.py index 5be70b4..7a565b8 100755 --- a/covid19_tool.py +++ b/covid19_tool.py @@ -1,5 +1,11 @@  #!/usr/bin/env python3 +""" +Wrapper CLI tool for invoking code in the `fatcat_covid19` module. + +Licensed the same as code under fatcat_covid19/ +""" +  import sys  import argparse diff --git a/fatcat_covid19/search.py b/fatcat_covid19/search.py index 921520c..8b90a4a 100644 --- a/fatcat_covid19/search.py +++ b/fatcat_covid19/search.py @@ -1,4 +1,11 @@ +""" +Helpers to make elasticsearch queries. + +TODO: switch to using elasticsearch-dsl library instead of requests+json. +already have a WIP branch for this in fatcat repo. +""" +  import json  import datetime  import requests | 
