diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-04-10 19:13:43 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-04-10 19:14:26 -0700 |
commit | a0be9706997182b18e48000375c462856aafc5ef (patch) | |
tree | 30a7ea934c041275adfeaae1d8f9d6349994e335 /mapreduce | |
parent | 0a778bd6d46a71b7cbec04eb3a5bdb00d91da0de (diff) | |
download | sandcrawler-a0be9706997182b18e48000375c462856aafc5ef.tar.gz sandcrawler-a0be9706997182b18e48000375c462856aafc5ef.zip |
TODO updates
Diffstat (limited to 'mapreduce')
-rw-r--r-- | mapreduce/TODO | 6 | ||||
-rwxr-xr-x | mapreduce/backfill_hbase_from_cdx.py | 7 |
2 files changed, 2 insertions, 11 deletions
diff --git a/mapreduce/TODO b/mapreduce/TODO index 3459752..4f4db16 100644 --- a/mapreduce/TODO +++ b/mapreduce/TODO @@ -1,6 +1,4 @@ -- better test coverage (actually check coverage!) -- use pre-mapper command to filter down, eg, by status type? +- quality scoring (of JSON output) +- use pre-mapper `grep` command to filter down, eg, by status? - automation/docs for bundling virtualenv along - think about speedups -- abstract CDX line reading and HBase stuff out into a common library -- actual GROBID_SERVER="http://wbgrp-svc096.us.archive.org:8070" diff --git a/mapreduce/backfill_hbase_from_cdx.py b/mapreduce/backfill_hbase_from_cdx.py index 72331b0..6b2ec0b 100755 --- a/mapreduce/backfill_hbase_from_cdx.py +++ b/mapreduce/backfill_hbase_from_cdx.py @@ -7,13 +7,6 @@ formats. Requires: - happybase - mrjob - -TODO: -- argparse -- refactor into an object -- tests in separate file -- nose tests -- sentry integration for error reporting """ import json |