index
:
sandcrawler
bnewbold-args
bnewbold-backfill
bnewbold-persist-grobid-errors
bnewbold-refactor-loggging
master
trawler
[no description]
about
summary
refs
log
tree
commit
diff
stats
log msg
author
committer
range
path:
root
/
python
Mode
Name
Size
-rw-r--r--
.coveragerc
32
log
stats
plain
-rw-r--r--
.gitignore
29
log
stats
plain
-rw-r--r--
.pylintrc
409
log
stats
plain
-rw-r--r--
Pipfile
554
log
stats
plain
-rw-r--r--
Pipfile.lock
51124
log
stats
plain
-rw-r--r--
README.md
3563
log
stats
plain
-rw-r--r--
TODO
52
log
stats
plain
-rwxr-xr-x
backfill_hbase_from_cdx.py
2896
log
stats
plain
-rw-r--r--
common.py
2618
log
stats
plain
-rwxr-xr-x
deliver_dumpgrobid_to_s3.py
4092
log
stats
plain
-rwxr-xr-x
deliver_gwb_to_disk.py
7109
log
stats
plain
-rwxr-xr-x
deliver_gwb_to_s3.py
7663
log
stats
plain
-rwxr-xr-x
enrich_scored_matches.py
938
log
stats
plain
-rwxr-xr-x
extraction_cdx_grobid.py
11769
log
stats
plain
-rwxr-xr-x
extraction_ungrobided.py
11383
log
stats
plain
-rwxr-xr-x
filter_grobid_metadata.py
4621
log
stats
plain
-rwxr-xr-x
filter_scored_matches.py
3432
log
stats
plain
-rwxr-xr-x
grobid2json.py
5273
log
stats
plain
-rwxr-xr-x
ia_pdf_match.py
2889
log
stats
plain
-rwxr-xr-x
import_grobid_metadata.py
2426
log
stats
plain
-rwxr-xr-x
kafka_grobid.py
13599
log
stats
plain
-rwxr-xr-x
kafka_grobid_hbase.py
7413
log
stats
plain
-rwxr-xr-x
manifest_converter.py
1594
log
stats
plain
-rw-r--r--
mrjob.conf
466
log
stats
plain
-rw-r--r--
pytest.ini
171
log
stats
plain
d---------
tests
294
log
stats
plain
l---------
title_slug_blacklist.txt
->
../scalding/src/main/resources/slug-denylist.txt
48
log
stats
plain
-rw-r--r--
xml2json.py
199
log
stats
plain