index
:
sandcrawler
bnewbold-args
bnewbold-backfill
bnewbold-persist-grobid-errors
bnewbold-refactor-loggging
master
trawler
[no description]
about
summary
refs
log
tree
commit
diff
stats
log msg
author
committer
range
path:
root
/
python
Mode
Name
Size
-rw-r--r--
.coveragerc
32
log
stats
plain
-rw-r--r--
.gitignore
29
log
stats
plain
-rw-r--r--
.pylintrc
409
log
stats
plain
-rw-r--r--
Pipfile
522
log
stats
plain
-rw-r--r--
Pipfile.lock
52174
log
stats
plain
-rw-r--r--
README.md
3563
log
stats
plain
-rw-r--r--
TODO
52
log
stats
plain
-rwxr-xr-x
backfill_hbase_from_cdx.py
2896
log
stats
plain
-rw-r--r--
common.py
2618
log
stats
plain
-rwxr-xr-x
enrich_scored_matches.py
938
log
stats
plain
-rwxr-xr-x
extraction_cdx_grobid.py
11023
log
stats
plain
-rwxr-xr-x
extraction_ungrobided.py
10653
log
stats
plain
-rwxr-xr-x
filter_grobid_metadata.py
4621
log
stats
plain
-rwxr-xr-x
filter_scored_matches.py
3432
log
stats
plain
-rwxr-xr-x
grobid2json.py
5122
log
stats
plain
-rwxr-xr-x
import_grobid_metadata.py
2426
log
stats
plain
-rwxr-xr-x
kafka_grobid.py
11322
log
stats
plain
-rwxr-xr-x
kafka_grobided_hbase.py
6220
log
stats
plain
-rwxr-xr-x
manifest_converter.py
1594
log
stats
plain
-rw-r--r--
mrjob.conf
466
log
stats
plain
-rw-r--r--
pytest.ini
171
log
stats
plain
d---------
tests
294
log
stats
plain
l---------
title_slug_blacklist.txt
->
../scalding/src/main/resources/slug-denylist.txt
48
log
stats
plain
-rw-r--r--
xml2json.py
199
log
stats
plain