aboutsummaryrefslogtreecommitdiffstats
path: root/mapreduce/extraction_cdx_grobid.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-04-06 12:39:49 -0700
committerBryan Newbold <bnewbold@archive.org>2018-04-06 12:39:49 -0700
commit114c6b611148d2ff499bcea302eee0eca00df647 (patch)
treedf929050d3aa9484f78e5c1807bc951ce1e85512 /mapreduce/extraction_cdx_grobid.py
parente68d43e2369eed7ddf288be8c8f2edd0a85974e1 (diff)
downloadsandcrawler-114c6b611148d2ff499bcea302eee0eca00df647.tar.gz
sandcrawler-114c6b611148d2ff499bcea302eee0eca00df647.zip
small grobid2json test
Diffstat (limited to 'mapreduce/extraction_cdx_grobid.py')
-rwxr-xr-xmapreduce/extraction_cdx_grobid.py1
1 files changed, 1 insertions, 0 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py
index a4a13f8..63f290a 100755
--- a/mapreduce/extraction_cdx_grobid.py
+++ b/mapreduce/extraction_cdx_grobid.py
@@ -61,6 +61,7 @@ class MRExtractCdxGrobid(MRJob):
r = requests.post(self.options.grobid_uri + "/api/processFulltextDocument",
files={'input': content})
if r.status_code is not 200:
+ # if invalid file, get a 400 with JSON body with 'description' key (and others)
# XXX:
return None
return r