aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-15 22:05:59 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-15 22:05:59 -0700
commit70350899dda973cdf7a5cfdd941ae80319254587 (patch)
tree19e444d4037da3124ed9c09ffeb5e8ac1ff6769a /scalding/src
parentc3c2760fb388059a9942a61965b79c42bc03f11b (diff)
downloadsandcrawler-70350899dda973cdf7a5cfdd941ae80319254587.tar.gz
sandcrawler-70350899dda973cdf7a5cfdd941ae80319254587.zip
handle null status_code lines
Diffstat (limited to 'scalding/src')
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorable.scala1
-rw-r--r--scalding/src/test/scala/sandcrawler/ScoreJobTest.scala10
2 files changed, 8 insertions, 3 deletions
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
index f484fad..9a09e05 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -25,6 +25,7 @@ class GrobidScorable extends Scorable with HBasePipeConversions {
.read
// Can't just "fromBytesWritable" because we have multiple types?
.toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "tei_json", "status_code"))
+ .filter { case (_, tei_json, status_code) => tei_json != null && status_code != null }
.map { case (key, tei_json, status_code) =>
(Bytes.toString(key.copyBytes()), Bytes.toString(tei_json.copyBytes()), Bytes.toLong(status_code.copyBytes()))
}
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
index f68ee1d..54ae801 100644
--- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -155,10 +155,15 @@ class ScoreJobTest extends FlatSpec with Matchers {
val Bad : Long = 400
val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad)
- val SampleData : List[List[Array[Byte]]] = (Sha1Strings, JsonStrings, StatusCodes)
+ val SampleDataHead : List[Tuple] = (Sha1Strings, JsonStrings, StatusCodes)
.zipped
.toList
.map { case (sha, json, status) => List(Bytes.toBytes(sha), Bytes.toBytes(json), Bytes.toBytes(status)) }
+ .map { l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*) }
+
+ // Add example of lines without GROBID data
+ val SampleData = SampleDataHead :+ new Tuple(
+ new ImmutableBytesWritable(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAA88888888888")), null, null)
JobTest("sandcrawler.ScoreJob")
.arg("test", "")
@@ -168,8 +173,7 @@ class ScoreJobTest extends FlatSpec with Matchers {
.arg("zookeeper-hosts", testHost)
.arg("crossref-input", input)
.arg("debug", "true")
- .source[Tuple](GrobidScorable.getHBaseSource(testTable, testHost),
- SampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
+ .source[Tuple](GrobidScorable.getHBaseSource(testTable, testHost), SampleData)
.source(TextLine(input), List(
0 -> CrossrefStrings(0),
1 -> CrossrefStrings(1),