diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2018-08-15 22:05:59 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-15 22:05:59 -0700 | 
| commit | 70350899dda973cdf7a5cfdd941ae80319254587 (patch) | |
| tree | 19e444d4037da3124ed9c09ffeb5e8ac1ff6769a | |
| parent | c3c2760fb388059a9942a61965b79c42bc03f11b (diff) | |
| download | sandcrawler-70350899dda973cdf7a5cfdd941ae80319254587.tar.gz sandcrawler-70350899dda973cdf7a5cfdd941ae80319254587.zip  | |
handle null status_code lines
| -rw-r--r-- | scalding/src/main/scala/sandcrawler/GrobidScorable.scala | 1 | ||||
| -rw-r--r-- | scalding/src/test/scala/sandcrawler/ScoreJobTest.scala | 10 | 
2 files changed, 8 insertions, 3 deletions
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index f484fad..9a09e05 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -25,6 +25,7 @@ class GrobidScorable extends Scorable with HBasePipeConversions {        .read        // Can't just "fromBytesWritable" because we have multiple types?        .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "tei_json", "status_code")) +      .filter { case (_, tei_json, status_code) => tei_json != null && status_code != null }        .map { case (key, tei_json, status_code) =>          (Bytes.toString(key.copyBytes()), Bytes.toString(tei_json.copyBytes()), Bytes.toLong(status_code.copyBytes()))        } diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala index f68ee1d..54ae801 100644 --- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala @@ -155,10 +155,15 @@ class ScoreJobTest extends FlatSpec with Matchers {    val Bad : Long = 400    val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad) -  val SampleData : List[List[Array[Byte]]] = (Sha1Strings, JsonStrings, StatusCodes) +  val SampleDataHead : List[Tuple] = (Sha1Strings, JsonStrings, StatusCodes)      .zipped      .toList      .map { case (sha, json, status) => List(Bytes.toBytes(sha), Bytes.toBytes(json), Bytes.toBytes(status)) } +    .map { l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*) } + +  // Add example of lines without GROBID data +  val SampleData = SampleDataHead :+ new Tuple( +    new ImmutableBytesWritable(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAA88888888888")), null, null)    JobTest("sandcrawler.ScoreJob")      .arg("test", "") @@ -168,8 +173,7 @@ class ScoreJobTest extends FlatSpec with Matchers {      .arg("zookeeper-hosts", testHost)      .arg("crossref-input", input)      .arg("debug", "true") -    .source[Tuple](GrobidScorable.getHBaseSource(testTable, testHost), -      SampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*))) +    .source[Tuple](GrobidScorable.getHBaseSource(testTable, testHost), SampleData)      .source(TextLine(input), List(        0 -> CrossrefStrings(0),        1 -> CrossrefStrings(1),  | 
