diff options
| author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-22 12:52:03 -0700 | 
|---|---|---|
| committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-22 12:52:03 -0700 | 
| commit | f54e47ace6dd041e78e10ee25573c6ad3de808eb (patch) | |
| tree | 21931c4efd9997d69227623aba0cff224907dd44 | |
| parent | b628b7026ab8e7abf4beeaaad99d831b49578483 (diff) | |
| download | sandcrawler-f54e47ace6dd041e78e10ee25573c6ad3de808eb.tar.gz sandcrawler-f54e47ace6dd041e78e10ee25573c6ad3de808eb.zip | |
Added title length filtering to GrobidScorable
3 files changed, 46 insertions, 2 deletions
| diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index e510f75..76f4f22 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -31,11 +31,27 @@ class GrobidScorable extends Scorable with HBasePipeConversions {        }        // TODO: Should I combine next two stages for efficiency?        .collect { case (key, json, StatusOK) => (key, json) } +      .filter { case (key, json) => GrobidScorable.keepRecord(json) }        .map { entry : (String, String) => GrobidScorable.jsonToMapFeatures(entry._1, entry._2) }    }  }  object GrobidScorable { +  def keepRecord(json : String) : Boolean = { +    Scorable.jsonToMap(json) match { +      case None => false +      case Some(map) => { +        if (map contains "title") { +          val title = Scorable.getString(map, "title") +          title != null && title.length <= Scorable.MaxTitleLength +        } else { +          false +        } +      } +    } +  } + +    def getHBaseSource(table : String, host : String) : HBaseSource = {      HBaseBuilder.build(table, host, List("grobid0:metadata", "grobid0:status_code"), SourceMode.SCAN_ALL)    } diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 9b9c633..c704ed9 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -30,6 +30,7 @@ abstract class Scorable {  }  object Scorable { +  val MaxTitleLength = 255    val NoSlug = "NO SLUG" // Used for slug if title is empty or unparseable    def isValidSlug(slug : String) : Boolean = { diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala index 661824b..620998e 100644 --- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala @@ -57,7 +57,9 @@ class GrobidScorableTest extends FlatSpec with Matchers {    "annex": null  }  """ -  val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File") +  val GrobidStringWithGoodTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File") +  val GrobidStringWithExcessiveTitle = GrobidString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0") +  val GrobidStringWithNullTitle = GrobidString.replace("\"<<TITLE>>\"", "null")    val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle")    val MalformedGrobidString = GrobidString.replace("}", "")    val Key = "Dummy Key" @@ -69,13 +71,18 @@ class GrobidScorableTest extends FlatSpec with Matchers {      result.slug shouldBe Scorable.NoSlug    } +  it should "handle null title" in { +    val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithNullTitle) +    result.slug shouldBe Scorable.NoSlug +  } +    it should "handle missing title" in {      val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle)      result.slug shouldBe Scorable.NoSlug    }    it should "handle valid input" in { -    val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithTitle) +    val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithGoodTitle)      result.slug shouldBe "dummyexamplefile"      Scorable.jsonToMap(result.json) match {        case None => fail() @@ -85,4 +92,24 @@ class GrobidScorableTest extends FlatSpec with Matchers {        }      }    } + +  "GrobidScorable.keepRecord()" should "return true for valid JSON with title" in { +    GrobidScorable.keepRecord(GrobidStringWithGoodTitle) shouldBe true +  } + +  it should "return false for valid JSON with excessively long title" in { +    GrobidScorable.keepRecord(GrobidStringWithExcessiveTitle) shouldBe false +  } + +  it should "return false for valid JSON with null title" in { +    GrobidScorable.keepRecord(GrobidStringWithNullTitle) shouldBe false +  } + +  it should "return false for valid JSON with no title" in { +    GrobidScorable.keepRecord(GrobidStringWithoutTitle) shouldBe false +  } + +  it should "return false for invalid JSON" in { +    GrobidScorable.keepRecord(GrobidStringWithoutTitle) shouldBe false +  }  } | 
