diff options
| author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-25 11:18:15 -0700 | 
|---|---|---|
| committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-25 11:18:15 -0700 | 
| commit | 773d5c28e2ac6085172aaebf86031358261a7014 (patch) | |
| tree | 55f3eb6e8323f206f7b8fe597cdb507381760a17 /scalding | |
| parent | 4c5dbdf964da9ca29246b0f8eadec6daae1d3ffb (diff) | |
| download | sandcrawler-773d5c28e2ac6085172aaebf86031358261a7014.tar.gz sandcrawler-773d5c28e2ac6085172aaebf86031358261a7014.zip | |
Grobid entries without legal slugs are removed from the pipe.
Diffstat (limited to 'scalding')
| -rw-r--r-- | scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala | 14 | ||||
| -rw-r--r-- | scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala | 18 | 
2 files changed, 22 insertions, 10 deletions
| diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala index 56eb91e..7b7deec 100644 --- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala +++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala @@ -15,7 +15,9 @@ import parallelai.spyglass.hbase.HBaseConstants.SourceMode  import parallelai.spyglass.hbase.HBasePipeConversions  import parallelai.spyglass.hbase.HBaseSource -class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConversions { +class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with +    HBasePipeConversions { +  val NoTitle = "NO TITLE" // Used for slug if title is empty or unparseable    // key is SHA1    val grobidSource = HBaseCrossrefScore.getHBaseSource( @@ -30,9 +32,13 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConv        val (key, json) = (entry._1, entry._2)        HBaseCrossrefScore.grobidToSlug(json) match {            case Some(slug) => (key, json, slug) -          case None => (key, json, "none") +          case None => (key, json, NoTitle)        }      } +    .filter { entry => +      val (_, _, slug) = entry +      slug != NoTitle && slug.length > 0 +    }      .write(TypedTsv[(String, String, String)](args("output")))  /* @@ -79,7 +85,7 @@ object HBaseCrossrefScore {      if (map contains "title") {        titleToSlug(map("title").asInstanceOf[String])      } else { -      Some("grobidToSlug None: " + map("foo")) +      None      }    } @@ -89,7 +95,7 @@ object HBaseCrossrefScore {        // TODO: Don't ignore titles after the first.        titleToSlug(map("title").asInstanceOf[List[String]](0))      } else { -      Some("crossRefToSlug None") +      None      }    } diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala index 0d681b9..d70c8f2 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala @@ -163,7 +163,7 @@ class HBaseCrossrefScoreTest extends FunSpec with TupleConversions {      List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title1"))),      List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title2: TNG"))),      List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title3: The Sequel"))), -    List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title4")))) +    List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), Bytes.toBytes(MalformedGrobidString)))    JobTest("sandcrawler.HBaseCrossrefScoreJob")      .arg("test", "") @@ -180,13 +180,19 @@ class HBaseCrossrefScoreTest extends FunSpec with TupleConversions {        "1" -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"))))      .sink[(String, String, String)](TypedTsv[(String, String, String)](output)) {        outputBuffer => -      it("should return a 4-element list.") { -        assert(outputBuffer.size === 4) +      it("should return a 3-element list.") { +        assert(outputBuffer.size === 3)        } -      it("should return the right slugs.") { -        val (sha1, json, slug) = outputBuffer(0) -        assert(slug == "title1") +      it("should return the right first slug.") { +        val (_, _, slug0) = outputBuffer(0) +        assert(slug0 == "title1")        } +      /* +      it("should return the right last slug.") { +        val (_, _, slug3) = outputBuffer(3) +        assert(slug3 == "foo") +      } +       */      }      .run      .finish | 
