diff options
author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-25 11:18:15 -0700 |
---|---|---|
committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-25 11:18:15 -0700 |
commit | 773d5c28e2ac6085172aaebf86031358261a7014 (patch) | |
tree | 55f3eb6e8323f206f7b8fe597cdb507381760a17 /scalding/src/main | |
parent | 4c5dbdf964da9ca29246b0f8eadec6daae1d3ffb (diff) | |
download | sandcrawler-773d5c28e2ac6085172aaebf86031358261a7014.tar.gz sandcrawler-773d5c28e2ac6085172aaebf86031358261a7014.zip |
Grobid entries without legal slugs are removed from the pipe.
Diffstat (limited to 'scalding/src/main')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala | 14 |
1 files changed, 10 insertions, 4 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala index 56eb91e..7b7deec 100644 --- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala +++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala @@ -15,7 +15,9 @@ import parallelai.spyglass.hbase.HBaseConstants.SourceMode import parallelai.spyglass.hbase.HBasePipeConversions import parallelai.spyglass.hbase.HBaseSource -class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConversions { +class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with + HBasePipeConversions { + val NoTitle = "NO TITLE" // Used for slug if title is empty or unparseable // key is SHA1 val grobidSource = HBaseCrossrefScore.getHBaseSource( @@ -30,9 +32,13 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConv val (key, json) = (entry._1, entry._2) HBaseCrossrefScore.grobidToSlug(json) match { case Some(slug) => (key, json, slug) - case None => (key, json, "none") + case None => (key, json, NoTitle) } } + .filter { entry => + val (_, _, slug) = entry + slug != NoTitle && slug.length > 0 + } .write(TypedTsv[(String, String, String)](args("output"))) /* @@ -79,7 +85,7 @@ object HBaseCrossrefScore { if (map contains "title") { titleToSlug(map("title").asInstanceOf[String]) } else { - Some("grobidToSlug None: " + map("foo")) + None } } @@ -89,7 +95,7 @@ object HBaseCrossrefScore { // TODO: Don't ignore titles after the first. titleToSlug(map("title").asInstanceOf[List[String]](0)) } else { - Some("crossRefToSlug None") + None } } |