aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-07-26 04:36:43 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-07-26 04:36:43 -0700
commit6d2bb4787150682236f4c349f8e469026fe3d490 (patch)
tree9c86515c4280c87c2f382d92213a8ef3cd8e18eb /scalding/src/main/scala
parent15ae7006cd8238bb9453f27be6aa5388a6002ce8 (diff)
downloadsandcrawler-6d2bb4787150682236f4c349f8e469026fe3d490.tar.gz
sandcrawler-6d2bb4787150682236f4c349f8e469026fe3d490.zip
Computes and outputs (score, sha1, doi, grobidTitle, crossrefTitle).
Diffstat (limited to 'scalding/src/main/scala')
-rw-r--r--scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala73
1 files changed, 49 insertions, 24 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
index 7e10c43..714af36 100644
--- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
@@ -3,6 +3,7 @@ package sandcrawler
import java.util.Arrays
import java.util.Properties
+import scala.math
import scala.util.parsing.json.JSON
import cascading.tuple.Fields
@@ -17,11 +18,9 @@ import parallelai.spyglass.hbase.HBaseConstants.SourceMode
import parallelai.spyglass.hbase.HBasePipeConversions
import parallelai.spyglass.hbase.HBaseSource
-class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with
- HBasePipeConversions {
+class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConversions {
val NoTitle = "NO TITLE" // Used for slug if title is empty or unparseable
- /*
// key is SHA1
val grobidSource = HBaseCrossrefScore.getHBaseSource(
args("hbase-table"),
@@ -29,13 +28,14 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with
val grobidPipe : TypedPipe[(String, String, String)] = grobidSource
.read
.fromBytesWritable(new Fields("key", "tei_json"))
- .debug // Should be 4 tuples for mocked data
+ // .debug // Should be 4 tuples for mocked data
.toTypedPipe[(String, String)]('key, 'tei_json)
.map { entry =>
val (key, json) = (entry._1, entry._2)
+ // TODO: Consider passing forward only a subset of JSON.
HBaseCrossrefScore.grobidToSlug(json) match {
- case Some(slug) => (slug, key, json)
- case None => (NoTitle, key, json)
+ case Some(slug) => (slug, key, json)
+ case None => (NoTitle, key, json)
}
}
.filter { entry =>
@@ -46,15 +46,12 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with
val grobidGroup = grobidPipe
.groupBy { case (slug, key, json) => slug }
- */
val crossrefSource = TextLine(args("crossref-input"))
- val crossrefPipe : TypedPipe[String] = crossrefSource
+ val crossrefPipe : TypedPipe[(String, String)] = crossrefSource
.read
- .debug // Should be 4 tuples for mocked data
+ // .debug // Should be 4 tuples for mocked data
.toTypedPipe[String]('line)
- /*
- .map{line : String => (line, "foo")}
.map{ json : String =>
HBaseCrossrefScore.crossrefToSlug(json) match {
case Some(slug) => (slug, json)
@@ -65,26 +62,21 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with
val (slug, json) = entry
slug != NoTitle
}
- */
- .write(TypedTsv[String](args("output")))
-
- /*
val crossrefGroup = crossrefPipe
.groupBy { case (slug, json) => slug }
- // TODO: Figure out which is smaller.
- val theJoin : CoGrouped[String, ((String, String, String), (String, String))] =
+ val theJoin : CoGrouped[String, ((String, String, String), (String, String))] =
grobidGroup.join(crossrefGroup)
theJoin.map{ entry =>
- val (slug : String,
- ((slug0: String, sha1 : String, grobidJson : String),
- (slug1 : String, crossrefJson : String))) = entry
- // TODO: For now, output it all.
- (slug, slug0, slug1, sha1, grobidJson, crossrefJson)}
- .write(TypedTsv[(String, String, String, String, String, String)](args("output")))
- */
+ val (slug : String,
+ ((slug0: String, sha1 : String, grobidJson : String),
+ (slug1 : String, crossrefJson : String))) = entry
+ HBaseCrossrefScore.computeOutput(sha1, grobidJson, crossrefJson)}
+ .debug
+ // Output: score, sha1, doi, grobid title, crossref title
+ .write(TypedTsv[(Int, String, String, String, String)](args("output")))
}
@@ -137,4 +129,37 @@ object HBaseCrossrefScore {
Some(slug)
}
}
+
+ val FullTitleMatch = 100
+ val TitleLeftMatchBase = 50
+ val MaxTitleLeftMatch = 80
+ val SlugMatch = 25
+
+ def computeSimilarity(gTitle : String, cTitle : String) : Int = {
+ assert(titleToSlug(gTitle) == titleToSlug(cTitle))
+ if (gTitle == cTitle) {
+ FullTitleMatch
+ } else if (gTitle.startsWith(cTitle) || cTitle.startsWith(gTitle)) {
+ math.min(TitleLeftMatchBase + math.abs(gTitle.length - cTitle.length),
+ MaxTitleLeftMatch)
+ } else {
+ SlugMatch
+ }
+ }
+
+ def computeOutput(sha1 : String, grobidJson : String, crossrefJson : String) :
+ // (score, sha1, doi, grobidTitle, crossrefTitle)
+ (Int, String, String, String, String) = {
+ // JSON has already been validated in previous stages.
+ val grobid = jsonToMap(grobidJson)
+ val crossref = jsonToMap(crossrefJson)
+
+ val grobidTitle = grobid("title").asInstanceOf[String].toLowerCase()
+ val crossrefTitle = crossref("title").asInstanceOf[List[String]](0).toLowerCase()
+ (computeSimilarity(grobidTitle, crossrefTitle),
+ sha1,
+ crossref("DOI").asInstanceOf[String],
+ "'" + grobidTitle + "'",
+ "'" + crossrefTitle + "'")
+ }
}