aboutsummaryrefslogtreecommitdiffstats
path: root/scalding
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-07-26 04:36:43 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-07-26 04:36:43 -0700
commit6d2bb4787150682236f4c349f8e469026fe3d490 (patch)
tree9c86515c4280c87c2f382d92213a8ef3cd8e18eb /scalding
parent15ae7006cd8238bb9453f27be6aa5388a6002ce8 (diff)
downloadsandcrawler-6d2bb4787150682236f4c349f8e469026fe3d490.tar.gz
sandcrawler-6d2bb4787150682236f4c349f8e469026fe3d490.zip
Computes and outputs (score, sha1, doi, grobidTitle, crossrefTitle).
Diffstat (limited to 'scalding')
-rw-r--r--scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala73
-rw-r--r--scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala31
2 files changed, 71 insertions, 33 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
index 7e10c43..714af36 100644
--- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
@@ -3,6 +3,7 @@ package sandcrawler
import java.util.Arrays
import java.util.Properties
+import scala.math
import scala.util.parsing.json.JSON
import cascading.tuple.Fields
@@ -17,11 +18,9 @@ import parallelai.spyglass.hbase.HBaseConstants.SourceMode
import parallelai.spyglass.hbase.HBasePipeConversions
import parallelai.spyglass.hbase.HBaseSource
-class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with
- HBasePipeConversions {
+class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConversions {
val NoTitle = "NO TITLE" // Used for slug if title is empty or unparseable
- /*
// key is SHA1
val grobidSource = HBaseCrossrefScore.getHBaseSource(
args("hbase-table"),
@@ -29,13 +28,14 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with
val grobidPipe : TypedPipe[(String, String, String)] = grobidSource
.read
.fromBytesWritable(new Fields("key", "tei_json"))
- .debug // Should be 4 tuples for mocked data
+ // .debug // Should be 4 tuples for mocked data
.toTypedPipe[(String, String)]('key, 'tei_json)
.map { entry =>
val (key, json) = (entry._1, entry._2)
+ // TODO: Consider passing forward only a subset of JSON.
HBaseCrossrefScore.grobidToSlug(json) match {
- case Some(slug) => (slug, key, json)
- case None => (NoTitle, key, json)
+ case Some(slug) => (slug, key, json)
+ case None => (NoTitle, key, json)
}
}
.filter { entry =>
@@ -46,15 +46,12 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with
val grobidGroup = grobidPipe
.groupBy { case (slug, key, json) => slug }
- */
val crossrefSource = TextLine(args("crossref-input"))
- val crossrefPipe : TypedPipe[String] = crossrefSource
+ val crossrefPipe : TypedPipe[(String, String)] = crossrefSource
.read
- .debug // Should be 4 tuples for mocked data
+ // .debug // Should be 4 tuples for mocked data
.toTypedPipe[String]('line)
- /*
- .map{line : String => (line, "foo")}
.map{ json : String =>
HBaseCrossrefScore.crossrefToSlug(json) match {
case Some(slug) => (slug, json)
@@ -65,26 +62,21 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with
val (slug, json) = entry
slug != NoTitle
}
- */
- .write(TypedTsv[String](args("output")))
-
- /*
val crossrefGroup = crossrefPipe
.groupBy { case (slug, json) => slug }
- // TODO: Figure out which is smaller.
- val theJoin : CoGrouped[String, ((String, String, String), (String, String))] =
+ val theJoin : CoGrouped[String, ((String, String, String), (String, String))] =
grobidGroup.join(crossrefGroup)
theJoin.map{ entry =>
- val (slug : String,
- ((slug0: String, sha1 : String, grobidJson : String),
- (slug1 : String, crossrefJson : String))) = entry
- // TODO: For now, output it all.
- (slug, slug0, slug1, sha1, grobidJson, crossrefJson)}
- .write(TypedTsv[(String, String, String, String, String, String)](args("output")))
- */
+ val (slug : String,
+ ((slug0: String, sha1 : String, grobidJson : String),
+ (slug1 : String, crossrefJson : String))) = entry
+ HBaseCrossrefScore.computeOutput(sha1, grobidJson, crossrefJson)}
+ .debug
+ // Output: score, sha1, doi, grobid title, crossref title
+ .write(TypedTsv[(Int, String, String, String, String)](args("output")))
}
@@ -137,4 +129,37 @@ object HBaseCrossrefScore {
Some(slug)
}
}
+
+ val FullTitleMatch = 100
+ val TitleLeftMatchBase = 50
+ val MaxTitleLeftMatch = 80
+ val SlugMatch = 25
+
+ def computeSimilarity(gTitle : String, cTitle : String) : Int = {
+ assert(titleToSlug(gTitle) == titleToSlug(cTitle))
+ if (gTitle == cTitle) {
+ FullTitleMatch
+ } else if (gTitle.startsWith(cTitle) || cTitle.startsWith(gTitle)) {
+ math.min(TitleLeftMatchBase + math.abs(gTitle.length - cTitle.length),
+ MaxTitleLeftMatch)
+ } else {
+ SlugMatch
+ }
+ }
+
+ def computeOutput(sha1 : String, grobidJson : String, crossrefJson : String) :
+ // (score, sha1, doi, grobidTitle, crossrefTitle)
+ (Int, String, String, String, String) = {
+ // JSON has already been validated in previous stages.
+ val grobid = jsonToMap(grobidJson)
+ val crossref = jsonToMap(crossrefJson)
+
+ val grobidTitle = grobid("title").asInstanceOf[String].toLowerCase()
+ val crossrefTitle = crossref("title").asInstanceOf[List[String]](0).toLowerCase()
+ (computeSimilarity(grobidTitle, crossrefTitle),
+ sha1,
+ crossref("DOI").asInstanceOf[String],
+ "'" + grobidTitle + "'",
+ "'" + crossrefTitle + "'")
+ }
}
diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
index bd9dcd3..e6211a2 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
@@ -163,10 +163,14 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {
val (testTable, testHost) = ("test-table", "dummy-host:2181")
val grobidSampleData = List(
- List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title1"))),
- List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title2: TNG"))),
- List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title3: The Sequel"))),
- List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), Bytes.toBytes(MalformedGrobidString)))
+ List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"),
+ Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 1"))),
+ List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"),
+ Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 2: TNG"))),
+ List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"),
+ Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"))),
+ List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"),
+ Bytes.toBytes(MalformedGrobidString)))
JobTest("sandcrawler.HBaseCrossrefScoreJob")
.arg("test", "")
@@ -180,18 +184,27 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {
grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
.source(TextLine(input), List(
0 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"),
- 1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0.5"),
- 2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0.75"),
+ 1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"),
+ 2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1")))
- .sink[String](TypedTsv[String](output)) {
+ .sink[(Int, String, String, String, String)](TypedTsv[(Int,
+ String, String, String, String)](output)) {
+ // Grobid titles:
+ // "Title 1", "Title 2: TNG", "Title 3: The Sequel"
+ // crossref slugs:
+ // "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted"
+ // Join should have 3 "Title 1" slugs and 1 "Title 2" slug
outputBuffer =>
it should "return a 4-element list" in {
- outputBuffer should have length 3
+ outputBuffer should have length 4
}
+
/*
it should "return the right first entry" in {
val (slug, slug0, slug1, sha1, grobidJson, crossrefJson) = outputBuffer(0)
- slug shouldBe "title1"
+ slug shouldBe "title 1"
+ slug shouldBe slug0
+ slug shouldBe slug1
sha1 shouldBe new String(grobidSampleData(0)(0), "UTF-8")
grobidJson shouldBe new String(grobidSampleData(0)(1), "UTF-8")
}