From 728e50a33cec921c9a624439f2e1c8561a6e12ce Mon Sep 17 00:00:00 2001 From: Ellen Spertus Date: Sat, 11 Aug 2018 21:03:53 -0700 Subject: It compiles. --- .../main/scala/sandcrawler/CrossrefScorable.scala | 54 ++++++++++++++-------- .../main/scala/sandcrawler/GrobidScorable.scala | 21 ++++----- scalding/src/main/scala/sandcrawler/Scorable.scala | 40 +++++++++++----- .../scala/sandcrawler/CrossrefScorableTest.scala | 26 ++++++----- .../scala/sandcrawler/GrobidScorableTest.scala | 19 ++++---- 5 files changed, 96 insertions(+), 64 deletions(-) (limited to 'scalding/src') diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index b2f6537..5113b0c 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -18,6 +18,7 @@ import java.util.regex.Pattern import scala.math import scala.util.parsing.json.JSON +import scala.util.parsing.json.JSONObject import cascading.tuple.Fields import com.twitter.scalding._ @@ -40,33 +41,48 @@ class CrossrefScorable extends Scorable with HBasePipeConversions { def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = { getSource(args).read .toTypedPipe[String](new Fields("line")) - .map{ json : String => - CrossrefScorable.simplifyJson(json) match { - case None => new MapFeatures(Scorable.NoSlug, json) - case Some(map) => new MapFeatures( - Scorable.titleToSlug(map("title").asInstanceOf[String]), - JSONObject(map).toString) + .map{ json : String => + Scorable.jsonToMap(json) match { + case None => MapFeatures(Scorable.NoSlug, json) + case Some(map) => { + if ((map contains "title") && (map contains "DOI")) { + val titles = map("title").asInstanceOf[List[String]] + if (titles.isEmpty) { + new MapFeatures(Scorable.NoSlug, json) + } else { + val title = titles(0) + val map2 = Scorable.toScorableMap(title=titles(0), doi=map("DOI").asInstanceOf[String]) + new MapFeatures( + Scorable.mapToSlug(map2), + JSONObject(map2).toString) + } + } else { + new MapFeatures(Scorable.NoSlug, json) + } + } } } } +} - object CrossrefScorable { - def simplifyJson(json : String) : Option[Map[String, Any]] = { - Scorable.jsonToMap(json) match { - case None => None - case Some(map) => { - if (map contains "title") { - val titles = map("title").asInstanceOf[List[String]] - if (titles.isEmpty) { - None - } else { - Some(Map("title" -> titles(0))) - } - } else { +/* +object CrossrefScorable { + def simplifyJson(json : String) : Option[Map[String, Any]] = { + Scorable.jsonToMap(json) match { + case None => None + case Some(map) => { + if (map contains "title") { + val titles = map("title").asInstanceOf[List[String]] + if (titles.isEmpty) { None + } else { + Some(Map("title" -> titles(0))) } + } else { + None } } } } } + */ diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index 61055f2..de9f51a 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -1,5 +1,6 @@ package sandcrawler +import scala.util.parsing.json.JSONObject import cascading.flow.FlowDef import cascading.pipe.Pipe import cascading.tuple.Fields @@ -21,13 +22,7 @@ class GrobidScorable extends Scorable with HBasePipeConversions { .read .fromBytesWritable(new Fields("key", "tei_json")) .toTypedPipe[(String, String)](new Fields("key", "tei_json")) - .map { entry => - val (key : String, json : String) = (entry._1, entry._2) - GrobidScorable.grobidToSlug(json) match { - case Some(slug) => new MapFeatures(slug, json) - case None => new MapFeatures(Scorable.NoSlug, json) - } - } + .map { entry : (String, String) => GrobidScorable.jsonToMapFeatures(entry._1, entry._2) } } } @@ -36,14 +31,18 @@ object GrobidScorable { HBaseBuilder.build(table, host, List("grobid0:tei_json"), SourceMode.SCAN_ALL) } - def grobidToSlug(json : String) : Option[String] = { + def jsonToMapFeatures(key : String, json : String) : MapFeatures = { Scorable.jsonToMap(json) match { - case None => None + case None => MapFeatures(Scorable.NoSlug, json) case Some(map) => { if (map contains "title") { - Some(Scorable.titleToSlug(map("title").asInstanceOf[String])) + val map2 = Scorable.toScorableMap(Scorable.getString(map, "title"), + sha1=key) + new MapFeatures( + Scorable.mapToSlug(map2), + JSONObject(map2).toString) } else { - None + MapFeatures(Scorable.NoSlug, json) } } } diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 0ec8e46..9c8da69 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -2,6 +2,7 @@ package sandcrawler import scala.math import scala.util.parsing.json.JSON +import scala.util.parsing.json.JSONObject import cascading.flow.FlowDef import com.twitter.scalding._ @@ -36,6 +37,21 @@ object Scorable { slug != NoSlug } + // NOTE: I could go all out and make ScorableMap a type. + // TODO: Require year. Other features will get added here. + def toScorableMap(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : Map[String, Any] = { + Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) + } + + def toScorableJson(title : String, year : Int, doi : String = "", sha1 : String = "") : String = { + JSONObject(toScorableMap(title=title, year=year, doi=doi, sha1=sha1)).toString + } + + // TODO: Score on more fields than "title". + def isScorableMap(map : Map[String, Any]) : Boolean = { + map.contains("title") + } + def jsonToMap(json : String) : Option[Map[String, Any]] = { // https://stackoverflow.com/a/32717262/631051 val jsonObject = JSON.parseFull(json) @@ -46,18 +62,17 @@ object Scorable { } } - def titleToSlug(title : String) : String = { - if (title == null || title.isEmpty) { + // Map should have been produced by toScorableMap. + // This guarantees it will have all of the fields needed to compute + // the ultimate score, which are a superset of those needed for a slug. + def mapToSlug(map : Map[String, Any]) : String = { + val unaccented = StringUtilities.removeAccents(getString(map, "title")) + // Remove punctuation after splitting on colon. + val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())) + if (slug.isEmpty || slug == null) { NoSlug } else { - val unaccented = StringUtilities.removeAccents(title) - // Remove punctuation after splitting on colon. - val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())) - if (slug.isEmpty || slug == null) { - NoSlug - } else { - slug - } + slug } } @@ -68,8 +83,9 @@ object Scorable { } } - // Caller is responsible for ensuring that key is in map. - def getString(map : Map[String, String], key : String) : String = { + // Caller is responsible for ensuring that key is a String in map. + // TODO: Add and handle ClassCastException + def getString(map : Map[String, Any], key : String) : String = { assert(map contains key) map(key).asInstanceOf[String] } diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala index 67a8bfe..1c35d66 100644 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala @@ -66,20 +66,24 @@ class CrossrefScorableTest extends FlatSpec with Matchers { val MalformedCrossrefString = CrossrefString.replace("}", "") // Unit tests -/* - "crossrefToSlug()" should "get the right slug for a crossref json string" in { - val slug = CrossrefScorable.crossrefToSlug(CrossrefStringWithTitle) - slug should contain ("sometitle") + "simplifyJson()" should "return None for bad JSON" in { + CrossrefScorable.simplifyJson("") shouldBe None + CrossrefScorable.simplifyJson(MalformedCrossrefString) shouldBe None } - it should "return None if given json string without title" in { - val slug = CrossrefScorable.crossrefToSlug(CrossrefStringWithoutTitle) - slug shouldBe None + it should "return None for JSON lacking title" in { + CrossrefScorable.simplifyJson(CrossrefStringWithoutTitle) shouldBe None } - it should "return None if given a malformed json string" in { - val slug = CrossrefScorable.crossrefToSlug(MalformedCrossrefString) - slug shouldBe None + it should "return appropriate result for valid JSON" in { + CrossrefScorable.simplifyJson(CrossrefStringWithTitle) match { + case None => fail("None unexpectedly returned by simplifyJson") + case Some(map) => { + Scorable.isScorableMap(map) shouldBe true + map.size shouldBe 1 + map.keys should contain ("title") + map("title") shouldBe "SomeTitle" + } + } } - */ } diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala index 7777610..5bb955a 100644 --- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala @@ -60,18 +60,15 @@ class GrobidScorableTest extends FlatSpec with Matchers { // Unit tests - "grobidToSlug()" should "get the right slug for a grobid json string" in { - val slug = GrobidScorable.grobidToSlug(GrobidStringWithTitle) - slug should contain ("dummy example file") + "GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in { + val result = GrobidScorable.jsonToMapFeatures(MalformedGrobidString) shouldBe None + result.slug shouldBe Scorable.NoSlug + result.json shouldBe MalformedGrobidString } - it should "return None if given json string without title" in { - val slug = GrobidScorable.grobidToSlug(GrobidStringWithoutTitle) - slug shouldBe None - } - - it should "return None if given a malformed json string" in { - val slug = GrobidScorable.grobidToSlug(MalformedGrobidString) - slug shouldBe None + "GrobidScorable.jsonToMapFeatures()" should "handle missing title" in { + val result = GrobidScorable.jsonToMapFeatures(GrobidStringWithoutTitle) shouldBe None + result.slug shouldBe Scorable.NoSlug + result.json shouldBe GrobidStringWithoutTitle } } -- cgit v1.2.3