diff options
| author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-11 21:03:53 -0700 | 
|---|---|---|
| committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-11 21:03:53 -0700 | 
| commit | 728e50a33cec921c9a624439f2e1c8561a6e12ce (patch) | |
| tree | 671548fe0e4bd38badb76453c0a1a90dea5e0ce7 /scalding/src/main | |
| parent | 768e7ef0d127cf55119543be6e656751704ca5b2 (diff) | |
| download | sandcrawler-728e50a33cec921c9a624439f2e1c8561a6e12ce.tar.gz sandcrawler-728e50a33cec921c9a624439f2e1c8561a6e12ce.zip | |
It compiles.
Diffstat (limited to 'scalding/src/main')
3 files changed, 73 insertions, 42 deletions
| diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index b2f6537..5113b0c 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -18,6 +18,7 @@ import java.util.regex.Pattern  import scala.math  import scala.util.parsing.json.JSON +import scala.util.parsing.json.JSONObject  import cascading.tuple.Fields  import com.twitter.scalding._ @@ -40,33 +41,48 @@ class CrossrefScorable extends Scorable with HBasePipeConversions {    def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = {      getSource(args).read        .toTypedPipe[String](new Fields("line")) -      .map{ json : String =>  -        CrossrefScorable.simplifyJson(json) match { -          case None => new MapFeatures(Scorable.NoSlug, json) -          case Some(map) => new MapFeatures( -            Scorable.titleToSlug(map("title").asInstanceOf[String]),  -            JSONObject(map).toString) +      .map{ json : String => +        Scorable.jsonToMap(json) match { +          case None => MapFeatures(Scorable.NoSlug, json) +          case Some(map) => { +            if ((map contains "title") && (map contains "DOI")) { +              val titles = map("title").asInstanceOf[List[String]] +              if (titles.isEmpty) { +                new MapFeatures(Scorable.NoSlug, json) +              } else { +                val title = titles(0) +                val map2 = Scorable.toScorableMap(title=titles(0), doi=map("DOI").asInstanceOf[String]) +                new MapFeatures( +                  Scorable.mapToSlug(map2), +                  JSONObject(map2).toString) +              } +            } else { +              new MapFeatures(Scorable.NoSlug, json) +            } +          }          }        }    } +} -  object CrossrefScorable { -    def simplifyJson(json : String) : Option[Map[String, Any]] = { -      Scorable.jsonToMap(json) match { -        case None => None -        case Some(map) => { -          if (map contains "title") { -            val titles = map("title").asInstanceOf[List[String]] -            if (titles.isEmpty) { -              None -            } else { -              Some(Map("title" -> titles(0))) -            } -          } else { +/* +object CrossrefScorable { +  def simplifyJson(json : String) : Option[Map[String, Any]] = { +    Scorable.jsonToMap(json) match { +      case None => None +      case Some(map) => { +        if (map contains "title") { +          val titles = map("title").asInstanceOf[List[String]] +          if (titles.isEmpty) {              None +          } else { +            Some(Map("title" -> titles(0)))            } +        } else { +          None          }        }      }    }  } + */ diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index 61055f2..de9f51a 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -1,5 +1,6 @@  package sandcrawler +import scala.util.parsing.json.JSONObject  import cascading.flow.FlowDef  import cascading.pipe.Pipe  import cascading.tuple.Fields @@ -21,13 +22,7 @@ class GrobidScorable extends Scorable with HBasePipeConversions {        .read        .fromBytesWritable(new Fields("key", "tei_json"))        .toTypedPipe[(String, String)](new Fields("key", "tei_json")) -      .map { entry => -        val (key : String, json : String) = (entry._1, entry._2) -        GrobidScorable.grobidToSlug(json) match { -          case Some(slug) => new MapFeatures(slug, json) -          case None => new MapFeatures(Scorable.NoSlug, json) -        } -      } +      .map { entry : (String, String) => GrobidScorable.jsonToMapFeatures(entry._1, entry._2) }    }  } @@ -36,14 +31,18 @@ object GrobidScorable {      HBaseBuilder.build(table, host, List("grobid0:tei_json"), SourceMode.SCAN_ALL)    } -  def grobidToSlug(json : String) : Option[String] = { +  def jsonToMapFeatures(key : String, json : String) : MapFeatures = {      Scorable.jsonToMap(json) match { -      case None => None +      case None => MapFeatures(Scorable.NoSlug, json)        case Some(map) => {          if (map contains "title") { -          Some(Scorable.titleToSlug(map("title").asInstanceOf[String])) +          val map2 = Scorable.toScorableMap(Scorable.getString(map, "title"), +            sha1=key) +          new MapFeatures( +            Scorable.mapToSlug(map2), +            JSONObject(map2).toString)          } else { -          None +          MapFeatures(Scorable.NoSlug, json)          }        }      } diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 0ec8e46..9c8da69 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -2,6 +2,7 @@ package sandcrawler  import scala.math  import scala.util.parsing.json.JSON +import scala.util.parsing.json.JSONObject  import cascading.flow.FlowDef  import com.twitter.scalding._ @@ -36,6 +37,21 @@ object Scorable {      slug != NoSlug    } +  // NOTE: I could go all out and make ScorableMap a type. +  // TODO: Require year. Other features will get added here. +  def toScorableMap(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : Map[String, Any] = { +   Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) +  } + +  def toScorableJson(title : String, year : Int, doi : String = "", sha1 : String = "") : String = { +    JSONObject(toScorableMap(title=title, year=year, doi=doi, sha1=sha1)).toString +  } + +  // TODO: Score on more fields than "title". +  def isScorableMap(map : Map[String, Any]) : Boolean = { +    map.contains("title") +  } +    def jsonToMap(json : String) : Option[Map[String, Any]] = {      // https://stackoverflow.com/a/32717262/631051      val jsonObject = JSON.parseFull(json) @@ -46,18 +62,17 @@ object Scorable {      }    } -  def titleToSlug(title : String) : String = { -    if (title == null || title.isEmpty) { +  // Map should have been produced by toScorableMap. +  // This guarantees it will have all of the fields needed to compute +  // the ultimate score, which are a superset of those needed for a slug. +  def mapToSlug(map : Map[String, Any]) : String = { +    val unaccented = StringUtilities.removeAccents(getString(map, "title")) +    // Remove punctuation after splitting on colon. +    val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())) +    if (slug.isEmpty || slug == null) {        NoSlug      } else { -      val unaccented = StringUtilities.removeAccents(title) -      // Remove punctuation after splitting on colon. -      val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())) -      if (slug.isEmpty || slug == null) { -        NoSlug -      } else { -        slug -      } +      slug      }    } @@ -68,8 +83,9 @@ object Scorable {      }    } -  // Caller is responsible for ensuring that key is in map. -  def getString(map : Map[String, String], key : String) : String = { +  // Caller is responsible for ensuring that key is a String in map. +  // TODO: Add and handle ClassCastException +  def getString(map : Map[String, Any], key : String) : String = {      assert(map contains key)      map(key).asInstanceOf[String]    } | 
