aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src/main/scala')
-rw-r--r--scalding/src/main/scala/sandcrawler/BibjsonScorable.scala50
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala153
-rw-r--r--scalding/src/main/scala/sandcrawler/DumpFileMetaJob.scala36
-rw-r--r--scalding/src/main/scala/sandcrawler/DumpGrobidMetaInsertableJob.scala38
-rw-r--r--scalding/src/main/scala/sandcrawler/DumpGrobidStatusCodeJob.scala34
-rw-r--r--scalding/src/main/scala/sandcrawler/DumpGrobidXmlJob.scala41
-rw-r--r--scalding/src/main/scala/sandcrawler/DumpUnGrobidedJob.scala67
-rw-r--r--scalding/src/main/scala/sandcrawler/FatcatScorable.scala146
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorable.scala83
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala59
-rw-r--r--scalding/src/main/scala/sandcrawler/GroupFatcatWorksJob.scala43
-rw-r--r--scalding/src/main/scala/sandcrawler/GroupFatcatWorksSubsetJob.scala52
-rw-r--r--scalding/src/main/scala/sandcrawler/HBaseColCountJob.scala37
-rw-r--r--scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala2
-rw-r--r--scalding/src/main/scala/sandcrawler/HBaseStatusCodeCountJob.scala32
-rw-r--r--scalding/src/main/scala/sandcrawler/HBaseStatusCountJob.scala10
-rw-r--r--scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala30
-rw-r--r--scalding/src/main/scala/sandcrawler/MissingColumnDumpJob.scala67
-rw-r--r--scalding/src/main/scala/sandcrawler/Scorable.scala96
-rw-r--r--scalding/src/main/scala/sandcrawler/ScorableFeatures.scala63
-rw-r--r--scalding/src/main/scala/sandcrawler/ScoreInsertable.scala86
-rw-r--r--scalding/src/main/scala/sandcrawler/ScoreJob.scala48
-rw-r--r--scalding/src/main/scala/sandcrawler/StringUtilities.scala76
23 files changed, 1343 insertions, 6 deletions
diff --git a/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala b/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala
new file mode 100644
index 0000000..abf9220
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala
@@ -0,0 +1,50 @@
+package sandcrawler
+
+import scala.math
+import scala.util.parsing.json.JSON
+import scala.util.parsing.json.JSONObject
+
+import cascading.flow.FlowDef
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+
+class BibjsonScorable extends Scorable {
+
+ def getSource(args : Args) : Source = {
+ TextLine(args("bibjson-input"))
+ }
+
+ def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]] = {
+ getSource(args).read
+ .toTypedPipe[String](new Fields("line"))
+ .map { BibjsonScorable.bibjsonToMapFeatures(_) }
+ }
+}
+
+object BibjsonScorable {
+ def bibjsonToMapFeatures(json : String) : Option[MapFeatures] = {
+ Scorable.jsonToMap(json) match {
+ case None => None
+ case Some(map) => {
+ if (map contains "title") {
+ val title = Scorable.getString(map, "title")
+ val doi = Scorable.getString(map, "doi")
+ val sha1 = Scorable.getString(map, "sha")
+ // TODO: year, authors (if available)
+ if (title == null || title.isEmpty) {
+ None
+ } else {
+ val sf : ScorableFeatures = ScorableFeatures.create(title=title, doi=doi, sha1=sha1)
+ sf.toSlug match {
+ case None => None
+ case Some(slug) => Some(MapFeatures(slug, sf.toString))
+ }
+ }
+ } else {
+ None
+ }
+ }
+ }
+ }
+}
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
new file mode 100644
index 0000000..bb6413f
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -0,0 +1,153 @@
+package sandcrawler
+
+import scala.math
+import scala.util.parsing.json.JSON
+import scala.util.parsing.json.JSONArray
+import scala.util.parsing.json.JSONObject
+
+import cascading.flow.FlowDef
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import parallelai.spyglass.hbase.HBasePipeConversions
+
+class CrossrefScorable extends Scorable with HBasePipeConversions {
+ // TODO: Generalize args so there can be multiple Crossref pipes in one job.
+ def getSource(args : Args) : Source = {
+ TextLine(args("crossref-input"))
+ }
+
+ def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]] = {
+ getSource(args).read
+ .toTypedPipe[String](new Fields("line"))
+ .filter { CrossrefScorable.keepRecord(_) }
+ .map { CrossrefScorable.jsonToMapFeatures(_) }
+ }
+}
+
+object CrossrefScorable {
+
+ val ContentTypeWhitelist: Set[String] = Set(
+ "book",
+ "book-chapter",
+ "dataset",
+ "dissertation",
+ "journal-article",
+ "letter",
+ "monograph",
+ "posted-content",
+ "pre-print",
+ "proceedings-article",
+ "report",
+ "working-paper")
+
+ def keepRecord(json : String) : Boolean = {
+ Scorable.jsonToMap(json) match {
+ case None => false
+ case Some(map) => {
+ mapToTitle(map) match {
+ case None => false
+ case Some(title) => title.length <= Scorable.MaxTitleLength
+ }
+ }
+ }
+ }
+
+ // Returns None if title is null, empty, or too long.
+ def mapToTitle(map : Map[String, Any]) : Option[String] = {
+ def getTitle : Option[String] = {
+ if (map contains "title") {
+ val titles = map("title").asInstanceOf[List[String]]
+ if (titles.isEmpty || titles == null) None else Some(titles(0))
+ } else {
+ None
+ }
+ }
+
+ def getSubtitle : Option[String] = {
+ if (map contains "subtitle") {
+ val subtitles = map("subtitle").asInstanceOf[List[String]]
+ if (subtitles.isEmpty || subtitles == null) {
+ None
+ } else {
+ val sub = subtitles(0)
+ if (sub == null || sub.isEmpty) {
+ None
+ } else {
+ Some(sub)
+ }
+ }
+ } else {
+ None
+ }
+ }
+
+ getTitle match {
+ case None => None
+ case Some(baseTitle) => {
+ if (baseTitle == null) {
+ None
+ } else {
+ getSubtitle match {
+ case None => Some(baseTitle)
+ case Some(baseSubtitle) => Some(baseTitle.concat(":".concat(baseSubtitle)))
+ }
+ }
+ }
+ }
+ }
+
+ def mapToAuthorList(map : Map[String, Any]) : List[String] = {
+ if (map contains "author") {
+ val objArray = map("author").asInstanceOf[List[Any]].map(e => e.asInstanceOf[Map[String,Any]])
+ // TODO(bnewbold): combine given and family names?
+ objArray
+ .filter(e => e contains "family")
+ .map(e => e.get("family").get.asInstanceOf[String])
+ } else {
+ List()
+ }
+ }
+
+ def mapToYear(map : Map[String, Any]) : Option[Int] = {
+ map.get("created") match {
+ case None => None
+ case Some(created) => {
+ Some(created.asInstanceOf[Map[String,Any]]
+ .get("date-parts")
+ .get
+ .asInstanceOf[List[Any]](0)
+ .asInstanceOf[List[Any]](0)
+ .asInstanceOf[Double]
+ .toInt)
+ }
+ }
+ }
+
+ def jsonToMapFeatures(json : String) : Option[MapFeatures] = {
+ def makeMapFeatures(title : String, doi : String, authors : List[String], year : Int, contentType : String) : Option[MapFeatures] = {
+ if (doi.isEmpty || doi == null || authors.length == 0 || !(ContentTypeWhitelist contains contentType)) {
+ None
+ } else {
+ val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi.toLowerCase(), year=year)
+ sf.toSlug match {
+ case None => None
+ case Some(slug) => Some(MapFeatures(slug, sf.toString))
+ }
+ }
+ }
+ Scorable.jsonToMap(json) match {
+ case None => None
+ case Some(map) =>
+ mapToTitle(map) match {
+ case None => None
+ case Some(title) => makeMapFeatures(
+ title=title,
+ doi=Scorable.getString(map, "DOI"),
+ authors=mapToAuthorList(map),
+ year=mapToYear(map).getOrElse(0),
+ contentType=map.get("type").map(e => e.asInstanceOf[String]).getOrElse("MISSING-CONTENT-TYPE"))
+ }
+ }
+ }
+}
diff --git a/scalding/src/main/scala/sandcrawler/DumpFileMetaJob.scala b/scalding/src/main/scala/sandcrawler/DumpFileMetaJob.scala
new file mode 100644
index 0000000..b3734f0
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/DumpFileMetaJob.scala
@@ -0,0 +1,36 @@
+package sandcrawler
+
+import java.util.Properties
+
+import cascading.property.AppProps
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+// Dumps all the info needed to insert a file entity in Fatcat. Useful for
+// joining.
+class DumpFileMetaJob(args: Args) extends JobBase(args) with HBasePipeConversions {
+
+ val metaPipe : TypedPipe[(String, String, String, Long)] = HBaseBuilder.build(args("hbase-table"),
+ args("zookeeper-hosts"),
+ List("file:cdx", "file:mime", "file:size"),
+ SourceMode.SCAN_ALL)
+ .read
+ .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "cdx", "mime", "size"))
+ .filter { case (_, cdx, mime, size) => cdx != null && mime != null && size != null }
+ .map { case (key, cdx, mime, size) =>
+ (Bytes.toString(key.copyBytes()),
+ Bytes.toString(cdx.copyBytes()),
+ Bytes.toString(mime.copyBytes()),
+ Bytes.toLong(size.copyBytes()))
+ };
+
+ metaPipe.write(TypedTsv[(String,String,String,Long)](args("output")))
+
+}
diff --git a/scalding/src/main/scala/sandcrawler/DumpGrobidMetaInsertableJob.scala b/scalding/src/main/scala/sandcrawler/DumpGrobidMetaInsertableJob.scala
new file mode 100644
index 0000000..ee2b7c2
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/DumpGrobidMetaInsertableJob.scala
@@ -0,0 +1,38 @@
+package sandcrawler
+
+import java.util.Properties
+
+import cascading.property.AppProps
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+// Dumps the SHA1 key and grobid0:metadata columns, plus file metadata needed
+// to insert into fatcat. Used, eg, as part of long-tail mellon pipeline.
+class DumpGrobidMetaInsertableJob(args: Args) extends JobBase(args) with HBasePipeConversions {
+
+ val metaPipe : TypedPipe[(String, String, String, Long, String)] = HBaseBuilder.build(args("hbase-table"),
+ args("zookeeper-hosts"),
+ List("file:cdx", "file:mime", "file:size", "grobid0:metadata"),
+ SourceMode.SCAN_ALL)
+ .read
+ .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "cdx", "mime", "size", "metadata"))
+ .filter { case (_, cdx, mime, size, metadata) => cdx != null && mime != null && size != null && metadata != null }
+ .map { case (key, cdx, mime, size, metadata) =>
+ (Bytes.toString(key.copyBytes()),
+ Bytes.toString(cdx.copyBytes()),
+ Bytes.toString(mime.copyBytes()),
+ Bytes.toLong(size.copyBytes()),
+ Bytes.toString(metadata.copyBytes())
+ )
+ };
+
+ metaPipe.write(TypedTsv[(String,String,String,Long,String)](args("output")))
+
+}
diff --git a/scalding/src/main/scala/sandcrawler/DumpGrobidStatusCodeJob.scala b/scalding/src/main/scala/sandcrawler/DumpGrobidStatusCodeJob.scala
new file mode 100644
index 0000000..42b3464
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/DumpGrobidStatusCodeJob.scala
@@ -0,0 +1,34 @@
+package sandcrawler
+
+import java.util.Properties
+
+import cascading.property.AppProps
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+// Dumps status code for each GROBID-processed file. Good for crawl/corpus
+// analytics, if we consider GROBID status a rough "is this a paper" metric.
+class DumpGrobidStatusCodeJob(args: Args) extends JobBase(args) with HBasePipeConversions {
+
+ val metaPipe : TypedPipe[(String, Long)] = HBaseBuilder.build(args("hbase-table"),
+ args("zookeeper-hosts"),
+ List("grobid0:status_code"),
+ SourceMode.SCAN_ALL)
+ .read
+ .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "status_code"))
+ .filter { case (_, status_code) => status_code != null }
+ .map { case (key, status_code) =>
+ (Bytes.toString(key.copyBytes()),
+ Bytes.toLong(status_code.copyBytes()))
+ };
+
+ metaPipe.write(TypedTsv[(String,Long)](args("output")))
+
+}
diff --git a/scalding/src/main/scala/sandcrawler/DumpGrobidXmlJob.scala b/scalding/src/main/scala/sandcrawler/DumpGrobidXmlJob.scala
new file mode 100644
index 0000000..953610d
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/DumpGrobidXmlJob.scala
@@ -0,0 +1,41 @@
+package sandcrawler
+
+import java.util.Properties
+
+import cascading.property.AppProps
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+import scala.util.parsing.json.JSONObject
+
+// Dumps the SHA1 key and grobid0:tei_xml columns, as TSV/JSON (two TSV
+// columns: one is key, second is JSON). Used for partner delivery/sharing
+class DumpGrobidXmlJob(args: Args) extends JobBase(args) with HBasePipeConversions {
+
+ val metaPipe : TypedPipe[(String, String)] = HBaseBuilder.build(args("hbase-table"),
+ args("zookeeper-hosts"),
+ List("file:cdx", "grobid0:tei_xml"),
+ SourceMode.SCAN_ALL)
+ .read
+ .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "cdx", "tei_xml"))
+ .filter { case (_, cdx, tei_xml) => cdx != null && tei_xml != null }
+ .map { case (key, cdx, tei_xml) =>
+ (Bytes.toString(key.copyBytes()),
+ JSONObject(
+ Map(
+ "pdf_hash" -> Bytes.toString(key.copyBytes()),
+ "cdx_metadata" -> Bytes.toString(cdx.copyBytes()),
+ "tei_xml" -> Bytes.toString(tei_xml.copyBytes())
+ )).toString
+ )
+ };
+
+ metaPipe.write(TypedTsv[(String,String)](args("output")))
+
+}
diff --git a/scalding/src/main/scala/sandcrawler/DumpUnGrobidedJob.scala b/scalding/src/main/scala/sandcrawler/DumpUnGrobidedJob.scala
new file mode 100644
index 0000000..7fd3ce0
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/DumpUnGrobidedJob.scala
@@ -0,0 +1,67 @@
+package sandcrawler
+
+import java.util.Properties
+
+import cascading.property.AppProps
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+// Filters for HBase rows which have not had GROBID run on them, but do have
+// full CDX metadata, and dumps to a TSV for later extraction by the
+// "extraction-ungrobided" job.
+//
+// Does the same horrible join thing that DumpUnGrobidedJob does.
+class DumpUnGrobidedJob(args: Args) extends JobBase(args) with HBasePipeConversions {
+
+ val output = args("output")
+
+ val allKeys : TypedPipe[(String,String,String,String)] = DumpUnGrobidedJob.getHBaseKeySource(
+ args("hbase-table"),
+ args("zookeeper-hosts"))
+ .read
+ .fromBytesWritable('key, 'c, 'mime, 'cdx)
+ .toTypedPipe[(String,String,String,String)]('key, 'c, 'mime, 'cdx)
+
+ val existingKeys : TypedPipe[(String,Boolean)] = DumpUnGrobidedJob.getHBaseColSource(
+ args("hbase-table"),
+ args("zookeeper-hosts"))
+ .read
+ .fromBytesWritable('key)
+ .toTypedPipe[String]('key)
+ .map{ key => (key, true) }
+
+ val missingKeys : TypedPipe[(String,String,String,String)] = allKeys
+ .groupBy(_._1)
+ .leftJoin(existingKeys.groupBy(_._1))
+ .toTypedPipe
+ .collect { case (key, ((_, c, mime, cdx), None)) => (key, c, mime, cdx) }
+
+ missingKeys
+ .write(TypedTsv[(String,String,String,String)](output))
+
+}
+
+object DumpUnGrobidedJob {
+
+ // eg, "wbgrp-journal-extract-0-qa",7 "mtrcs-zk1.us.archive.org:2181"
+ def getHBaseColSource(hbaseTable: String, zookeeperHosts: String) : HBaseSource = {
+ HBaseBuilder.build(
+ hbaseTable,
+ zookeeperHosts,
+ List("grobid0:status_code"),
+ SourceMode.SCAN_ALL)
+ }
+
+ def getHBaseKeySource(hbaseTable: String, zookeeperHosts: String) : HBaseSource = {
+ HBaseBuilder.build(
+ hbaseTable,
+ zookeeperHosts,
+ List("f:c", "file:mime", "file:cdx"),
+ SourceMode.SCAN_ALL)
+ }
+}
diff --git a/scalding/src/main/scala/sandcrawler/FatcatScorable.scala b/scalding/src/main/scala/sandcrawler/FatcatScorable.scala
new file mode 100644
index 0000000..2090e84
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/FatcatScorable.scala
@@ -0,0 +1,146 @@
+package sandcrawler
+
+import scala.math
+import scala.util.parsing.json.JSON
+import scala.util.parsing.json.JSONArray
+import scala.util.parsing.json.JSONObject
+
+import cascading.flow.FlowDef
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import parallelai.spyglass.hbase.HBasePipeConversions
+
+
+class FatcatScorableRight extends Scorable {
+
+ def getSource(args : Args) : Source = {
+ TextLine(args("fatcat-release-input-right"))
+ }
+
+ def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]] = {
+ getSource(args).read
+ .toTypedPipe[String](new Fields("line"))
+ .filter { FatcatScorable.keepRecord(_) }
+ .map { FatcatScorable.jsonToMapFeatures(_) }
+ }
+}
+
+class FatcatScorable extends Scorable with HBasePipeConversions {
+
+ def getSource(args : Args) : Source = {
+ TextLine(args("fatcat-release-input"))
+ }
+
+ def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]] = {
+ getSource(args).read
+ .toTypedPipe[String](new Fields("line"))
+ .filter { FatcatScorable.keepRecord(_) }
+ .map { FatcatScorable.jsonToMapFeatures(_) }
+ }
+}
+
+object FatcatScorable {
+
+ // Note; removed ReleaseType filtering
+
+ def keepRecord(json : String) : Boolean = {
+ Scorable.jsonToMap(json) match {
+ case None => false
+ case Some(map) => {
+ mapToTitle(map) match {
+ case None => false
+ case Some(title) => title.length <= Scorable.MaxTitleLength
+ }
+ }
+ }
+ }
+
+ // Returns None if title is null, empty, or too long.
+ def mapToTitle(map : Map[String, Any]) : Option[String] = {
+ def getTitle : Option[String] = {
+ if (map contains "title") {
+ val title = map("title").asInstanceOf[String]
+ if (title == null || title.isEmpty) None else Some(title)
+ } else {
+ None
+ }
+ }
+
+ def getSubtitle : Option[String] = {
+ if (map contains "subtitle") {
+ val subtitle = map("subtitle").asInstanceOf[String]
+ if (subtitle == null || subtitle.isEmpty) {
+ None
+ } else {
+ Some(subtitle)
+ }
+ } else {
+ None
+ }
+ }
+
+ getTitle match {
+ case None => None
+ case Some(baseTitle) => {
+ if (baseTitle == null) {
+ None
+ } else {
+ getSubtitle match {
+ case None => Some(baseTitle)
+ case Some(baseSubtitle) => Some(baseTitle.concat(":".concat(baseSubtitle)))
+ }
+ }
+ }
+ }
+ }
+
+ def mapToAuthorList(map : Map[String, Any]) : List[String] = {
+ if (map contains "contribs") {
+ val objArray = map("contribs").asInstanceOf[List[Any]].map(e => e.asInstanceOf[Map[String,Any]])
+ // TODO(bnewbold): better name stuff... contrib.surname, creator.surname,
+ // or raw_name split to last
+ objArray
+ .filter(e => e contains "raw_name")
+ .map(e => e.get("raw_name").get.asInstanceOf[String])
+ } else {
+ List()
+ }
+ }
+
+ def mapToYear(map : Map[String, Any]) : Option[Int] = {
+ map.get("release_year") match {
+ case None => None
+ case Some(year) => {
+ Some(year.asInstanceOf[Double].toInt)
+ }
+ }
+ }
+
+ def jsonToMapFeatures(json : String) : Option[MapFeatures] = {
+ def makeMapFeatures(title : String, doi : String, fatcat_release: String, fatcat_work : String, authors : List[String], year : Int, contentType : String) : Option[MapFeatures] = {
+ // NOTE: not doing any filtering here!
+ val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi, fatcat_release=fatcat_release, fatcat_work=fatcat_work, year=year)
+ sf.toSlug match {
+ case None => None
+ case Some(slug) => Some(MapFeatures(slug, sf.toString))
+ }
+ }
+ Scorable.jsonToMap(json) match {
+ case None => None
+ case Some(map) =>
+ mapToTitle(map) match {
+ case None => None
+ case Some(title) => makeMapFeatures(
+ title=title,
+ // TODO: doi=Scorable.getString(map, "doi"),
+ doi=null,
+ fatcat_release=Scorable.getString(map, "ident"),
+ fatcat_work=Scorable.getString(map, "work_id"),
+ authors=mapToAuthorList(map),
+ year=mapToYear(map).getOrElse(0),
+ contentType=map.get("type").map(e => e.asInstanceOf[String]).getOrElse("MISSING-CONTENT-TYPE"))
+ }
+ }
+ }
+}
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
new file mode 100644
index 0000000..f4ed129
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -0,0 +1,83 @@
+package sandcrawler
+
+import scala.util.parsing.json.JSONObject
+
+import cascading.flow.FlowDef
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+class GrobidScorable extends Scorable with HBasePipeConversions {
+ val StatusOK = 200
+
+ def getSource(args : Args) : Source = {
+ // TODO: Generalize args so there can be multiple grobid pipes in one job
+ GrobidScorable.getHBaseSource(args("hbase-table"), args("zookeeper-hosts"))
+ }
+
+ def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]] = {
+ getSource(args)
+ .read
+ // Can't just "fromBytesWritable" because we have multiple types
+ .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "metadata", "status_code"))
+ .filter { case (_, metadata, status_code) => metadata != null && status_code != null }
+ .map { case (key, metadata, status_code) =>
+ (Bytes.toString(key.copyBytes()), Bytes.toString(metadata.copyBytes()), Bytes.toLong(status_code.copyBytes()))
+ }
+ .collect { case (key, json, StatusOK) => (key, json) }
+ .filter { case (key, json) => GrobidScorable.keepRecord(json) }
+ .map { entry : (String, String) => GrobidScorable.jsonToMapFeatures(entry._1, entry._2) }
+ }
+}
+
+object GrobidScorable {
+ def keepRecord(json : String) : Boolean = {
+ Scorable.jsonToMap(json) match {
+ case None => false
+ case Some(map) => {
+ if (map contains "title") {
+ val title = Scorable.getString(map, "title")
+ title != null && title.length <= Scorable.MaxTitleLength
+ } else {
+ false
+ }
+ }
+ }
+ }
+
+ def mapToAuthorList(map : Map[String, Any]) : List[String] = {
+ if (map contains "authors") {
+ val objArray = map("authors").asInstanceOf[List[Any]].map(e => e.asInstanceOf[Map[String,Any]])
+ objArray
+ .filter(e => e contains "name")
+ .map(e => e.get("name").get.asInstanceOf[String])
+ } else {
+ List()
+ }
+ }
+
+ def getHBaseSource(table : String, host : String) : HBaseSource = {
+ HBaseBuilder.build(table, host, List("grobid0:metadata", "grobid0:status_code"), SourceMode.SCAN_ALL)
+ }
+
+ def jsonToMapFeatures(key : String, json : String) : Option[MapFeatures] = {
+ Scorable.jsonToMap(json) match {
+ case None => None
+ case Some(map) => {
+ if (map contains "title") {
+ val authors: List[String] = mapToAuthorList(map)
+ val title = Scorable.getString(map, "title")
+ ScorableFeatures.create(title=title, authors=authors, sha1=key).toMapFeatures
+ } else {
+ None
+ }
+ }
+ }
+ }
+}
+
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
new file mode 100644
index 0000000..3146a6c
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
@@ -0,0 +1,59 @@
+
+package sandcrawler
+
+import cascading.flow.FlowDef
+import cascading.pipe.Pipe
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+class GrobidScorableDumpJob(args: Args) extends JobBase(args) {
+
+ val grobidHbaseRows = Stat("hbase-rows-scanned", "hbase-grobid-dump")
+ val filteredGrobidRows = Stat("grobid-rows-filtered", "hbase-grobid-dump")
+ val parsedGrobidRows = Stat("grobid-rows-parsed", "hbase-grobid-dump")
+ val validGrobidRows = Stat("grobid-rows-valid-slug", "hbase-grobid-dump")
+
+ val pipe = GrobidScorable.getHBaseSource(args("hbase-table"), args("zookeeper-hosts"))
+ .read
+ // Can't just "fromBytesWritable" because we have multiple types?
+ .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "metadata", "status_code"))
+ .filter { case (_, metadata, status_code) =>
+ grobidHbaseRows.inc
+ metadata != null && status_code != null
+ }
+ .map { case (key, metadata, status_code) =>
+ (Bytes.toString(key.copyBytes()), Bytes.toString(metadata.copyBytes()), Bytes.toLong(status_code.copyBytes()))
+ }
+ // TODO: Should I combine next two stages for efficiency?
+ .collect { case (key, json, 200) =>
+ filteredGrobidRows.inc
+ (key, json)
+ }
+ .map { entry : (String, String) =>
+ parsedGrobidRows.inc
+ GrobidScorable.jsonToMapFeatures(entry._1, entry._2)
+ }
+ .filterNot { entry => entry.isEmpty }
+ .map { entry => {
+ validGrobidRows.inc
+ entry.get
+ }}
+ .groupBy { case MapFeatures(slug, json) => slug }
+ .map { tuple =>
+ val (slug : String, features : MapFeatures) = tuple
+ (slug, ReduceFeatures(features.json))
+ }
+
+ pipe
+ .map { case (slug, features) =>
+ (slug, features.json)
+ }
+ .write(TypedTsv[(String, String)](args("output")))
+}
diff --git a/scalding/src/main/scala/sandcrawler/GroupFatcatWorksJob.scala b/scalding/src/main/scala/sandcrawler/GroupFatcatWorksJob.scala
new file mode 100644
index 0000000..46d2038
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/GroupFatcatWorksJob.scala
@@ -0,0 +1,43 @@
+package sandcrawler
+
+import cascading.pipe.Pipe
+import com.twitter.scalding.Args
+import com.twitter.scalding.Stat
+import com.twitter.scalding.TypedPipe
+import com.twitter.scalding.TypedTsv
+import parallelai.spyglass.base.JobBase
+
+class GroupFatcatWorksJob(args: Args) extends JobBase(args) {
+
+ val fatcatRowCount = Stat("fatcat-rows-filtered", "sandcrawler")
+ val joinedRowCount = Stat("joined-rows", "sandcrawler")
+
+ val fatcatScorable : Scorable = new FatcatScorable()
+ val fatcatPipe : TypedPipe[(String, ReduceFeatures)] = fatcatScorable
+ .getInputPipe(args)
+ .map { r =>
+ fatcatRowCount.inc
+ r
+ }
+
+ val joinedPipe = fatcatPipe
+ .addTrap(TypedTsv(args("output") + ".trapped"))
+ .join(fatcatPipe)
+
+ // TypedTsv doesn't work over case classes.
+ joinedPipe
+ // filter out trivial self-matches (releases are identical)
+ .filter { case (slug, (fatcatFeaturesLeft, fatcatFeaturesRight)) =>
+ Scorable.selfMatchable(fatcatFeaturesLeft, fatcatFeaturesRight)
+ }
+ .map { case (slug, (fatcatFeaturesLeft, fatcatFeaturesRight)) =>
+ joinedRowCount.inc
+ new ReduceOutput(
+ slug,
+ Scorable.computeSimilarity(fatcatFeaturesLeft, fatcatFeaturesRight),
+ fatcatFeaturesLeft.json,
+ fatcatFeaturesRight.json)
+ }
+ .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) }
+ .write(TypedTsv[(String, Int, String, String)](args("output")))
+}
diff --git a/scalding/src/main/scala/sandcrawler/GroupFatcatWorksSubsetJob.scala b/scalding/src/main/scala/sandcrawler/GroupFatcatWorksSubsetJob.scala
new file mode 100644
index 0000000..ea5e26b
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/GroupFatcatWorksSubsetJob.scala
@@ -0,0 +1,52 @@
+package sandcrawler
+
+import cascading.pipe.Pipe
+import com.twitter.scalding.Args
+import com.twitter.scalding.Stat
+import com.twitter.scalding.TypedPipe
+import com.twitter.scalding.TypedTsv
+import parallelai.spyglass.base.JobBase
+
+class GroupFatcatWorksSubsetJob(args: Args) extends JobBase(args) {
+
+ val fatcatLhsRowCount = Stat("fatcat-rows-filtered-left", "sandcrawler")
+ val fatcatRhsRowCount = Stat("fatcat-rows-filtered-right", "sandcrawler")
+ val joinedRowCount = Stat("joined-rows", "sandcrawler")
+
+ val fatcatScorableLhs : Scorable = new FatcatScorable()
+ val fatcatPipeLhs : TypedPipe[(String, ReduceFeatures)] = fatcatScorableLhs
+ .getInputPipe(args)
+ .map { r =>
+ fatcatLhsRowCount.inc
+ r
+ }
+
+ val fatcatScorableRhs : Scorable = new FatcatScorableRight()
+ val fatcatPipeRhs : TypedPipe[(String, ReduceFeatures)] = fatcatScorableRhs
+ .getInputPipe(args)
+ .map { r =>
+ fatcatRhsRowCount.inc
+ r
+ }
+
+ val joinedPipe = fatcatPipeLhs
+ .addTrap(TypedTsv(args("output") + ".trapped"))
+ .join(fatcatPipeRhs)
+
+ // TypedTsv doesn't work over case classes.
+ joinedPipe
+ // filter out trivial self-matches (releases are identical)
+ .filter { case (slug, (fatcatFeaturesLeft, fatcatFeaturesRight)) =>
+ Scorable.selfMatchable(fatcatFeaturesLeft, fatcatFeaturesRight)
+ }
+ .map { case (slug, (fatcatFeaturesLeft, fatcatFeaturesRight)) =>
+ joinedRowCount.inc
+ new ReduceOutput(
+ slug,
+ Scorable.computeSimilarity(fatcatFeaturesLeft, fatcatFeaturesRight),
+ fatcatFeaturesLeft.json,
+ fatcatFeaturesRight.json)
+ }
+ .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) }
+ .write(TypedTsv[(String, Int, String, String)](args("output")))
+}
diff --git a/scalding/src/main/scala/sandcrawler/HBaseColCountJob.scala b/scalding/src/main/scala/sandcrawler/HBaseColCountJob.scala
new file mode 100644
index 0000000..20cc7a1
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/HBaseColCountJob.scala
@@ -0,0 +1,37 @@
+package sandcrawler
+
+import java.util.Properties
+
+import cascading.property.AppProps
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+class HBaseColCountJob(args: Args) extends JobBase(args) with HBasePipeConversions {
+
+ val output = args("output")
+
+ HBaseColCountJob.getHBaseSource(
+ args("hbase-table"),
+ args("zookeeper-hosts"),
+ args("column"))
+ .read
+ .debug
+ .groupAll { _.size('count) }
+ .write(Tsv(output))
+}
+
+object HBaseColCountJob {
+
+ // eg, "wbgrp-journal-extract-0-qa",7 "mtrcs-zk1.us.archive.org:2181"
+ def getHBaseSource(hbaseTable: String, zookeeperHosts: String, col: String) : HBaseSource = {
+ HBaseBuilder.build(
+ hbaseTable,
+ zookeeperHosts,
+ List(col),
+ SourceMode.SCAN_ALL)
+ }
+}
diff --git a/scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala b/scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala
index 4c3de33..5c7954a 100644
--- a/scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala
+++ b/scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala
@@ -30,7 +30,7 @@ object HBaseRowCountJob {
HBaseBuilder.build(
hbaseTable,
zookeeperHosts,
- List("file:size"),
+ List("f:c"),
SourceMode.SCAN_ALL)
}
}
diff --git a/scalding/src/main/scala/sandcrawler/HBaseStatusCodeCountJob.scala b/scalding/src/main/scala/sandcrawler/HBaseStatusCodeCountJob.scala
new file mode 100644
index 0000000..4d9880f
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/HBaseStatusCodeCountJob.scala
@@ -0,0 +1,32 @@
+package sandcrawler
+
+import java.util.Properties
+
+import cascading.property.AppProps
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+class HBaseStatusCodeCountJob(args: Args) extends JobBase(args) with HBasePipeConversions {
+
+ val source = HBaseCountJob.getHBaseSource(
+ args("hbase-table"),
+ args("zookeeper-hosts"),
+ "grobid0:status_code")
+
+ val statusPipe : TypedPipe[Long] = source
+ .read
+ .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable)]('key, 'status_code)
+ .map { case (key, raw_code) => Bytes.toLong(raw_code.copyBytes()) }
+
+ statusPipe.groupBy { identity }
+ .size
+ .debug
+ .write(TypedTsv[(Long,Long)](args("output")))
+}
diff --git a/scalding/src/main/scala/sandcrawler/HBaseStatusCountJob.scala b/scalding/src/main/scala/sandcrawler/HBaseStatusCountJob.scala
index fd0b4e2..f79d672 100644
--- a/scalding/src/main/scala/sandcrawler/HBaseStatusCountJob.scala
+++ b/scalding/src/main/scala/sandcrawler/HBaseStatusCountJob.scala
@@ -18,15 +18,15 @@ class HBaseStatusCountJob(args: Args) extends JobBase(args) with HBasePipeConver
val source = HBaseCountJob.getHBaseSource(
args("hbase-table"),
args("zookeeper-hosts"),
- "grobid0:status_code")
+ "grobid0:status")
- val statusPipe : TypedPipe[Long] = source
+ val statusPipe : TypedPipe[String] = source
.read
- .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable)]('key, 'status_code)
- .map { case (key, raw_code) => Bytes.toLong(raw_code.copyBytes()) }
+ .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable)]('key, 'status)
+ .map { case (key, raw_status) => Bytes.toString(raw_status.copyBytes()) }
statusPipe.groupBy { identity }
.size
.debug
- .write(TypedTsv[(Long,Long)](args("output")))
+ .write(TypedTsv[(String,Long)](args("output")))
}
diff --git a/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala b/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala
new file mode 100644
index 0000000..292de75
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala
@@ -0,0 +1,30 @@
+package sandcrawler
+
+import cascading.pipe.Pipe
+import com.twitter.scalding.Args
+import com.twitter.scalding.TypedPipe
+import com.twitter.scalding.TypedTsv
+import parallelai.spyglass.base.JobBase
+
+class MatchBenchmarkJob(args: Args) extends JobBase(args) {
+ // TODO: Instantiate any subclass of Scorable specified in args.
+ val sc1 : Scorable = new BibjsonScorable()
+ val sc2 : Scorable = new BibjsonScorable()
+ val leftArgs = args + ("bibjson-input" -> List(args("left-bibjson")))
+ val rightArgs = args + ("bibjson-input" -> List(args("right-bibjson")))
+ val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(leftArgs)
+ val pipe2 : TypedPipe[(String, ReduceFeatures)] = sc2.getInputPipe(rightArgs)
+
+ pipe1.join(pipe2)
+ .map { entry =>
+ val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry
+ new ReduceOutput(
+ slug,
+ Scorable.computeSimilarity(features1, features2),
+ features1.json,
+ features2.json)
+ }
+ //TypedTsv doesn't work over case classes.
+ .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) }
+ .write(TypedTsv[(String, Int, String, String)](args("output")))
+}
diff --git a/scalding/src/main/scala/sandcrawler/MissingColumnDumpJob.scala b/scalding/src/main/scala/sandcrawler/MissingColumnDumpJob.scala
new file mode 100644
index 0000000..cc3bf23
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/MissingColumnDumpJob.scala
@@ -0,0 +1,67 @@
+package sandcrawler
+
+import java.util.Properties
+
+import cascading.property.AppProps
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+// This nasty, no-good, horrible Job outputs a list of keys ("sha1:A234...")
+// for which the given "column" does not have a value set.
+// It does this using a self-join because SpyGlass's HBase SCAN support seems
+// to be extremely limited.
+class MissingColumnDumpJob(args: Args) extends JobBase(args) with HBasePipeConversions {
+
+ val output = args("output")
+
+ val allKeys : TypedPipe[String] = MissingColumnDumpJob.getHBaseKeySource(
+ args("hbase-table"),
+ args("zookeeper-hosts"))
+ .read
+ .fromBytesWritable('key)
+ .toTypedPipe[String]('key)
+
+ val existingKeys : TypedPipe[(String,Boolean)] = MissingColumnDumpJob.getHBaseColSource(
+ args("hbase-table"),
+ args("zookeeper-hosts"),
+ args("column"))
+ .read
+ .fromBytesWritable('key)
+ .toTypedPipe[String]('key)
+ .map{ key => (key, true) }
+
+ val missingKeys : TypedPipe[String] = allKeys
+ .groupBy( identity )
+ .leftJoin(existingKeys.groupBy(_._1))
+ .toTypedPipe
+ .collect { case (key, (_, None)) => key }
+
+ missingKeys
+ .write(TypedTsv[String](output))
+
+}
+
+object MissingColumnDumpJob {
+
+ // eg, "wbgrp-journal-extract-0-qa",7 "mtrcs-zk1.us.archive.org:2181"
+ def getHBaseColSource(hbaseTable: String, zookeeperHosts: String, col: String) : HBaseSource = {
+ HBaseBuilder.build(
+ hbaseTable,
+ zookeeperHosts,
+ List(col),
+ SourceMode.SCAN_ALL)
+ }
+
+ def getHBaseKeySource(hbaseTable: String, zookeeperHosts: String) : HBaseSource = {
+ HBaseBuilder.build(
+ hbaseTable,
+ zookeeperHosts,
+ List("f:c"),
+ SourceMode.SCAN_ALL)
+ }
+}
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
new file mode 100644
index 0000000..d9c38e8
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -0,0 +1,96 @@
+package sandcrawler
+
+import scala.math
+import scala.util.parsing.json.JSON
+import scala.util.parsing.json.JSONObject
+
+import cascading.flow.FlowDef
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+
+case class MapFeatures(slug : String, json : String)
+case class ReduceFeatures(json : String)
+case class ReduceOutput(val slug : String, score : Int, json1 : String, json2 : String)
+
+abstract class Scorable {
+ def getInputPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[(String, ReduceFeatures)] = {
+ val validFeatures : TypedPipe[MapFeatures] = getFeaturesPipe(args)
+ .filterNot { entry => entry.isEmpty }
+ .map { entry => entry.get }
+
+ validFeatures
+ .groupBy { case MapFeatures(slug, json) => slug }
+ .map { tuple =>
+ val (slug : String, features : MapFeatures) = tuple
+ (slug, ReduceFeatures(features.json))
+ }
+ }
+
+ // abstract methods
+ def getSource(args : Args) : Source
+ def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]]
+}
+
+object Scorable {
+ val MaxTitleLength = 1023
+
+ def jsonToMap(json : String) : Option[Map[String, Any]] = {
+ // https://stackoverflow.com/a/32717262/631051
+ val jsonObject = JSON.parseFull(json)
+ if (jsonObject == None) {
+ None
+ } else {
+ Some(jsonObject.get.asInstanceOf[Map[String, Any]])
+ }
+ }
+
+ def getStringOption(optionalMap : Option[Map[String, Any]], key : String) : Option[String] = {
+ optionalMap match {
+ case None => None
+ case Some(map) => if (map contains key) Some(map(key).asInstanceOf[String]) else None
+ }
+ }
+
+ // Caller is responsible for ensuring that key is a String in map.
+ // TODO: Add and handle ClassCastException
+ def getString(map : Map[String, Any], key : String) : String = {
+ assert(map contains key)
+ map(key).asInstanceOf[String]
+ }
+
+ val MaxScore = 1000
+
+ def selfMatchable(features1 : ReduceFeatures, features2 : ReduceFeatures) : Boolean = {
+ val json1 = jsonToMap(features1.json)
+ val json2 = jsonToMap(features2.json)
+
+ (
+ getStringOption(json1, "fatcat_release") != None &&
+ getStringOption(json2, "fatcat_release") != None &&
+ getStringOption(json1, "fatcat_release") != getStringOption(json2, "fatcat_release") &&
+ (getStringOption(json1, "fatcat_work") match {
+ case None => false
+ case Some(work1) => getStringOption(json2, "fatcat_work") match {
+ case None => false
+ // this last check ensures we don't double-match
+ case Some(work2) => work1 > work2
+ }
+ })
+ )
+ }
+
+ def computeSimilarity(features1 : ReduceFeatures, features2 : ReduceFeatures) : Int = {
+ val json1 = jsonToMap(features1.json)
+ val json2 = jsonToMap(features2.json)
+ getStringOption(json1, "title") match {
+ case None => 0
+ case Some(title1) => {
+ getStringOption(json2, "title") match {
+ case None => 0
+ case Some(title2) =>
+ (StringUtilities.similarity(title1.toLowerCase, title2.toLowerCase) * MaxScore).toInt
+ }
+ }
+ }
+ }
+}
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
new file mode 100644
index 0000000..93cd78d
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -0,0 +1,63 @@
+package sandcrawler
+
+import java.io.InputStream
+
+import scala.io.Source
+import scala.util.parsing.json.JSONArray
+import scala.util.parsing.json.JSONObject
+
+object ScorableFeatures {
+ // TODO: Add exception handling.
+ val fileStream : InputStream = getClass.getResourceAsStream("/slug-denylist.txt")
+ val SlugDenylist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet
+ fileStream.close
+ val MinSlugLength = 8
+
+ // Static factory method
+ def create(title : String, authors : List[Any] = List(), year : Int = 0, doi : String = "", fatcat_release : String = "", fatcat_work : String = "", sha1 : String = "") : ScorableFeatures = {
+ new ScorableFeatures(
+ title=if (title == null) "" else title,
+ authors=if (authors == null) List() else authors.map(a => if (a == null) "" else a),
+ year=year,
+ doi=if (doi == null) "" else doi,
+ fatcat_release=if (fatcat_release == null) "" else fatcat_release,
+ fatcat_work=if (fatcat_work == null) "" else fatcat_work,
+ sha1=if (sha1 == null) "" else sha1)
+ }
+}
+
+// Contains features needed to make slug and to score (in combination
+// with a second ScorableFeatures). Create with above static factory method.
+class ScorableFeatures private(title : String, authors : List[Any] = List(), year: Int = 0, doi : String = "", fatcat_release : String = "", fatcat_work : String = "", sha1: String = "") {
+
+ def toMap() : Map[String, Any] =
+ Map("title" -> title, "authors" -> JSONArray(authors), "year" -> year, "doi" -> doi, "fatcat_release" -> fatcat_release, "fatcat_work" -> fatcat_work, "sha1" -> sha1)
+
+ override def toString() : String = {
+ JSONObject(toMap).toString
+ }
+
+ def toSlug() : Option[String] = {
+ if (title == null) {
+ None
+ } else {
+ val unaccented = StringUtilities.removeAccents(title)
+ // Remove punctuation
+ val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "")
+ if (slug.isEmpty
+ || slug == null
+ || (ScorableFeatures.SlugDenylist contains slug)
+ || (slug.length < ScorableFeatures.MinSlugLength)) {
+ None
+ } else {
+ Some(slug)
+ }
+ }
+ }
+
+ def toMapFeatures : Option[MapFeatures] =
+ toSlug match {
+ case None => None
+ case Some(slug) => Some(MapFeatures(slug, toString))
+ }
+}
diff --git a/scalding/src/main/scala/sandcrawler/ScoreInsertable.scala b/scalding/src/main/scala/sandcrawler/ScoreInsertable.scala
new file mode 100644
index 0000000..58007fa
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/ScoreInsertable.scala
@@ -0,0 +1,86 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.pipe.Pipe
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+class ScoreInsertableJob(args: Args) extends JobBase(args) {
+
+ val grobidRowCount = Stat("grobid-rows-filtered", "sandcrawler")
+ val crossrefRowCount = Stat("crossref-rows-filtered", "sandcrawler")
+ val cdxRowCount = Stat("cdx-rows", "sandcrawler")
+ val scoredRowCount = Stat("scored-rows", "sandcrawler")
+ val joinedRowCount = Stat("joined-rows", "sandcrawler")
+
+ val grobidScorable : Scorable = new GrobidScorable()
+ val crossrefScorable : Scorable = new CrossrefScorable()
+
+ val grobidPipe : TypedPipe[(String, ReduceFeatures)] = grobidScorable
+ .getInputPipe(args)
+ .map { r =>
+ grobidRowCount.inc
+ r
+ }
+ val crossrefPipe : TypedPipe[(String, ReduceFeatures)] = crossrefScorable
+ .getInputPipe(args)
+ .map { r =>
+ crossrefRowCount.inc
+ r
+ }
+ val cdxPipe : TypedPipe[(String, String, String, Long)] = ScoreInsertableJob.getHBaseCdxSource(args("hbase-table"), args("zookeeper-hosts"))
+ .read
+ .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "cdx", "mime", "size"))
+ .filter { case (_, cdx, mime, size) => cdx != null && mime != null && size != null }
+ .map { case (key, cdx, mime, size) =>
+ (Bytes.toString(key.copyBytes()),
+ Bytes.toString(cdx.copyBytes()),
+ Bytes.toString(mime.copyBytes()),
+ Bytes.toLong(size.copyBytes()))
+ }
+ .map { r =>
+ cdxRowCount.inc
+ r
+ }
+
+ val scoredPipe = grobidPipe
+ .addTrap(TypedTsv(args("output") + ".trapped"))
+ .join(crossrefPipe)
+ .map { case (slug, (grobidFeatures, crossrefFeatures)) =>
+ scoredRowCount.inc
+ //val (slug : String, (grobidFeatures: ReduceFeatures, crossrefFeatures: ReduceFeatures)) = entry
+ // Not ever Empty, I promise
+ val key = Scorable.getStringOption(Scorable.jsonToMap(grobidFeatures.json), "sha1").orNull
+ (key, new ReduceOutput(
+ slug,
+ Scorable.computeSimilarity(grobidFeatures, crossrefFeatures),
+ grobidFeatures.json,
+ crossrefFeatures.json))
+ }
+ .map { case (key, entry) => (key, entry.slug, entry.score, entry.json1, entry.json2) }
+ .groupBy { case (key, _, _, _, _) => key }
+
+ // TypedTsv doesn't work over case classes.
+ val joinedPipe = scoredPipe
+ .join(cdxPipe.groupBy { case (key, _, _, _) => key })
+ .map { case (key, ((_, slug, score, left, right), (_, cdx, mime, size))) => (key, slug, score, left, right, cdx, mime, size) }
+ .write(TypedTsv[(String, String, Int, String, String, String, String, Long)](args("output")))
+}
+
+object ScoreInsertableJob {
+
+ // eg, "wbgrp-journal-extract-0-qa",7 "mtrcs-zk1.us.archive.org:2181"
+ def getHBaseCdxSource(hbaseTable: String, zookeeperHosts: String) : HBaseSource = {
+ HBaseBuilder.build(
+ hbaseTable,
+ zookeeperHosts,
+ List("file:cdx", "file:mime", "file:size"),
+ SourceMode.SCAN_ALL)
+ }
+}
diff --git a/scalding/src/main/scala/sandcrawler/ScoreJob.scala b/scalding/src/main/scala/sandcrawler/ScoreJob.scala
new file mode 100644
index 0000000..ccb9b76
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/ScoreJob.scala
@@ -0,0 +1,48 @@
+package sandcrawler
+
+import cascading.pipe.Pipe
+import com.twitter.scalding.Args
+import com.twitter.scalding.Stat
+import com.twitter.scalding.TypedPipe
+import com.twitter.scalding.TypedTsv
+import parallelai.spyglass.base.JobBase
+
+class ScoreJob(args: Args) extends JobBase(args) {
+
+ val grobidRowCount = Stat("grobid-rows-filtered", "sandcrawler")
+ val crossrefRowCount = Stat("crossref-rows-filtered", "sandcrawler")
+ val joinedRowCount = Stat("joined-rows", "sandcrawler")
+
+ val grobidScorable : Scorable = new GrobidScorable()
+ val crossrefScorable : Scorable = new CrossrefScorable()
+ val grobidPipe : TypedPipe[(String, ReduceFeatures)] = grobidScorable
+ .getInputPipe(args)
+ .map { r =>
+ grobidRowCount.inc
+ r
+ }
+ val crossrefPipe : TypedPipe[(String, ReduceFeatures)] = crossrefScorable
+ .getInputPipe(args)
+ .map { r =>
+ crossrefRowCount.inc
+ r
+ }
+
+ val joinedPipe = grobidPipe
+ .addTrap(TypedTsv(args("output") + ".trapped"))
+ .join(crossrefPipe)
+
+ // TypedTsv doesn't work over case classes.
+ joinedPipe
+ .map { case (slug, (grobidFeatures, crossrefFeatures)) =>
+ joinedRowCount.inc
+ //val (slug : String, (grobidFeatures: ReduceFeatures, crossrefFeatures: ReduceFeatures)) = entry
+ new ReduceOutput(
+ slug,
+ Scorable.computeSimilarity(grobidFeatures, crossrefFeatures),
+ grobidFeatures.json,
+ crossrefFeatures.json)
+ }
+ .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) }
+ .write(TypedTsv[(String, Int, String, String)](args("output")))
+}
diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
new file mode 100644
index 0000000..9150ced
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
@@ -0,0 +1,76 @@
+package sandcrawler
+
+import java.text.Normalizer
+import java.util.regex.Pattern
+
+object StringUtilities {
+ // bnewbold: I propose that we:
+ // 1. keep only \p{Ideographic}, \p{Alphabetic}, and \p{Digit}
+ // 2. strip accents
+ // 3. "lower-case" (unicode-aware)
+ // 4. do any final custom/manual mappings
+ //
+ // We should check (test) that null bytes are handled, in addition to other
+ // more obvious characters
+
+ // Adapted from https://git-wip-us.apache.org/repos/asf?p=commons-lang.git;a=blob;f=src/main/java/org/apache/commons/lang3/StringUtils.java;h=1d7b9b99335865a88c509339f700ce71ce2c71f2;hb=HEAD#l934
+ def removeAccents(s : String) : String = {
+ val replacements = Map(
+ '\u0141' -> 'L',
+ '\u0142' -> 'l', // Letter ell
+ '\u00d8' -> 'O',
+ '\u00f8' -> 'o'
+ )
+ val sb = new StringBuilder(Normalizer.normalize(s, Normalizer.Form.NFD))
+ for (i <- 0 to sb.length - 1) {
+ for (key <- replacements.keys) {
+ if (sb(i) == key) {
+ sb.deleteCharAt(i);
+ sb.insert(i, replacements(key))
+ }
+ }
+ }
+ val pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+")
+ pattern.matcher(sb).replaceAll("")
+ }
+
+ // Source: https://stackoverflow.com/a/30076541/631051
+ def removePunctuation(s: String) : String = {
+ s.replaceAll("""[\p{Punct}’·“”‘’“”«»「」¿–±§ʿ]""", "")
+ }
+
+ // Adapted from: https://stackoverflow.com/a/16018452/631051
+ def similarity(s1a : String, s2a : String) : Double = {
+ val (s1, s2) = (removeAccents(removePunctuation(s1a)),
+ removeAccents(removePunctuation(s2a)))
+ val longer : String = if (s1.length > s2.length) s1 else s2
+ val shorter : String = if (s1.length > s2.length) s2 else s1
+ if (longer.length == 0) {
+ // Both strings are empty.
+ 1
+ } else {
+ (longer.length - stringDistance(longer, shorter)) / longer.length.toDouble
+ }
+ }
+
+ // Source: https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/
+ def stringDistance(s1: String, s2: String): Int = {
+ val memo = scala.collection.mutable.Map[(List[Char],List[Char]),Int]()
+ def min(a:Int, b:Int, c:Int) = Math.min( Math.min( a, b ), c)
+ def sd(s1: List[Char], s2: List[Char]): Int = {
+ if (!memo.contains((s1, s2))) {
+ memo((s1,s2)) = (s1, s2) match {
+ case (_, Nil) => s1.length
+ case (Nil, _) => s2.length
+ case (c1::t1, c2::t2) =>
+ min( sd(t1,s2) + 1, sd(s1,t2) + 1,
+ sd(t1,t2) + (if (c1==c2) 0 else 1) )
+ }
+ }
+ memo((s1,s2))
+ }
+
+ sd( s1.toList, s2.toList )
+ }
+}
+