From e99700cb521d82a750497d58bcb04d8cf1abcd80 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 19 Aug 2018 19:19:37 -0700 Subject: bibjson scorable class (no tests) --- .../main/scala/sandcrawler/BibjsonScorable.scala | 50 ++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 scalding/src/main/scala/sandcrawler/BibjsonScorable.scala (limited to 'scalding') diff --git a/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala b/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala new file mode 100644 index 0000000..7221a66 --- /dev/null +++ b/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala @@ -0,0 +1,50 @@ +package sandcrawler + +import scala.math +import scala.util.parsing.json.JSON +import scala.util.parsing.json.JSONObject + +import cascading.flow.FlowDef +import cascading.tuple.Fields +import com.twitter.scalding._ +import com.twitter.scalding.typed.TDsl._ +// XXX: import parallelai.spyglass.hbase.HBasePipeConversions + +// XXX: class BibjsonScorable extends Scorable with HBasePipeConversions { + +class BibjsonScorable extends Scorable { + + def getSource(args : Args) : Source = { + TextLine(args("bibjson-input")) + } + + def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = { + getSource(args).read + .toTypedPipe[String](new Fields("line")) + .map { BibjsonScorable.bibjsonToMapFeatures(_) } + } +} + +object BibjsonScorable { + def bibjsonToMapFeatures(json : String) : MapFeatures = { + Scorable.jsonToMap(json) match { + case None => MapFeatures(Scorable.NoSlug, json) + case Some(map) => { + if (map contains "title") { + val title = Scorable.getString(map, "title") + val doi = Scorable.getString(map, "doi") + val sha1 = Scorable.getString(map, "sha") + // TODO: year, authors (if available) + if (title == null || title.isEmpty) { + new MapFeatures(Scorable.NoSlug, json) + } else { + val sf : ScorableFeatures = new ScorableFeatures(title=title, doi=doi, sha1=sha1) + new MapFeatures(sf.toSlug, sf.toString) + } + } else { + new MapFeatures(Scorable.NoSlug, json) + } + } + } + } +} -- cgit v1.2.3 From 6fbe71c0d36012d9096c1f5557aa64a53f6218d5 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 19 Aug 2018 19:20:05 -0700 Subject: local bibjson-to-bibjson matching job (no tests) --- .../main/scala/sandcrawler/MatchBenchmarkJob.scala | 29 ++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala (limited to 'scalding') diff --git a/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala b/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala new file mode 100644 index 0000000..1578258 --- /dev/null +++ b/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala @@ -0,0 +1,29 @@ +package sandcrawler + +import cascading.pipe.Pipe +import com.twitter.scalding.Args +import com.twitter.scalding.TypedPipe +import com.twitter.scalding.TypedTsv +import parallelai.spyglass.base.JobBase + +class MatchBenchmarkJob(args: Args) extends JobBase(args) { + // TODO: Instantiate any subclass of Scorable specified in args. + val sc1 : Scorable = new BibjsonScorable() + val sc2 : Scorable = new BibjsonScorable() + val leftArgs = args + ("bibjson-input" -> List(args("left-bibjson"))) + val rightArgs = args + ("bibjson-input" -> List(args("right-bibjson"))) + val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(leftArgs) + val pipe2 : TypedPipe[(String, ReduceFeatures)] = sc2.getInputPipe(rightArgs) + + pipe1.join(pipe2).map { entry => + val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry + new ReduceOutput( + slug, + Scorable.computeSimilarity(features1, features2), + features1.json, + features2.json) + } + //TypedTsv doesn't work over case classes. + .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) } + .write(TypedTsv[(String, Int, String, String)](args("output"))) +} -- cgit v1.2.3 From e81774a66980ba17c42380884f39aa61b54e5eef Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 21 Aug 2018 21:22:19 -0700 Subject: BibjsonScorable: fix ScorableFeatures (after rebase) --- scalding/src/main/scala/sandcrawler/BibjsonScorable.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'scalding') diff --git a/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala b/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala index 7221a66..cdd598f 100644 --- a/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala +++ b/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala @@ -38,7 +38,7 @@ object BibjsonScorable { if (title == null || title.isEmpty) { new MapFeatures(Scorable.NoSlug, json) } else { - val sf : ScorableFeatures = new ScorableFeatures(title=title, doi=doi, sha1=sha1) + val sf : ScorableFeatures = ScorableFeatures.create(title=title, doi=doi, sha1=sha1) new MapFeatures(sf.toSlug, sf.toString) } } else { -- cgit v1.2.3