From 3e33d60aac9db78d0458876fbe987627db222bbb Mon Sep 17 00:00:00 2001 From: Ellen Spertus Date: Tue, 24 Jul 2018 11:53:58 -0700 Subject: grobidToSlug() seems to work, including parsing of valid JSON strings. --- .../scala/sandcrawler/HBaseCrossrefScoreJob.scala | 58 +++++++++++++++++ .../scala/sandcrawler/HBaseCrossrefScoreTest.scala | 73 ++++++++++++++++++++++ 2 files changed, 131 insertions(+) create mode 100644 scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala create mode 100644 scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala (limited to 'scalding/src') diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala new file mode 100644 index 0000000..a22af81 --- /dev/null +++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala @@ -0,0 +1,58 @@ +package sandcrawler + +import java.util.Properties + +import scala.util.parsing.json.JSON + +import cascading.tuple.Fields +import com.twitter.scalding._ +import com.twitter.scalding.typed.TDsl._ +import org.apache.hadoop.hbase.io.ImmutableBytesWritable +import parallelai.spyglass.base.JobBase +import parallelai.spyglass.hbase.HBaseConstants.SourceMode +import parallelai.spyglass.hbase.HBasePipeConversions + +class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConversions { + + // key is SHA1 + val grobidSource = HBaseBuilder.build( + args("grobid-table"), + args("zookeeper-hosts"), + List("grobid0:tei_json"), + sourceMode = SourceMode.SCAN_ALL) + + val grobidPipe = grobidSource + .read + .map('tei_json -> 'slug) { + json : String => HBaseCrossrefScore.grobidToSlug(json)} + + /* + val crossrefSource = TextLine(args("input")) + val crossrefPipe = crossrefSource + .read + .map('line -> 'slug) { + json : String => crossrefToSlug(json)} + + + statusPipe.groupBy { identity } + .size + .debug + .write(TypedTsv[(Long,Long)](args("output"))) + */ +} + +object HBaseCrossrefScore { + def grobidToSlug(json : String) = { + // https://stackoverflow.com/a/32717262/631051 + val jsonObject = JSON.parseFull(json) + val globalMap = jsonObject.get.asInstanceOf[Map[String, Any]] + globalMap.get("title") match { + case Some(title) => titleToSlug(title.asInstanceOf[String]) + case None => "" + } + } + + def titleToSlug(title : String) = { + title.split(":")(0) + } +} diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala new file mode 100644 index 0000000..186bb70 --- /dev/null +++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala @@ -0,0 +1,73 @@ +package sandcrawler + +import cascading.tuple.Fields +import org.scalatest._ +import parallelai.spyglass.hbase.HBaseConstants.SourceMode + +class HBaseCrossrefScoreTest extends FlatSpec with Matchers { + val GrobidString = """ +{ + "title": "Dummy Example File", + "authors": [ + {"name": "Brewster Kahle"}, + {"name": "J Doe"} + ], + "journal": { + "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", + "eissn": null, + "issn": null, + "issue": null, + "publisher": null, + "volume": null + }, + "date": "2000", + "doi": null, + "citations": [ + { "authors": [{"name": "A Seaperson"}], + "date": "2001", + "id": "b0", + "index": 0, + "issue": null, + "journal": "Letters in the Alphabet", + "publisher": null, + "title": "Everything is Wonderful", + "url": null, + "volume": "20"}, + { "authors": [], + "date": "2011-03-28", + "id": "b1", + "index": 1, + "issue": null, + "journal": "The Dictionary", + "publisher": null, + "title": "All about Facts", + "url": null, + "volume": "14"} + ], + "abstract": "Everything you ever wanted to know about nothing", + "body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", + "acknowledgement": null, + "annex": null +} +""" + val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle") + + "titleToSlug()" should "extract the parts of titles before a colon" in { + val slug = HBaseCrossrefScore.titleToSlug("hello:there") + slug shouldBe "hello" + } + it should "extract an entire colon-less string" in { + val slug = HBaseCrossrefScore.titleToSlug("hello there") + slug shouldBe "hello there" + } + + "grobidToSlug()" should "get the right slug for a grobid json string" in { + val slug = HBaseCrossrefScore.grobidToSlug(GrobidString) + slug shouldBe "Dummy Example File" + } + + "grobidToSlug()" should "return empty string for a grobid json string without a title" in { + val slug = HBaseCrossrefScore.grobidToSlug(GrobidStringWithoutTitle) + slug shouldBe "" + } +} -- cgit v1.2.3