quick and dirty GROBID XML dumper

author: Bryan Newbold <bnewbold@archive.org> 2018-10-30 14:40:53 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2018-10-30 14:43:57 -0700
commit: 3105e9fc5799063027f3273048eea27f906d4c66 (patch)
tree: 4ab60cfee1cbbcacd3e255d1cf7877cc10df67b5 /scalding
parent: 0bbe8e1f6689da846944d60a53e620adc2b7622b (diff)
download: sandcrawler-3105e9fc5799063027f3273048eea27f906d4c66.tar.gz
sandcrawler-3105e9fc5799063027f3273048eea27f906d4c66.zip
1 files changed, 41 insertions, 0 deletions
diff --git a/scalding/src/main/scala/sandcrawler/DumpGrobidXmlJob.scala b/scalding/src/main/scala/sandcrawler/DumpGrobidXmlJob.scala
new file mode 100644
index 0000000..1b178eb
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/DumpGrobidXmlJob.scala
@@ -0,0 +1,41 @@
+package sandcrawler
+
+import java.util.Properties
+
+import cascading.property.AppProps
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+import scala.util.parsing.json.JSONObject
+
+// Dumps the SHA1 key and grobid0:tei_xml columns, as TSV/JSON (two TSV
+// columns: one is key, second is JSON). Used for partner delivery/sharing
+class DumpGrobidXmlJob(args: Args) extends JobBase(args) with HBasePipeConversions {
+
+  val metaPipe : TypedPipe[(String, String)] = HBaseBuilder.build(args("hbase-table"),
+                     args("zookeeper-hosts"),
+                     List("file:cdx", "grobid0:tei_xml"),
+                     SourceMode.SCAN_ALL)
+    .read
+    .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "cdx", "tei_xml"))
+    .filter { case (_, cdx, tei_xml) => cdx != null && tei_xml != null }
+    .map { case (key, cdx, tei_xml) =>
+      (Bytes.toString(key.copyBytes()),
+       JSONObject(
+        Map(
+          "pdf_hash" -> Bytes.toString(key.copyBytes()),
+          "cdx_metadata" -> Bytes.toString(key.copyBytes()),
+          "tei_xml" -> Bytes.toString(key.copyBytes())
+        )).toString
+      )
+    };
+
+  metaPipe.write(TypedTsv[(String,String)](args("output")))
+
+}
author	Bryan Newbold <bnewbold@archive.org>	2018-10-30 14:40:53 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2018-10-30 14:43:57 -0700
commit	3105e9fc5799063027f3273048eea27f906d4c66 (patch)
tree	4ab60cfee1cbbcacd3e255d1cf7877cc10df67b5 /scalding
parent	0bbe8e1f6689da846944d60a53e620adc2b7622b (diff)
download	sandcrawler-3105e9fc5799063027f3273048eea27f906d4c66.tar.gz sandcrawler-3105e9fc5799063027f3273048eea27f906d4c66.zip