aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-10-30 14:40:53 -0700
committerBryan Newbold <bnewbold@archive.org>2018-10-30 14:43:57 -0700
commit3105e9fc5799063027f3273048eea27f906d4c66 (patch)
tree4ab60cfee1cbbcacd3e255d1cf7877cc10df67b5
parent0bbe8e1f6689da846944d60a53e620adc2b7622b (diff)
downloadsandcrawler-3105e9fc5799063027f3273048eea27f906d4c66.tar.gz
sandcrawler-3105e9fc5799063027f3273048eea27f906d4c66.zip
quick and dirty GROBID XML dumper
-rw-r--r--scalding/src/main/scala/sandcrawler/DumpGrobidXmlJob.scala41
1 files changed, 41 insertions, 0 deletions
diff --git a/scalding/src/main/scala/sandcrawler/DumpGrobidXmlJob.scala b/scalding/src/main/scala/sandcrawler/DumpGrobidXmlJob.scala
new file mode 100644
index 0000000..1b178eb
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/DumpGrobidXmlJob.scala
@@ -0,0 +1,41 @@
+package sandcrawler
+
+import java.util.Properties
+
+import cascading.property.AppProps
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+import scala.util.parsing.json.JSONObject
+
+// Dumps the SHA1 key and grobid0:tei_xml columns, as TSV/JSON (two TSV
+// columns: one is key, second is JSON). Used for partner delivery/sharing
+class DumpGrobidXmlJob(args: Args) extends JobBase(args) with HBasePipeConversions {
+
+ val metaPipe : TypedPipe[(String, String)] = HBaseBuilder.build(args("hbase-table"),
+ args("zookeeper-hosts"),
+ List("file:cdx", "grobid0:tei_xml"),
+ SourceMode.SCAN_ALL)
+ .read
+ .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "cdx", "tei_xml"))
+ .filter { case (_, cdx, tei_xml) => cdx != null && tei_xml != null }
+ .map { case (key, cdx, tei_xml) =>
+ (Bytes.toString(key.copyBytes()),
+ JSONObject(
+ Map(
+ "pdf_hash" -> Bytes.toString(key.copyBytes()),
+ "cdx_metadata" -> Bytes.toString(key.copyBytes()),
+ "tei_xml" -> Bytes.toString(key.copyBytes())
+ )).toString
+ )
+ };
+
+ metaPipe.write(TypedTsv[(String,String)](args("output")))
+
+}