aboutsummaryrefslogtreecommitdiffstats
path: root/scalding
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-07-19 16:26:00 -0700
committerBryan Newbold <bnewbold@archive.org>2018-07-24 11:27:45 -0700
commitc36e59bf03e692d22d6d72aa5ae37977e3a13524 (patch)
tree94a2e2f025e930a2d3d5c683a3b99f54c1cec5a6 /scalding
parent5883c665b9c6d1d27fe4b0cfd98098f671a5f5dd (diff)
downloadsandcrawler-c36e59bf03e692d22d6d72aa5ae37977e3a13524.tar.gz
sandcrawler-c36e59bf03e692d22d6d72aa5ae37977e3a13524.zip
CdxBackfillJob: implement other fields
Diffstat (limited to 'scalding')
-rw-r--r--scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala34
-rw-r--r--scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala69
2 files changed, 84 insertions, 19 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala
index 5ff667d..0af3c9c 100644
--- a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala
+++ b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala
@@ -10,6 +10,7 @@ import cascading.tap.SinkMode
import parallelai.spyglass.base.JobBase
import parallelai.spyglass.hbase.HBaseConstants.SourceMode
import parallelai.spyglass.hbase.{HBaseSource, HBasePipeConversions}
+import scala.util.parsing.json.JSONObject
// Type that represents a raw parsed CDX line
case class CdxLine(surt: String,
@@ -53,6 +54,7 @@ class CdxBackfillJob(args: Args) extends JobBase(args) with HBasePipeConversions
val cdxRows : TypedPipe[(String, String, String, String)] = cdxLines
.map { CdxBackfillJob.cdxLineToRow }
+ .debug
val existingKeys : TypedPipe[String] = hbaseSource
.read
@@ -67,7 +69,7 @@ class CdxBackfillJob(args: Args) extends JobBase(args) with HBasePipeConversions
.rightJoin(cdxRows.groupBy(_._1))
.toTypedPipe
.collect { case (_, (None, row)) => row }
- //.debug
+ .debug
// convert to tuple form and write out into HBase
newRows
@@ -149,17 +151,29 @@ object CdxBackfillJob {
// TODO: timezones? UTC to UTC, so I don't think so.
val dtIso = isoFormat.format(dtFormat.parse(line.datetime))
- // warc_file = warc.split('/')[-1]
- // dt_iso = datetime.strptime(dt, "%Y%m%d%H%M%S").isoformat()
- // f:c = dict(u=url, d=dt_iso, f=warc_file, o=int(offset), c=1)
-
// This is the "f:c" field. 'i' intentionally not set
- val heritrixInfo = ""
-
- // file:cdx = dict(surt=surt, dt=dt, url=url, c_size=int(c_size),
+ // python: f:c = dict(u=url, d=dt_iso, f=warc_file, o=int(offset), c=1)
+ // python: warc_file = warc.split('/')[-1]
+ // python: dt_iso = datetime.strptime(dt, "%Y%m%d%H%M%S").isoformat()
+ val heritrixInfo = JSONObject(Map(
+ "u" -> line.url,
+ "d" -> dtIso,
+ "f" -> warcFile,
+ "o" -> line.offset.toInt,
+ "c" -> line.c_size.toInt
+ ))
+
+ // python: dict(surt=surt, dt=dt, url=url, c_size=int(c_size),
// offset=int(offset), warc=warc)
- val fileCdx = ""
- (key, heritrixInfo, fileCdx, line.mime)
+ val fileCdx = JSONObject(Map(
+ "surt" -> line.surt,
+ "dt" -> line.datetime,
+ "url" -> line.url,
+ "c_size" -> line.c_size.toInt,
+ "offset" -> line.offset.toInt,
+ "warc" -> line.warc
+ ))
+ (key, heritrixInfo.toString(), fileCdx.toString(), line.mime)
}
def lineToCdxLine(line: String) : CdxLine = {
diff --git a/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala b/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala
index f0a024a..fb5b162 100644
--- a/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala
+++ b/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala
@@ -12,6 +12,7 @@ import org.scalatest.junit.JUnitRunner
import org.slf4j.LoggerFactory
import parallelai.spyglass.hbase.HBaseSource
import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import scala.util.parsing.json.JSON
class CdxBackfillTest extends FlatSpec with Matchers {
@@ -30,7 +31,7 @@ class CdxBackfillTest extends FlatSpec with Matchers {
it should "filter CDX lines" in {
assert(true === keepCdx(lineToCdxLine(
- "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz")))
+ """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""")))
// redirect
assert(false === keepCdx(lineToCdxLine(
"edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 301 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz")))
@@ -53,8 +54,33 @@ class CdxBackfillTest extends FlatSpec with Matchers {
}
it should "execute lineToRow" in {
- cdxLineToRow(lineToCdxLine(
- "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"))
+ // this particular test copied from python test_backfill_hbase_from_cdx.py
+ val row = cdxLineToRow(lineToCdxLine(
+ "eu,eui,cadmus)/bitstream/handle/1814/36635/rscas_2015_03.pdf;jsessionid=761393014319a39f40d32ae3eb3a853f?sequence=1 20170705062202 http://cadmus.eui.eu/bitstream/handle/1814/36635/RSCAS_2015_03.pdf%3Bjsessionid%3D761393014319A39F40D32AE3EB3A853F?sequence%3D1 application/PDF 200 MPCXVWMUTRUGFP36SLPHKDLY6NGU4S3J - - 854156 328850624 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"))
+
+ assert(row._1 == "sha1:MPCXVWMUTRUGFP36SLPHKDLY6NGU4S3J")
+ JSON.parseFull(row._2) match {
+ case Some(obj: Map[String, Any]) => {
+ assert(obj("u") == "http://cadmus.eui.eu/bitstream/handle/1814/36635/RSCAS_2015_03.pdf%3Bjsessionid%3D761393014319A39F40D32AE3EB3A853F?sequence%3D1")
+ assert(obj("f") == "CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz")
+ assert(obj("c") == 854156)
+ assert(obj("o") == 328850624)
+ assert(obj("d") == "2017-08-28T23:31:54Z")
+ }
+ case other => assert(false)
+ }
+ JSON.parseFull(row._3) match {
+ case Some(obj: Map[String, Any]) => {
+ assert(obj("surt") == "eu,eui,cadmus)/bitstream/handle/1814/36635/rscas_2015_03.pdf;jsessionid=761393014319a39f40d32ae3eb3a853f?sequence=1")
+ assert(obj("dt") == "20170705062202")
+ assert(obj("url") == "http://cadmus.eui.eu/bitstream/handle/1814/36635/RSCAS_2015_03.pdf%3Bjsessionid%3D761393014319A39F40D32AE3EB3A853F?sequence%3D1")
+ assert(obj("c_size") == 854156)
+ assert(obj("offset") == 328850624)
+ assert(obj("warc") == "CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz")
+ }
+ case other => assert(false)
+ }
+ assert(row._3 == "application/pdf")
}
}
@@ -76,15 +102,15 @@ class CdxBackfillJobTest extends FunSpec with TupleConversions {
)
val sampleCdxLines = List(
// clean line
- "0" -> "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
+ "0" -> """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""",
// has existing SHA1
- "1" -> "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
+ "1" -> """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""",
// HTTP status code
- "2" -> "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 301 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
+ "2" -> """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 301 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""",
// not CDX (prefixed with hash)
- "3" -> "#edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
+ "3" -> """#edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""",
// not PDF
- "4" -> "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/film 200 AAAAAEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ "4" -> """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/film 200 AAAAAEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"""
)
JobTest("sandcrawler.CdxBackfillJob")
@@ -100,12 +126,37 @@ class CdxBackfillJobTest extends FunSpec with TupleConversions {
.sink[(ImmutableBytesWritable, ImmutableBytesWritable, ImmutableBytesWritable, ImmutableBytesWritable)](CdxBackfillJob.getHBaseSink(testTable, testHost)) {
outputBuffer =>
+ val buf0 = outputBuffer(0)
+ val row0 = List(buf0._1, buf0._2, buf0._3, buf0._4).map(b => Bytes.toString(b.copyBytes()))
+
it("should return a 1-element list (after join).") {
assert(outputBuffer.size === 1)
}
it("should insert the valid, new CDX line") {
- assert(outputBuffer(0)._1 == Bytes.toBytes("sha1:WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G"))
+ assert(row0(0) == "sha1:WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G")
+ JSON.parseFull(row0(1)) match {
+ case Some(obj: Map[String, Any]) => {
+ assert(obj("u") == "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf")
+ assert(obj("f") == "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz")
+ assert(obj("c") == 210251)
+ assert(obj("o") == 931661233)
+ assert(obj("d") == "2017-08-28T23:31:54Z")
+ }
+ case other => assert(false)
+ }
+ JSON.parseFull(row0(2)) match {
+ case Some(obj: Map[String, Any]) => {
+ assert(obj("surt") == "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf")
+ assert(obj("dt") == "20170828233154")
+ assert(obj("url") == "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf")
+ assert(obj("c_size") == 210251)
+ assert(obj("offset") == 931661233)
+ assert(obj("warc") == "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz")
+ }
+ case other => assert(false)
+ }
+ assert(row0(3) == "application/pdf")
}
}
.run