You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@bigtop.apache.org by ja...@apache.org on 2015/01/18 03:45:38 UTC
bigtop git commit: BIGTOP-1586: BigPetStore-Spark only works on the
East Coast.
Repository: bigtop
Updated Branches:
refs/heads/master 4309762fb -> 5dd666198
BIGTOP-1586: BigPetStore-Spark only works on the East Coast.
Project: http://git-wip-us.apache.org/repos/asf/bigtop/repo
Commit: http://git-wip-us.apache.org/repos/asf/bigtop/commit/5dd66619
Tree: http://git-wip-us.apache.org/repos/asf/bigtop/tree/5dd66619
Diff: http://git-wip-us.apache.org/repos/asf/bigtop/diff/5dd66619
Branch: refs/heads/master
Commit: 5dd66619848fb87fd25410c6c0e19db908663dc4
Parents: 4309762
Author: jayunit100 <ja...@gmail.com>
Authored: Sat Jan 17 10:54:39 2015 -0600
Committer: jayunit100 <ja...@apache.org>
Committed: Sat Jan 17 21:45:27 2015 -0500
----------------------------------------------------------------------
.../apache/bigpetstore/spark/etl/ETLSuite.scala | 68 +++++++++++++++-----
1 file changed, 52 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/bigtop/blob/5dd66619/bigtop-bigpetstore/bigpetstore-spark/src/test/scala/org/apache/bigpetstore/spark/etl/ETLSuite.scala
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/bigpetstore-spark/src/test/scala/org/apache/bigpetstore/spark/etl/ETLSuite.scala b/bigtop-bigpetstore/bigpetstore-spark/src/test/scala/org/apache/bigpetstore/spark/etl/ETLSuite.scala
index a7699ac..ca1cfdf 100644
--- a/bigtop-bigpetstore/bigpetstore-spark/src/test/scala/org/apache/bigpetstore/spark/etl/ETLSuite.scala
+++ b/bigtop-bigpetstore/bigpetstore-spark/src/test/scala/org/apache/bigpetstore/spark/etl/ETLSuite.scala
@@ -24,17 +24,30 @@ import java.util.Locale
import java.util.TimeZone
import org.apache.spark.{SparkContext, SparkConf}
-
import org.scalatest._
import org.scalatest.junit.JUnitRunner
import org.junit.runner.RunWith
import org.apache.bigtop.bigpetstore.spark.datamodel._
-// hack for running tests with Gradle
+/**
+ * This class tests that, when we read records from the generator, the
+ * data model classes are populated with correct information.
+ *
+ * This is a critical test, since all subsequent phases of BigPetStore will use the
+ * datamodel objects (i.e. Transaction, Customer, and so on) for the analytics which
+ * we do.
+ *
+ * Other BigPetStore unit tests may not need to mock / test data with this granular of precision.
+ * RunWith annotation is just a hack for running tests with Gradle
+ */
@RunWith(classOf[JUnitRunner])
class IOUtilsSuite extends FunSuite with BeforeAndAfterAll {
+ /**
+ * TODO : We are using Option monads as a replacement for nulls.
+ * Lets move towards immutable spark context instead, if possible ?
+ */
var sc: Option[SparkContext] = None
var rawRecords: Option[Array[(Store, Location, Customer, Location, TransactionProduct)]] = None
var transactions: Option[Array[Transaction]] = None
@@ -66,28 +79,21 @@ class IOUtilsSuite extends FunSuite with BeforeAndAfterAll {
val cal2 = Calendar.getInstance(TimeZone.getTimeZone("America/New_York"), Locale.US)
val cal3 = Calendar.getInstance(TimeZone.getTimeZone("America/New_York"), Locale.US)
- // Calendar seems to interpet months as 0-11
- // ms are not in output we parse.
- // have to set ms to 0, otherwise calendar will
- // use current system's ms.
cal1.set(2015, 10, 3, 1, 8, 11)
- cal1.set(Calendar.MILLISECOND, 0)
cal2.set(2015, 10, 2, 17, 51, 37)
- cal2.set(Calendar.MILLISECOND, 0)
cal3.set(2015, 9, 12, 4, 29, 46)
- cal3.set(Calendar.MILLISECOND, 0)
rawRecords = Some(Array(
(stores(0), locations(0), customers(0), locations(3),
TransactionProduct(999L, 32L, 5L, cal1, "category=dry dog food;brand=Happy Pup;flavor=Fish & Potato;size=30.0;per_unit_cost=2.67;")),
(stores(1), locations(1), customers(0), locations(3),
- TransactionProduct(999L, 31L, 1L, cal2, "category=poop bags;brand=Dog Days;color=Blue;size=60.0;per_unit_cost=0.21;")),
+ TransactionProduct(999L, 31L, 1L, cal2, "category=poop bags;brand=Dog Days;color=Blue;size=60.0;per_unit_cost=0.21;")),
(stores(2), locations(2), customers(0), locations(3),
- TransactionProduct(999L, 30L, 6L, cal3, "category=dry cat food;brand=Feisty Feline;flavor=Chicken & Rice;size=14.0;per_unit_cost=2.14;"))))
+ TransactionProduct(999L, 30L, 6L, cal3, "category=dry cat food;brand=Feisty Feline;flavor=Chicken & Rice;size=14.0;per_unit_cost=2.14;"))))
transactions = Some(Array(
Transaction(999L, 31L, 1L, cal2, 0L),
@@ -99,17 +105,47 @@ class IOUtilsSuite extends FunSuite with BeforeAndAfterAll {
sc.get.stop()
}
- test("Parse Raw Data") {
+ test("Parsing Generated Strings into Transaction Objects") {
val rawRDD = sc.get.parallelize(rawLines)
- val rdds = SparkETL.parseRawData(rawRDD)
+ val expectedRecords = rawRecords.get
+
+ //Goal: Confirm that these RDD's are identical to the expected ones.
+ val rdd = SparkETL.parseRawData(rawRDD).collect
+
+ /**
+ * Assumption: Order of RDD elements will be same as the mock records.
+ * This assumption seems to hold, but probably would break down if input size was large
+ * or running this test on distributed cluster.
+ */
+ for(i <- 0 to expectedRecords.length-1) {
+ val rawRecord = rdd(i)
+ val expectedRecord = expectedRecords(i)
+
+ //Store, Location, Customer, TransactionProduct
+ assert(rawRecord._1===expectedRecord._1)
+ assert(rawRecord._2===expectedRecord._2)
+ assert(rawRecord._3===expectedRecord._3)
+ assert(rawRecord._4===expectedRecord._4)
+
+ //Transaction
+ assert(rawRecord._5.customerId === expectedRecord._5.customerId)
+ assert(rawRecord._5.product === expectedRecord._5.product)
+ assert(rawRecord._5.storeId === expectedRecord._5.storeId)
+
+ //BIGTOP-1586 : We want granular assertions, and we don't care to compare millisecond timestamps.
+ assert(rawRecord._5.dateTime.getTime.getYear === expectedRecord._5.dateTime.getTime.getYear)
+ assert(rawRecord._5.dateTime.getTime.getMonth === expectedRecord._5.dateTime.getTime.getMonth)
+ assert(rawRecord._5.dateTime.getTime.getDay === expectedRecord._5.dateTime.getTime.getDay)
+ assert(rawRecord._5.dateTime.getTime.getHours === expectedRecord._5.dateTime.getTime.getHours)
+ assert(rawRecord._5.dateTime.getTime.getMinutes === expectedRecord._5.dateTime.getTime.getMinutes)
+ assert(rawRecord._5.dateTime.getTime.getSeconds=== expectedRecord._5.dateTime.getTime.getSeconds)
+ }
- assert(rdds.collect().toSet === rawRecords.get.toSet)
}
- test("Normalize Data") {
+ test("Generation of unique sets of transaction attributes") {
val rawRDD = sc.get.parallelize(rawRecords.get)
val rdds = SparkETL.normalizeData(rawRDD)
-
val locationRDD = rdds._1
val storeRDD = rdds._2
val customerRDD = rdds._3