You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@carbondata.apache.org by jackylk <gi...@git.apache.org> on 2018/03/30 07:27:18 UTC
[GitHub] carbondata pull request #1713: [CARBONDATA-1899] Optimize CarbonData concurr...
Github user jackylk commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/1713#discussion_r178245698
--- Diff: examples/spark2/src/main/scala/org/apache/carbondata/benchmark/ConcurrentQueryBenchmark.scala ---
@@ -0,0 +1,631 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.benchmark
+
+import java.io.File
+import java.text.SimpleDateFormat
+import java.util
+import java.util.Date
+import java.util.concurrent.{Callable, Executors, Future, TimeUnit}
+
+import scala.util.Random
+
+import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession}
+import org.apache.spark.sql.types._
+
+import org.apache.carbondata.core.constants.{CarbonCommonConstants, CarbonVersionConstants}
+import org.apache.carbondata.core.util.{CarbonProperties, CarbonUtil}
+
+// scalastyle:off println
+/**
+ * Test concurrent query performance of CarbonData
+ *
+ * This benchmark will print out some information:
+ * 1.Environment information
+ * 2.Parameters information
+ * 3.concurrent query performance result using parquet format
+ * 4.concurrent query performance result using CarbonData format
+ *
+ * This benchmark default run in local model,
+ * user can change 'runInLocal' to false if want to run in cluster,
+ * user can change variables like:
+ *
+ * spark-submit \
+ --class org.apache.carbondata.benchmark.ConcurrentQueryBenchmark \
+ --master yarn \
+ --deploy-mode client \
+ --driver-memory 16g \
+ --executor-cores 4g \
+ --executor-memory 24g \
+ --num-executors 3 \
+ concurrencyTest.jar \
+ totalNum threadNum taskNum resultIsEmpty runInLocal generateFile deleteFile
+ * details in initParameters method of this benchmark
+ */
+object ConcurrentQueryBenchmark {
+
+ // generate number of data
+ var totalNum = 1 * 1000 * 1000
+ // the number of thread pool
+ var threadNum = 16
+ // task number of spark sql query
+ var taskNum = 100
+ // whether is result empty, if true then result is empty
+ var resultIsEmpty = true
+ // the store path of task details
+ var path: String = "/tmp/carbondata"
+ // whether run in local or cluster
+ var runInLocal = true
+ // whether generate new file
+ var generateFile = true
+ // whether delete file
+ var deleteFile = true
+
+ val cardinalityId = 100 * 1000 * 1000
+ val cardinalityCity = 6
+
+ def parquetTableName: String = "Num" + totalNum + "_" + "comparetest_parquet"
+
+ def orcTableName: String = "Num" + totalNum + "_" + "comparetest_orc"
+
+ def carbonTableName(version: String): String =
+ "Num" + totalNum + "_" + s"comparetest_carbonV$version"
+
+ // Table schema:
+ // +-------------+-----------+-------------+-------------+------------+
+ // | Column name | Data type | Cardinality | Column type | Dictionary |
+ // +-------------+-----------+-------------+-------------+------------+
+ // | id | string | 100,000,000 | dimension | no |
+ // +-------------+-----------+-------------+-------------+------------+
+ // | city | string | 6 | dimension | yes |
+ // +-------------+-----------+-------------+-------------+------------+
+ // | country | string | 6 | dimension | yes |
+ // +-------------+-----------+-------------+-------------+------------+
+ // | planet | string | 10,007 | dimension | yes |
+ // +-------------+-----------+-------------+-------------+------------+
+ // | m1 | short | NA | measure | no |
+ // +-------------+-----------+-------------+-------------+------------+
+ // | m2 | int | NA | measure | no |
+ // +-------------+-----------+-------------+-------------+------------+
+ // | m3 | big int | NA | measure | no |
+ // +-------------+-----------+-------------+-------------+------------+
+ // | m4 | double | NA | measure | no |
+ // +-------------+-----------+-------------+-------------+------------+
+ // | m5 | decimal | NA | measure | no |
+ // +-------------+-----------+-------------+-------------+------------+
+ /**
+ * generate DataFrame with above table schema
+ *
+ * @param spark SparkSession
+ * @return Dataframe of test data
+ */
+ private def generateDataFrame(spark: SparkSession): DataFrame = {
--- End diff --
It is better to move this to a separate class in benchmark package, it can be reused in two benchmark class
---