You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@carbondata.apache.org by ja...@apache.org on 2018/03/28 03:19:42 UTC
[1/5] carbondata git commit: [CARBONDATA-2165]Remove spark in
carbon-hadoop module
Repository: carbondata
Updated Branches:
refs/heads/master 2e1ddb542 -> c723947a7
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/streaming/src/main/scala/org/apache/carbondata/streaming/StreamSinkFactory.scala
----------------------------------------------------------------------
diff --git a/streaming/src/main/scala/org/apache/carbondata/streaming/StreamSinkFactory.scala b/streaming/src/main/scala/org/apache/carbondata/streaming/StreamSinkFactory.scala
deleted file mode 100644
index bc7b042..0000000
--- a/streaming/src/main/scala/org/apache/carbondata/streaming/StreamSinkFactory.scala
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.carbondata.streaming
-
-import scala.collection.JavaConverters._
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd}
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.execution.streaming.{CarbonAppendableStreamSink, Sink}
-
-import org.apache.carbondata.core.constants.CarbonCommonConstants
-import org.apache.carbondata.core.datastore.impl.FileFactory
-import org.apache.carbondata.core.dictionary.server.{DictionaryServer, NonSecureDictionaryServer}
-import org.apache.carbondata.core.dictionary.service.NonSecureDictionaryServiceProvider
-import org.apache.carbondata.core.metadata.encoder.Encoding
-import org.apache.carbondata.core.metadata.schema.table.CarbonTable
-import org.apache.carbondata.core.util.CarbonProperties
-import org.apache.carbondata.core.util.path.CarbonTablePath
-import org.apache.carbondata.events.{OperationContext, OperationListenerBus}
-import org.apache.carbondata.processing.loading.events.LoadEvents.{LoadTablePostExecutionEvent, LoadTablePreExecutionEvent}
-import org.apache.carbondata.processing.loading.model.{CarbonLoadModel, CarbonLoadModelBuilder, LoadOption}
-import org.apache.carbondata.spark.dictionary.provider.SecureDictionaryServiceProvider
-import org.apache.carbondata.spark.dictionary.server.SecureDictionaryServer
-import org.apache.carbondata.streaming.segment.StreamSegment
-
-/**
- * Stream sink factory
- */
-object StreamSinkFactory {
-
- def createStreamTableSink(
- sparkSession: SparkSession,
- hadoopConf: Configuration,
- carbonTable: CarbonTable,
- parameters: Map[String, String]): Sink = {
- validateParameters(parameters)
-
- // build load model
- val carbonLoadModel = buildCarbonLoadModelForStream(
- sparkSession,
- hadoopConf,
- carbonTable,
- parameters,
- "")
- // fire pre event before streamin is started
- // in case of streaming options and optionsFinal can be same
- val operationContext = new OperationContext
- val loadTablePreExecutionEvent = new LoadTablePreExecutionEvent(
- carbonTable.getCarbonTableIdentifier,
- carbonLoadModel,
- carbonLoadModel.getFactFilePath,
- false,
- parameters.asJava,
- parameters.asJava,
- false
- )
- OperationListenerBus.getInstance().fireEvent(loadTablePreExecutionEvent, operationContext)
- // prepare the stream segment
- val segmentId = getStreamSegmentId(carbonTable)
- carbonLoadModel.setSegmentId(segmentId)
-
- // start server if necessary
- val server = startDictionaryServer(
- sparkSession,
- carbonTable,
- carbonLoadModel)
- if (server.isDefined) {
- carbonLoadModel.setUseOnePass(true)
- } else {
- carbonLoadModel.setUseOnePass(false)
- }
- // default is carbon appended stream sink
- val carbonAppendableStreamSink = new CarbonAppendableStreamSink(
- sparkSession,
- carbonTable,
- segmentId,
- parameters,
- carbonLoadModel,
- server)
-
- // fire post event before streamin is started
- val loadTablePostExecutionEvent = new LoadTablePostExecutionEvent(
- carbonTable.getCarbonTableIdentifier,
- carbonLoadModel
- )
- OperationListenerBus.getInstance().fireEvent(loadTablePostExecutionEvent, operationContext)
- carbonAppendableStreamSink
- }
-
- private def validateParameters(parameters: Map[String, String]): Unit = {
- val segmentSize = parameters.get(CarbonCommonConstants.HANDOFF_SIZE)
- if (segmentSize.isDefined) {
- try {
- val value = java.lang.Long.parseLong(segmentSize.get)
- if (value < CarbonCommonConstants.HANDOFF_SIZE_MIN) {
- new CarbonStreamException(CarbonCommonConstants.HANDOFF_SIZE +
- "should be bigger than or equal " +
- CarbonCommonConstants.HANDOFF_SIZE_MIN)
- }
- } catch {
- case _: NumberFormatException =>
- new CarbonStreamException(CarbonCommonConstants.HANDOFF_SIZE +
- s" $segmentSize is an illegal number")
- }
- }
- }
-
- /**
- * get current stream segment id
- * @return
- */
- private def getStreamSegmentId(carbonTable: CarbonTable): String = {
- val segmentId = StreamSegment.open(carbonTable)
- val segmentDir = CarbonTablePath.getSegmentPath(carbonTable.getTablePath, segmentId)
- val fileType = FileFactory.getFileType(segmentDir)
- if (!FileFactory.isFileExist(segmentDir, fileType)) {
- // Create table directory path, in case of enabling hive metastore first load may not have
- // table folder created.
- FileFactory.mkdirs(segmentDir, fileType)
- }
- if (FileFactory.isFileExist(segmentDir, fileType)) {
- // recover fault
- StreamSegment.recoverSegmentIfRequired(segmentDir)
- } else {
- FileFactory.mkdirs(segmentDir, fileType)
- }
- segmentId
- }
-
- def startDictionaryServer(
- sparkSession: SparkSession,
- carbonTable: CarbonTable,
- carbonLoadModel: CarbonLoadModel): Option[DictionaryServer] = {
- // start dictionary server when use one pass load and dimension with DICTIONARY
- // encoding is present.
- val allDimensions = carbonTable.getAllDimensions.asScala.toList
- val createDictionary = allDimensions.exists {
- carbonDimension => carbonDimension.hasEncoding(Encoding.DICTIONARY) &&
- !carbonDimension.hasEncoding(Encoding.DIRECT_DICTIONARY)
- }
- val carbonSecureModeDictServer = CarbonProperties.getInstance.
- getProperty(CarbonCommonConstants.CARBON_SECURE_DICTIONARY_SERVER,
- CarbonCommonConstants.CARBON_SECURE_DICTIONARY_SERVER_DEFAULT)
-
- val sparkConf = sparkSession.sqlContext.sparkContext.getConf
- val sparkDriverHost = sparkSession.sqlContext.sparkContext.
- getConf.get("spark.driver.host")
-
- val server: Option[DictionaryServer] = if (createDictionary) {
- if (sparkConf.get("spark.authenticate", "false").equalsIgnoreCase("true") &&
- carbonSecureModeDictServer.toBoolean) {
- val dictionaryServer = SecureDictionaryServer.getInstance(sparkConf,
- sparkDriverHost.toString, carbonLoadModel.getDictionaryServerPort, carbonTable)
- carbonLoadModel.setDictionaryServerPort(dictionaryServer.getPort)
- carbonLoadModel.setDictionaryServerHost(dictionaryServer.getHost)
- carbonLoadModel.setDictionaryServerSecretKey(dictionaryServer.getSecretKey)
- carbonLoadModel.setDictionaryEncryptServerSecure(dictionaryServer.isEncryptSecureServer)
- carbonLoadModel.setDictionaryServiceProvider(new SecureDictionaryServiceProvider())
- sparkSession.sparkContext.addSparkListener(new SparkListener() {
- override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd) {
- dictionaryServer.shutdown()
- }
- })
- Some(dictionaryServer)
- } else {
- val dictionaryServer = NonSecureDictionaryServer
- .getInstance(carbonLoadModel.getDictionaryServerPort, carbonTable)
- carbonLoadModel.setDictionaryServerPort(dictionaryServer.getPort)
- carbonLoadModel.setDictionaryServerHost(dictionaryServer.getHost)
- carbonLoadModel.setDictionaryEncryptServerSecure(false)
- carbonLoadModel
- .setDictionaryServiceProvider(new NonSecureDictionaryServiceProvider(dictionaryServer
- .getPort))
- sparkSession.sparkContext.addSparkListener(new SparkListener() {
- override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd) {
- dictionaryServer.shutdown()
- }
- })
- Some(dictionaryServer)
- }
- } else {
- None
- }
- server
- }
-
- private def buildCarbonLoadModelForStream(
- sparkSession: SparkSession,
- hadoopConf: Configuration,
- carbonTable: CarbonTable,
- parameters: Map[String, String],
- segmentId: String): CarbonLoadModel = {
- val carbonProperty: CarbonProperties = CarbonProperties.getInstance()
- carbonProperty.addProperty("zookeeper.enable.lock", "false")
- val optionsFinal = LoadOption.fillOptionWithDefaultValue(parameters.asJava)
- optionsFinal.put("sort_scope", "no_sort")
- if (parameters.get("fileheader").isEmpty) {
- optionsFinal.put("fileheader", carbonTable.getCreateOrderColumn(carbonTable.getTableName)
- .asScala.map(_.getColName).mkString(","))
- }
- val carbonLoadModel = new CarbonLoadModel()
- new CarbonLoadModelBuilder(carbonTable).build(
- parameters.asJava,
- optionsFinal,
- carbonLoadModel,
- hadoopConf)
- carbonLoadModel.setSegmentId(segmentId)
- // stream should use one pass
- val dictionaryServerPort = parameters.getOrElse(
- CarbonCommonConstants.DICTIONARY_SERVER_PORT,
- carbonProperty.getProperty(
- CarbonCommonConstants.DICTIONARY_SERVER_PORT,
- CarbonCommonConstants.DICTIONARY_SERVER_PORT_DEFAULT))
- val sparkDriverHost = sparkSession.sqlContext.sparkContext.
- getConf.get("spark.driver.host")
- carbonLoadModel.setDictionaryServerHost(sparkDriverHost)
- carbonLoadModel.setDictionaryServerPort(dictionaryServerPort.toInt)
- carbonLoadModel
- }
-}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/streaming/src/main/scala/org/apache/carbondata/streaming/parser/FieldConverter.scala
----------------------------------------------------------------------
diff --git a/streaming/src/main/scala/org/apache/carbondata/streaming/parser/FieldConverter.scala b/streaming/src/main/scala/org/apache/carbondata/streaming/parser/FieldConverter.scala
new file mode 100644
index 0000000..8661417
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/carbondata/streaming/parser/FieldConverter.scala
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.streaming.parser
+
+import java.nio.charset.Charset
+import java.text.SimpleDateFormat
+
+import org.apache.carbondata.core.constants.CarbonCommonConstants
+
+object FieldConverter {
+
+ /**
+ * Return a String representation of the input value
+ * @param value input value
+ * @param serializationNullFormat string for null value
+ * @param delimiterLevel1 level 1 delimiter for complex type
+ * @param delimiterLevel2 level 2 delimiter for complex type
+ * @param timeStampFormat timestamp format
+ * @param dateFormat date format
+ * @param level level for recursive call
+ */
+ def objectToString(
+ value: Any,
+ serializationNullFormat: String,
+ delimiterLevel1: String,
+ delimiterLevel2: String,
+ timeStampFormat: SimpleDateFormat,
+ dateFormat: SimpleDateFormat,
+ level: Int = 1): String = {
+ if (value == null) {
+ serializationNullFormat
+ } else {
+ value match {
+ case s: String => if (s.length > CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT) {
+ throw new Exception("Dataload failed, String length cannot exceed " +
+ CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT + " characters")
+ } else {
+ s
+ }
+ case d: java.math.BigDecimal => d.toPlainString
+ case i: java.lang.Integer => i.toString
+ case d: java.lang.Double => d.toString
+ case t: java.sql.Timestamp => timeStampFormat format t
+ case d: java.sql.Date => dateFormat format d
+ case b: java.lang.Boolean => b.toString
+ case s: java.lang.Short => s.toString
+ case f: java.lang.Float => f.toString
+ case bs: Array[Byte] => new String(bs,
+ Charset.forName(CarbonCommonConstants.DEFAULT_CHARSET))
+ case s: scala.collection.Seq[Any] =>
+ val delimiter = if (level == 1) {
+ delimiterLevel1
+ } else {
+ delimiterLevel2
+ }
+ val builder = new StringBuilder()
+ s.foreach { x =>
+ builder.append(objectToString(x, serializationNullFormat, delimiterLevel1,
+ delimiterLevel2, timeStampFormat, dateFormat, level + 1)).append(delimiter)
+ }
+ builder.substring(0, builder.length - delimiter.length())
+ case m: scala.collection.Map[Any, Any] =>
+ throw new Exception("Unsupported data type: Map")
+ case r: org.apache.spark.sql.Row =>
+ val delimiter = if (level == 1) {
+ delimiterLevel1
+ } else {
+ delimiterLevel2
+ }
+ val builder = new StringBuilder()
+ for (i <- 0 until r.length) {
+ builder.append(objectToString(r(i), serializationNullFormat, delimiterLevel1,
+ delimiterLevel2, timeStampFormat, dateFormat, level + 1)).append(delimiter)
+ }
+ builder.substring(0, builder.length - delimiter.length())
+ case other => other.toString
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/streaming/src/main/scala/org/apache/carbondata/streaming/parser/RowStreamParserImp.scala
----------------------------------------------------------------------
diff --git a/streaming/src/main/scala/org/apache/carbondata/streaming/parser/RowStreamParserImp.scala b/streaming/src/main/scala/org/apache/carbondata/streaming/parser/RowStreamParserImp.scala
index 5a227cf..1696fdc 100644
--- a/streaming/src/main/scala/org/apache/carbondata/streaming/parser/RowStreamParserImp.scala
+++ b/streaming/src/main/scala/org/apache/carbondata/streaming/parser/RowStreamParserImp.scala
@@ -17,6 +17,7 @@
package org.apache.carbondata.streaming.parser
+import java.nio.charset.Charset
import java.text.SimpleDateFormat
import org.apache.hadoop.conf.Configuration
@@ -27,7 +28,6 @@ import org.apache.spark.sql.types.StructType
import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.processing.loading.constants.DataLoadProcessorConstants
-import org.apache.carbondata.spark.util.CarbonScalaUtil
/**
* SparkSQL Row Stream Parser.
@@ -61,12 +61,13 @@ class RowStreamParserImp extends CarbonStreamParser {
override def parserRow(value: InternalRow): Array[Object] = {
this.encoder.fromRow(value).toSeq.map { x => {
- CarbonScalaUtil.getString(x,
- serializationNullFormat, complexDelimiterLevel1, complexDelimiterLevel2,
+ FieldConverter.objectToString(
+ x, serializationNullFormat, complexDelimiterLevel1, complexDelimiterLevel2,
timeStampFormat, dateFormat)
} }.toArray
}
override def close(): Unit = {
}
+
}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/streaming/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonAppendableStreamSink.scala
----------------------------------------------------------------------
diff --git a/streaming/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonAppendableStreamSink.scala b/streaming/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonAppendableStreamSink.scala
deleted file mode 100644
index 6e6d092..0000000
--- a/streaming/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonAppendableStreamSink.scala
+++ /dev/null
@@ -1,362 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.streaming
-
-import java.util.Date
-
-import scala.collection.JavaConverters._
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.mapreduce._
-import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
-import org.apache.spark.TaskContext
-import org.apache.spark.internal.io.FileCommitProtocol
-import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage
-import org.apache.spark.sql.{DataFrame, SparkSession}
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.execution.{QueryExecution, SQLExecution}
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.util.{SerializableConfiguration, Utils}
-
-import org.apache.carbondata.common.CarbonIterator
-import org.apache.carbondata.common.logging.LogServiceFactory
-import org.apache.carbondata.core.constants.CarbonCommonConstants
-import org.apache.carbondata.core.datastore.impl.FileFactory
-import org.apache.carbondata.core.dictionary.server.DictionaryServer
-import org.apache.carbondata.core.metadata.schema.table.CarbonTable
-import org.apache.carbondata.core.stats.QueryStatistic
-import org.apache.carbondata.core.util.CarbonProperties
-import org.apache.carbondata.core.util.path.CarbonTablePath
-import org.apache.carbondata.events.{OperationContext, OperationListenerBus}
-import org.apache.carbondata.hadoop.streaming.CarbonStreamOutputFormat
-import org.apache.carbondata.hadoop.util.CarbonInputFormatUtil
-import org.apache.carbondata.processing.loading.constants.DataLoadProcessorConstants
-import org.apache.carbondata.processing.loading.events.LoadEvents.{LoadTablePostExecutionEvent, LoadTablePreExecutionEvent}
-import org.apache.carbondata.processing.loading.model.CarbonLoadModel
-import org.apache.carbondata.streaming.{CarbonStreamException, StreamHandoffRDD}
-import org.apache.carbondata.streaming.parser.CarbonStreamParser
-import org.apache.carbondata.streaming.segment.StreamSegment
-
-/**
- * an implement of stream sink, it persist each batch to disk by appending the batch data to
- * data files.
- */
-class CarbonAppendableStreamSink(
- sparkSession: SparkSession,
- val carbonTable: CarbonTable,
- var currentSegmentId: String,
- parameters: Map[String, String],
- carbonLoadModel: CarbonLoadModel,
- server: Option[DictionaryServer]) extends Sink {
-
- private val fileLogPath = CarbonTablePath.getStreamingLogDir(carbonTable.getTablePath)
- private val fileLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession, fileLogPath)
- // prepare configuration
- private val hadoopConf = {
- val conf = sparkSession.sessionState.newHadoopConf()
- // put all parameters into hadoopConf
- parameters.foreach { entry =>
- conf.set(entry._1, entry._2)
- }
- // properties below will be used for default CarbonStreamParser
- conf.set("carbon_complex_delimiter_level_1",
- carbonLoadModel.getComplexDelimiterLevel1)
- conf.set("carbon_complex_delimiter_level_2",
- carbonLoadModel.getComplexDelimiterLevel2)
- conf.set(
- DataLoadProcessorConstants.SERIALIZATION_NULL_FORMAT,
- carbonLoadModel.getSerializationNullFormat().split(",")(1))
- conf.set(
- CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT,
- carbonLoadModel.getTimestampformat())
- conf.set(
- CarbonCommonConstants.CARBON_DATE_FORMAT,
- carbonLoadModel.getDateFormat())
- conf
- }
- // segment max size(byte)
- private val segmentMaxSize = hadoopConf.getLong(
- CarbonCommonConstants.HANDOFF_SIZE,
- CarbonProperties.getInstance().getHandoffSize
- )
-
- // auto handoff
- private val enableAutoHandoff = hadoopConf.getBoolean(
- CarbonCommonConstants.ENABLE_AUTO_HANDOFF,
- CarbonProperties.getInstance().isEnableAutoHandoff
- )
-
- override def addBatch(batchId: Long, data: DataFrame): Unit = {
- if (batchId <= fileLog.getLatest().map(_._1).getOrElse(-1L)) {
- CarbonAppendableStreamSink.LOGGER.info(s"Skipping already committed batch $batchId")
- } else {
-
- val statistic = new QueryStatistic()
-
- // fire pre event on every batch add
- // in case of streaming options and optionsFinal can be same
- val operationContext = new OperationContext
- val loadTablePreExecutionEvent = new LoadTablePreExecutionEvent(
- carbonTable.getCarbonTableIdentifier,
- carbonLoadModel,
- carbonLoadModel.getFactFilePath,
- false,
- parameters.asJava,
- parameters.asJava,
- false
- )
- OperationListenerBus.getInstance().fireEvent(loadTablePreExecutionEvent, operationContext)
- checkOrHandOffSegment()
-
- // committer will record how this spark job commit its output
- val committer = FileCommitProtocol.instantiate(
- className = sparkSession.sessionState.conf.streamingFileCommitProtocolClass,
- jobId = batchId.toString,
- outputPath = fileLogPath,
- isAppend = false)
-
- committer match {
- case manifestCommitter: ManifestFileCommitProtocol =>
- manifestCommitter.setupManifestOptions(fileLog, batchId)
- case _ => // Do nothing
- }
-
- CarbonAppendableStreamSink.writeDataFileJob(
- sparkSession,
- carbonTable,
- parameters,
- batchId,
- currentSegmentId,
- data.queryExecution,
- committer,
- hadoopConf,
- carbonLoadModel,
- server)
- // fire post event on every batch add
- val loadTablePostExecutionEvent = new LoadTablePostExecutionEvent(
- carbonTable.getCarbonTableIdentifier,
- carbonLoadModel
- )
- OperationListenerBus.getInstance().fireEvent(loadTablePostExecutionEvent, operationContext)
-
- statistic.addStatistics(s"add batch: $batchId", System.currentTimeMillis())
- CarbonAppendableStreamSink.LOGGER.info(
- s"${statistic.getMessage}, taken time(ms): ${statistic.getTimeTaken}")
- }
- }
-
- /**
- * if the directory size of current segment beyond the threshold, hand off new segment
- */
- private def checkOrHandOffSegment(): Unit = {
- val segmentDir = CarbonTablePath.getSegmentPath(carbonTable.getTablePath, currentSegmentId)
- val fileType = FileFactory.getFileType(segmentDir)
- if (segmentMaxSize <= StreamSegment.size(segmentDir)) {
- val newSegmentId = StreamSegment.close(carbonTable, currentSegmentId)
- currentSegmentId = newSegmentId
- val newSegmentDir = CarbonTablePath.getSegmentPath(carbonTable.getTablePath, currentSegmentId)
- FileFactory.mkdirs(newSegmentDir, fileType)
-
- // TODO trigger hand off operation
- if (enableAutoHandoff) {
- StreamHandoffRDD.startStreamingHandoffThread(
- carbonLoadModel,
- sparkSession,
- false)
- }
- }
- }
-}
-
-object CarbonAppendableStreamSink {
-
- private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName)
-
- /**
- * package the hadoop configuration and it will be passed to executor side from driver side
- */
- case class WriteDataFileJobDescription(
- serializableHadoopConf: SerializableConfiguration,
- batchId: Long,
- segmentId: String)
-
- /**
- * Run a spark job to append the newly arrived data to the existing row format
- * file directly.
- * If there are failure in the task, spark will re-try the task and
- * carbon will do recovery by HDFS file truncate. (see StreamSegment.tryRecoverFromTaskFault)
- * If there are job level failure, every files in the stream segment will do truncate
- * if necessary. (see StreamSegment.tryRecoverFromJobFault)
- */
- def writeDataFileJob(
- sparkSession: SparkSession,
- carbonTable: CarbonTable,
- parameters: Map[String, String],
- batchId: Long,
- segmentId: String,
- queryExecution: QueryExecution,
- committer: FileCommitProtocol,
- hadoopConf: Configuration,
- carbonLoadModel: CarbonLoadModel,
- server: Option[DictionaryServer]): Unit = {
-
- // create job
- val job = Job.getInstance(hadoopConf)
- job.setOutputKeyClass(classOf[Void])
- job.setOutputValueClass(classOf[InternalRow])
- val jobId = CarbonInputFormatUtil.getJobId(new Date, batchId.toInt)
- job.setJobID(jobId)
-
- val description = WriteDataFileJobDescription(
- serializableHadoopConf = new SerializableConfiguration(job.getConfiguration),
- batchId,
- segmentId
- )
-
- // run write data file job
- SQLExecution.withNewExecutionId(sparkSession, queryExecution) {
- var result: Array[TaskCommitMessage] = null
- try {
- committer.setupJob(job)
- // initialize dictionary server
- if (server.isDefined) {
- server.get.initializeDictionaryGenerator(carbonTable)
- }
-
- val rowSchema = queryExecution.analyzed.schema
- // write data file
- result = sparkSession.sparkContext.runJob(queryExecution.toRdd,
- (taskContext: TaskContext, iterator: Iterator[InternalRow]) => {
- writeDataFileTask(
- description,
- carbonLoadModel,
- sparkStageId = taskContext.stageId(),
- sparkPartitionId = taskContext.partitionId(),
- sparkAttemptNumber = taskContext.attemptNumber(),
- committer,
- iterator,
- rowSchema
- )
- })
-
- // write dictionary
- if (server.isDefined) {
- try {
- server.get.writeTableDictionary(carbonTable.getCarbonTableIdentifier.getTableId)
- } catch {
- case _: Exception =>
- LOGGER.error(
- s"Error while writing dictionary file for ${carbonTable.getTableUniqueName}")
- throw new Exception(
- "Streaming ingest failed due to error while writing dictionary file")
- }
- }
-
- // update data file info in index file
- StreamSegment.updateIndexFile(
- CarbonTablePath.getSegmentPath(carbonTable.getTablePath, segmentId))
-
- } catch {
- // catch fault of executor side
- case t: Throwable =>
- val segmentDir = CarbonTablePath.getSegmentPath(carbonTable.getTablePath, segmentId)
- StreamSegment.recoverSegmentIfRequired(segmentDir)
- LOGGER.error(t, s"Aborting job ${ job.getJobID }.")
- committer.abortJob(job)
- throw new CarbonStreamException("Job failed to write data file", t)
- }
- committer.commitJob(job, result)
- LOGGER.info(s"Job ${ job.getJobID } committed.")
- }
- }
-
- /**
- * execute a task for each partition to write a data file
- */
- def writeDataFileTask(
- description: WriteDataFileJobDescription,
- carbonLoadModel: CarbonLoadModel,
- sparkStageId: Int,
- sparkPartitionId: Int,
- sparkAttemptNumber: Int,
- committer: FileCommitProtocol,
- iterator: Iterator[InternalRow],
- rowSchema: StructType
- ): TaskCommitMessage = {
-
- val jobId = CarbonInputFormatUtil.getJobId(new Date, sparkStageId)
- val taskId = new TaskID(jobId, TaskType.MAP, sparkPartitionId)
- val taskAttemptId = new TaskAttemptID(taskId, sparkAttemptNumber)
-
- // Set up the attempt context required to use in the output committer.
- val taskAttemptContext: TaskAttemptContext = {
- // Set up the configuration object
- val hadoopConf = description.serializableHadoopConf.value
- CarbonStreamOutputFormat.setSegmentId(hadoopConf, description.segmentId)
- hadoopConf.set("mapred.job.id", jobId.toString)
- hadoopConf.set("mapred.tip.id", taskAttemptId.getTaskID.toString)
- hadoopConf.set("mapred.task.id", taskAttemptId.toString)
- hadoopConf.setBoolean("mapred.task.is.map", true)
- hadoopConf.setInt("mapred.task.partition", 0)
- new TaskAttemptContextImpl(hadoopConf, taskAttemptId)
- }
-
- committer.setupTask(taskAttemptContext)
-
- try {
- Utils.tryWithSafeFinallyAndFailureCallbacks(block = {
-
- val parserName = taskAttemptContext.getConfiguration.get(
- CarbonStreamParser.CARBON_STREAM_PARSER,
- CarbonStreamParser.CARBON_STREAM_PARSER_DEFAULT)
-
- val streamParser =
- Class.forName(parserName).newInstance.asInstanceOf[CarbonStreamParser]
- streamParser.initialize(taskAttemptContext.getConfiguration, rowSchema)
-
- StreamSegment.appendBatchData(new InputIterator(iterator, streamParser),
- taskAttemptContext, carbonLoadModel)
- })(catchBlock = {
- committer.abortTask(taskAttemptContext)
- LOGGER.error(s"Job $jobId aborted.")
- })
- committer.commitTask(taskAttemptContext)
- } catch {
- case t: Throwable =>
- throw new CarbonStreamException("Task failed while writing rows", t)
- }
- }
-
- /**
- * convert spark iterator to carbon iterator, so that java module can use it.
- */
- class InputIterator(rddIter: Iterator[InternalRow], streamParser: CarbonStreamParser)
- extends CarbonIterator[Array[Object]] {
-
- override def hasNext: Boolean = rddIter.hasNext
-
- override def next: Array[Object] = {
- streamParser.parserRow(rddIter.next())
- }
-
- override def close(): Unit = {
- streamParser.close()
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/streaming/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonStreamingQueryListener.scala
----------------------------------------------------------------------
diff --git a/streaming/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonStreamingQueryListener.scala b/streaming/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonStreamingQueryListener.scala
deleted file mode 100644
index 2f911c5..0000000
--- a/streaming/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonStreamingQueryListener.scala
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.streaming
-
-import java.util
-import java.util.UUID
-
-import org.apache.spark.SPARK_VERSION
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.streaming.StreamingQueryListener
-
-import org.apache.carbondata.common.logging.LogServiceFactory
-import org.apache.carbondata.core.locks.{CarbonLockFactory, ICarbonLock, LockUsage}
-
-class CarbonStreamingQueryListener(spark: SparkSession) extends StreamingQueryListener {
-
- private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName)
-
- private val cache = new util.HashMap[UUID, ICarbonLock]()
-
- override def onQueryStarted(event: StreamingQueryListener.QueryStartedEvent): Unit = {
- val streamQuery = spark.streams.get(event.id)
- val qry = if (streamQuery.isInstanceOf[StreamExecution]) {
- // adapt spark 2.1
- streamQuery.asInstanceOf[StreamExecution]
- } else {
- // adapt spark 2.2 and later version
- val clazz = Class.forName("org.apache.spark.sql.execution.streaming.StreamingQueryWrapper")
- val method = clazz.getMethod("streamingQuery")
- method.invoke(streamQuery).asInstanceOf[StreamExecution]
- }
- if (qry.sink.isInstanceOf[CarbonAppendableStreamSink]) {
- LOGGER.info("Carbon streaming query started: " + event.id)
- val sink = qry.sink.asInstanceOf[CarbonAppendableStreamSink]
- val carbonTable = sink.carbonTable
- val lock = CarbonLockFactory.getCarbonLockObj(carbonTable.getAbsoluteTableIdentifier,
- LockUsage.STREAMING_LOCK)
- if (lock.lockWithRetries()) {
- LOGGER.info("Acquired the lock for stream table: " + carbonTable.getDatabaseName + "." +
- carbonTable.getTableName)
- cache.put(event.id, lock)
- } else {
- LOGGER.error("Not able to acquire the lock for stream table:" +
- carbonTable.getDatabaseName + "." + carbonTable.getTableName)
- throw new InterruptedException(
- "Not able to acquire the lock for stream table: " + carbonTable.getDatabaseName + "." +
- carbonTable.getTableName)
- }
- }
- }
-
- override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = {
- }
-
- override def onQueryTerminated(event: StreamingQueryListener.QueryTerminatedEvent): Unit = {
- val lock = cache.remove(event.id)
- if (null != lock) {
- LOGGER.info("Carbon streaming query: " + event.id)
- lock.unlock()
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/streaming/src/test/java/org/apache/carbondata/streaming/CarbonStreamInputFormatTest.java
----------------------------------------------------------------------
diff --git a/streaming/src/test/java/org/apache/carbondata/streaming/CarbonStreamInputFormatTest.java b/streaming/src/test/java/org/apache/carbondata/streaming/CarbonStreamInputFormatTest.java
new file mode 100644
index 0000000..a224446
--- /dev/null
+++ b/streaming/src/test/java/org/apache/carbondata/streaming/CarbonStreamInputFormatTest.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.streaming;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
+import java.util.UUID;
+
+import org.apache.carbondata.core.datastore.impl.FileFactory;
+import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier;
+import org.apache.carbondata.core.metadata.CarbonTableIdentifier;
+import org.apache.carbondata.core.statusmanager.FileFormat;
+import org.apache.carbondata.hadoop.CarbonInputSplit;
+import org.apache.carbondata.hadoop.CarbonMultiBlockSplit;
+import org.apache.carbondata.hadoop.util.CarbonInputFormatUtil;
+
+import junit.framework.TestCase;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobID;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.TaskAttemptID;
+import org.apache.hadoop.mapreduce.TaskID;
+import org.apache.hadoop.mapreduce.TaskType;
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class CarbonStreamInputFormatTest extends TestCase {
+
+ private TaskAttemptID taskAttemptId;
+ private TaskAttemptContext taskAttemptContext;
+ private Configuration hadoopConf;
+ private AbsoluteTableIdentifier identifier;
+ private String tablePath;
+
+
+ @Override protected void setUp() throws Exception {
+ tablePath = new File("target/stream_input").getCanonicalPath();
+ String dbName = "default";
+ String tableName = "stream_table_input";
+ identifier = AbsoluteTableIdentifier.from(
+ tablePath,
+ new CarbonTableIdentifier(dbName, tableName, UUID.randomUUID().toString()));
+
+ JobID jobId = CarbonInputFormatUtil.getJobId(new Date(), 0);
+ TaskID taskId = new TaskID(jobId, TaskType.MAP, 0);
+ taskAttemptId = new TaskAttemptID(taskId, 0);
+
+ hadoopConf = new Configuration();
+ taskAttemptContext = new TaskAttemptContextImpl(hadoopConf, taskAttemptId);
+ }
+
+ private InputSplit buildInputSplit() throws IOException {
+ CarbonInputSplit carbonInputSplit = new CarbonInputSplit();
+ List<CarbonInputSplit> splitList = new ArrayList<>();
+ splitList.add(carbonInputSplit);
+ return new CarbonMultiBlockSplit(splitList, new String[] { "localhost" },
+ FileFormat.ROW_V1);
+ }
+
+ @Test public void testCreateRecordReader() {
+ try {
+ InputSplit inputSplit = buildInputSplit();
+ CarbonStreamInputFormat inputFormat = new CarbonStreamInputFormat();
+ RecordReader recordReader = inputFormat.createRecordReader(inputSplit, taskAttemptContext);
+ Assert.assertNotNull("Failed to create record reader", recordReader);
+ } catch (Exception e) {
+ e.printStackTrace();
+ Assert.assertTrue(e.getMessage(), false);
+ }
+ }
+
+ @Override protected void tearDown() throws Exception {
+ super.tearDown();
+ if (tablePath != null) {
+ FileFactory.deleteAllFilesOfDir(new File(tablePath));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/streaming/src/test/java/org/apache/carbondata/streaming/CarbonStreamOutputFormatTest.java
----------------------------------------------------------------------
diff --git a/streaming/src/test/java/org/apache/carbondata/streaming/CarbonStreamOutputFormatTest.java b/streaming/src/test/java/org/apache/carbondata/streaming/CarbonStreamOutputFormatTest.java
new file mode 100644
index 0000000..af79483
--- /dev/null
+++ b/streaming/src/test/java/org/apache/carbondata/streaming/CarbonStreamOutputFormatTest.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.streaming;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Date;
+import java.util.UUID;
+
+import org.apache.carbondata.core.datastore.impl.FileFactory;
+import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier;
+import org.apache.carbondata.core.metadata.CarbonTableIdentifier;
+import org.apache.carbondata.core.metadata.schema.table.CarbonTable;
+import org.apache.carbondata.hadoop.testutil.StoreCreator;
+import org.apache.carbondata.hadoop.util.CarbonInputFormatUtil;
+import org.apache.carbondata.processing.loading.model.CarbonLoadModel;
+
+import junit.framework.TestCase;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.JobID;
+import org.apache.hadoop.mapreduce.RecordWriter;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.TaskAttemptID;
+import org.apache.hadoop.mapreduce.TaskID;
+import org.apache.hadoop.mapreduce.TaskType;
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class CarbonStreamOutputFormatTest extends TestCase {
+
+ private Configuration hadoopConf;
+ private TaskAttemptID taskAttemptId;
+ private CarbonLoadModel carbonLoadModel;
+ private String tablePath;
+
+ @Override protected void setUp() throws Exception {
+ super.setUp();
+ JobID jobId = CarbonInputFormatUtil.getJobId(new Date(), 0);
+ TaskID taskId = new TaskID(jobId, TaskType.MAP, 0);
+ taskAttemptId = new TaskAttemptID(taskId, 0);
+
+ hadoopConf = new Configuration();
+ hadoopConf.set("mapred.job.id", jobId.toString());
+ hadoopConf.set("mapred.tip.id", taskAttemptId.getTaskID().toString());
+ hadoopConf.set("mapred.task.id", taskAttemptId.toString());
+ hadoopConf.setBoolean("mapred.task.is.map", true);
+ hadoopConf.setInt("mapred.task.partition", 0);
+
+ tablePath = new File("target/stream_output").getCanonicalPath();
+ String dbName = "default";
+ String tableName = "stream_table_output";
+ AbsoluteTableIdentifier identifier =
+ AbsoluteTableIdentifier.from(
+ tablePath,
+ new CarbonTableIdentifier(dbName, tableName, UUID.randomUUID().toString()));
+
+ CarbonTable table = StoreCreator.createTable(identifier);
+
+ String factFilePath = new File("../hadoop/src/test/resources/data.csv").getCanonicalPath();
+ carbonLoadModel = StoreCreator.buildCarbonLoadModel(table, factFilePath, identifier);
+ }
+
+ @Test public void testSetCarbonLoadModel() {
+ try {
+ CarbonStreamOutputFormat.setCarbonLoadModel(hadoopConf, carbonLoadModel);
+ } catch (IOException e) {
+ Assert.assertTrue("Failed to config CarbonLoadModel for CarbonStreamOutputFromat", false);
+ }
+ }
+
+ @Test public void testGetCarbonLoadModel() {
+ try {
+ CarbonStreamOutputFormat.setCarbonLoadModel(hadoopConf, carbonLoadModel);
+ CarbonLoadModel model = CarbonStreamOutputFormat.getCarbonLoadModel(hadoopConf);
+
+ Assert.assertNotNull("Failed to get CarbonLoadModel", model);
+ Assert.assertTrue("CarbonLoadModel should be same with previous",
+ carbonLoadModel.getFactTimeStamp() == model.getFactTimeStamp());
+
+ } catch (IOException e) {
+ Assert.assertTrue("Failed to get CarbonLoadModel for CarbonStreamOutputFromat", false);
+ }
+ }
+
+ @Test public void testGetRecordWriter() {
+ CarbonStreamOutputFormat outputFormat = new CarbonStreamOutputFormat();
+ try {
+ CarbonStreamOutputFormat.setCarbonLoadModel(hadoopConf, carbonLoadModel);
+ TaskAttemptContext taskAttemptContext =
+ new TaskAttemptContextImpl(hadoopConf, taskAttemptId);
+ RecordWriter recordWriter = outputFormat.getRecordWriter(taskAttemptContext);
+ Assert.assertNotNull("Failed to get CarbonStreamRecordWriter", recordWriter);
+ } catch (Exception e) {
+ e.printStackTrace();
+ Assert.assertTrue(e.getMessage(), false);
+ }
+ }
+
+ @Override protected void tearDown() throws Exception {
+ super.tearDown();
+ if (tablePath != null) {
+ FileFactory.deleteAllFilesOfDir(new File(tablePath));
+ }
+ }
+}
[4/5] carbondata git commit: [CARBONDATA-2165]Remove spark in
carbon-hadoop module
Posted by ja...@apache.org.
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/hadoop/src/main/java/org/apache/carbondata/hadoop/testutil/StoreCreator.java
----------------------------------------------------------------------
diff --git a/hadoop/src/main/java/org/apache/carbondata/hadoop/testutil/StoreCreator.java b/hadoop/src/main/java/org/apache/carbondata/hadoop/testutil/StoreCreator.java
new file mode 100644
index 0000000..b2c2d39
--- /dev/null
+++ b/hadoop/src/main/java/org/apache/carbondata/hadoop/testutil/StoreCreator.java
@@ -0,0 +1,495 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.carbondata.hadoop.testutil;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.nio.charset.Charset;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.UUID;
+
+import org.apache.carbondata.common.CarbonIterator;
+import org.apache.carbondata.common.logging.LogService;
+import org.apache.carbondata.common.logging.LogServiceFactory;
+import org.apache.carbondata.core.cache.Cache;
+import org.apache.carbondata.core.cache.CacheProvider;
+import org.apache.carbondata.core.cache.CacheType;
+import org.apache.carbondata.core.cache.dictionary.Dictionary;
+import org.apache.carbondata.core.cache.dictionary.DictionaryColumnUniqueIdentifier;
+import org.apache.carbondata.core.constants.CarbonCommonConstants;
+import org.apache.carbondata.core.datamap.DataMapStoreManager;
+import org.apache.carbondata.core.datastore.impl.FileFactory;
+import org.apache.carbondata.core.fileoperations.AtomicFileOperations;
+import org.apache.carbondata.core.fileoperations.AtomicFileOperationsImpl;
+import org.apache.carbondata.core.fileoperations.FileWriteOperation;
+import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier;
+import org.apache.carbondata.core.metadata.CarbonMetadata;
+import org.apache.carbondata.core.metadata.CarbonTableIdentifier;
+import org.apache.carbondata.core.metadata.ColumnIdentifier;
+import org.apache.carbondata.core.metadata.converter.SchemaConverter;
+import org.apache.carbondata.core.metadata.converter.ThriftWrapperSchemaConverterImpl;
+import org.apache.carbondata.core.metadata.datatype.DataTypes;
+import org.apache.carbondata.core.metadata.encoder.Encoding;
+import org.apache.carbondata.core.metadata.schema.SchemaEvolution;
+import org.apache.carbondata.core.metadata.schema.SchemaEvolutionEntry;
+import org.apache.carbondata.core.metadata.schema.table.CarbonTable;
+import org.apache.carbondata.core.metadata.schema.table.TableInfo;
+import org.apache.carbondata.core.metadata.schema.table.TableSchema;
+import org.apache.carbondata.core.metadata.schema.table.column.CarbonDimension;
+import org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema;
+import org.apache.carbondata.core.statusmanager.LoadMetadataDetails;
+import org.apache.carbondata.core.statusmanager.SegmentStatus;
+import org.apache.carbondata.core.util.CarbonProperties;
+import org.apache.carbondata.core.util.CarbonUtil;
+import org.apache.carbondata.core.util.path.CarbonTablePath;
+import org.apache.carbondata.core.writer.CarbonDictionaryWriter;
+import org.apache.carbondata.core.writer.CarbonDictionaryWriterImpl;
+import org.apache.carbondata.core.writer.ThriftWriter;
+import org.apache.carbondata.core.writer.sortindex.CarbonDictionarySortIndexWriter;
+import org.apache.carbondata.core.writer.sortindex.CarbonDictionarySortIndexWriterImpl;
+import org.apache.carbondata.core.writer.sortindex.CarbonDictionarySortInfo;
+import org.apache.carbondata.core.writer.sortindex.CarbonDictionarySortInfoPreparator;
+import org.apache.carbondata.processing.loading.DataLoadExecutor;
+import org.apache.carbondata.processing.loading.constants.DataLoadProcessorConstants;
+import org.apache.carbondata.processing.loading.csvinput.BlockDetails;
+import org.apache.carbondata.processing.loading.csvinput.CSVInputFormat;
+import org.apache.carbondata.processing.loading.csvinput.CSVRecordReaderIterator;
+import org.apache.carbondata.processing.loading.csvinput.StringArrayWritable;
+import org.apache.carbondata.processing.loading.model.CarbonDataLoadSchema;
+import org.apache.carbondata.processing.loading.model.CarbonLoadModel;
+import org.apache.carbondata.processing.util.TableOptionConstant;
+
+import com.google.gson.Gson;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapred.TaskAttemptID;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskType;
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
+
+/**
+ * This class will create store file based on provided schema
+ *
+ */
+public class StoreCreator {
+
+ private static LogService LOG =
+ LogServiceFactory.getLogService(StoreCreator.class.getCanonicalName());
+ private static AbsoluteTableIdentifier absoluteTableIdentifier;
+ private static String storePath = null;
+
+ static {
+ storePath = new File("target/store").getAbsolutePath();
+ String dbName = "testdb";
+ String tableName = "testtable";
+ absoluteTableIdentifier = AbsoluteTableIdentifier.from(storePath + "/testdb/testtable",
+ new CarbonTableIdentifier(dbName, tableName, UUID.randomUUID().toString()));
+ }
+
+ public static AbsoluteTableIdentifier getAbsoluteTableIdentifier() {
+ return absoluteTableIdentifier;
+ }
+
+ public static CarbonLoadModel buildCarbonLoadModel(CarbonTable table, String factFilePath,
+ AbsoluteTableIdentifier absoluteTableIdentifier) {
+ CarbonDataLoadSchema schema = new CarbonDataLoadSchema(table);
+ CarbonLoadModel loadModel = new CarbonLoadModel();
+ loadModel.setCarbonDataLoadSchema(schema);
+ loadModel.setDatabaseName(absoluteTableIdentifier.getCarbonTableIdentifier().getDatabaseName());
+ loadModel.setTableName(absoluteTableIdentifier.getCarbonTableIdentifier().getTableName());
+ loadModel.setTableName(absoluteTableIdentifier.getCarbonTableIdentifier().getTableName());
+ loadModel.setFactFilePath(factFilePath);
+ loadModel.setLoadMetadataDetails(new ArrayList<LoadMetadataDetails>());
+ loadModel.setTablePath(absoluteTableIdentifier.getTablePath());
+ loadModel.setDateFormat(null);
+ loadModel.setDefaultTimestampFormat(CarbonProperties.getInstance().getProperty(
+ CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT,
+ CarbonCommonConstants.CARBON_TIMESTAMP_MILLIS));
+ loadModel.setDefaultDateFormat(CarbonProperties.getInstance().getProperty(
+ CarbonCommonConstants.CARBON_DATE_FORMAT,
+ CarbonCommonConstants.CARBON_DATE_DEFAULT_FORMAT));
+ loadModel
+ .setSerializationNullFormat(
+ TableOptionConstant.SERIALIZATION_NULL_FORMAT.getName() + "," + "\\N");
+ loadModel
+ .setBadRecordsLoggerEnable(
+ TableOptionConstant.BAD_RECORDS_LOGGER_ENABLE.getName() + "," + "false");
+ loadModel
+ .setBadRecordsAction(
+ TableOptionConstant.BAD_RECORDS_ACTION.getName() + "," + "FORCE");
+ loadModel
+ .setIsEmptyDataBadRecord(
+ DataLoadProcessorConstants.IS_EMPTY_DATA_BAD_RECORD + "," + "false");
+ loadModel.setCsvHeader("ID,date,country,name,phonetype,serialname,salary");
+ loadModel.setCsvHeaderColumns(loadModel.getCsvHeader().split(","));
+ loadModel.setTaskNo("0");
+ loadModel.setSegmentId("0");
+ loadModel.setFactTimeStamp(System.currentTimeMillis());
+ loadModel.setMaxColumns("10");
+ return loadModel;
+ }
+
+ /**
+ * Create store without any restructure
+ */
+ public static void createCarbonStore() throws Exception {
+ CarbonLoadModel loadModel = createTableAndLoadModel();
+ loadData(loadModel, storePath);
+ }
+
+ /**
+ * Method to clear the data maps
+ */
+ public static void clearDataMaps() {
+ DataMapStoreManager.getInstance().clearDataMaps(absoluteTableIdentifier);
+ }
+
+ public static CarbonLoadModel createTableAndLoadModel() throws Exception {
+ String factFilePath =
+ new File("../hadoop/src/test/resources/data.csv").getCanonicalPath();
+ File storeDir = new File(storePath);
+ CarbonUtil.deleteFoldersAndFiles(storeDir);
+ CarbonProperties.getInstance().addProperty(CarbonCommonConstants.STORE_LOCATION_HDFS,
+ storePath);
+
+ CarbonTable table = createTable(absoluteTableIdentifier);
+ writeDictionary(factFilePath, table);
+ return buildCarbonLoadModel(table, factFilePath, absoluteTableIdentifier);
+ }
+
+ public static CarbonTable createTable(
+ AbsoluteTableIdentifier identifier) throws IOException {
+ TableInfo tableInfo = new TableInfo();
+ tableInfo.setDatabaseName(identifier.getCarbonTableIdentifier().getDatabaseName());
+ TableSchema tableSchema = new TableSchema();
+ tableSchema.setTableName(identifier.getCarbonTableIdentifier().getTableName());
+ List<ColumnSchema> columnSchemas = new ArrayList<ColumnSchema>();
+ ArrayList<Encoding> encodings = new ArrayList<>();
+ encodings.add(Encoding.DICTIONARY);
+ ColumnSchema id = new ColumnSchema();
+ id.setColumnName("ID");
+ id.setColumnar(true);
+ id.setDataType(DataTypes.INT);
+ id.setEncodingList(encodings);
+ id.setColumnUniqueId(UUID.randomUUID().toString());
+ id.setColumnReferenceId(id.getColumnUniqueId());
+ id.setDimensionColumn(true);
+ id.setColumnGroup(1);
+ columnSchemas.add(id);
+
+ ColumnSchema date = new ColumnSchema();
+ date.setColumnName("date");
+ date.setColumnar(true);
+ date.setDataType(DataTypes.STRING);
+ date.setEncodingList(encodings);
+ date.setColumnUniqueId(UUID.randomUUID().toString());
+ date.setDimensionColumn(true);
+ date.setColumnGroup(2);
+ date.setSortColumn(true);
+ date.setColumnReferenceId(id.getColumnUniqueId());
+ columnSchemas.add(date);
+
+ ColumnSchema country = new ColumnSchema();
+ country.setColumnName("country");
+ country.setColumnar(true);
+ country.setDataType(DataTypes.STRING);
+ country.setEncodingList(encodings);
+ country.setColumnUniqueId(UUID.randomUUID().toString());
+ country.setDimensionColumn(true);
+ country.setColumnGroup(3);
+ country.setSortColumn(true);
+ country.setColumnReferenceId(id.getColumnUniqueId());
+ columnSchemas.add(country);
+
+ ColumnSchema name = new ColumnSchema();
+ name.setColumnName("name");
+ name.setColumnar(true);
+ name.setDataType(DataTypes.STRING);
+ name.setEncodingList(encodings);
+ name.setColumnUniqueId(UUID.randomUUID().toString());
+ name.setDimensionColumn(true);
+ name.setColumnGroup(4);
+ name.setSortColumn(true);
+ name.setColumnReferenceId(id.getColumnUniqueId());
+ columnSchemas.add(name);
+
+ ColumnSchema phonetype = new ColumnSchema();
+ phonetype.setColumnName("phonetype");
+ phonetype.setColumnar(true);
+ phonetype.setDataType(DataTypes.STRING);
+ phonetype.setEncodingList(encodings);
+ phonetype.setColumnUniqueId(UUID.randomUUID().toString());
+ phonetype.setDimensionColumn(true);
+ phonetype.setColumnGroup(5);
+ phonetype.setSortColumn(true);
+ phonetype.setColumnReferenceId(id.getColumnUniqueId());
+ columnSchemas.add(phonetype);
+
+ ColumnSchema serialname = new ColumnSchema();
+ serialname.setColumnName("serialname");
+ serialname.setColumnar(true);
+ serialname.setDataType(DataTypes.STRING);
+ serialname.setEncodingList(encodings);
+ serialname.setColumnUniqueId(UUID.randomUUID().toString());
+ serialname.setDimensionColumn(true);
+ serialname.setColumnGroup(6);
+ serialname.setSortColumn(true);
+ serialname.setColumnReferenceId(id.getColumnUniqueId());
+ columnSchemas.add(serialname);
+
+ ColumnSchema salary = new ColumnSchema();
+ salary.setColumnName("salary");
+ salary.setColumnar(true);
+ salary.setDataType(DataTypes.INT);
+ salary.setEncodingList(new ArrayList<Encoding>());
+ salary.setColumnUniqueId(UUID.randomUUID().toString());
+ salary.setDimensionColumn(false);
+ salary.setColumnReferenceId(id.getColumnUniqueId());
+ salary.setColumnGroup(7);
+ columnSchemas.add(salary);
+
+ tableSchema.setListOfColumns(columnSchemas);
+ SchemaEvolution schemaEvol = new SchemaEvolution();
+ schemaEvol.setSchemaEvolutionEntryList(new ArrayList<SchemaEvolutionEntry>());
+ tableSchema.setSchemaEvalution(schemaEvol);
+ tableSchema.setTableId(UUID.randomUUID().toString());
+ tableInfo.setTableUniqueName(
+ identifier.getCarbonTableIdentifier().getTableUniqueName()
+ );
+ tableInfo.setLastUpdatedTime(System.currentTimeMillis());
+ tableInfo.setFactTable(tableSchema);
+ tableInfo.setTablePath(identifier.getTablePath());
+ String schemaFilePath = CarbonTablePath.getSchemaFilePath(identifier.getTablePath());
+ String schemaMetadataPath = CarbonTablePath.getFolderContainingFile(schemaFilePath);
+ CarbonMetadata.getInstance().loadTableMetadata(tableInfo);
+
+ SchemaConverter schemaConverter = new ThriftWrapperSchemaConverterImpl();
+ org.apache.carbondata.format.TableInfo thriftTableInfo =
+ schemaConverter.fromWrapperToExternalTableInfo(
+ tableInfo,
+ tableInfo.getDatabaseName(),
+ tableInfo.getFactTable().getTableName());
+ org.apache.carbondata.format.SchemaEvolutionEntry schemaEvolutionEntry =
+ new org.apache.carbondata.format.SchemaEvolutionEntry(tableInfo.getLastUpdatedTime());
+ thriftTableInfo.getFact_table().getSchema_evolution().getSchema_evolution_history()
+ .add(schemaEvolutionEntry);
+
+ FileFactory.FileType fileType = FileFactory.getFileType(schemaMetadataPath);
+ if (!FileFactory.isFileExist(schemaMetadataPath, fileType)) {
+ FileFactory.mkdirs(schemaMetadataPath, fileType);
+ }
+
+ ThriftWriter thriftWriter = new ThriftWriter(schemaFilePath, false);
+ thriftWriter.open();
+ thriftWriter.write(thriftTableInfo);
+ thriftWriter.close();
+ return CarbonMetadata.getInstance().getCarbonTable(tableInfo.getTableUniqueName());
+ }
+
+ private static void writeDictionary(String factFilePath, CarbonTable table) throws Exception {
+ BufferedReader reader = new BufferedReader(new InputStreamReader(
+ new FileInputStream(factFilePath), "UTF-8"));
+ List<CarbonDimension> dims = table.getDimensionByTableName(table.getTableName());
+ Set<String>[] set = new HashSet[dims.size()];
+ for (int i = 0; i < set.length; i++) {
+ set[i] = new HashSet<String>();
+ }
+ String line = reader.readLine();
+ while (line != null) {
+ String[] data = line.split(",");
+ for (int i = 0; i < set.length; i++) {
+ set[i].add(data[i]);
+ }
+ line = reader.readLine();
+ }
+
+ Cache dictCache = CacheProvider.getInstance()
+ .createCache(CacheType.REVERSE_DICTIONARY);
+ for (int i = 0; i < set.length; i++) {
+ ColumnIdentifier columnIdentifier =
+ new ColumnIdentifier(dims.get(i).getColumnId(), null, null);
+ DictionaryColumnUniqueIdentifier dictionaryColumnUniqueIdentifier =
+ new DictionaryColumnUniqueIdentifier(
+ table.getAbsoluteTableIdentifier(), columnIdentifier, columnIdentifier.getDataType());
+ CarbonDictionaryWriter writer =
+ new CarbonDictionaryWriterImpl(dictionaryColumnUniqueIdentifier);
+ for (String value : set[i]) {
+ writer.write(value);
+ }
+ writer.close();
+ writer.commit();
+ Dictionary dict = (Dictionary) dictCache.get(
+ new DictionaryColumnUniqueIdentifier(absoluteTableIdentifier,
+ columnIdentifier, dims.get(i).getDataType()));
+ CarbonDictionarySortInfoPreparator preparator =
+ new CarbonDictionarySortInfoPreparator();
+ List<String> newDistinctValues = new ArrayList<String>();
+ CarbonDictionarySortInfo dictionarySortInfo =
+ preparator.getDictionarySortInfo(newDistinctValues, dict, dims.get(i).getDataType());
+ CarbonDictionarySortIndexWriter carbonDictionaryWriter =
+ new CarbonDictionarySortIndexWriterImpl(dictionaryColumnUniqueIdentifier);
+ try {
+ carbonDictionaryWriter.writeSortIndex(dictionarySortInfo.getSortIndex());
+ carbonDictionaryWriter.writeInvertedSortIndex(dictionarySortInfo.getSortIndexInverted());
+ } finally {
+ carbonDictionaryWriter.close();
+ }
+ }
+ reader.close();
+ }
+
+ /**
+ * Execute graph which will further load data
+ *
+ * @param loadModel
+ * @param storeLocation
+ * @throws Exception
+ */
+ public static void loadData(CarbonLoadModel loadModel, String storeLocation)
+ throws Exception {
+ if (new File(storeLocation).mkdirs()) {
+ LOG.warn("mkdir is failed");
+ }
+ String outPutLoc = storeLocation + "/etl";
+ String databaseName = loadModel.getDatabaseName();
+ String tableName = loadModel.getTableName();
+ String tempLocationKey = databaseName + '_' + tableName + "_1";
+ CarbonProperties.getInstance().addProperty(
+ tempLocationKey, storeLocation + "/" + databaseName + "/" + tableName);
+ CarbonProperties.getInstance().addProperty("store_output_location", outPutLoc);
+ CarbonProperties.getInstance().addProperty("send.signal.load", "false");
+ CarbonProperties.getInstance().addProperty("carbon.is.columnar.storage", "true");
+ CarbonProperties.getInstance().addProperty("carbon.dimension.split.value.in.columnar", "1");
+ CarbonProperties.getInstance().addProperty("carbon.is.fullyfilled.bits", "true");
+ CarbonProperties.getInstance().addProperty("is.int.based.indexer", "true");
+ CarbonProperties.getInstance().addProperty("aggregate.columnar.keyblock", "true");
+ CarbonProperties.getInstance().addProperty("is.compressed.keyblock", "false");
+ CarbonProperties.getInstance().addProperty("carbon.leaf.node.size", "120000");
+
+ String graphPath =
+ outPutLoc + File.separator + loadModel.getDatabaseName() + File.separator + tableName
+ + File.separator + 0 + File.separator + 1 + File.separator + tableName + ".ktr";
+ File path = new File(graphPath);
+ if (path.exists()) {
+ if (!path.delete()) {
+ LOG.warn("delete " + path + " failed");
+ }
+ }
+
+ BlockDetails blockDetails = new BlockDetails(new Path(loadModel.getFactFilePath()),
+ 0, new File(loadModel.getFactFilePath()).length(), new String[] {"localhost"});
+ Configuration configuration = new Configuration();
+ CSVInputFormat.setCommentCharacter(configuration, loadModel.getCommentChar());
+ CSVInputFormat.setCSVDelimiter(configuration, loadModel.getCsvDelimiter());
+ CSVInputFormat.setEscapeCharacter(configuration, loadModel.getEscapeChar());
+ CSVInputFormat.setHeaderExtractionEnabled(configuration, true);
+ CSVInputFormat.setQuoteCharacter(configuration, loadModel.getQuoteChar());
+ CSVInputFormat.setReadBufferSize(configuration,
+ CarbonProperties.getInstance().getProperty(
+ CarbonCommonConstants.CSV_READ_BUFFER_SIZE,
+ CarbonCommonConstants.CSV_READ_BUFFER_SIZE_DEFAULT));
+ CSVInputFormat.setNumberOfColumns(
+ configuration, String.valueOf(loadModel.getCsvHeaderColumns().length));
+ CSVInputFormat.setMaxColumns(configuration, "10");
+
+ TaskAttemptContextImpl hadoopAttemptContext =
+ new TaskAttemptContextImpl(configuration, new TaskAttemptID("", 1, TaskType.MAP, 0, 0));
+ CSVInputFormat format = new CSVInputFormat();
+
+ RecordReader<NullWritable, StringArrayWritable> recordReader =
+ format.createRecordReader(blockDetails, hadoopAttemptContext);
+
+ CSVRecordReaderIterator readerIterator =
+ new CSVRecordReaderIterator(recordReader, blockDetails, hadoopAttemptContext);
+ new DataLoadExecutor().execute(loadModel,
+ new String[] {storeLocation + "/" + databaseName + "/" + tableName},
+ new CarbonIterator[]{readerIterator});
+
+ writeLoadMetadata(
+ loadModel.getCarbonDataLoadSchema(), loadModel.getTableName(), loadModel.getTableName(),
+ new ArrayList<LoadMetadataDetails>());
+ }
+
+ public static void writeLoadMetadata(CarbonDataLoadSchema schema, String databaseName,
+ String tableName, List<LoadMetadataDetails> listOfLoadFolderDetails) throws IOException {
+ LoadMetadataDetails loadMetadataDetails = new LoadMetadataDetails();
+ loadMetadataDetails.setLoadEndTime(System.currentTimeMillis());
+ loadMetadataDetails.setSegmentStatus(SegmentStatus.SUCCESS);
+ loadMetadataDetails.setLoadName(String.valueOf(0));
+ loadMetadataDetails.setLoadStartTime(loadMetadataDetails.getTimeStamp(readCurrentTime()));
+ listOfLoadFolderDetails.add(loadMetadataDetails);
+
+ String dataLoadLocation = schema.getCarbonTable().getMetadataPath() + File.separator
+ + CarbonTablePath.TABLE_STATUS_FILE;
+
+ DataOutputStream dataOutputStream;
+ Gson gsonObjectToWrite = new Gson();
+ BufferedWriter brWriter = null;
+
+ AtomicFileOperations writeOperation =
+ new AtomicFileOperationsImpl(dataLoadLocation, FileFactory.getFileType(dataLoadLocation));
+
+ try {
+
+ dataOutputStream = writeOperation.openForWrite(FileWriteOperation.OVERWRITE);
+ brWriter = new BufferedWriter(new OutputStreamWriter(dataOutputStream,
+ Charset.forName(CarbonCommonConstants.DEFAULT_CHARSET)));
+
+ String metadataInstance = gsonObjectToWrite.toJson(listOfLoadFolderDetails.toArray());
+ brWriter.write(metadataInstance);
+ } finally {
+ try {
+ if (null != brWriter) {
+ brWriter.flush();
+ }
+ } catch (Exception e) {
+ throw e;
+
+ }
+ CarbonUtil.closeStreams(brWriter);
+
+ }
+ writeOperation.close();
+
+ }
+
+ public static String readCurrentTime() {
+ SimpleDateFormat sdf = new SimpleDateFormat(CarbonCommonConstants.CARBON_TIMESTAMP_MILLIS);
+ String date = null;
+
+ date = sdf.format(new Date());
+
+ return date;
+ }
+
+ public static void main(String[] args) throws Exception {
+ StoreCreator.createCarbonStore();
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/hadoop/src/main/java/org/apache/carbondata/hadoop/util/CarbonTypeUtil.java
----------------------------------------------------------------------
diff --git a/hadoop/src/main/java/org/apache/carbondata/hadoop/util/CarbonTypeUtil.java b/hadoop/src/main/java/org/apache/carbondata/hadoop/util/CarbonTypeUtil.java
deleted file mode 100644
index 395015e..0000000
--- a/hadoop/src/main/java/org/apache/carbondata/hadoop/util/CarbonTypeUtil.java
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.carbondata.hadoop.util;
-
-import org.apache.carbondata.core.keygenerator.directdictionary.DirectDictionaryGenerator;
-import org.apache.carbondata.core.keygenerator.directdictionary.DirectDictionaryKeyGeneratorFactory;
-import org.apache.carbondata.core.metadata.datatype.DataType;
-import org.apache.carbondata.core.metadata.encoder.Encoding;
-import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn;
-import org.apache.carbondata.core.metadata.schema.table.column.CarbonMeasure;
-
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.DecimalType;
-import org.apache.spark.sql.types.StructField;
-
-public class CarbonTypeUtil {
-
- public static org.apache.spark.sql.types.DataType convertCarbonToSparkDataType(
- DataType carbonDataType) {
- if (carbonDataType == org.apache.carbondata.core.metadata.datatype.DataTypes.STRING) {
- return DataTypes.StringType;
- } else if (carbonDataType == org.apache.carbondata.core.metadata.datatype.DataTypes.SHORT) {
- return DataTypes.ShortType;
- } else if (carbonDataType == org.apache.carbondata.core.metadata.datatype.DataTypes.INT) {
- return DataTypes.IntegerType;
- } else if (carbonDataType == org.apache.carbondata.core.metadata.datatype.DataTypes.LONG) {
- return DataTypes.LongType;
- } else if (carbonDataType == org.apache.carbondata.core.metadata.datatype.DataTypes.DOUBLE) {
- return DataTypes.DoubleType;
- } else if (carbonDataType == org.apache.carbondata.core.metadata.datatype.DataTypes.BOOLEAN) {
- return DataTypes.BooleanType;
- } else if (org.apache.carbondata.core.metadata.datatype.DataTypes.isDecimal(carbonDataType)) {
- return DataTypes.createDecimalType();
- } else if (carbonDataType == org.apache.carbondata.core.metadata.datatype.DataTypes.TIMESTAMP) {
- return DataTypes.TimestampType;
- } else if (carbonDataType == org.apache.carbondata.core.metadata.datatype.DataTypes.DATE) {
- return DataTypes.DateType;
- } else {
- return null;
- }
- }
-
- public static StructField[] convertCarbonSchemaToSparkSchema(CarbonColumn[] carbonColumns) {
- StructField[] fields = new StructField[carbonColumns.length];
- for (int i = 0; i < carbonColumns.length; i++) {
- CarbonColumn carbonColumn = carbonColumns[i];
- if (carbonColumn.isDimension()) {
- if (carbonColumn.hasEncoding(Encoding.DIRECT_DICTIONARY)) {
- DirectDictionaryGenerator generator = DirectDictionaryKeyGeneratorFactory
- .getDirectDictionaryGenerator(carbonColumn.getDataType());
- fields[i] = new StructField(carbonColumn.getColName(),
- CarbonTypeUtil.convertCarbonToSparkDataType(generator.getReturnType()), true, null);
- } else if (!carbonColumn.hasEncoding(Encoding.DICTIONARY)) {
- fields[i] = new StructField(carbonColumn.getColName(),
- CarbonTypeUtil.convertCarbonToSparkDataType(carbonColumn.getDataType()), true, null);
- } else if (carbonColumn.isComplex()) {
- fields[i] = new StructField(carbonColumn.getColName(),
- CarbonTypeUtil.convertCarbonToSparkDataType(carbonColumn.getDataType()), true, null);
- } else {
- fields[i] = new StructField(carbonColumn.getColName(), CarbonTypeUtil
- .convertCarbonToSparkDataType(
- org.apache.carbondata.core.metadata.datatype.DataTypes.INT), true, null);
- }
- } else if (carbonColumn.isMeasure()) {
- DataType dataType = carbonColumn.getDataType();
- if (dataType == org.apache.carbondata.core.metadata.datatype.DataTypes.BOOLEAN
- || dataType == org.apache.carbondata.core.metadata.datatype.DataTypes.SHORT
- || dataType == org.apache.carbondata.core.metadata.datatype.DataTypes.INT
- || dataType == org.apache.carbondata.core.metadata.datatype.DataTypes.LONG) {
- fields[i] = new StructField(carbonColumn.getColName(),
- CarbonTypeUtil.convertCarbonToSparkDataType(dataType), true, null);
- } else if (org.apache.carbondata.core.metadata.datatype.DataTypes.isDecimal(dataType)) {
- CarbonMeasure measure = (CarbonMeasure) carbonColumn;
- fields[i] = new StructField(carbonColumn.getColName(),
- new DecimalType(measure.getPrecision(), measure.getScale()), true, null);
- } else {
- fields[i] = new StructField(carbonColumn.getColName(), CarbonTypeUtil
- .convertCarbonToSparkDataType(
- org.apache.carbondata.core.metadata.datatype.DataTypes.DOUBLE), true, null);
- }
- }
- }
- return fields;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/hadoop/src/test/java/org/apache/carbondata/hadoop/ft/CarbonTableInputFormatTest.java
----------------------------------------------------------------------
diff --git a/hadoop/src/test/java/org/apache/carbondata/hadoop/ft/CarbonTableInputFormatTest.java b/hadoop/src/test/java/org/apache/carbondata/hadoop/ft/CarbonTableInputFormatTest.java
index 2f029ab..ea242d1 100644
--- a/hadoop/src/test/java/org/apache/carbondata/hadoop/ft/CarbonTableInputFormatTest.java
+++ b/hadoop/src/test/java/org/apache/carbondata/hadoop/ft/CarbonTableInputFormatTest.java
@@ -27,8 +27,6 @@ import java.io.IOException;
import java.util.List;
import java.util.UUID;
-import junit.framework.TestCase;
-
import org.apache.carbondata.core.constants.CarbonCommonConstants;
import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier;
import org.apache.carbondata.core.metadata.datatype.DataTypes;
@@ -41,7 +39,7 @@ import org.apache.carbondata.core.util.CarbonUtil;
import org.apache.carbondata.core.util.path.CarbonTablePath;
import org.apache.carbondata.hadoop.CarbonProjection;
import org.apache.carbondata.hadoop.api.CarbonTableInputFormat;
-import org.apache.carbondata.hadoop.test.util.StoreCreator;
+import org.apache.carbondata.hadoop.testutil.StoreCreator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
@@ -52,9 +50,7 @@ import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-import org.junit.After;
import org.junit.Assert;
-import org.junit.Before;
import org.junit.Test;
public class CarbonTableInputFormatTest {
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/hadoop/src/test/java/org/apache/carbondata/hadoop/ft/CarbonTableOutputFormatTest.java
----------------------------------------------------------------------
diff --git a/hadoop/src/test/java/org/apache/carbondata/hadoop/ft/CarbonTableOutputFormatTest.java b/hadoop/src/test/java/org/apache/carbondata/hadoop/ft/CarbonTableOutputFormatTest.java
index 653a49e..99f69c2 100644
--- a/hadoop/src/test/java/org/apache/carbondata/hadoop/ft/CarbonTableOutputFormatTest.java
+++ b/hadoop/src/test/java/org/apache/carbondata/hadoop/ft/CarbonTableOutputFormatTest.java
@@ -26,7 +26,7 @@ import org.apache.carbondata.core.util.CarbonUtil;
import org.apache.carbondata.core.util.path.CarbonTablePath;
import org.apache.carbondata.hadoop.api.CarbonTableOutputFormat;
import org.apache.carbondata.hadoop.internal.ObjectArrayWritable;
-import org.apache.carbondata.hadoop.test.util.StoreCreator;
+import org.apache.carbondata.hadoop.testutil.StoreCreator;
import org.apache.carbondata.processing.loading.csvinput.CSVInputFormat;
import org.apache.carbondata.processing.loading.csvinput.StringArrayWritable;
import org.apache.carbondata.processing.loading.model.CarbonLoadModel;
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/hadoop/src/test/java/org/apache/carbondata/hadoop/streaming/CarbonStreamInputFormatTest.java
----------------------------------------------------------------------
diff --git a/hadoop/src/test/java/org/apache/carbondata/hadoop/streaming/CarbonStreamInputFormatTest.java b/hadoop/src/test/java/org/apache/carbondata/hadoop/streaming/CarbonStreamInputFormatTest.java
deleted file mode 100644
index 57f488f..0000000
--- a/hadoop/src/test/java/org/apache/carbondata/hadoop/streaming/CarbonStreamInputFormatTest.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.carbondata.hadoop.streaming;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Date;
-import java.util.List;
-import java.util.UUID;
-
-import org.apache.carbondata.core.datastore.impl.FileFactory;
-import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier;
-import org.apache.carbondata.core.metadata.CarbonTableIdentifier;
-import org.apache.carbondata.core.statusmanager.FileFormat;
-import org.apache.carbondata.hadoop.CarbonInputSplit;
-import org.apache.carbondata.hadoop.CarbonMultiBlockSplit;
-import org.apache.carbondata.hadoop.util.CarbonInputFormatUtil;
-
-import junit.framework.TestCase;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.JobID;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.TaskAttemptID;
-import org.apache.hadoop.mapreduce.TaskID;
-import org.apache.hadoop.mapreduce.TaskType;
-import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class CarbonStreamInputFormatTest extends TestCase {
-
- private TaskAttemptID taskAttemptId;
- private TaskAttemptContext taskAttemptContext;
- private Configuration hadoopConf;
- private AbsoluteTableIdentifier identifier;
- private String tablePath;
-
-
- @Override protected void setUp() throws Exception {
- tablePath = new File("target/stream_input").getCanonicalPath();
- String dbName = "default";
- String tableName = "stream_table_input";
- identifier = AbsoluteTableIdentifier.from(
- tablePath,
- new CarbonTableIdentifier(dbName, tableName, UUID.randomUUID().toString()));
-
- JobID jobId = CarbonInputFormatUtil.getJobId(new Date(), 0);
- TaskID taskId = new TaskID(jobId, TaskType.MAP, 0);
- taskAttemptId = new TaskAttemptID(taskId, 0);
-
- hadoopConf = new Configuration();
- taskAttemptContext = new TaskAttemptContextImpl(hadoopConf, taskAttemptId);
- }
-
- private InputSplit buildInputSplit() throws IOException {
- CarbonInputSplit carbonInputSplit = new CarbonInputSplit();
- List<CarbonInputSplit> splitList = new ArrayList<>();
- splitList.add(carbonInputSplit);
- return new CarbonMultiBlockSplit(splitList, new String[] { "localhost" },
- FileFormat.ROW_V1);
- }
-
- @Test public void testCreateRecordReader() {
- try {
- InputSplit inputSplit = buildInputSplit();
- CarbonStreamInputFormat inputFormat = new CarbonStreamInputFormat();
- RecordReader recordReader = inputFormat.createRecordReader(inputSplit, taskAttemptContext);
- Assert.assertNotNull("Failed to create record reader", recordReader);
- } catch (Exception e) {
- e.printStackTrace();
- Assert.assertTrue(e.getMessage(), false);
- }
- }
-
- @Override protected void tearDown() throws Exception {
- super.tearDown();
- if (tablePath != null) {
- FileFactory.deleteAllFilesOfDir(new File(tablePath));
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/hadoop/src/test/java/org/apache/carbondata/hadoop/streaming/CarbonStreamOutputFormatTest.java
----------------------------------------------------------------------
diff --git a/hadoop/src/test/java/org/apache/carbondata/hadoop/streaming/CarbonStreamOutputFormatTest.java b/hadoop/src/test/java/org/apache/carbondata/hadoop/streaming/CarbonStreamOutputFormatTest.java
deleted file mode 100644
index e871c7e..0000000
--- a/hadoop/src/test/java/org/apache/carbondata/hadoop/streaming/CarbonStreamOutputFormatTest.java
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.carbondata.hadoop.streaming;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Date;
-import java.util.UUID;
-
-import org.apache.carbondata.core.datastore.impl.FileFactory;
-import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier;
-import org.apache.carbondata.core.metadata.CarbonTableIdentifier;
-import org.apache.carbondata.core.metadata.schema.table.CarbonTable;
-import org.apache.carbondata.hadoop.test.util.StoreCreator;
-import org.apache.carbondata.hadoop.util.CarbonInputFormatUtil;
-import org.apache.carbondata.processing.loading.model.CarbonLoadModel;
-
-import junit.framework.TestCase;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.JobID;
-import org.apache.hadoop.mapreduce.RecordWriter;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.TaskAttemptID;
-import org.apache.hadoop.mapreduce.TaskID;
-import org.apache.hadoop.mapreduce.TaskType;
-import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class CarbonStreamOutputFormatTest extends TestCase {
-
- private Configuration hadoopConf;
- private TaskAttemptID taskAttemptId;
- private CarbonLoadModel carbonLoadModel;
- private String tablePath;
-
- @Override protected void setUp() throws Exception {
- super.setUp();
- JobID jobId = CarbonInputFormatUtil.getJobId(new Date(), 0);
- TaskID taskId = new TaskID(jobId, TaskType.MAP, 0);
- taskAttemptId = new TaskAttemptID(taskId, 0);
-
- hadoopConf = new Configuration();
- hadoopConf.set("mapred.job.id", jobId.toString());
- hadoopConf.set("mapred.tip.id", taskAttemptId.getTaskID().toString());
- hadoopConf.set("mapred.task.id", taskAttemptId.toString());
- hadoopConf.setBoolean("mapred.task.is.map", true);
- hadoopConf.setInt("mapred.task.partition", 0);
-
- tablePath = new File("target/stream_output").getCanonicalPath();
- String dbName = "default";
- String tableName = "stream_table_output";
- AbsoluteTableIdentifier identifier =
- AbsoluteTableIdentifier.from(
- tablePath,
- new CarbonTableIdentifier(dbName, tableName, UUID.randomUUID().toString()));
-
- CarbonTable table = StoreCreator.createTable(identifier);
-
- String factFilePath = new File("../hadoop/src/test/resources/data.csv").getCanonicalPath();
- carbonLoadModel = StoreCreator.buildCarbonLoadModel(table, factFilePath, identifier);
- }
-
- @Test public void testSetCarbonLoadModel() {
- try {
- CarbonStreamOutputFormat.setCarbonLoadModel(hadoopConf, carbonLoadModel);
- } catch (IOException e) {
- Assert.assertTrue("Failed to config CarbonLoadModel for CarbonStreamOutputFromat", false);
- }
- }
-
- @Test public void testGetCarbonLoadModel() {
- try {
- CarbonStreamOutputFormat.setCarbonLoadModel(hadoopConf, carbonLoadModel);
- CarbonLoadModel model = CarbonStreamOutputFormat.getCarbonLoadModel(hadoopConf);
-
- Assert.assertNotNull("Failed to get CarbonLoadModel", model);
- Assert.assertTrue("CarbonLoadModel should be same with previous",
- carbonLoadModel.getFactTimeStamp() == model.getFactTimeStamp());
-
- } catch (IOException e) {
- Assert.assertTrue("Failed to get CarbonLoadModel for CarbonStreamOutputFromat", false);
- }
- }
-
- @Test public void testGetRecordWriter() {
- CarbonStreamOutputFormat outputFormat = new CarbonStreamOutputFormat();
- try {
- CarbonStreamOutputFormat.setCarbonLoadModel(hadoopConf, carbonLoadModel);
- TaskAttemptContext taskAttemptContext =
- new TaskAttemptContextImpl(hadoopConf, taskAttemptId);
- RecordWriter recordWriter = outputFormat.getRecordWriter(taskAttemptContext);
- Assert.assertNotNull("Failed to get CarbonStreamRecordWriter", recordWriter);
- } catch (Exception e) {
- e.printStackTrace();
- Assert.assertTrue(e.getMessage(), false);
- }
- }
-
- @Override protected void tearDown() throws Exception {
- super.tearDown();
- if (tablePath != null) {
- FileFactory.deleteAllFilesOfDir(new File(tablePath));
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/hadoop/src/test/java/org/apache/carbondata/hadoop/test/util/StoreCreator.java
----------------------------------------------------------------------
diff --git a/hadoop/src/test/java/org/apache/carbondata/hadoop/test/util/StoreCreator.java b/hadoop/src/test/java/org/apache/carbondata/hadoop/test/util/StoreCreator.java
deleted file mode 100644
index 8e8916d..0000000
--- a/hadoop/src/test/java/org/apache/carbondata/hadoop/test/util/StoreCreator.java
+++ /dev/null
@@ -1,492 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.carbondata.hadoop.test.util;
-
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.DataOutputStream;
-import java.io.File;
-import java.io.FileReader;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.nio.charset.Charset;
-import java.text.SimpleDateFormat;
-import java.util.ArrayList;
-import java.util.Date;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-import java.util.UUID;
-
-import org.apache.carbondata.common.CarbonIterator;
-import org.apache.carbondata.core.cache.Cache;
-import org.apache.carbondata.core.cache.CacheProvider;
-import org.apache.carbondata.core.cache.CacheType;
-import org.apache.carbondata.core.cache.dictionary.Dictionary;
-import org.apache.carbondata.core.cache.dictionary.DictionaryColumnUniqueIdentifier;
-import org.apache.carbondata.core.constants.CarbonCommonConstants;
-import org.apache.carbondata.core.datamap.DataMapStoreManager;
-import org.apache.carbondata.core.datastore.impl.FileFactory;
-import org.apache.carbondata.core.fileoperations.AtomicFileOperations;
-import org.apache.carbondata.core.fileoperations.AtomicFileOperationsImpl;
-import org.apache.carbondata.core.fileoperations.FileWriteOperation;
-import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier;
-import org.apache.carbondata.core.metadata.CarbonMetadata;
-import org.apache.carbondata.core.metadata.CarbonTableIdentifier;
-import org.apache.carbondata.core.metadata.ColumnIdentifier;
-import org.apache.carbondata.core.metadata.converter.SchemaConverter;
-import org.apache.carbondata.core.metadata.converter.ThriftWrapperSchemaConverterImpl;
-import org.apache.carbondata.core.metadata.datatype.DataTypes;
-import org.apache.carbondata.core.metadata.encoder.Encoding;
-import org.apache.carbondata.core.metadata.schema.SchemaEvolution;
-import org.apache.carbondata.core.metadata.schema.SchemaEvolutionEntry;
-import org.apache.carbondata.core.metadata.schema.table.CarbonTable;
-import org.apache.carbondata.core.metadata.schema.table.TableInfo;
-import org.apache.carbondata.core.metadata.schema.table.TableSchema;
-import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn;
-import org.apache.carbondata.core.metadata.schema.table.column.CarbonDimension;
-import org.apache.carbondata.core.metadata.schema.table.column.CarbonMeasure;
-import org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema;
-import org.apache.carbondata.core.statusmanager.LoadMetadataDetails;
-import org.apache.carbondata.core.statusmanager.SegmentStatus;
-import org.apache.carbondata.core.util.CarbonProperties;
-import org.apache.carbondata.core.util.CarbonUtil;
-import org.apache.carbondata.core.util.path.CarbonTablePath;
-import org.apache.carbondata.core.writer.CarbonDictionaryWriter;
-import org.apache.carbondata.core.writer.CarbonDictionaryWriterImpl;
-import org.apache.carbondata.core.writer.ThriftWriter;
-import org.apache.carbondata.core.writer.sortindex.CarbonDictionarySortIndexWriter;
-import org.apache.carbondata.core.writer.sortindex.CarbonDictionarySortIndexWriterImpl;
-import org.apache.carbondata.core.writer.sortindex.CarbonDictionarySortInfo;
-import org.apache.carbondata.core.writer.sortindex.CarbonDictionarySortInfoPreparator;
-import org.apache.carbondata.processing.loading.DataLoadExecutor;
-import org.apache.carbondata.processing.loading.constants.DataLoadProcessorConstants;
-import org.apache.carbondata.processing.loading.csvinput.BlockDetails;
-import org.apache.carbondata.processing.loading.csvinput.CSVInputFormat;
-import org.apache.carbondata.processing.loading.csvinput.CSVRecordReaderIterator;
-import org.apache.carbondata.processing.loading.csvinput.StringArrayWritable;
-import org.apache.carbondata.processing.loading.model.CarbonDataLoadSchema;
-import org.apache.carbondata.processing.loading.model.CarbonLoadModel;
-import org.apache.carbondata.processing.util.TableOptionConstant;
-
-import com.google.gson.Gson;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.mapred.TaskAttemptID;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskType;
-import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
-
-/**
- * This class will create store file based on provided schema
- *
- */
-public class StoreCreator {
-
- private static AbsoluteTableIdentifier absoluteTableIdentifier;
- private static String storePath = null;
-
- static {
- try {
- storePath = new File("target/store").getCanonicalPath();
- String dbName = "testdb";
- String tableName = "testtable";
- absoluteTableIdentifier =
- AbsoluteTableIdentifier.from(
- storePath +"/testdb/testtable",
- new CarbonTableIdentifier(dbName, tableName, UUID.randomUUID().toString()));
- } catch (IOException ex) {
-
- }
- }
-
- public static AbsoluteTableIdentifier getAbsoluteTableIdentifier() {
- return absoluteTableIdentifier;
- }
-
- public static CarbonLoadModel buildCarbonLoadModel(CarbonTable table, String factFilePath,
- AbsoluteTableIdentifier absoluteTableIdentifier) {
- CarbonDataLoadSchema schema = new CarbonDataLoadSchema(table);
- CarbonLoadModel loadModel = new CarbonLoadModel();
- loadModel.setCarbonDataLoadSchema(schema);
- loadModel.setDatabaseName(absoluteTableIdentifier.getCarbonTableIdentifier().getDatabaseName());
- loadModel.setTableName(absoluteTableIdentifier.getCarbonTableIdentifier().getTableName());
- loadModel.setTableName(absoluteTableIdentifier.getCarbonTableIdentifier().getTableName());
- loadModel.setFactFilePath(factFilePath);
- loadModel.setLoadMetadataDetails(new ArrayList<LoadMetadataDetails>());
- loadModel.setTablePath(absoluteTableIdentifier.getTablePath());
- loadModel.setDateFormat(null);
- loadModel.setDefaultTimestampFormat(CarbonProperties.getInstance().getProperty(
- CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT,
- CarbonCommonConstants.CARBON_TIMESTAMP_MILLIS));
- loadModel.setDefaultDateFormat(CarbonProperties.getInstance().getProperty(
- CarbonCommonConstants.CARBON_DATE_FORMAT,
- CarbonCommonConstants.CARBON_DATE_DEFAULT_FORMAT));
- loadModel
- .setSerializationNullFormat(
- TableOptionConstant.SERIALIZATION_NULL_FORMAT.getName() + "," + "\\N");
- loadModel
- .setBadRecordsLoggerEnable(
- TableOptionConstant.BAD_RECORDS_LOGGER_ENABLE.getName() + "," + "false");
- loadModel
- .setBadRecordsAction(
- TableOptionConstant.BAD_RECORDS_ACTION.getName() + "," + "FORCE");
- loadModel
- .setIsEmptyDataBadRecord(
- DataLoadProcessorConstants.IS_EMPTY_DATA_BAD_RECORD + "," + "false");
- loadModel.setCsvHeader("ID,date,country,name,phonetype,serialname,salary");
- loadModel.setCsvHeaderColumns(loadModel.getCsvHeader().split(","));
- loadModel.setTaskNo("0");
- loadModel.setSegmentId("0");
- loadModel.setFactTimeStamp(System.currentTimeMillis());
- loadModel.setMaxColumns("10");
- return loadModel;
- }
-
- /**
- * Create store without any restructure
- */
- public static void createCarbonStore() throws Exception {
- CarbonLoadModel loadModel = createTableAndLoadModel();
- loadData(loadModel, storePath);
- }
-
- /**
- * Method to clear the data maps
- */
- public static void clearDataMaps() {
- DataMapStoreManager.getInstance().clearDataMaps(absoluteTableIdentifier);
- }
-
- public static CarbonLoadModel createTableAndLoadModel() throws Exception {
- String factFilePath =
- new File("../hadoop/src/test/resources/data.csv").getCanonicalPath();
- File storeDir = new File(storePath);
- CarbonUtil.deleteFoldersAndFiles(storeDir);
- CarbonProperties.getInstance().addProperty(CarbonCommonConstants.STORE_LOCATION_HDFS,
- storePath);
-
- CarbonTable table = createTable(absoluteTableIdentifier);
- writeDictionary(factFilePath, table);
- return buildCarbonLoadModel(table, factFilePath, absoluteTableIdentifier);
- }
-
- public static CarbonTable createTable(
- AbsoluteTableIdentifier identifier) throws IOException {
- TableInfo tableInfo = new TableInfo();
- tableInfo.setDatabaseName(identifier.getCarbonTableIdentifier().getDatabaseName());
- TableSchema tableSchema = new TableSchema();
- tableSchema.setTableName(identifier.getCarbonTableIdentifier().getTableName());
- List<ColumnSchema> columnSchemas = new ArrayList<ColumnSchema>();
- ArrayList<Encoding> encodings = new ArrayList<>();
- encodings.add(Encoding.DICTIONARY);
- ColumnSchema id = new ColumnSchema();
- id.setColumnName("ID");
- id.setColumnar(true);
- id.setDataType(DataTypes.INT);
- id.setEncodingList(encodings);
- id.setColumnUniqueId(UUID.randomUUID().toString());
- id.setColumnReferenceId(id.getColumnUniqueId());
- id.setDimensionColumn(true);
- id.setColumnGroup(1);
- columnSchemas.add(id);
-
- ColumnSchema date = new ColumnSchema();
- date.setColumnName("date");
- date.setColumnar(true);
- date.setDataType(DataTypes.STRING);
- date.setEncodingList(encodings);
- date.setColumnUniqueId(UUID.randomUUID().toString());
- date.setDimensionColumn(true);
- date.setColumnGroup(2);
- date.setSortColumn(true);
- date.setColumnReferenceId(id.getColumnUniqueId());
- columnSchemas.add(date);
-
- ColumnSchema country = new ColumnSchema();
- country.setColumnName("country");
- country.setColumnar(true);
- country.setDataType(DataTypes.STRING);
- country.setEncodingList(encodings);
- country.setColumnUniqueId(UUID.randomUUID().toString());
- country.setDimensionColumn(true);
- country.setColumnGroup(3);
- country.setSortColumn(true);
- country.setColumnReferenceId(id.getColumnUniqueId());
- columnSchemas.add(country);
-
- ColumnSchema name = new ColumnSchema();
- name.setColumnName("name");
- name.setColumnar(true);
- name.setDataType(DataTypes.STRING);
- name.setEncodingList(encodings);
- name.setColumnUniqueId(UUID.randomUUID().toString());
- name.setDimensionColumn(true);
- name.setColumnGroup(4);
- name.setSortColumn(true);
- name.setColumnReferenceId(id.getColumnUniqueId());
- columnSchemas.add(name);
-
- ColumnSchema phonetype = new ColumnSchema();
- phonetype.setColumnName("phonetype");
- phonetype.setColumnar(true);
- phonetype.setDataType(DataTypes.STRING);
- phonetype.setEncodingList(encodings);
- phonetype.setColumnUniqueId(UUID.randomUUID().toString());
- phonetype.setDimensionColumn(true);
- phonetype.setColumnGroup(5);
- phonetype.setSortColumn(true);
- phonetype.setColumnReferenceId(id.getColumnUniqueId());
- columnSchemas.add(phonetype);
-
- ColumnSchema serialname = new ColumnSchema();
- serialname.setColumnName("serialname");
- serialname.setColumnar(true);
- serialname.setDataType(DataTypes.STRING);
- serialname.setEncodingList(encodings);
- serialname.setColumnUniqueId(UUID.randomUUID().toString());
- serialname.setDimensionColumn(true);
- serialname.setColumnGroup(6);
- serialname.setSortColumn(true);
- serialname.setColumnReferenceId(id.getColumnUniqueId());
- columnSchemas.add(serialname);
-
- ColumnSchema salary = new ColumnSchema();
- salary.setColumnName("salary");
- salary.setColumnar(true);
- salary.setDataType(DataTypes.INT);
- salary.setEncodingList(new ArrayList<Encoding>());
- salary.setColumnUniqueId(UUID.randomUUID().toString());
- salary.setDimensionColumn(false);
- salary.setColumnReferenceId(id.getColumnUniqueId());
- salary.setColumnGroup(7);
- columnSchemas.add(salary);
-
- tableSchema.setListOfColumns(columnSchemas);
- SchemaEvolution schemaEvol = new SchemaEvolution();
- schemaEvol.setSchemaEvolutionEntryList(new ArrayList<SchemaEvolutionEntry>());
- tableSchema.setSchemaEvalution(schemaEvol);
- tableSchema.setTableId(UUID.randomUUID().toString());
- tableInfo.setTableUniqueName(
- identifier.getCarbonTableIdentifier().getTableUniqueName()
- );
- tableInfo.setLastUpdatedTime(System.currentTimeMillis());
- tableInfo.setFactTable(tableSchema);
- tableInfo.setTablePath(identifier.getTablePath());
- String schemaFilePath = CarbonTablePath.getSchemaFilePath(identifier.getTablePath());
- String schemaMetadataPath = CarbonTablePath.getFolderContainingFile(schemaFilePath);
- CarbonMetadata.getInstance().loadTableMetadata(tableInfo);
-
- SchemaConverter schemaConverter = new ThriftWrapperSchemaConverterImpl();
- org.apache.carbondata.format.TableInfo thriftTableInfo =
- schemaConverter.fromWrapperToExternalTableInfo(
- tableInfo,
- tableInfo.getDatabaseName(),
- tableInfo.getFactTable().getTableName());
- org.apache.carbondata.format.SchemaEvolutionEntry schemaEvolutionEntry =
- new org.apache.carbondata.format.SchemaEvolutionEntry(tableInfo.getLastUpdatedTime());
- thriftTableInfo.getFact_table().getSchema_evolution().getSchema_evolution_history()
- .add(schemaEvolutionEntry);
-
- FileFactory.FileType fileType = FileFactory.getFileType(schemaMetadataPath);
- if (!FileFactory.isFileExist(schemaMetadataPath, fileType)) {
- FileFactory.mkdirs(schemaMetadataPath, fileType);
- }
-
- ThriftWriter thriftWriter = new ThriftWriter(schemaFilePath, false);
- thriftWriter.open();
- thriftWriter.write(thriftTableInfo);
- thriftWriter.close();
- return CarbonMetadata.getInstance().getCarbonTable(tableInfo.getTableUniqueName());
- }
-
- private static void writeDictionary(String factFilePath, CarbonTable table) throws Exception {
- BufferedReader reader = new BufferedReader(new FileReader(factFilePath));
- String header = reader.readLine();
- String[] split = header.split(",");
- List<CarbonColumn> allCols = new ArrayList<CarbonColumn>();
- List<CarbonDimension> dims = table.getDimensionByTableName(table.getTableName());
- allCols.addAll(dims);
- List<CarbonMeasure> msrs = table.getMeasureByTableName(table.getTableName());
- allCols.addAll(msrs);
- Set<String>[] set = new HashSet[dims.size()];
- for (int i = 0; i < set.length; i++) {
- set[i] = new HashSet<String>();
- }
- String line = reader.readLine();
- while (line != null) {
- String[] data = line.split(",");
- for (int i = 0; i < set.length; i++) {
- set[i].add(data[i]);
- }
- line = reader.readLine();
- }
-
- Cache dictCache = CacheProvider.getInstance()
- .createCache(CacheType.REVERSE_DICTIONARY);
- for (int i = 0; i < set.length; i++) {
- ColumnIdentifier columnIdentifier = new ColumnIdentifier(dims.get(i).getColumnId(), null, null);
- DictionaryColumnUniqueIdentifier dictionaryColumnUniqueIdentifier =
- new DictionaryColumnUniqueIdentifier(table.getAbsoluteTableIdentifier(), columnIdentifier,
- columnIdentifier.getDataType());
- CarbonDictionaryWriter writer =
- new CarbonDictionaryWriterImpl(dictionaryColumnUniqueIdentifier);
- for (String value : set[i]) {
- writer.write(value);
- }
- writer.close();
- writer.commit();
- Dictionary dict = (Dictionary) dictCache.get(
- new DictionaryColumnUniqueIdentifier(absoluteTableIdentifier,
- columnIdentifier, dims.get(i).getDataType()));
- CarbonDictionarySortInfoPreparator preparator =
- new CarbonDictionarySortInfoPreparator();
- List<String> newDistinctValues = new ArrayList<String>();
- CarbonDictionarySortInfo dictionarySortInfo =
- preparator.getDictionarySortInfo(newDistinctValues, dict, dims.get(i).getDataType());
- CarbonDictionarySortIndexWriter carbonDictionaryWriter =
- new CarbonDictionarySortIndexWriterImpl(dictionaryColumnUniqueIdentifier);
- try {
- carbonDictionaryWriter.writeSortIndex(dictionarySortInfo.getSortIndex());
- carbonDictionaryWriter.writeInvertedSortIndex(dictionarySortInfo.getSortIndexInverted());
- } finally {
- carbonDictionaryWriter.close();
- }
- }
- reader.close();
- }
-
- /**
- * Execute graph which will further load data
- *
- * @param loadModel
- * @param storeLocation
- * @throws Exception
- */
- public static void loadData(CarbonLoadModel loadModel, String storeLocation)
- throws Exception {
- new File(storeLocation).mkdirs();
- String outPutLoc = storeLocation + "/etl";
- String databaseName = loadModel.getDatabaseName();
- String tableName = loadModel.getTableName();
- String tempLocationKey = databaseName + '_' + tableName + "_1";
- CarbonProperties.getInstance().addProperty(tempLocationKey, storeLocation + "/" + databaseName + "/" + tableName);
- CarbonProperties.getInstance().addProperty("store_output_location", outPutLoc);
- CarbonProperties.getInstance().addProperty("send.signal.load", "false");
- CarbonProperties.getInstance().addProperty("carbon.is.columnar.storage", "true");
- CarbonProperties.getInstance().addProperty("carbon.dimension.split.value.in.columnar", "1");
- CarbonProperties.getInstance().addProperty("carbon.is.fullyfilled.bits", "true");
- CarbonProperties.getInstance().addProperty("is.int.based.indexer", "true");
- CarbonProperties.getInstance().addProperty("aggregate.columnar.keyblock", "true");
- CarbonProperties.getInstance().addProperty("is.compressed.keyblock", "false");
- CarbonProperties.getInstance().addProperty("carbon.leaf.node.size", "120000");
-
- String graphPath =
- outPutLoc + File.separator + loadModel.getDatabaseName() + File.separator + tableName
- + File.separator + 0 + File.separator + 1 + File.separator + tableName + ".ktr";
- File path = new File(graphPath);
- if (path.exists()) {
- path.delete();
- }
-
- BlockDetails blockDetails = new BlockDetails(new Path(loadModel.getFactFilePath()),
- 0, new File(loadModel.getFactFilePath()).length(), new String[] {"localhost"});
- Configuration configuration = new Configuration();
- CSVInputFormat.setCommentCharacter(configuration, loadModel.getCommentChar());
- CSVInputFormat.setCSVDelimiter(configuration, loadModel.getCsvDelimiter());
- CSVInputFormat.setEscapeCharacter(configuration, loadModel.getEscapeChar());
- CSVInputFormat.setHeaderExtractionEnabled(configuration, true);
- CSVInputFormat.setQuoteCharacter(configuration, loadModel.getQuoteChar());
- CSVInputFormat.setReadBufferSize(configuration, CarbonProperties.getInstance()
- .getProperty(CarbonCommonConstants.CSV_READ_BUFFER_SIZE,
- CarbonCommonConstants.CSV_READ_BUFFER_SIZE_DEFAULT));
- CSVInputFormat.setNumberOfColumns(configuration, String.valueOf(loadModel.getCsvHeaderColumns().length));
- CSVInputFormat.setMaxColumns(configuration, "10");
-
- TaskAttemptContextImpl hadoopAttemptContext = new TaskAttemptContextImpl(configuration, new TaskAttemptID("", 1, TaskType.MAP, 0, 0));
- CSVInputFormat format = new CSVInputFormat();
-
- RecordReader<NullWritable, StringArrayWritable> recordReader =
- format.createRecordReader(blockDetails, hadoopAttemptContext);
-
- CSVRecordReaderIterator readerIterator = new CSVRecordReaderIterator(recordReader, blockDetails, hadoopAttemptContext);
- new DataLoadExecutor().execute(loadModel,
- new String[] {storeLocation + "/" + databaseName + "/" + tableName},
- new CarbonIterator[]{readerIterator});
-
- writeLoadMetadata(loadModel.getCarbonDataLoadSchema(), loadModel.getTableName(), loadModel.getTableName(),
- new ArrayList<LoadMetadataDetails>());
- }
-
- public static void writeLoadMetadata(CarbonDataLoadSchema schema, String databaseName,
- String tableName, List<LoadMetadataDetails> listOfLoadFolderDetails) throws IOException {
- LoadMetadataDetails loadMetadataDetails = new LoadMetadataDetails();
- loadMetadataDetails.setLoadEndTime(System.currentTimeMillis());
- loadMetadataDetails.setSegmentStatus(SegmentStatus.SUCCESS);
- loadMetadataDetails.setLoadName(String.valueOf(0));
- loadMetadataDetails.setLoadStartTime(loadMetadataDetails.getTimeStamp(readCurrentTime()));
- listOfLoadFolderDetails.add(loadMetadataDetails);
-
- String dataLoadLocation = schema.getCarbonTable().getMetadataPath() + File.separator
- + CarbonTablePath.TABLE_STATUS_FILE;
-
- DataOutputStream dataOutputStream;
- Gson gsonObjectToWrite = new Gson();
- BufferedWriter brWriter = null;
-
- AtomicFileOperations writeOperation =
- new AtomicFileOperationsImpl(dataLoadLocation, FileFactory.getFileType(dataLoadLocation));
-
- try {
-
- dataOutputStream = writeOperation.openForWrite(FileWriteOperation.OVERWRITE);
- brWriter = new BufferedWriter(new OutputStreamWriter(dataOutputStream,
- Charset.forName(CarbonCommonConstants.DEFAULT_CHARSET)));
-
- String metadataInstance = gsonObjectToWrite.toJson(listOfLoadFolderDetails.toArray());
- brWriter.write(metadataInstance);
- } finally {
- try {
- if (null != brWriter) {
- brWriter.flush();
- }
- } catch (Exception e) {
- throw e;
-
- }
- CarbonUtil.closeStreams(brWriter);
-
- }
- writeOperation.close();
-
- }
-
- public static String readCurrentTime() {
- SimpleDateFormat sdf = new SimpleDateFormat(CarbonCommonConstants.CARBON_TIMESTAMP_MILLIS);
- String date = null;
-
- date = sdf.format(new Date());
-
- return date;
- }
-
- public static void main(String[] args) throws Exception {
- StoreCreator.createCarbonStore();
- }
-
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/integration/spark-common/pom.xml
----------------------------------------------------------------------
diff --git a/integration/spark-common/pom.xml b/integration/spark-common/pom.xml
index 16f327d..f011a75 100644
--- a/integration/spark-common/pom.xml
+++ b/integration/spark-common/pom.xml
@@ -36,7 +36,7 @@
<dependencies>
<dependency>
<groupId>org.apache.carbondata</groupId>
- <artifactId>carbondata-hadoop</artifactId>
+ <artifactId>carbondata-streaming</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/integration/spark-common/src/main/java/org/apache/carbondata/spark/util/SparkDataTypeConverterImpl.java
----------------------------------------------------------------------
diff --git a/integration/spark-common/src/main/java/org/apache/carbondata/spark/util/SparkDataTypeConverterImpl.java b/integration/spark-common/src/main/java/org/apache/carbondata/spark/util/SparkDataTypeConverterImpl.java
index 6e9e0a6..f6dc65b 100644
--- a/integration/spark-common/src/main/java/org/apache/carbondata/spark/util/SparkDataTypeConverterImpl.java
+++ b/integration/spark-common/src/main/java/org/apache/carbondata/spark/util/SparkDataTypeConverterImpl.java
@@ -20,10 +20,19 @@ package org.apache.carbondata.spark.util;
import java.io.Serializable;
import java.math.BigDecimal;
+import org.apache.carbondata.core.keygenerator.directdictionary.DirectDictionaryGenerator;
+import org.apache.carbondata.core.keygenerator.directdictionary.DirectDictionaryKeyGeneratorFactory;
+import org.apache.carbondata.core.metadata.datatype.DataType;
+import org.apache.carbondata.core.metadata.encoder.Encoding;
+import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn;
+import org.apache.carbondata.core.metadata.schema.table.column.CarbonMeasure;
import org.apache.carbondata.core.util.DataTypeConverter;
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
import org.apache.spark.sql.catalyst.util.GenericArrayData;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.DecimalType;
+import org.apache.spark.sql.types.StructField;
import org.apache.spark.unsafe.types.UTF8String;
/**
@@ -90,4 +99,76 @@ public final class SparkDataTypeConverterImpl implements DataTypeConverter, Seri
public Object wrapWithGenericRow(Object[] fields) {
return new GenericInternalRow(fields);
}
+
+ private static org.apache.spark.sql.types.DataType convertCarbonToSparkDataType(
+ DataType carbonDataType) {
+ if (carbonDataType == org.apache.carbondata.core.metadata.datatype.DataTypes.STRING) {
+ return DataTypes.StringType;
+ } else if (carbonDataType == org.apache.carbondata.core.metadata.datatype.DataTypes.SHORT) {
+ return DataTypes.ShortType;
+ } else if (carbonDataType == org.apache.carbondata.core.metadata.datatype.DataTypes.INT) {
+ return DataTypes.IntegerType;
+ } else if (carbonDataType == org.apache.carbondata.core.metadata.datatype.DataTypes.LONG) {
+ return DataTypes.LongType;
+ } else if (carbonDataType == org.apache.carbondata.core.metadata.datatype.DataTypes.DOUBLE) {
+ return DataTypes.DoubleType;
+ } else if (carbonDataType == org.apache.carbondata.core.metadata.datatype.DataTypes.BOOLEAN) {
+ return DataTypes.BooleanType;
+ } else if (org.apache.carbondata.core.metadata.datatype.DataTypes.isDecimal(carbonDataType)) {
+ return DataTypes.createDecimalType();
+ } else if (carbonDataType == org.apache.carbondata.core.metadata.datatype.DataTypes.TIMESTAMP) {
+ return DataTypes.TimestampType;
+ } else if (carbonDataType == org.apache.carbondata.core.metadata.datatype.DataTypes.DATE) {
+ return DataTypes.DateType;
+ } else {
+ return null;
+ }
+ }
+
+ /**
+ * convert from CarbonColumn array to Spark's StructField array
+ */
+ @Override
+ public Object[] convertCarbonSchemaToSparkSchema(CarbonColumn[] carbonColumns) {
+ StructField[] fields = new StructField[carbonColumns.length];
+ for (int i = 0; i < carbonColumns.length; i++) {
+ CarbonColumn carbonColumn = carbonColumns[i];
+ if (carbonColumn.isDimension()) {
+ if (carbonColumn.hasEncoding(Encoding.DIRECT_DICTIONARY)) {
+ DirectDictionaryGenerator generator = DirectDictionaryKeyGeneratorFactory
+ .getDirectDictionaryGenerator(carbonColumn.getDataType());
+ fields[i] = new StructField(carbonColumn.getColName(),
+ convertCarbonToSparkDataType(generator.getReturnType()), true, null);
+ } else if (!carbonColumn.hasEncoding(Encoding.DICTIONARY)) {
+ fields[i] = new StructField(carbonColumn.getColName(),
+ convertCarbonToSparkDataType(carbonColumn.getDataType()), true, null);
+ } else if (carbonColumn.isComplex()) {
+ fields[i] = new StructField(carbonColumn.getColName(),
+ convertCarbonToSparkDataType(carbonColumn.getDataType()), true, null);
+ } else {
+ fields[i] = new StructField(carbonColumn.getColName(),
+ convertCarbonToSparkDataType(
+ org.apache.carbondata.core.metadata.datatype.DataTypes.INT), true, null);
+ }
+ } else if (carbonColumn.isMeasure()) {
+ DataType dataType = carbonColumn.getDataType();
+ if (dataType == org.apache.carbondata.core.metadata.datatype.DataTypes.BOOLEAN
+ || dataType == org.apache.carbondata.core.metadata.datatype.DataTypes.SHORT
+ || dataType == org.apache.carbondata.core.metadata.datatype.DataTypes.INT
+ || dataType == org.apache.carbondata.core.metadata.datatype.DataTypes.LONG) {
+ fields[i] = new StructField(carbonColumn.getColName(),
+ convertCarbonToSparkDataType(dataType), true, null);
+ } else if (org.apache.carbondata.core.metadata.datatype.DataTypes.isDecimal(dataType)) {
+ CarbonMeasure measure = (CarbonMeasure) carbonColumn;
+ fields[i] = new StructField(carbonColumn.getColName(),
+ new DecimalType(measure.getPrecision(), measure.getScale()), true, null);
+ } else {
+ fields[i] = new StructField(carbonColumn.getColName(),
+ convertCarbonToSparkDataType(
+ org.apache.carbondata.core.metadata.datatype.DataTypes.DOUBLE), true, null);
+ }
+ }
+ }
+ return fields;
+ }
}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/CarbonScanRDD.scala
----------------------------------------------------------------------
diff --git a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/CarbonScanRDD.scala b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/CarbonScanRDD.scala
index 2be0efc..7e549a6 100644
--- a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/CarbonScanRDD.scala
+++ b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/CarbonScanRDD.scala
@@ -53,10 +53,11 @@ import org.apache.carbondata.core.statusmanager.FileFormat
import org.apache.carbondata.core.util._
import org.apache.carbondata.hadoop._
import org.apache.carbondata.hadoop.api.{CarbonFileInputFormat, CarbonInputFormat, CarbonTableInputFormat}
-import org.apache.carbondata.hadoop.streaming.{CarbonStreamInputFormat, CarbonStreamRecordReader}
+import org.apache.carbondata.hadoop.api.CarbonTableInputFormat
import org.apache.carbondata.processing.util.CarbonLoaderUtil
import org.apache.carbondata.spark.InitInputMetrics
import org.apache.carbondata.spark.util.{SparkDataTypeConverterImpl, Util}
+import org.apache.carbondata.streaming.{CarbonStreamInputFormat, CarbonStreamRecordReader}
/**
* This RDD is used to perform query on CarbonData file. Before sending tasks to scan
[5/5] carbondata git commit: [CARBONDATA-2165]Remove spark in
carbon-hadoop module
Posted by ja...@apache.org.
[CARBONDATA-2165]Remove spark in carbon-hadoop module
1. Streaming relation RecordReader is moved to carbon-streaming module.
2. RDD related class is moved to carbon-spark2 module
This closes #2074
Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/c723947a
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/c723947a
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/c723947a
Branch: refs/heads/master
Commit: c723947a79332c66175f5a33cf57f08fe70fe1a9
Parents: 2e1ddb5
Author: Jacky Li <ja...@qq.com>
Authored: Sat Mar 17 18:13:08 2018 +0800
Committer: Jacky Li <ja...@qq.com>
Committed: Wed Mar 28 11:19:23 2018 +0800
----------------------------------------------------------------------
.../carbondata/core/util/DataTypeConverter.java | 3 +
.../core/util/DataTypeConverterImpl.java | 5 +
hadoop/CARBON_HADOOPLogResource.properties | 18 -
hadoop/pom.xml | 7 -
.../readsupport/impl/RawDataReadSupport.java | 42 -
.../streaming/CarbonStreamInputFormat.java | 115 ---
.../streaming/CarbonStreamOutputFormat.java | 87 ---
.../streaming/CarbonStreamRecordReader.java | 759 ------------------
.../streaming/CarbonStreamRecordWriter.java | 325 --------
.../hadoop/streaming/StreamBlockletReader.java | 259 -------
.../hadoop/streaming/StreamBlockletWriter.java | 152 ----
.../hadoop/testutil/StoreCreator.java | 495 ++++++++++++
.../carbondata/hadoop/util/CarbonTypeUtil.java | 101 ---
.../hadoop/ft/CarbonTableInputFormatTest.java | 6 +-
.../hadoop/ft/CarbonTableOutputFormatTest.java | 2 +-
.../streaming/CarbonStreamInputFormatTest.java | 99 ---
.../streaming/CarbonStreamOutputFormatTest.java | 121 ---
.../hadoop/test/util/StoreCreator.java | 492 ------------
integration/spark-common/pom.xml | 2 +-
.../spark/util/SparkDataTypeConverterImpl.java | 81 ++
.../carbondata/spark/rdd/CarbonScanRDD.scala | 3 +-
.../carbondata/spark/rdd/StreamHandoffRDD.scala | 435 +++++++++++
.../carbondata/spark/util/CarbonScalaUtil.scala | 52 +-
.../CarbonSparkStreamingListener.scala | 30 +
.../streaming/CarbonStreamSparkStreaming.scala | 184 +++++
.../CarbonStreamingQueryListener.scala | 77 ++
.../streaming/StreamSinkFactory.scala | 236 ++++++
.../streaming/CarbonAppendableStreamSink.scala | 362 +++++++++
.../spark/sql/test/TestQueryExecutor.scala | 4 +-
integration/spark2/pom.xml | 2 +-
.../org/apache/spark/sql/CarbonSession.scala | 2 +-
.../CarbonAlterTableCompactionCommand.scala | 3 +-
streaming/pom.xml | 9 +-
.../streaming/CarbonStreamInputFormat.java | 115 +++
.../streaming/CarbonStreamOutputFormat.java | 87 +++
.../streaming/CarbonStreamRecordReader.java | 761 +++++++++++++++++++
.../streaming/CarbonStreamRecordWriter.java | 325 ++++++++
.../streaming/StreamBlockletReader.java | 259 +++++++
.../streaming/StreamBlockletWriter.java | 152 ++++
.../streaming/segment/StreamSegment.java | 2 +-
.../CarbonSparkStreamingListener.scala | 31 -
.../streaming/CarbonStreamSparkStreaming.scala | 187 -----
.../carbondata/streaming/StreamHandoffRDD.scala | 436 -----------
.../streaming/StreamSinkFactory.scala | 236 ------
.../streaming/parser/FieldConverter.scala | 95 +++
.../streaming/parser/RowStreamParserImp.scala | 7 +-
.../streaming/CarbonAppendableStreamSink.scala | 362 ---------
.../CarbonStreamingQueryListener.scala | 77 --
.../streaming/CarbonStreamInputFormatTest.java | 99 +++
.../streaming/CarbonStreamOutputFormatTest.java | 121 +++
50 files changed, 3948 insertions(+), 3974 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/core/src/main/java/org/apache/carbondata/core/util/DataTypeConverter.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/util/DataTypeConverter.java b/core/src/main/java/org/apache/carbondata/core/util/DataTypeConverter.java
index 7c63860..474493a 100644
--- a/core/src/main/java/org/apache/carbondata/core/util/DataTypeConverter.java
+++ b/core/src/main/java/org/apache/carbondata/core/util/DataTypeConverter.java
@@ -17,6 +17,8 @@
package org.apache.carbondata.core.util;
+import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn;
+
public interface DataTypeConverter {
Object convertFromStringToDecimal(Object data);
@@ -31,4 +33,5 @@ public interface DataTypeConverter {
Object wrapWithGenericArrayData(Object data);
Object wrapWithGenericRow(Object[] fields);
+ Object[] convertCarbonSchemaToSparkSchema(CarbonColumn[] carbonColumns);
}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/core/src/main/java/org/apache/carbondata/core/util/DataTypeConverterImpl.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/util/DataTypeConverterImpl.java b/core/src/main/java/org/apache/carbondata/core/util/DataTypeConverterImpl.java
index ea5740d..a4f571e 100644
--- a/core/src/main/java/org/apache/carbondata/core/util/DataTypeConverterImpl.java
+++ b/core/src/main/java/org/apache/carbondata/core/util/DataTypeConverterImpl.java
@@ -21,6 +21,7 @@ import java.io.Serializable;
import java.math.BigDecimal;
import org.apache.carbondata.core.constants.CarbonCommonConstants;
+import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn;
public class DataTypeConverterImpl implements DataTypeConverter, Serializable {
@@ -91,4 +92,8 @@ public class DataTypeConverterImpl implements DataTypeConverter, Serializable {
return fields;
}
+ @Override
+ public Object[] convertCarbonSchemaToSparkSchema(CarbonColumn[] carbonColumns) {
+ throw new UnsupportedOperationException();
+ }
}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/hadoop/CARBON_HADOOPLogResource.properties
----------------------------------------------------------------------
diff --git a/hadoop/CARBON_HADOOPLogResource.properties b/hadoop/CARBON_HADOOPLogResource.properties
deleted file mode 100644
index 135a578..0000000
--- a/hadoop/CARBON_HADOOPLogResource.properties
+++ /dev/null
@@ -1,18 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-carbon.hadoop = {0}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/hadoop/pom.xml
----------------------------------------------------------------------
diff --git a/hadoop/pom.xml b/hadoop/pom.xml
index 916b9db..41e2822 100644
--- a/hadoop/pom.xml
+++ b/hadoop/pom.xml
@@ -40,10 +40,6 @@
<version>${project.version}</version>
</dependency>
<dependency>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-sql_${scala.binary.version}</artifactId>
- </dependency>
- <dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
@@ -55,9 +51,6 @@
<resources>
<resource>
<directory>.</directory>
- <includes>
- <include>CARBON_HADOOPLogResource.properties</include>
- </includes>
</resource>
</resources>
<plugins>
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/hadoop/src/main/java/org/apache/carbondata/hadoop/readsupport/impl/RawDataReadSupport.java
----------------------------------------------------------------------
diff --git a/hadoop/src/main/java/org/apache/carbondata/hadoop/readsupport/impl/RawDataReadSupport.java b/hadoop/src/main/java/org/apache/carbondata/hadoop/readsupport/impl/RawDataReadSupport.java
deleted file mode 100644
index b2cd450..0000000
--- a/hadoop/src/main/java/org/apache/carbondata/hadoop/readsupport/impl/RawDataReadSupport.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.carbondata.hadoop.readsupport.impl;
-
-import org.apache.carbondata.core.metadata.schema.table.CarbonTable;
-import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn;
-import org.apache.carbondata.hadoop.readsupport.CarbonReadSupport;
-
-import org.apache.spark.sql.catalyst.InternalRow;
-import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
-
-public class RawDataReadSupport implements CarbonReadSupport<InternalRow> {
-
- @Override
- public void initialize(CarbonColumn[] carbonColumns, CarbonTable carbonTable) { }
-
- /**
- * return column data as InternalRow
- *
- * @param data column data
- */
- @Override
- public InternalRow readRow(Object[] data) {
- return new GenericInternalRow(data);
- }
-
- @Override public void close() { }
-}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/CarbonStreamInputFormat.java
----------------------------------------------------------------------
diff --git a/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/CarbonStreamInputFormat.java b/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/CarbonStreamInputFormat.java
deleted file mode 100644
index a6e9563..0000000
--- a/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/CarbonStreamInputFormat.java
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.carbondata.hadoop.streaming;
-
-import java.io.IOException;
-
-import org.apache.carbondata.core.cache.Cache;
-import org.apache.carbondata.core.cache.dictionary.Dictionary;
-import org.apache.carbondata.core.cache.dictionary.DictionaryColumnUniqueIdentifier;
-import org.apache.carbondata.core.constants.CarbonCommonConstants;
-import org.apache.carbondata.core.metadata.datatype.DataType;
-import org.apache.carbondata.core.metadata.datatype.DataTypes;
-import org.apache.carbondata.core.metadata.encoder.Encoding;
-import org.apache.carbondata.core.metadata.schema.table.CarbonTable;
-import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn;
-import org.apache.carbondata.core.metadata.schema.table.column.CarbonDimension;
-import org.apache.carbondata.core.scan.complextypes.ArrayQueryType;
-import org.apache.carbondata.core.scan.complextypes.PrimitiveQueryType;
-import org.apache.carbondata.core.scan.complextypes.StructQueryType;
-import org.apache.carbondata.core.scan.filter.GenericQueryType;
-import org.apache.carbondata.core.util.CarbonUtil;
-
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-
-/**
- * Stream input format
- */
-public class CarbonStreamInputFormat extends FileInputFormat<Void, Object> {
-
- public static final String READ_BUFFER_SIZE = "carbon.stream.read.buffer.size";
- public static final String READ_BUFFER_SIZE_DEFAULT = "65536";
-
- @Override public RecordReader<Void, Object> createRecordReader(InputSplit split,
- TaskAttemptContext context) throws IOException, InterruptedException {
- return new CarbonStreamRecordReader();
- }
-
- public static GenericQueryType[] getComplexDimensions(CarbonTable carbontable,
- CarbonColumn[] carbonColumns, Cache<DictionaryColumnUniqueIdentifier, Dictionary> cache)
- throws IOException {
- GenericQueryType[] queryTypes = new GenericQueryType[carbonColumns.length];
- for (int i = 0; i < carbonColumns.length; i++) {
- if (carbonColumns[i].isComplex()) {
- if (DataTypes.isArrayType(carbonColumns[i].getDataType())) {
- queryTypes[i] = new ArrayQueryType(carbonColumns[i].getColName(),
- carbonColumns[i].getColName(), i);
- } else if (DataTypes.isStructType(carbonColumns[i].getDataType())) {
- queryTypes[i] = new StructQueryType(carbonColumns[i].getColName(),
- carbonColumns[i].getColName(), i);
- } else {
- throw new UnsupportedOperationException(
- carbonColumns[i].getDataType().getName() + " is not supported");
- }
-
- fillChildren(carbontable, queryTypes[i], (CarbonDimension) carbonColumns[i], i, cache);
- }
- }
-
- return queryTypes;
- }
-
- private static void fillChildren(CarbonTable carbontable, GenericQueryType parentQueryType,
- CarbonDimension dimension, int parentBlockIndex,
- Cache<DictionaryColumnUniqueIdentifier, Dictionary> cache) throws IOException {
- for (int i = 0; i < dimension.getNumberOfChild(); i++) {
- CarbonDimension child = dimension.getListOfChildDimensions().get(i);
- DataType dataType = child.getDataType();
- GenericQueryType queryType = null;
- if (DataTypes.isArrayType(dataType)) {
- queryType =
- new ArrayQueryType(child.getColName(), dimension.getColName(), ++parentBlockIndex);
-
- } else if (DataTypes.isStructType(dataType)) {
- queryType =
- new StructQueryType(child.getColName(), dimension.getColName(), ++parentBlockIndex);
- parentQueryType.addChildren(queryType);
- } else {
- boolean isDirectDictionary =
- CarbonUtil.hasEncoding(child.getEncoder(), Encoding.DIRECT_DICTIONARY);
- String dictionaryPath = carbontable.getTableInfo().getFactTable().getTableProperties()
- .get(CarbonCommonConstants.DICTIONARY_PATH);
- DictionaryColumnUniqueIdentifier dictionarIdentifier =
- new DictionaryColumnUniqueIdentifier(carbontable.getAbsoluteTableIdentifier(),
- child.getColumnIdentifier(), child.getDataType(), dictionaryPath);
-
- queryType =
- new PrimitiveQueryType(child.getColName(), dimension.getColName(), ++parentBlockIndex,
- child.getDataType(), 4, cache.get(dictionarIdentifier),
- isDirectDictionary);
- }
- parentQueryType.addChildren(queryType);
- if (child.getNumberOfChild() > 0) {
- fillChildren(carbontable, queryType, child, parentBlockIndex, cache);
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/CarbonStreamOutputFormat.java
----------------------------------------------------------------------
diff --git a/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/CarbonStreamOutputFormat.java b/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/CarbonStreamOutputFormat.java
deleted file mode 100644
index 2599fa7..0000000
--- a/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/CarbonStreamOutputFormat.java
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.carbondata.hadoop.streaming;
-
-import java.io.IOException;
-import java.nio.charset.Charset;
-
-import org.apache.carbondata.core.constants.CarbonCommonConstants;
-import org.apache.carbondata.hadoop.util.ObjectSerializationUtil;
-import org.apache.carbondata.processing.loading.model.CarbonLoadModel;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.RecordWriter;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-
-/**
- * Stream output format
- */
-public class CarbonStreamOutputFormat extends FileOutputFormat<Void, Object> {
-
- static final byte[] CARBON_SYNC_MARKER =
- "@carbondata_sync".getBytes(Charset.forName(CarbonCommonConstants.DEFAULT_CHARSET));
-
- public static final String CARBON_ENCODER_ROW_BUFFER_SIZE = "carbon.stream.row.buffer.size";
-
- public static final int CARBON_ENCODER_ROW_BUFFER_SIZE_DEFAULT = 1024;
-
- public static final String CARBON_STREAM_BLOCKLET_ROW_NUMS = "carbon.stream.blocklet.row.nums";
-
- public static final int CARBON_STREAM_BLOCKLET_ROW_NUMS_DEFAULT = 32000;
-
- public static final String CARBON_STREAM_CACHE_SIZE = "carbon.stream.cache.size";
-
- public static final int CARBON_STREAM_CACHE_SIZE_DEFAULT = 32 * 1024 * 1024;
-
- private static final String LOAD_Model = "mapreduce.output.carbon.load.model";
-
- private static final String SEGMENT_ID = "carbon.segment.id";
-
- @Override public RecordWriter<Void, Object> getRecordWriter(TaskAttemptContext job)
- throws IOException, InterruptedException {
- return new CarbonStreamRecordWriter(job);
- }
-
- public static void setCarbonLoadModel(Configuration hadoopConf, CarbonLoadModel carbonLoadModel)
- throws IOException {
- if (carbonLoadModel != null) {
- hadoopConf.set(LOAD_Model, ObjectSerializationUtil.convertObjectToString(carbonLoadModel));
- }
- }
-
- public static CarbonLoadModel getCarbonLoadModel(Configuration hadoopConf) throws IOException {
- String value = hadoopConf.get(LOAD_Model);
- if (value == null) {
- return null;
- } else {
- return (CarbonLoadModel) ObjectSerializationUtil.convertStringToObject(value);
- }
- }
-
- public static void setSegmentId(Configuration hadoopConf, String segmentId) throws IOException {
- if (segmentId != null) {
- hadoopConf.set(SEGMENT_ID, segmentId);
- }
- }
-
- public static String getSegmentId(Configuration hadoopConf) throws IOException {
- return hadoopConf.get(SEGMENT_ID);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/CarbonStreamRecordReader.java
----------------------------------------------------------------------
diff --git a/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/CarbonStreamRecordReader.java b/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/CarbonStreamRecordReader.java
deleted file mode 100644
index 1e227c4..0000000
--- a/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/CarbonStreamRecordReader.java
+++ /dev/null
@@ -1,759 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.carbondata.hadoop.streaming;
-
-import java.io.IOException;
-import java.math.BigDecimal;
-import java.math.BigInteger;
-import java.nio.ByteBuffer;
-import java.util.BitSet;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.carbondata.core.cache.Cache;
-import org.apache.carbondata.core.cache.CacheProvider;
-import org.apache.carbondata.core.cache.CacheType;
-import org.apache.carbondata.core.cache.dictionary.Dictionary;
-import org.apache.carbondata.core.cache.dictionary.DictionaryColumnUniqueIdentifier;
-import org.apache.carbondata.core.constants.CarbonCommonConstants;
-import org.apache.carbondata.core.datastore.block.SegmentProperties;
-import org.apache.carbondata.core.keygenerator.directdictionary.DirectDictionaryGenerator;
-import org.apache.carbondata.core.keygenerator.directdictionary.DirectDictionaryKeyGeneratorFactory;
-import org.apache.carbondata.core.metadata.datatype.DataType;
-import org.apache.carbondata.core.metadata.datatype.DataTypes;
-import org.apache.carbondata.core.metadata.encoder.Encoding;
-import org.apache.carbondata.core.metadata.schema.table.CarbonTable;
-import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn;
-import org.apache.carbondata.core.metadata.schema.table.column.CarbonDimension;
-import org.apache.carbondata.core.metadata.schema.table.column.CarbonMeasure;
-import org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema;
-import org.apache.carbondata.core.reader.CarbonHeaderReader;
-import org.apache.carbondata.core.scan.expression.exception.FilterUnsupportedException;
-import org.apache.carbondata.core.scan.filter.FilterUtil;
-import org.apache.carbondata.core.scan.filter.GenericQueryType;
-import org.apache.carbondata.core.scan.filter.executer.FilterExecuter;
-import org.apache.carbondata.core.scan.filter.intf.RowImpl;
-import org.apache.carbondata.core.scan.filter.intf.RowIntf;
-import org.apache.carbondata.core.scan.filter.resolver.FilterResolverIntf;
-import org.apache.carbondata.core.scan.model.QueryModel;
-import org.apache.carbondata.core.util.CarbonUtil;
-import org.apache.carbondata.core.util.DataTypeUtil;
-import org.apache.carbondata.format.BlockletHeader;
-import org.apache.carbondata.format.FileHeader;
-import org.apache.carbondata.hadoop.CarbonInputSplit;
-import org.apache.carbondata.hadoop.CarbonMultiBlockSplit;
-import org.apache.carbondata.hadoop.InputMetricsStats;
-import org.apache.carbondata.hadoop.api.CarbonTableInputFormat;
-import org.apache.carbondata.hadoop.util.CarbonTypeUtil;
-import org.apache.carbondata.processing.util.CarbonDataProcessorUtil;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.lib.input.FileSplit;
-import org.apache.spark.memory.MemoryMode;
-import org.apache.spark.sql.catalyst.InternalRow;
-import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
-import org.apache.spark.sql.execution.vectorized.ColumnVector;
-import org.apache.spark.sql.execution.vectorized.ColumnarBatch;
-import org.apache.spark.sql.types.CalendarIntervalType;
-import org.apache.spark.sql.types.Decimal;
-import org.apache.spark.sql.types.DecimalType;
-import org.apache.spark.sql.types.StructType;
-import org.apache.spark.unsafe.types.CalendarInterval;
-import org.apache.spark.unsafe.types.UTF8String;
-
-/**
- * Stream record reader
- */
-public class CarbonStreamRecordReader extends RecordReader<Void, Object> {
- // vector reader
- private boolean isVectorReader;
-
- // metadata
- private CarbonTable carbonTable;
- private CarbonColumn[] storageColumns;
- private boolean[] isRequired;
- private DataType[] measureDataTypes;
- private int dimensionCount;
- private int measureCount;
-
- // input
- private FileSplit fileSplit;
- private Configuration hadoopConf;
- private StreamBlockletReader input;
- private boolean isFirstRow = true;
- private QueryModel model;
-
- // decode data
- private BitSet allNonNull;
- private boolean[] isNoDictColumn;
- private DirectDictionaryGenerator[] directDictionaryGenerators;
- private CacheProvider cacheProvider;
- private Cache<DictionaryColumnUniqueIdentifier, Dictionary> cache;
- private GenericQueryType[] queryTypes;
-
- // vectorized reader
- private StructType outputSchema;
- private ColumnarBatch columnarBatch;
- private boolean isFinished = false;
-
- // filter
- private FilterExecuter filter;
- private boolean[] isFilterRequired;
- private Object[] filterValues;
- private RowIntf filterRow;
- private int[] filterMap;
-
- // output
- private CarbonColumn[] projection;
- private boolean[] isProjectionRequired;
- private int[] projectionMap;
- private Object[] outputValues;
- private InternalRow outputRow;
-
- // empty project, null filter
- private boolean skipScanData;
-
- // return raw row for handoff
- private boolean useRawRow = false;
-
- // InputMetricsStats
- private InputMetricsStats inputMetricsStats;
-
- @Override public void initialize(InputSplit split, TaskAttemptContext context)
- throws IOException, InterruptedException {
- // input
- if (split instanceof CarbonInputSplit) {
- fileSplit = (CarbonInputSplit) split;
- } else if (split instanceof CarbonMultiBlockSplit) {
- fileSplit = ((CarbonMultiBlockSplit) split).getAllSplits().get(0);
- } else {
- fileSplit = (FileSplit) split;
- }
-
- // metadata
- hadoopConf = context.getConfiguration();
- if (model == null) {
- CarbonTableInputFormat format = new CarbonTableInputFormat<Object>();
- model = format.createQueryModel(split, context);
- }
- carbonTable = model.getTable();
- List<CarbonDimension> dimensions =
- carbonTable.getDimensionByTableName(carbonTable.getTableName());
- dimensionCount = dimensions.size();
- List<CarbonMeasure> measures =
- carbonTable.getMeasureByTableName(carbonTable.getTableName());
- measureCount = measures.size();
- List<CarbonColumn> carbonColumnList =
- carbonTable.getStreamStorageOrderColumn(carbonTable.getTableName());
- storageColumns = carbonColumnList.toArray(new CarbonColumn[carbonColumnList.size()]);
- isNoDictColumn = CarbonDataProcessorUtil.getNoDictionaryMapping(storageColumns);
- directDictionaryGenerators = new DirectDictionaryGenerator[storageColumns.length];
- for (int i = 0; i < storageColumns.length; i++) {
- if (storageColumns[i].hasEncoding(Encoding.DIRECT_DICTIONARY)) {
- directDictionaryGenerators[i] = DirectDictionaryKeyGeneratorFactory
- .getDirectDictionaryGenerator(storageColumns[i].getDataType());
- }
- }
- measureDataTypes = new DataType[measureCount];
- for (int i = 0; i < measureCount; i++) {
- measureDataTypes[i] = storageColumns[dimensionCount + i].getDataType();
- }
-
- // decode data
- allNonNull = new BitSet(storageColumns.length);
- projection = model.getProjectionColumns();
-
- isRequired = new boolean[storageColumns.length];
- boolean[] isFiltlerDimensions = model.getIsFilterDimensions();
- boolean[] isFiltlerMeasures = model.getIsFilterMeasures();
- isFilterRequired = new boolean[storageColumns.length];
- filterMap = new int[storageColumns.length];
- for (int i = 0; i < storageColumns.length; i++) {
- if (storageColumns[i].isDimension()) {
- if (isFiltlerDimensions[storageColumns[i].getOrdinal()]) {
- isRequired[i] = true;
- isFilterRequired[i] = true;
- filterMap[i] = storageColumns[i].getOrdinal();
- }
- } else {
- if (isFiltlerMeasures[storageColumns[i].getOrdinal()]) {
- isRequired[i] = true;
- isFilterRequired[i] = true;
- filterMap[i] = carbonTable.getDimensionOrdinalMax() + storageColumns[i].getOrdinal();
- }
- }
- }
-
- isProjectionRequired = new boolean[storageColumns.length];
- projectionMap = new int[storageColumns.length];
- for (int i = 0; i < storageColumns.length; i++) {
- for (int j = 0; j < projection.length; j++) {
- if (storageColumns[i].getColName().equals(projection[j].getColName())) {
- isRequired[i] = true;
- isProjectionRequired[i] = true;
- projectionMap[i] = j;
- break;
- }
- }
- }
-
- // initialize filter
- if (null != model.getFilterExpressionResolverTree()) {
- initializeFilter();
- } else if (projection.length == 0) {
- skipScanData = true;
- }
-
- }
-
- private void initializeFilter() {
-
- List<ColumnSchema> wrapperColumnSchemaList = CarbonUtil
- .getColumnSchemaList(carbonTable.getDimensionByTableName(carbonTable.getTableName()),
- carbonTable.getMeasureByTableName(carbonTable.getTableName()));
- int[] dimLensWithComplex = new int[wrapperColumnSchemaList.size()];
- for (int i = 0; i < dimLensWithComplex.length; i++) {
- dimLensWithComplex[i] = Integer.MAX_VALUE;
- }
-
- int[] dictionaryColumnCardinality =
- CarbonUtil.getFormattedCardinality(dimLensWithComplex, wrapperColumnSchemaList);
- SegmentProperties segmentProperties =
- new SegmentProperties(wrapperColumnSchemaList, dictionaryColumnCardinality);
- Map<Integer, GenericQueryType> complexDimensionInfoMap = new HashMap<>();
-
- FilterResolverIntf resolverIntf = model.getFilterExpressionResolverTree();
- filter = FilterUtil.getFilterExecuterTree(resolverIntf, segmentProperties,
- complexDimensionInfoMap);
- // for row filter, we need update column index
- FilterUtil.updateIndexOfColumnExpression(resolverIntf.getFilterExpression(),
- carbonTable.getDimensionOrdinalMax());
-
- }
-
- public void setQueryModel(QueryModel model) {
- this.model = model;
- }
-
- private byte[] getSyncMarker(String filePath) throws IOException {
- CarbonHeaderReader headerReader = new CarbonHeaderReader(filePath);
- FileHeader header = headerReader.readHeader();
- return header.getSync_marker();
- }
-
- public void setUseRawRow(boolean useRawRow) {
- this.useRawRow = useRawRow;
- }
-
- private void initializeAtFirstRow() throws IOException {
- filterValues = new Object[carbonTable.getDimensionOrdinalMax() + measureCount];
- filterRow = new RowImpl();
- filterRow.setValues(filterValues);
-
- outputValues = new Object[projection.length];
- outputRow = new GenericInternalRow(outputValues);
-
- Path file = fileSplit.getPath();
-
- byte[] syncMarker = getSyncMarker(file.toString());
-
- FileSystem fs = file.getFileSystem(hadoopConf);
-
- int bufferSize = Integer.parseInt(hadoopConf.get(CarbonStreamInputFormat.READ_BUFFER_SIZE,
- CarbonStreamInputFormat.READ_BUFFER_SIZE_DEFAULT));
-
- FSDataInputStream fileIn = fs.open(file, bufferSize);
- fileIn.seek(fileSplit.getStart());
- input = new StreamBlockletReader(syncMarker, fileIn, fileSplit.getLength(),
- fileSplit.getStart() == 0);
-
- cacheProvider = CacheProvider.getInstance();
- cache = cacheProvider.createCache(CacheType.FORWARD_DICTIONARY);
- queryTypes = CarbonStreamInputFormat.getComplexDimensions(carbonTable, storageColumns, cache);
-
- outputSchema = new StructType(CarbonTypeUtil.convertCarbonSchemaToSparkSchema(projection));
- }
-
- @Override public boolean nextKeyValue() throws IOException, InterruptedException {
- if (isFirstRow) {
- isFirstRow = false;
- initializeAtFirstRow();
- }
- if (isFinished) {
- return false;
- }
-
- if (isVectorReader) {
- return nextColumnarBatch();
- }
-
- return nextRow();
- }
-
- /**
- * for vector reader, check next columnar batch
- */
- private boolean nextColumnarBatch() throws IOException {
- boolean hasNext;
- boolean scanMore = false;
- do {
- // move to the next blocklet
- hasNext = input.nextBlocklet();
- if (hasNext) {
- // read blocklet header
- BlockletHeader header = input.readBlockletHeader();
- if (isScanRequired(header)) {
- scanMore = !scanBlockletAndFillVector(header);
- } else {
- input.skipBlockletData(true);
- scanMore = true;
- }
- } else {
- isFinished = true;
- scanMore = false;
- }
- } while (scanMore);
- return hasNext;
- }
-
- /**
- * check next Row
- */
- private boolean nextRow() throws IOException {
- // read row one by one
- try {
- boolean hasNext;
- boolean scanMore = false;
- do {
- hasNext = input.hasNext();
- if (hasNext) {
- if (skipScanData) {
- input.nextRow();
- scanMore = false;
- } else {
- if (useRawRow) {
- // read raw row for streaming handoff which does not require decode raw row
- readRawRowFromStream();
- } else {
- readRowFromStream();
- }
- if (null != filter) {
- scanMore = !filter.applyFilter(filterRow, carbonTable.getDimensionOrdinalMax());
- } else {
- scanMore = false;
- }
- }
- } else {
- if (input.nextBlocklet()) {
- BlockletHeader header = input.readBlockletHeader();
- if (isScanRequired(header)) {
- if (skipScanData) {
- input.skipBlockletData(false);
- } else {
- input.readBlockletData(header);
- }
- } else {
- input.skipBlockletData(true);
- }
- scanMore = true;
- } else {
- isFinished = true;
- scanMore = false;
- }
- }
- } while (scanMore);
- return hasNext;
- } catch (FilterUnsupportedException e) {
- throw new IOException("Failed to filter row in detail reader", e);
- }
- }
-
- @Override public Void getCurrentKey() throws IOException, InterruptedException {
- return null;
- }
-
- @Override public Object getCurrentValue() throws IOException, InterruptedException {
- if (isVectorReader) {
- int value = columnarBatch.numValidRows();
- if (inputMetricsStats != null) {
- inputMetricsStats.incrementRecordRead((long) value);
- }
-
- return columnarBatch;
- }
-
- if (inputMetricsStats != null) {
- inputMetricsStats.incrementRecordRead(1L);
- }
-
- return outputRow;
- }
-
- private boolean isScanRequired(BlockletHeader header) {
- // TODO require to implement min-max index
- if (null == filter) {
- return true;
- }
- return true;
- }
-
- private boolean scanBlockletAndFillVector(BlockletHeader header) throws IOException {
- // if filter is null and output projection is empty, use the row number of blocklet header
- if (skipScanData) {
- int rowNums = header.getBlocklet_info().getNum_rows();
- columnarBatch = ColumnarBatch.allocate(outputSchema, MemoryMode.OFF_HEAP, rowNums);
- columnarBatch.setNumRows(rowNums);
- input.skipBlockletData(true);
- return rowNums > 0;
- }
-
- input.readBlockletData(header);
- columnarBatch = ColumnarBatch.allocate(outputSchema, MemoryMode.OFF_HEAP, input.getRowNums());
- int rowNum = 0;
- if (null == filter) {
- while (input.hasNext()) {
- readRowFromStream();
- putRowToColumnBatch(rowNum++);
- }
- } else {
- try {
- while (input.hasNext()) {
- readRowFromStream();
- if (filter.applyFilter(filterRow, carbonTable.getDimensionOrdinalMax())) {
- putRowToColumnBatch(rowNum++);
- }
- }
- } catch (FilterUnsupportedException e) {
- throw new IOException("Failed to filter row in vector reader", e);
- }
- }
- columnarBatch.setNumRows(rowNum);
- return rowNum > 0;
- }
-
- private void readRowFromStream() {
- input.nextRow();
- short nullLen = input.readShort();
- BitSet nullBitSet = allNonNull;
- if (nullLen > 0) {
- nullBitSet = BitSet.valueOf(input.readBytes(nullLen));
- }
- int colCount = 0;
- // primitive type dimension
- for (; colCount < isNoDictColumn.length; colCount++) {
- if (nullBitSet.get(colCount)) {
- if (isFilterRequired[colCount]) {
- filterValues[filterMap[colCount]] = CarbonCommonConstants.MEMBER_DEFAULT_VAL_ARRAY;
- }
- if (isProjectionRequired[colCount]) {
- outputValues[projectionMap[colCount]] = null;
- }
- } else {
- if (isNoDictColumn[colCount]) {
- int v = input.readShort();
- if (isRequired[colCount]) {
- byte[] b = input.readBytes(v);
- if (isFilterRequired[colCount]) {
- filterValues[filterMap[colCount]] = b;
- }
- if (isProjectionRequired[colCount]) {
- outputValues[projectionMap[colCount]] =
- DataTypeUtil.getDataBasedOnDataTypeForNoDictionaryColumn(b,
- storageColumns[colCount].getDataType());
- }
- } else {
- input.skipBytes(v);
- }
- } else if (null != directDictionaryGenerators[colCount]) {
- if (isRequired[colCount]) {
- if (isFilterRequired[colCount]) {
- filterValues[filterMap[colCount]] = input.copy(4);
- }
- if (isProjectionRequired[colCount]) {
- outputValues[projectionMap[colCount]] =
- directDictionaryGenerators[colCount].getValueFromSurrogate(input.readInt());
- } else {
- input.skipBytes(4);
- }
- } else {
- input.skipBytes(4);
- }
- } else {
- if (isRequired[colCount]) {
- if (isFilterRequired[colCount]) {
- filterValues[filterMap[colCount]] = input.copy(4);
- }
- if (isProjectionRequired[colCount]) {
- outputValues[projectionMap[colCount]] = input.readInt();
- } else {
- input.skipBytes(4);
- }
- } else {
- input.skipBytes(4);
- }
- }
- }
- }
- // complex type dimension
- for (; colCount < dimensionCount; colCount++) {
- if (nullBitSet.get(colCount)) {
- if (isFilterRequired[colCount]) {
- filterValues[filterMap[colCount]] = null;
- }
- if (isProjectionRequired[colCount]) {
- outputValues[projectionMap[colCount]] = null;
- }
- } else {
- short v = input.readShort();
- if (isRequired[colCount]) {
- byte[] b = input.readBytes(v);
- if (isFilterRequired[colCount]) {
- filterValues[filterMap[colCount]] = b;
- }
- if (isProjectionRequired[colCount]) {
- outputValues[projectionMap[colCount]] = queryTypes[colCount]
- .getDataBasedOnDataTypeFromSurrogates(ByteBuffer.wrap(b));
- }
- } else {
- input.skipBytes(v);
- }
- }
- }
- // measure
- DataType dataType;
- for (int msrCount = 0; msrCount < measureCount; msrCount++, colCount++) {
- if (nullBitSet.get(colCount)) {
- if (isFilterRequired[colCount]) {
- filterValues[filterMap[colCount]] = null;
- }
- if (isProjectionRequired[colCount]) {
- outputValues[projectionMap[colCount]] = null;
- }
- } else {
- dataType = measureDataTypes[msrCount];
- if (dataType == DataTypes.BOOLEAN) {
- if (isRequired[colCount]) {
- boolean v = input.readBoolean();
- if (isFilterRequired[colCount]) {
- filterValues[filterMap[colCount]] = v;
- }
- if (isProjectionRequired[colCount]) {
- outputValues[projectionMap[colCount]] = v;
- }
- } else {
- input.skipBytes(1);
- }
- } else if (dataType == DataTypes.SHORT) {
- if (isRequired[colCount]) {
- short v = input.readShort();
- if (isFilterRequired[colCount]) {
- filterValues[filterMap[colCount]] = v;
- }
- if (isProjectionRequired[colCount]) {
- outputValues[projectionMap[colCount]] = v;
- }
- } else {
- input.skipBytes(2);
- }
- } else if (dataType == DataTypes.INT) {
- if (isRequired[colCount]) {
- int v = input.readInt();
- if (isFilterRequired[colCount]) {
- filterValues[filterMap[colCount]] = v;
- }
- if (isProjectionRequired[colCount]) {
- outputValues[projectionMap[colCount]] = v;
- }
- } else {
- input.skipBytes(4);
- }
- } else if (dataType == DataTypes.LONG) {
- if (isRequired[colCount]) {
- long v = input.readLong();
- if (isFilterRequired[colCount]) {
- filterValues[filterMap[colCount]] = v;
- }
- if (isProjectionRequired[colCount]) {
- outputValues[projectionMap[colCount]] = v;
- }
- } else {
- input.skipBytes(8);
- }
- } else if (dataType == DataTypes.DOUBLE) {
- if (isRequired[colCount]) {
- double v = input.readDouble();
- if (isFilterRequired[colCount]) {
- filterValues[filterMap[colCount]] = v;
- }
- if (isProjectionRequired[colCount]) {
- outputValues[projectionMap[colCount]] = v;
- }
- } else {
- input.skipBytes(8);
- }
- } else if (DataTypes.isDecimal(dataType)) {
- int len = input.readShort();
- if (isRequired[colCount]) {
- BigDecimal v = DataTypeUtil.byteToBigDecimal(input.readBytes(len));
- if (isFilterRequired[colCount]) {
- filterValues[filterMap[colCount]] = v;
- }
- if (isProjectionRequired[colCount]) {
- outputValues[projectionMap[colCount]] = Decimal.apply(v);
- }
- } else {
- input.skipBytes(len);
- }
- }
- }
- }
- }
-
- private void readRawRowFromStream() {
- input.nextRow();
- short nullLen = input.readShort();
- BitSet nullBitSet = allNonNull;
- if (nullLen > 0) {
- nullBitSet = BitSet.valueOf(input.readBytes(nullLen));
- }
- int colCount = 0;
- // primitive type dimension
- for (; colCount < isNoDictColumn.length; colCount++) {
- if (nullBitSet.get(colCount)) {
- outputValues[colCount] = CarbonCommonConstants.MEMBER_DEFAULT_VAL_ARRAY;
- } else {
- if (isNoDictColumn[colCount]) {
- int v = input.readShort();
- outputValues[colCount] = input.readBytes(v);
- } else {
- outputValues[colCount] = input.readInt();
- }
- }
- }
- // complex type dimension
- for (; colCount < dimensionCount; colCount++) {
- if (nullBitSet.get(colCount)) {
- outputValues[colCount] = null;
- } else {
- short v = input.readShort();
- outputValues[colCount] = input.readBytes(v);
- }
- }
- // measure
- DataType dataType;
- for (int msrCount = 0; msrCount < measureCount; msrCount++, colCount++) {
- if (nullBitSet.get(colCount)) {
- outputValues[colCount] = null;
- } else {
- dataType = measureDataTypes[msrCount];
- if (dataType == DataTypes.BOOLEAN) {
- outputValues[colCount] = input.readBoolean();
- } else if (dataType == DataTypes.SHORT) {
- outputValues[colCount] = input.readShort();
- } else if (dataType == DataTypes.INT) {
- outputValues[colCount] = input.readInt();
- } else if (dataType == DataTypes.LONG) {
- outputValues[colCount] = input.readLong();
- } else if (dataType == DataTypes.DOUBLE) {
- outputValues[colCount] = input.readDouble();
- } else if (DataTypes.isDecimal(dataType)) {
- int len = input.readShort();
- outputValues[colCount] = DataTypeUtil.byteToBigDecimal(input.readBytes(len));
- }
- }
- }
- }
-
- private void putRowToColumnBatch(int rowId) {
- for (int i = 0; i < projection.length; i++) {
- Object value = outputValues[i];
- ColumnVector col = columnarBatch.column(i);
- org.apache.spark.sql.types.DataType t = col.dataType();
- if (null == value) {
- col.putNull(rowId);
- } else {
- if (t == org.apache.spark.sql.types.DataTypes.BooleanType) {
- col.putBoolean(rowId, (boolean)value);
- } else if (t == org.apache.spark.sql.types.DataTypes.ByteType) {
- col.putByte(rowId, (byte) value);
- } else if (t == org.apache.spark.sql.types.DataTypes.ShortType) {
- col.putShort(rowId, (short) value);
- } else if (t == org.apache.spark.sql.types.DataTypes.IntegerType) {
- col.putInt(rowId, (int) value);
- } else if (t == org.apache.spark.sql.types.DataTypes.LongType) {
- col.putLong(rowId, (long) value);
- } else if (t == org.apache.spark.sql.types.DataTypes.FloatType) {
- col.putFloat(rowId, (float) value);
- } else if (t == org.apache.spark.sql.types.DataTypes.DoubleType) {
- col.putDouble(rowId, (double) value);
- } else if (t == org.apache.spark.sql.types.DataTypes.StringType) {
- UTF8String v = (UTF8String) value;
- col.putByteArray(rowId, v.getBytes());
- } else if (t instanceof org.apache.spark.sql.types.DecimalType) {
- DecimalType dt = (DecimalType)t;
- Decimal d = Decimal.fromDecimal(value);
- if (dt.precision() <= Decimal.MAX_INT_DIGITS()) {
- col.putInt(rowId, (int)d.toUnscaledLong());
- } else if (dt.precision() <= Decimal.MAX_LONG_DIGITS()) {
- col.putLong(rowId, d.toUnscaledLong());
- } else {
- final BigInteger integer = d.toJavaBigDecimal().unscaledValue();
- byte[] bytes = integer.toByteArray();
- col.putByteArray(rowId, bytes, 0, bytes.length);
- }
- } else if (t instanceof CalendarIntervalType) {
- CalendarInterval c = (CalendarInterval) value;
- col.getChildColumn(0).putInt(rowId, c.months);
- col.getChildColumn(1).putLong(rowId, c.microseconds);
- } else if (t instanceof org.apache.spark.sql.types.DateType) {
- col.putInt(rowId, (int) value);
- } else if (t instanceof org.apache.spark.sql.types.TimestampType) {
- col.putLong(rowId, (long) value);
- }
- }
- }
- }
-
- @Override public float getProgress() throws IOException, InterruptedException {
- return 0;
- }
-
- public void setVectorReader(boolean isVectorReader) {
- this.isVectorReader = isVectorReader;
- }
-
- public void setInputMetricsStats(InputMetricsStats inputMetricsStats) {
- this.inputMetricsStats = inputMetricsStats;
- }
-
- @Override public void close() throws IOException {
- if (null != input) {
- input.close();
- }
- if (null != columnarBatch) {
- columnarBatch.close();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/CarbonStreamRecordWriter.java
----------------------------------------------------------------------
diff --git a/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/CarbonStreamRecordWriter.java b/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/CarbonStreamRecordWriter.java
deleted file mode 100644
index a4b3be8..0000000
--- a/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/CarbonStreamRecordWriter.java
+++ /dev/null
@@ -1,325 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.carbondata.hadoop.streaming;
-
-import java.io.DataOutputStream;
-import java.io.File;
-import java.io.IOException;
-import java.math.BigDecimal;
-import java.util.ArrayList;
-import java.util.BitSet;
-import java.util.List;
-
-import org.apache.carbondata.common.logging.LogService;
-import org.apache.carbondata.common.logging.LogServiceFactory;
-import org.apache.carbondata.core.datastore.filesystem.CarbonFile;
-import org.apache.carbondata.core.datastore.impl.FileFactory;
-import org.apache.carbondata.core.datastore.row.CarbonRow;
-import org.apache.carbondata.core.metadata.datatype.DataType;
-import org.apache.carbondata.core.metadata.datatype.DataTypes;
-import org.apache.carbondata.core.metadata.schema.table.CarbonTable;
-import org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema;
-import org.apache.carbondata.core.util.CarbonMetadataUtil;
-import org.apache.carbondata.core.util.CarbonUtil;
-import org.apache.carbondata.core.util.DataTypeUtil;
-import org.apache.carbondata.core.util.path.CarbonTablePath;
-import org.apache.carbondata.format.FileHeader;
-import org.apache.carbondata.processing.loading.BadRecordsLogger;
-import org.apache.carbondata.processing.loading.BadRecordsLoggerProvider;
-import org.apache.carbondata.processing.loading.CarbonDataLoadConfiguration;
-import org.apache.carbondata.processing.loading.DataField;
-import org.apache.carbondata.processing.loading.DataLoadProcessBuilder;
-import org.apache.carbondata.processing.loading.converter.RowConverter;
-import org.apache.carbondata.processing.loading.converter.impl.RowConverterImpl;
-import org.apache.carbondata.processing.loading.model.CarbonLoadModel;
-import org.apache.carbondata.processing.loading.parser.RowParser;
-import org.apache.carbondata.processing.loading.parser.impl.RowParserImpl;
-import org.apache.carbondata.processing.store.writer.AbstractFactDataWriter;
-import org.apache.carbondata.processing.util.CarbonDataProcessorUtil;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.RecordWriter;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.TaskID;
-
-/**
- * Stream record writer
- */
-public class CarbonStreamRecordWriter extends RecordWriter<Void, Object> {
-
- private static final LogService LOGGER =
- LogServiceFactory.getLogService(CarbonStreamRecordWriter.class.getName());
-
- // basic info
- private Configuration hadoopConf;
- private CarbonLoadModel carbonLoadModel;
- private CarbonDataLoadConfiguration configuration;
- private CarbonTable carbonTable;
- private int maxRowNums;
- private int maxCacheSize;
-
- // parser and converter
- private RowParser rowParser;
- private BadRecordsLogger badRecordLogger;
- private RowConverter converter;
- private CarbonRow currentRow = new CarbonRow(null);
-
- // encoder
- private DataField[] dataFields;
- private BitSet nullBitSet;
- private boolean[] isNoDictionaryDimensionColumn;
- private int dimensionWithComplexCount;
- private int measureCount;
- private DataType[] measureDataTypes;
- private StreamBlockletWriter output = null;
-
- // data write
- private String segmentDir;
- private String fileName;
- private DataOutputStream outputStream;
- private boolean isFirstRow = true;
- private boolean hasException = false;
-
- CarbonStreamRecordWriter(TaskAttemptContext job) throws IOException {
- initialize(job);
- }
-
- public CarbonStreamRecordWriter(TaskAttemptContext job, CarbonLoadModel carbonLoadModel)
- throws IOException {
- this.carbonLoadModel = carbonLoadModel;
- initialize(job);
- }
-
- private void initialize(TaskAttemptContext job) throws IOException {
- // set basic information
- hadoopConf = job.getConfiguration();
- if (carbonLoadModel == null) {
- carbonLoadModel = CarbonStreamOutputFormat.getCarbonLoadModel(hadoopConf);
- if (carbonLoadModel == null) {
- throw new IOException(
- "CarbonStreamRecordWriter require configuration: mapreduce.output.carbon.load.model");
- }
- }
- String segmentId = CarbonStreamOutputFormat.getSegmentId(hadoopConf);
- carbonLoadModel.setSegmentId(segmentId);
- carbonTable = carbonLoadModel.getCarbonDataLoadSchema().getCarbonTable();
- long taskNo = TaskID.forName(hadoopConf.get("mapred.tip.id")).getId();
- carbonLoadModel.setTaskNo("" + taskNo);
- configuration = DataLoadProcessBuilder.createConfiguration(carbonLoadModel);
- maxRowNums = hadoopConf.getInt(CarbonStreamOutputFormat.CARBON_STREAM_BLOCKLET_ROW_NUMS,
- CarbonStreamOutputFormat.CARBON_STREAM_BLOCKLET_ROW_NUMS_DEFAULT) - 1;
- maxCacheSize = hadoopConf.getInt(CarbonStreamOutputFormat.CARBON_STREAM_CACHE_SIZE,
- CarbonStreamOutputFormat.CARBON_STREAM_CACHE_SIZE_DEFAULT);
-
- segmentDir = CarbonTablePath.getSegmentPath(
- carbonTable.getAbsoluteTableIdentifier().getTablePath(), segmentId);
- fileName = CarbonTablePath.getCarbonDataFileName(0, taskNo, 0, 0, "0");
- }
-
- private void initializeAtFirstRow() throws IOException, InterruptedException {
-
- // initialize metadata
- isNoDictionaryDimensionColumn =
- CarbonDataProcessorUtil.getNoDictionaryMapping(configuration.getDataFields());
- dimensionWithComplexCount = configuration.getDimensionCount();
- measureCount = configuration.getMeasureCount();
- dataFields = configuration.getDataFields();
- measureDataTypes = new DataType[measureCount];
- for (int i = 0; i < measureCount; i++) {
- measureDataTypes[i] =
- dataFields[dimensionWithComplexCount + i].getColumn().getDataType();
- }
-
- // initialize parser and converter
- rowParser = new RowParserImpl(dataFields, configuration);
- badRecordLogger = BadRecordsLoggerProvider.createBadRecordLogger(configuration);
- converter = new RowConverterImpl(configuration.getDataFields(), configuration, badRecordLogger);
- configuration.setCardinalityFinder(converter);
- converter.initialize();
-
- // initialize encoder
- nullBitSet = new BitSet(dataFields.length);
- int rowBufferSize = hadoopConf.getInt(CarbonStreamOutputFormat.CARBON_ENCODER_ROW_BUFFER_SIZE,
- CarbonStreamOutputFormat.CARBON_ENCODER_ROW_BUFFER_SIZE_DEFAULT);
- output = new StreamBlockletWriter(maxCacheSize, maxRowNums, rowBufferSize);
-
- // initialize data writer
- String filePath = segmentDir + File.separator + fileName;
- FileFactory.FileType fileType = FileFactory.getFileType(filePath);
- CarbonFile carbonFile = FileFactory.getCarbonFile(filePath, fileType);
- if (carbonFile.exists()) {
- // if the file is existed, use the append api
- outputStream = FileFactory.getDataOutputStreamUsingAppend(filePath, fileType);
- } else {
- // IF the file is not existed, use the create api
- outputStream = FileFactory.getDataOutputStream(filePath, fileType);
- writeFileHeader();
- }
-
- isFirstRow = false;
- }
-
- @Override public void write(Void key, Object value) throws IOException, InterruptedException {
- if (isFirstRow) {
- initializeAtFirstRow();
- }
-
- // parse and convert row
- currentRow.setData(rowParser.parseRow((Object[]) value));
- converter.convert(currentRow);
-
- // null bit set
- nullBitSet.clear();
- for (int i = 0; i < dataFields.length; i++) {
- if (null == currentRow.getObject(i)) {
- nullBitSet.set(i);
- }
- }
- output.nextRow();
- byte[] b = nullBitSet.toByteArray();
- output.writeShort(b.length);
- if (b.length > 0) {
- output.writeBytes(b);
- }
- int dimCount = 0;
- Object columnValue;
-
- // primitive type dimension
- for (; dimCount < isNoDictionaryDimensionColumn.length; dimCount++) {
- columnValue = currentRow.getObject(dimCount);
- if (null != columnValue) {
- if (isNoDictionaryDimensionColumn[dimCount]) {
- byte[] col = (byte[]) columnValue;
- output.writeShort(col.length);
- output.writeBytes(col);
- } else {
- output.writeInt((int) columnValue);
- }
- }
- }
- // complex type dimension
- for (; dimCount < dimensionWithComplexCount; dimCount++) {
- columnValue = currentRow.getObject(dimCount);
- if (null != columnValue) {
- byte[] col = (byte[]) columnValue;
- output.writeShort(col.length);
- output.writeBytes(col);
- }
- }
- // measure
- DataType dataType;
- for (int msrCount = 0; msrCount < measureCount; msrCount++) {
- columnValue = currentRow.getObject(dimCount + msrCount);
- if (null != columnValue) {
- dataType = measureDataTypes[msrCount];
- if (dataType == DataTypes.BOOLEAN) {
- output.writeBoolean((boolean) columnValue);
- } else if (dataType == DataTypes.SHORT) {
- output.writeShort((short) columnValue);
- } else if (dataType == DataTypes.INT) {
- output.writeInt((int) columnValue);
- } else if (dataType == DataTypes.LONG) {
- output.writeLong((long) columnValue);
- } else if (dataType == DataTypes.DOUBLE) {
- output.writeDouble((double) columnValue);
- } else if (DataTypes.isDecimal(dataType)) {
- BigDecimal val = (BigDecimal) columnValue;
- byte[] bigDecimalInBytes = DataTypeUtil.bigDecimalToByte(val);
- output.writeShort(bigDecimalInBytes.length);
- output.writeBytes(bigDecimalInBytes);
- } else {
- String msg =
- "unsupported data type:" + dataFields[dimCount + msrCount].getColumn().getDataType()
- .getName();
- LOGGER.error(msg);
- throw new IOException(msg);
- }
- }
- }
-
- if (output.isFull()) {
- appendBlockletToDataFile();
- }
- }
-
- private void writeFileHeader() throws IOException {
- List<ColumnSchema> wrapperColumnSchemaList = CarbonUtil
- .getColumnSchemaList(carbonTable.getDimensionByTableName(carbonTable.getTableName()),
- carbonTable.getMeasureByTableName(carbonTable.getTableName()));
- int[] dimLensWithComplex = new int[wrapperColumnSchemaList.size()];
- for (int i = 0; i < dimLensWithComplex.length; i++) {
- dimLensWithComplex[i] = Integer.MAX_VALUE;
- }
- int[] dictionaryColumnCardinality =
- CarbonUtil.getFormattedCardinality(dimLensWithComplex, wrapperColumnSchemaList);
- List<Integer> cardinality = new ArrayList<>();
- List<org.apache.carbondata.format.ColumnSchema> columnSchemaList = AbstractFactDataWriter
- .getColumnSchemaListAndCardinality(cardinality, dictionaryColumnCardinality,
- wrapperColumnSchemaList);
- FileHeader fileHeader =
- CarbonMetadataUtil.getFileHeader(true, columnSchemaList, System.currentTimeMillis());
- fileHeader.setIs_footer_present(false);
- fileHeader.setIs_splitable(true);
- fileHeader.setSync_marker(CarbonStreamOutputFormat.CARBON_SYNC_MARKER);
- outputStream.write(CarbonUtil.getByteArray(fileHeader));
- }
-
- /**
- * write a blocklet to file
- */
- private void appendBlockletToDataFile() throws IOException {
- if (output.getRowIndex() == -1) {
- return;
- }
- output.apppendBlocklet(outputStream);
- outputStream.flush();
- // reset data
- output.reset();
- }
-
- @Override public void close(TaskAttemptContext context) throws IOException, InterruptedException {
- try {
- // append remain buffer data
- if (!hasException && !isFirstRow) {
- appendBlockletToDataFile();
- converter.finish();
- }
- } finally {
- // close resource
- CarbonUtil.closeStreams(outputStream);
- if (output != null) {
- output.close();
- }
- if (badRecordLogger != null) {
- badRecordLogger.closeStreams();
- }
- }
- }
-
- public String getSegmentDir() {
- return segmentDir;
- }
-
- public String getFileName() {
- return fileName;
- }
-
- public void setHasException(boolean hasException) {
- this.hasException = hasException;
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/StreamBlockletReader.java
----------------------------------------------------------------------
diff --git a/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/StreamBlockletReader.java b/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/StreamBlockletReader.java
deleted file mode 100644
index 1989198..0000000
--- a/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/StreamBlockletReader.java
+++ /dev/null
@@ -1,259 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.carbondata.hadoop.streaming;
-
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.carbondata.core.datastore.compression.Compressor;
-import org.apache.carbondata.core.datastore.compression.CompressorFactory;
-import org.apache.carbondata.core.util.CarbonUtil;
-import org.apache.carbondata.format.BlockletHeader;
-
-/**
- * stream blocklet reader
- */
-public class StreamBlockletReader {
-
- private byte[] buffer;
- private int offset;
- private final byte[] syncMarker;
- private final byte[] syncBuffer;
- private final int syncLen;
- private long pos = 0;
- private final InputStream in;
- private final long limitStart;
- private final long limitEnd;
- private boolean isAlreadySync = false;
- private Compressor compressor = CompressorFactory.getInstance().getCompressor();
- private int rowNums = 0;
- private int rowIndex = 0;
- private boolean isHeaderPresent;
-
- StreamBlockletReader(byte[] syncMarker, InputStream in, long limit, boolean isHeaderPresent) {
- this.syncMarker = syncMarker;
- syncLen = syncMarker.length;
- syncBuffer = new byte[syncLen];
- this.in = in;
- limitStart = limit;
- limitEnd = limitStart + syncLen;
- this.isHeaderPresent = isHeaderPresent;
- }
-
- private void ensureCapacity(int capacity) {
- if (buffer == null || capacity > buffer.length) {
- buffer = new byte[capacity];
- }
- }
-
- /**
- * find the first position of sync_marker in input stream
- */
- private boolean sync() throws IOException {
- if (!readBytesFromStream(syncBuffer, 0, syncLen)) {
- return false;
- }
- boolean skipHeader = false;
- for (int i = 0; i < limitStart; i++) {
- int j = 0;
- for (; j < syncLen; j++) {
- if (syncMarker[j] != syncBuffer[(i + j) % syncLen]) break;
- }
- if (syncLen == j) {
- if (isHeaderPresent) {
- if (skipHeader) {
- return true;
- } else {
- skipHeader = true;
- }
- } else {
- return true;
- }
- }
- int value = in.read();
- if (-1 == value) {
- return false;
- }
- syncBuffer[i % syncLen] = (byte) value;
- pos++;
- }
- return false;
- }
-
- BlockletHeader readBlockletHeader() throws IOException {
- int len = readIntFromStream();
- byte[] b = new byte[len];
- if (!readBytesFromStream(b, 0, len)) {
- throw new EOFException("Failed to read blocklet header");
- }
- BlockletHeader header = CarbonUtil.readBlockletHeader(b);
- rowNums = header.getBlocklet_info().getNum_rows();
- rowIndex = 0;
- return header;
- }
-
- void readBlockletData(BlockletHeader header) throws IOException {
- ensureCapacity(header.getBlocklet_length());
- offset = 0;
- int len = readIntFromStream();
- byte[] b = new byte[len];
- if (!readBytesFromStream(b, 0, len)) {
- throw new EOFException("Failed to read blocklet data");
- }
- compressor.rawUncompress(b, buffer);
- }
-
- void skipBlockletData(boolean reset) throws IOException {
- int len = readIntFromStream();
- skip(len);
- pos += len;
- if (reset) {
- this.rowNums = 0;
- this.rowIndex = 0;
- }
- }
-
- private void skip(int len) throws IOException {
- long remaining = len;
- do {
- long skipLen = in.skip(remaining);
- remaining -= skipLen;
- } while (remaining > 0);
- }
-
- /**
- * find the next blocklet
- */
- boolean nextBlocklet() throws IOException {
- if (pos >= limitStart) {
- return false;
- }
- if (isAlreadySync) {
- if (!readBytesFromStream(syncBuffer, 0, syncLen)) {
- return false;
- }
- } else {
- isAlreadySync = true;
- if (!sync()) {
- return false;
- }
- }
-
- return pos < limitEnd;
- }
-
- boolean hasNext() throws IOException {
- return rowIndex < rowNums;
- }
-
- void nextRow() {
- rowIndex++;
- }
-
- int readIntFromStream() throws IOException {
- int ch1 = in.read();
- int ch2 = in.read();
- int ch3 = in.read();
- int ch4 = in.read();
- if ((ch1 | ch2 | ch3 | ch4) < 0) throw new EOFException();
- pos += 4;
- return ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0));
- }
-
- /**
- * Reads <code>len</code> bytes of data from the input stream into
- * an array of bytes.
- * @return <code>true</code> if reading data successfully, or
- * <code>false</code> if there is no more data because the end of the stream has been reached.
- */
- boolean readBytesFromStream(byte[] b, int offset, int len) throws IOException {
- int readLen = in.read(b, offset, len);
- if (readLen < 0) {
- return false;
- }
- pos += readLen;
- if (readLen < len) {
- return readBytesFromStream(b, offset + readLen, len - readLen);
- } else {
- return true;
- }
- }
-
- boolean readBoolean() {
- return (buffer[offset++]) != 0;
- }
-
- short readShort() {
- short v = (short) ((buffer[offset + 1] & 255) +
- ((buffer[offset]) << 8));
- offset += 2;
- return v;
- }
-
- byte[] copy(int len) {
- byte[] b = new byte[len];
- System.arraycopy(buffer, offset, b, 0, len);
- return b;
- }
-
- int readInt() {
- int v = ((buffer[offset + 3] & 255) +
- ((buffer[offset + 2] & 255) << 8) +
- ((buffer[offset + 1] & 255) << 16) +
- ((buffer[offset]) << 24));
- offset += 4;
- return v;
- }
-
- long readLong() {
- long v = ((long)(buffer[offset + 7] & 255)) +
- ((long) (buffer[offset + 6] & 255) << 8) +
- ((long) (buffer[offset + 5] & 255) << 16) +
- ((long) (buffer[offset + 4] & 255) << 24) +
- ((long) (buffer[offset + 3] & 255) << 32) +
- ((long) (buffer[offset + 2] & 255) << 40) +
- ((long) (buffer[offset + 1] & 255) << 48) +
- ((long) (buffer[offset]) << 56);
- offset += 8;
- return v;
- }
-
- double readDouble() {
- return Double.longBitsToDouble(readLong());
- }
-
- byte[] readBytes(int len) {
- byte[] b = new byte[len];
- System.arraycopy(buffer, offset, b, 0, len);
- offset += len;
- return b;
- }
-
- void skipBytes(int len) {
- offset += len;
- }
-
- int getRowNums() {
- return rowNums;
- }
-
- void close() {
- CarbonUtil.closeStreams(in);
- }
-}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/StreamBlockletWriter.java
----------------------------------------------------------------------
diff --git a/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/StreamBlockletWriter.java b/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/StreamBlockletWriter.java
deleted file mode 100644
index a0328b3..0000000
--- a/hadoop/src/main/java/org/apache/carbondata/hadoop/streaming/StreamBlockletWriter.java
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.carbondata.hadoop.streaming;
-
-import java.io.DataOutputStream;
-import java.io.IOException;
-
-import org.apache.carbondata.core.datastore.compression.Compressor;
-import org.apache.carbondata.core.datastore.compression.CompressorFactory;
-import org.apache.carbondata.core.util.CarbonUtil;
-import org.apache.carbondata.format.BlockletHeader;
-import org.apache.carbondata.format.BlockletInfo;
-import org.apache.carbondata.format.MutationType;
-
-/**
- * stream blocklet writer
- */
-public class StreamBlockletWriter {
- private byte[] buffer;
- private int maxSize;
- private int maxRowNum;
- private int rowSize;
- private int count = 0;
- private int rowIndex = -1;
- private Compressor compressor = CompressorFactory.getInstance().getCompressor();
-
- StreamBlockletWriter(int maxSize, int maxRowNum, int rowSize) {
- buffer = new byte[maxSize];
- this.maxSize = maxSize;
- this.maxRowNum = maxRowNum;
- this.rowSize = rowSize;
- }
-
- private void ensureCapacity(int space) {
- int newcount = space + count;
- if (newcount > buffer.length) {
- byte[] newbuf = new byte[Math.max(newcount, buffer.length + rowSize)];
- System.arraycopy(buffer, 0, newbuf, 0, count);
- buffer = newbuf;
- }
- }
-
- void reset() {
- count = 0;
- rowIndex = -1;
- }
-
- byte[] getBytes() {
- return buffer;
- }
-
- int getCount() {
- return count;
- }
-
- int getRowIndex() {
- return rowIndex;
- }
-
- void nextRow() {
- rowIndex++;
- }
-
- boolean isFull() {
- return rowIndex == maxRowNum || count >= maxSize;
- }
-
- void writeBoolean(boolean val) {
- ensureCapacity(1);
- buffer[count] = (byte) (val ? 1 : 0);
- count += 1;
- }
-
- void writeShort(int val) {
- ensureCapacity(2);
- buffer[count + 1] = (byte) (val);
- buffer[count] = (byte) (val >>> 8);
- count += 2;
- }
-
- void writeInt(int val) {
- ensureCapacity(4);
- buffer[count + 3] = (byte) (val);
- buffer[count + 2] = (byte) (val >>> 8);
- buffer[count + 1] = (byte) (val >>> 16);
- buffer[count] = (byte) (val >>> 24);
- count += 4;
- }
-
- void writeLong(long val) {
- ensureCapacity(8);
- buffer[count + 7] = (byte) (val);
- buffer[count + 6] = (byte) (val >>> 8);
- buffer[count + 5] = (byte) (val >>> 16);
- buffer[count + 4] = (byte) (val >>> 24);
- buffer[count + 3] = (byte) (val >>> 32);
- buffer[count + 2] = (byte) (val >>> 40);
- buffer[count + 1] = (byte) (val >>> 48);
- buffer[count] = (byte) (val >>> 56);
- count += 8;
- }
-
- void writeDouble(double val) {
- writeLong(Double.doubleToLongBits(val));
- }
-
- void writeBytes(byte[] b) {
- writeBytes(b, 0, b.length);
- }
-
- void writeBytes(byte[] b, int off, int len) {
- ensureCapacity(len);
- System.arraycopy(b, off, buffer, count, len);
- count += len;
- }
-
- void apppendBlocklet(DataOutputStream outputStream) throws IOException {
- outputStream.write(CarbonStreamOutputFormat.CARBON_SYNC_MARKER);
-
- BlockletInfo blockletInfo = new BlockletInfo();
- blockletInfo.setNum_rows(getRowIndex() + 1);
- BlockletHeader blockletHeader = new BlockletHeader();
- blockletHeader.setBlocklet_length(getCount());
- blockletHeader.setMutation(MutationType.INSERT);
- blockletHeader.setBlocklet_info(blockletInfo);
- byte[] headerBytes = CarbonUtil.getByteArray(blockletHeader);
- outputStream.writeInt(headerBytes.length);
- outputStream.write(headerBytes);
-
- byte[] compressed = compressor.compressByte(getBytes(), getCount());
- outputStream.writeInt(compressed.length);
- outputStream.write(compressed);
- }
-
- void close() {
- }
-}
[3/5] carbondata git commit: [CARBONDATA-2165]Remove spark in
carbon-hadoop module
Posted by ja...@apache.org.
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/StreamHandoffRDD.scala
----------------------------------------------------------------------
diff --git a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/StreamHandoffRDD.scala b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/StreamHandoffRDD.scala
new file mode 100644
index 0000000..bf5f660
--- /dev/null
+++ b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/StreamHandoffRDD.scala
@@ -0,0 +1,435 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.spark.rdd
+
+import java.text.SimpleDateFormat
+import java.util
+import java.util.Date
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.mapreduce.{Job, TaskAttemptID, TaskType}
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
+import org.apache.spark.{Partition, SerializableWritable, SparkContext, TaskContext}
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
+
+import org.apache.carbondata.common.logging.LogServiceFactory
+import org.apache.carbondata.core.datamap.Segment
+import org.apache.carbondata.core.datastore.block.SegmentProperties
+import org.apache.carbondata.core.datastore.impl.FileFactory
+import org.apache.carbondata.core.locks.{CarbonLockFactory, LockUsage}
+import org.apache.carbondata.core.metadata.CarbonMetadata
+import org.apache.carbondata.core.metadata.schema.table.CarbonTable
+import org.apache.carbondata.core.scan.result.iterator.RawResultIterator
+import org.apache.carbondata.core.statusmanager.{LoadMetadataDetails, SegmentStatus, SegmentStatusManager}
+import org.apache.carbondata.core.util.CarbonUtil
+import org.apache.carbondata.core.util.path.CarbonTablePath
+import org.apache.carbondata.events.{OperationContext, OperationListenerBus}
+import org.apache.carbondata.hadoop.{CarbonInputSplit, CarbonProjection}
+import org.apache.carbondata.hadoop.api.{CarbonInputFormat, CarbonTableInputFormat}
+import org.apache.carbondata.processing.loading.events.LoadEvents.{LoadTablePostStatusUpdateEvent, LoadTablePreStatusUpdateEvent}
+import org.apache.carbondata.processing.loading.model.CarbonLoadModel
+import org.apache.carbondata.processing.merger.{CompactionResultSortProcessor, CompactionType}
+import org.apache.carbondata.processing.util.CarbonLoaderUtil
+import org.apache.carbondata.spark.{HandoffResult, HandoffResultImpl}
+import org.apache.carbondata.spark.util.CommonUtil
+import org.apache.carbondata.streaming.{CarbonStreamInputFormat, CarbonStreamRecordReader}
+
+
+/**
+ * partition of the handoff segment
+ */
+class HandoffPartition(
+ val rddId: Int,
+ val idx: Int,
+ @transient val inputSplit: CarbonInputSplit
+) extends Partition {
+
+ val split = new SerializableWritable[CarbonInputSplit](inputSplit)
+
+ override val index: Int = idx
+
+ override def hashCode(): Int = 41 * (41 + rddId) + idx
+}
+
+/**
+ * package the record reader of the handoff segment to RawResultIterator
+ */
+class StreamingRawResultIterator(
+ recordReader: CarbonStreamRecordReader
+) extends RawResultIterator(null, null, null) {
+
+ override def hasNext: Boolean = {
+ recordReader.nextKeyValue()
+ }
+
+ override def next(): Array[Object] = {
+ val rowTmp = recordReader
+ .getCurrentValue
+ .asInstanceOf[GenericInternalRow]
+ .values
+ .asInstanceOf[Array[Object]]
+ val row = new Array[Object](rowTmp.length)
+ System.arraycopy(rowTmp, 0, row, 0, rowTmp.length)
+ row
+ }
+}
+
+/**
+ * execute streaming segment handoff
+ */
+class StreamHandoffRDD[K, V](
+ sc: SparkContext,
+ result: HandoffResult[K, V],
+ carbonLoadModel: CarbonLoadModel,
+ handOffSegmentId: String
+) extends CarbonRDD[(K, V)](sc, Nil, sc.hadoopConfiguration) {
+
+ private val jobTrackerId: String = {
+ val formatter = new SimpleDateFormat("yyyyMMddHHmm")
+ formatter.format(new Date())
+ }
+
+ override def internalCompute(
+ split: Partition,
+ context: TaskContext
+ ): Iterator[(K, V)] = {
+ carbonLoadModel.setTaskNo("" + split.index)
+ val carbonTable = carbonLoadModel.getCarbonDataLoadSchema.getCarbonTable
+ CarbonMetadata.getInstance().addCarbonTable(carbonTable)
+ // the input iterator is using raw row
+ val iteratorList = prepareInputIterator(split, carbonTable)
+
+ CommonUtil.setTempStoreLocation(split.index, carbonLoadModel, true, false)
+ // use CompactionResultSortProcessor to sort data dan write to columnar files
+ val processor = prepareHandoffProcessor(carbonTable)
+ val status = processor.execute(iteratorList)
+
+ new Iterator[(K, V)] {
+ private var finished = false
+
+ override def hasNext: Boolean = {
+ !finished
+ }
+
+ override def next(): (K, V) = {
+ finished = true
+ result.getKey("" + split.index, status)
+ }
+ }
+ }
+
+ /**
+ * prepare input iterator by basing CarbonStreamRecordReader
+ */
+ private def prepareInputIterator(
+ split: Partition,
+ carbonTable: CarbonTable
+ ): util.ArrayList[RawResultIterator] = {
+ val inputSplit = split.asInstanceOf[HandoffPartition].split.value
+ val attemptId = new TaskAttemptID(jobTrackerId, id, TaskType.MAP, split.index, 0)
+ val hadoopConf = new Configuration()
+ CarbonInputFormat.setDatabaseName(hadoopConf, carbonTable.getDatabaseName)
+ CarbonInputFormat.setTableName(hadoopConf, carbonTable.getTableName)
+ CarbonInputFormat.setTablePath(hadoopConf, carbonTable.getTablePath)
+ val projection = new CarbonProjection
+ val dataFields = carbonTable.getStreamStorageOrderColumn(carbonTable.getTableName)
+ (0 until dataFields.size()).foreach { index =>
+ projection.addColumn(dataFields.get(index).getColName)
+ }
+ CarbonInputFormat.setColumnProjection(hadoopConf, projection)
+ CarbonInputFormat.setTableInfo(hadoopConf, carbonTable.getTableInfo)
+ val attemptContext = new TaskAttemptContextImpl(hadoopConf, attemptId)
+ val format = new CarbonTableInputFormat[Array[Object]]()
+ val model = format.createQueryModel(inputSplit, attemptContext)
+ val inputFormat = new CarbonStreamInputFormat
+ val streamReader = inputFormat.createRecordReader(inputSplit, attemptContext)
+ .asInstanceOf[CarbonStreamRecordReader]
+ streamReader.setVectorReader(false)
+ streamReader.setQueryModel(model)
+ streamReader.setUseRawRow(true)
+ streamReader.initialize(inputSplit, attemptContext)
+ val iteratorList = new util.ArrayList[RawResultIterator](1)
+ iteratorList.add(new StreamingRawResultIterator(streamReader))
+ iteratorList
+ }
+
+ private def prepareHandoffProcessor(
+ carbonTable: CarbonTable
+ ): CompactionResultSortProcessor = {
+ val wrapperColumnSchemaList = CarbonUtil.getColumnSchemaList(
+ carbonTable.getDimensionByTableName(carbonTable.getTableName),
+ carbonTable.getMeasureByTableName(carbonTable.getTableName))
+ val dimLensWithComplex =
+ (0 until wrapperColumnSchemaList.size()).map(_ => Integer.MAX_VALUE).toArray
+ val dictionaryColumnCardinality =
+ CarbonUtil.getFormattedCardinality(dimLensWithComplex, wrapperColumnSchemaList)
+ val segmentProperties =
+ new SegmentProperties(wrapperColumnSchemaList, dictionaryColumnCardinality)
+
+ new CompactionResultSortProcessor(
+ carbonLoadModel,
+ carbonTable,
+ segmentProperties,
+ CompactionType.STREAMING,
+ carbonTable.getTableName,
+ null
+ )
+ }
+
+ /**
+ * get the partitions of the handoff segment
+ */
+ override protected def getPartitions: Array[Partition] = {
+ val job = Job.getInstance(FileFactory.getConfiguration)
+ val inputFormat = new CarbonTableInputFormat[Array[Object]]()
+ val segmentList = new util.ArrayList[Segment](1)
+ segmentList.add(Segment.toSegment(handOffSegmentId))
+ val splits = inputFormat.getSplitsOfStreaming(
+ job,
+ carbonLoadModel.getCarbonDataLoadSchema.getCarbonTable.getAbsoluteTableIdentifier,
+ segmentList
+ )
+
+ (0 until splits.size()).map { index =>
+ new HandoffPartition(id, index, splits.get(index).asInstanceOf[CarbonInputSplit])
+ }.toArray[Partition]
+ }
+}
+
+object StreamHandoffRDD {
+
+ private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName)
+
+ def iterateStreamingHandoff(
+ carbonLoadModel: CarbonLoadModel,
+ sparkSession: SparkSession
+ ): Unit = {
+ val carbonTable = carbonLoadModel.getCarbonDataLoadSchema.getCarbonTable
+ val identifier = carbonTable.getAbsoluteTableIdentifier
+ var continueHandoff = false
+ // require handoff lock on table
+ val lock = CarbonLockFactory.getCarbonLockObj(identifier, LockUsage.HANDOFF_LOCK)
+ try {
+ if (lock.lockWithRetries()) {
+ LOGGER.info("Acquired the handoff lock for table" +
+ s" ${ carbonTable.getDatabaseName }.${ carbonTable.getTableName }")
+ // handoff streaming segment one by one
+ do {
+ val segmentStatusManager = new SegmentStatusManager(identifier)
+ var loadMetadataDetails: Array[LoadMetadataDetails] = null
+ // lock table to read table status file
+ val statusLock = segmentStatusManager.getTableStatusLock
+ try {
+ if (statusLock.lockWithRetries()) {
+ loadMetadataDetails = SegmentStatusManager.readLoadMetadata(
+ CarbonTablePath.getMetadataPath(identifier.getTablePath))
+ }
+ } finally {
+ if (null != statusLock) {
+ statusLock.unlock()
+ }
+ }
+ if (null != loadMetadataDetails) {
+ val streamSegments =
+ loadMetadataDetails.filter(_.getSegmentStatus == SegmentStatus.STREAMING_FINISH)
+
+ continueHandoff = streamSegments.length > 0
+ if (continueHandoff) {
+ // handoff a streaming segment
+ val loadMetadataDetail = streamSegments(0)
+ executeStreamingHandoff(
+ carbonLoadModel,
+ sparkSession,
+ loadMetadataDetail.getLoadName
+ )
+ }
+ } else {
+ continueHandoff = false
+ }
+ } while (continueHandoff)
+ }
+ } finally {
+ if (null != lock) {
+ lock.unlock()
+ }
+ }
+ }
+
+ /**
+ * start new thread to execute stream segment handoff
+ */
+ def startStreamingHandoffThread(
+ carbonLoadModel: CarbonLoadModel,
+ sparkSession: SparkSession,
+ isDDL: Boolean
+ ): Unit = {
+ if (isDDL) {
+ iterateStreamingHandoff(carbonLoadModel, sparkSession)
+ } else {
+ // start a new thread to execute streaming segment handoff
+ val handoffThread = new Thread() {
+ override def run(): Unit = {
+ iterateStreamingHandoff(carbonLoadModel, sparkSession)
+ }
+ }
+ handoffThread.start()
+ }
+ }
+
+ /**
+ * invoke StreamHandoffRDD to handoff a streaming segment to a columnar segment
+ */
+ def executeStreamingHandoff(
+ carbonLoadModel: CarbonLoadModel,
+ sparkSession: SparkSession,
+ handoffSegmenId: String
+ ): Unit = {
+ var loadStatus = SegmentStatus.SUCCESS
+ var errorMessage: String = "Handoff failure"
+ try {
+ // generate new columnar segment
+ val newMetaEntry = new LoadMetadataDetails
+ carbonLoadModel.setFactTimeStamp(System.currentTimeMillis())
+ CarbonLoaderUtil.populateNewLoadMetaEntry(
+ newMetaEntry,
+ SegmentStatus.INSERT_IN_PROGRESS,
+ carbonLoadModel.getFactTimeStamp,
+ false)
+ val operationContext = new OperationContext()
+ val loadTablePreStatusUpdateEvent: LoadTablePreStatusUpdateEvent =
+ new LoadTablePreStatusUpdateEvent(
+ carbonLoadModel.getCarbonDataLoadSchema.getCarbonTable.getCarbonTableIdentifier,
+ carbonLoadModel)
+ OperationListenerBus.getInstance().fireEvent(loadTablePreStatusUpdateEvent, operationContext)
+
+ CarbonLoaderUtil.recordNewLoadMetadata(newMetaEntry, carbonLoadModel, true, false)
+ val loadTablePostStatusUpdateEvent: LoadTablePostStatusUpdateEvent =
+ new LoadTablePostStatusUpdateEvent(carbonLoadModel)
+ OperationListenerBus.getInstance()
+ .fireEvent(loadTablePostStatusUpdateEvent, operationContext)
+ // convert a streaming segment to columnar segment
+ val status = new StreamHandoffRDD(
+ sparkSession.sparkContext,
+ new HandoffResultImpl(),
+ carbonLoadModel,
+ handoffSegmenId).collect()
+
+ status.foreach { x =>
+ if (!x._2) {
+ loadStatus = SegmentStatus.LOAD_FAILURE
+ }
+ }
+ } catch {
+ case ex: Exception =>
+ loadStatus = SegmentStatus.LOAD_FAILURE
+ errorMessage = errorMessage + ": " + ex.getCause.getMessage
+ LOGGER.error(errorMessage)
+ LOGGER.error(ex, s"Handoff failed on streaming segment $handoffSegmenId")
+ }
+
+ if (loadStatus == SegmentStatus.LOAD_FAILURE) {
+ CarbonLoaderUtil.updateTableStatusForFailure(carbonLoadModel)
+ LOGGER.info("********starting clean up**********")
+ CarbonLoaderUtil.deleteSegment(carbonLoadModel, carbonLoadModel.getSegmentId.toInt)
+ LOGGER.info("********clean up done**********")
+ LOGGER.audit(s"Handoff is failed for " +
+ s"${ carbonLoadModel.getDatabaseName }.${ carbonLoadModel.getTableName }")
+ LOGGER.error("Cannot write load metadata file as handoff failed")
+ throw new Exception(errorMessage)
+ }
+
+ if (loadStatus == SegmentStatus.SUCCESS) {
+ val done = updateLoadMetadata(handoffSegmenId, carbonLoadModel)
+ if (!done) {
+ val errorMessage = "Handoff failed due to failure in table status updation."
+ LOGGER.audit("Handoff is failed for " +
+ s"${ carbonLoadModel.getDatabaseName }.${ carbonLoadModel.getTableName }")
+ LOGGER.error("Handoff failed due to failure in table status updation.")
+ throw new Exception(errorMessage)
+ }
+ done
+ }
+
+ }
+
+ /**
+ * update streaming segment and new columnar segment
+ */
+ private def updateLoadMetadata(
+ handoffSegmentId: String,
+ loadModel: CarbonLoadModel
+ ): Boolean = {
+ var status = false
+ val metaDataFilepath = loadModel.getCarbonDataLoadSchema.getCarbonTable.getMetadataPath
+ val identifier = loadModel.getCarbonDataLoadSchema.getCarbonTable.getAbsoluteTableIdentifier
+ val metadataPath = CarbonTablePath.getMetadataPath(identifier.getTablePath)
+ val fileType = FileFactory.getFileType(metadataPath)
+ if (!FileFactory.isFileExist(metadataPath, fileType)) {
+ FileFactory.mkdirs(metadataPath, fileType)
+ }
+ val tableStatusPath = CarbonTablePath.getTableStatusFilePath(identifier.getTablePath)
+ val segmentStatusManager = new SegmentStatusManager(identifier)
+ val carbonLock = segmentStatusManager.getTableStatusLock
+ try {
+ if (carbonLock.lockWithRetries()) {
+ LOGGER.info(
+ "Acquired lock for table" + loadModel.getDatabaseName() + "." + loadModel.getTableName()
+ + " for table status updation")
+ val listOfLoadFolderDetailsArray =
+ SegmentStatusManager.readLoadMetadata(metaDataFilepath)
+
+ // update new columnar segment to success status
+ val newSegment =
+ listOfLoadFolderDetailsArray.find(_.getLoadName.equals(loadModel.getSegmentId))
+ if (newSegment.isEmpty) {
+ throw new Exception("Failed to update table status for new segment")
+ } else {
+ newSegment.get.setSegmentStatus(SegmentStatus.SUCCESS)
+ newSegment.get.setLoadEndTime(System.currentTimeMillis())
+ }
+
+ // update streaming segment to compacted status
+ val streamSegment =
+ listOfLoadFolderDetailsArray.find(_.getLoadName.equals(handoffSegmentId))
+ if (streamSegment.isEmpty) {
+ throw new Exception("Failed to update table status for streaming segment")
+ } else {
+ streamSegment.get.setSegmentStatus(SegmentStatus.COMPACTED)
+ streamSegment.get.setMergedLoadName(loadModel.getSegmentId)
+ }
+
+ // refresh table status file
+ SegmentStatusManager.writeLoadDetailsIntoFile(tableStatusPath, listOfLoadFolderDetailsArray)
+ status = true
+ } else {
+ LOGGER.error("Not able to acquire the lock for Table status updation for table " + loadModel
+ .getDatabaseName() + "." + loadModel.getTableName())
+ }
+ } finally {
+ if (carbonLock.unlock()) {
+ LOGGER.info("Table unlocked successfully after table status updation" +
+ loadModel.getDatabaseName() + "." + loadModel.getTableName())
+ } else {
+ LOGGER.error("Unable to unlock Table lock for table" + loadModel.getDatabaseName() +
+ "." + loadModel.getTableName() + " during table status updation")
+ }
+ }
+ status
+ }
+}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala
----------------------------------------------------------------------
diff --git a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala
index 3250a53..5f55ef3 100644
--- a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala
+++ b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala
@@ -47,6 +47,7 @@ import org.apache.carbondata.processing.exception.DataLoadingException
import org.apache.carbondata.processing.loading.FailureCauses
import org.apache.carbondata.processing.loading.exception.CarbonDataLoadingException
import org.apache.carbondata.processing.util.CarbonDataProcessorUtil
+import org.apache.carbondata.streaming.parser.FieldConverter
object CarbonScalaUtil {
@@ -121,55 +122,8 @@ object CarbonScalaUtil {
timeStampFormat: SimpleDateFormat,
dateFormat: SimpleDateFormat,
level: Int = 1): String = {
- if (value == null) {
- serializationNullFormat
- } else {
- value match {
- case s: String => if (s.length > CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT) {
- throw new Exception("Dataload failed, String length cannot exceed " +
- CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT + " characters")
- } else {
- s
- }
- case d: java.math.BigDecimal => d.toPlainString
- case i: java.lang.Integer => i.toString
- case d: java.lang.Double => d.toString
- case t: java.sql.Timestamp => timeStampFormat format t
- case d: java.sql.Date => dateFormat format d
- case b: java.lang.Boolean => b.toString
- case s: java.lang.Short => s.toString
- case f: java.lang.Float => f.toString
- case bs: Array[Byte] => new String(bs,
- Charset.forName(CarbonCommonConstants.DEFAULT_CHARSET))
- case s: scala.collection.Seq[Any] =>
- val delimiter = if (level == 1) {
- delimiterLevel1
- } else {
- delimiterLevel2
- }
- val builder = new StringBuilder()
- s.foreach { x =>
- builder.append(getString(x, serializationNullFormat, delimiterLevel1,
- delimiterLevel2, timeStampFormat, dateFormat, level + 1)).append(delimiter)
- }
- builder.substring(0, builder.length - delimiter.length())
- case m: scala.collection.Map[Any, Any] =>
- throw new Exception("Unsupported data type: Map")
- case r: org.apache.spark.sql.Row =>
- val delimiter = if (level == 1) {
- delimiterLevel1
- } else {
- delimiterLevel2
- }
- val builder = new StringBuilder()
- for (i <- 0 until r.length) {
- builder.append(getString(r(i), serializationNullFormat, delimiterLevel1,
- delimiterLevel2, timeStampFormat, dateFormat, level + 1)).append(delimiter)
- }
- builder.substring(0, builder.length - delimiter.length())
- case other => other.toString
- }
- }
+ FieldConverter.objectToString(value, serializationNullFormat, delimiterLevel1,
+ delimiterLevel2, timeStampFormat, dateFormat, level)
}
/**
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/integration/spark-common/src/main/scala/org/apache/carbondata/streaming/CarbonSparkStreamingListener.scala
----------------------------------------------------------------------
diff --git a/integration/spark-common/src/main/scala/org/apache/carbondata/streaming/CarbonSparkStreamingListener.scala b/integration/spark-common/src/main/scala/org/apache/carbondata/streaming/CarbonSparkStreamingListener.scala
new file mode 100644
index 0000000..a99a1e8
--- /dev/null
+++ b/integration/spark-common/src/main/scala/org/apache/carbondata/streaming/CarbonSparkStreamingListener.scala
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.streaming
+
+import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd}
+
+class CarbonSparkStreamingListener extends SparkListener {
+
+ /**
+ * When Spark Streaming App stops, remove all locks for stream table.
+ */
+ override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = {
+ CarbonStreamSparkStreaming.cleanAllLockAfterStop()
+ }
+}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/integration/spark-common/src/main/scala/org/apache/carbondata/streaming/CarbonStreamSparkStreaming.scala
----------------------------------------------------------------------
diff --git a/integration/spark-common/src/main/scala/org/apache/carbondata/streaming/CarbonStreamSparkStreaming.scala b/integration/spark-common/src/main/scala/org/apache/carbondata/streaming/CarbonStreamSparkStreaming.scala
new file mode 100644
index 0000000..28f04b1
--- /dev/null
+++ b/integration/spark-common/src/main/scala/org/apache/carbondata/streaming/CarbonStreamSparkStreaming.scala
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.streaming
+
+import java.util
+
+import scala.collection.JavaConverters._
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
+import org.apache.spark.sql.execution.streaming.{CarbonAppendableStreamSink, Sink}
+import org.apache.spark.streaming.Time
+
+import org.apache.carbondata.common.logging.LogServiceFactory
+import org.apache.carbondata.core.locks.{CarbonLockFactory, ICarbonLock, LockUsage}
+import org.apache.carbondata.core.metadata.schema.table.CarbonTable
+
+/**
+ * Interface used to write stream data to stream table
+ * when integrate with Spark Streaming.
+ *
+ * NOTE: Current integration with Spark Streaming is an alpha feature.
+ */
+class CarbonStreamSparkStreamingWriter(val sparkSession: SparkSession,
+ val carbonTable: CarbonTable,
+ val configuration: Configuration) {
+
+ private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName)
+
+ private var isInitialize: Boolean = false
+
+ private var lock: ICarbonLock = null
+ private var carbonAppendableStreamSink: Sink = null
+
+ /**
+ * Acquired the lock for stream table
+ */
+ def lockStreamTable(): Unit = {
+ lock = CarbonLockFactory.getCarbonLockObj(carbonTable.getAbsoluteTableIdentifier,
+ LockUsage.STREAMING_LOCK)
+ if (lock.lockWithRetries()) {
+ LOGGER.info("Acquired the lock for stream table: " +
+ carbonTable.getDatabaseName + "." +
+ carbonTable.getTableName)
+ } else {
+ LOGGER.error("Not able to acquire the lock for stream table:" +
+ carbonTable.getDatabaseName + "." + carbonTable.getTableName)
+ throw new InterruptedException(
+ "Not able to acquire the lock for stream table: " + carbonTable.getDatabaseName + "." +
+ carbonTable.getTableName)
+ }
+ }
+
+ /**
+ * unlock for stream table
+ */
+ def unLockStreamTable(): Unit = {
+ if (null != lock) {
+ lock.unlock()
+ LOGGER.info("unlock for stream table: " +
+ carbonTable.getDatabaseName + "." +
+ carbonTable.getTableName)
+ }
+ }
+
+ def initialize(): Unit = {
+ carbonAppendableStreamSink = StreamSinkFactory.createStreamTableSink(
+ sparkSession,
+ configuration,
+ carbonTable,
+ extraOptions.toMap).asInstanceOf[CarbonAppendableStreamSink]
+
+ lockStreamTable()
+
+ isInitialize = true
+ }
+
+ def writeStreamData(dataFrame: DataFrame, time: Time): Unit = {
+ if (!isInitialize) {
+ initialize()
+ }
+ carbonAppendableStreamSink.addBatch(time.milliseconds, dataFrame)
+ }
+
+ private val extraOptions = new scala.collection.mutable.HashMap[String, String]
+ private var mode: SaveMode = SaveMode.ErrorIfExists
+
+ this.option("dbName", carbonTable.getDatabaseName)
+ this.option("tableName", carbonTable.getTableName)
+
+ /**
+ * Specifies the behavior when data or table already exists. Options include:
+ * - `SaveMode.Overwrite`: overwrite the existing data.
+ * - `SaveMode.Append`: append the data.
+ * - `SaveMode.Ignore`: ignore the operation (i.e. no-op).
+ * - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
+ */
+ def mode(saveMode: SaveMode): CarbonStreamSparkStreamingWriter = {
+ if (mode == SaveMode.ErrorIfExists) {
+ mode = saveMode
+ }
+ this
+ }
+
+ /**
+ * Specifies the behavior when data or table already exists. Options include:
+ * - `overwrite`: overwrite the existing data.
+ * - `append`: append the data.
+ * - `ignore`: ignore the operation (i.e. no-op).
+ * - `error or default`: default option, throw an exception at runtime.
+ */
+ def mode(saveMode: String): CarbonStreamSparkStreamingWriter = {
+ if (mode == SaveMode.ErrorIfExists) {
+ mode = saveMode.toLowerCase(util.Locale.ROOT) match {
+ case "overwrite" => SaveMode.Overwrite
+ case "append" => SaveMode.Append
+ case "ignore" => SaveMode.Ignore
+ case "error" | "default" => SaveMode.ErrorIfExists
+ case _ => throw new IllegalArgumentException(s"Unknown save mode: $saveMode. " +
+ "Accepted save modes are 'overwrite', 'append', 'ignore', 'error'.")
+ }
+ }
+ this
+ }
+
+ /**
+ * Adds an output option
+ */
+ def option(key: String, value: String): CarbonStreamSparkStreamingWriter = {
+ if (!extraOptions.contains(key)) {
+ extraOptions += (key -> value)
+ }
+ this
+ }
+
+ /**
+ * Adds an output option
+ */
+ def option(key: String, value: Boolean): CarbonStreamSparkStreamingWriter =
+ option(key, value.toString)
+
+ /**
+ * Adds an output option
+ */
+ def option(key: String, value: Long): CarbonStreamSparkStreamingWriter =
+ option(key, value.toString)
+
+ /**
+ * Adds an output option
+ */
+ def option(key: String, value: Double): CarbonStreamSparkStreamingWriter =
+ option(key, value.toString)
+}
+
+object CarbonStreamSparkStreaming {
+
+ @transient private val tableMap =
+ new util.HashMap[String, CarbonStreamSparkStreamingWriter]()
+
+ def getTableMap: util.Map[String, CarbonStreamSparkStreamingWriter] = tableMap
+
+ /**
+ * remove all stream lock.
+ */
+ def cleanAllLockAfterStop(): Unit = {
+ tableMap.asScala.values.foreach { writer => writer.unLockStreamTable() }
+ tableMap.clear()
+ }
+}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/integration/spark-common/src/main/scala/org/apache/carbondata/streaming/CarbonStreamingQueryListener.scala
----------------------------------------------------------------------
diff --git a/integration/spark-common/src/main/scala/org/apache/carbondata/streaming/CarbonStreamingQueryListener.scala b/integration/spark-common/src/main/scala/org/apache/carbondata/streaming/CarbonStreamingQueryListener.scala
new file mode 100644
index 0000000..6d83fad
--- /dev/null
+++ b/integration/spark-common/src/main/scala/org/apache/carbondata/streaming/CarbonStreamingQueryListener.scala
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.streaming
+
+import java.util
+import java.util.UUID
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.execution.streaming.{CarbonAppendableStreamSink, StreamExecution}
+import org.apache.spark.sql.streaming.StreamingQueryListener
+
+import org.apache.carbondata.common.logging.LogServiceFactory
+import org.apache.carbondata.core.locks.{CarbonLockFactory, ICarbonLock, LockUsage}
+
+class CarbonStreamingQueryListener(spark: SparkSession) extends StreamingQueryListener {
+
+ private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName)
+
+ private val cache = new util.HashMap[UUID, ICarbonLock]()
+
+ override def onQueryStarted(event: StreamingQueryListener.QueryStartedEvent): Unit = {
+ val streamQuery = spark.streams.get(event.id)
+ val qry = if (streamQuery.isInstanceOf[StreamExecution]) {
+ // adapt spark 2.1
+ streamQuery.asInstanceOf[StreamExecution]
+ } else {
+ // adapt spark 2.2 and later version
+ val clazz = Class.forName("org.apache.spark.sql.execution.streaming.StreamingQueryWrapper")
+ val method = clazz.getMethod("streamingQuery")
+ method.invoke(streamQuery).asInstanceOf[StreamExecution]
+ }
+ if (qry.sink.isInstanceOf[CarbonAppendableStreamSink]) {
+ LOGGER.info("Carbon streaming query started: " + event.id)
+ val sink = qry.sink.asInstanceOf[CarbonAppendableStreamSink]
+ val carbonTable = sink.carbonTable
+ val lock = CarbonLockFactory.getCarbonLockObj(carbonTable.getAbsoluteTableIdentifier,
+ LockUsage.STREAMING_LOCK)
+ if (lock.lockWithRetries()) {
+ LOGGER.info("Acquired the lock for stream table: " + carbonTable.getDatabaseName + "." +
+ carbonTable.getTableName)
+ cache.put(event.id, lock)
+ } else {
+ LOGGER.error("Not able to acquire the lock for stream table:" +
+ carbonTable.getDatabaseName + "." + carbonTable.getTableName)
+ throw new InterruptedException(
+ "Not able to acquire the lock for stream table: " + carbonTable.getDatabaseName + "." +
+ carbonTable.getTableName)
+ }
+ }
+ }
+
+ override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = {
+ }
+
+ override def onQueryTerminated(event: StreamingQueryListener.QueryTerminatedEvent): Unit = {
+ val lock = cache.remove(event.id)
+ if (null != lock) {
+ LOGGER.info("Carbon streaming query: " + event.id)
+ lock.unlock()
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/integration/spark-common/src/main/scala/org/apache/carbondata/streaming/StreamSinkFactory.scala
----------------------------------------------------------------------
diff --git a/integration/spark-common/src/main/scala/org/apache/carbondata/streaming/StreamSinkFactory.scala b/integration/spark-common/src/main/scala/org/apache/carbondata/streaming/StreamSinkFactory.scala
new file mode 100644
index 0000000..bc7b042
--- /dev/null
+++ b/integration/spark-common/src/main/scala/org/apache/carbondata/streaming/StreamSinkFactory.scala
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.streaming
+
+import scala.collection.JavaConverters._
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd}
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.execution.streaming.{CarbonAppendableStreamSink, Sink}
+
+import org.apache.carbondata.core.constants.CarbonCommonConstants
+import org.apache.carbondata.core.datastore.impl.FileFactory
+import org.apache.carbondata.core.dictionary.server.{DictionaryServer, NonSecureDictionaryServer}
+import org.apache.carbondata.core.dictionary.service.NonSecureDictionaryServiceProvider
+import org.apache.carbondata.core.metadata.encoder.Encoding
+import org.apache.carbondata.core.metadata.schema.table.CarbonTable
+import org.apache.carbondata.core.util.CarbonProperties
+import org.apache.carbondata.core.util.path.CarbonTablePath
+import org.apache.carbondata.events.{OperationContext, OperationListenerBus}
+import org.apache.carbondata.processing.loading.events.LoadEvents.{LoadTablePostExecutionEvent, LoadTablePreExecutionEvent}
+import org.apache.carbondata.processing.loading.model.{CarbonLoadModel, CarbonLoadModelBuilder, LoadOption}
+import org.apache.carbondata.spark.dictionary.provider.SecureDictionaryServiceProvider
+import org.apache.carbondata.spark.dictionary.server.SecureDictionaryServer
+import org.apache.carbondata.streaming.segment.StreamSegment
+
+/**
+ * Stream sink factory
+ */
+object StreamSinkFactory {
+
+ def createStreamTableSink(
+ sparkSession: SparkSession,
+ hadoopConf: Configuration,
+ carbonTable: CarbonTable,
+ parameters: Map[String, String]): Sink = {
+ validateParameters(parameters)
+
+ // build load model
+ val carbonLoadModel = buildCarbonLoadModelForStream(
+ sparkSession,
+ hadoopConf,
+ carbonTable,
+ parameters,
+ "")
+ // fire pre event before streamin is started
+ // in case of streaming options and optionsFinal can be same
+ val operationContext = new OperationContext
+ val loadTablePreExecutionEvent = new LoadTablePreExecutionEvent(
+ carbonTable.getCarbonTableIdentifier,
+ carbonLoadModel,
+ carbonLoadModel.getFactFilePath,
+ false,
+ parameters.asJava,
+ parameters.asJava,
+ false
+ )
+ OperationListenerBus.getInstance().fireEvent(loadTablePreExecutionEvent, operationContext)
+ // prepare the stream segment
+ val segmentId = getStreamSegmentId(carbonTable)
+ carbonLoadModel.setSegmentId(segmentId)
+
+ // start server if necessary
+ val server = startDictionaryServer(
+ sparkSession,
+ carbonTable,
+ carbonLoadModel)
+ if (server.isDefined) {
+ carbonLoadModel.setUseOnePass(true)
+ } else {
+ carbonLoadModel.setUseOnePass(false)
+ }
+ // default is carbon appended stream sink
+ val carbonAppendableStreamSink = new CarbonAppendableStreamSink(
+ sparkSession,
+ carbonTable,
+ segmentId,
+ parameters,
+ carbonLoadModel,
+ server)
+
+ // fire post event before streamin is started
+ val loadTablePostExecutionEvent = new LoadTablePostExecutionEvent(
+ carbonTable.getCarbonTableIdentifier,
+ carbonLoadModel
+ )
+ OperationListenerBus.getInstance().fireEvent(loadTablePostExecutionEvent, operationContext)
+ carbonAppendableStreamSink
+ }
+
+ private def validateParameters(parameters: Map[String, String]): Unit = {
+ val segmentSize = parameters.get(CarbonCommonConstants.HANDOFF_SIZE)
+ if (segmentSize.isDefined) {
+ try {
+ val value = java.lang.Long.parseLong(segmentSize.get)
+ if (value < CarbonCommonConstants.HANDOFF_SIZE_MIN) {
+ new CarbonStreamException(CarbonCommonConstants.HANDOFF_SIZE +
+ "should be bigger than or equal " +
+ CarbonCommonConstants.HANDOFF_SIZE_MIN)
+ }
+ } catch {
+ case _: NumberFormatException =>
+ new CarbonStreamException(CarbonCommonConstants.HANDOFF_SIZE +
+ s" $segmentSize is an illegal number")
+ }
+ }
+ }
+
+ /**
+ * get current stream segment id
+ * @return
+ */
+ private def getStreamSegmentId(carbonTable: CarbonTable): String = {
+ val segmentId = StreamSegment.open(carbonTable)
+ val segmentDir = CarbonTablePath.getSegmentPath(carbonTable.getTablePath, segmentId)
+ val fileType = FileFactory.getFileType(segmentDir)
+ if (!FileFactory.isFileExist(segmentDir, fileType)) {
+ // Create table directory path, in case of enabling hive metastore first load may not have
+ // table folder created.
+ FileFactory.mkdirs(segmentDir, fileType)
+ }
+ if (FileFactory.isFileExist(segmentDir, fileType)) {
+ // recover fault
+ StreamSegment.recoverSegmentIfRequired(segmentDir)
+ } else {
+ FileFactory.mkdirs(segmentDir, fileType)
+ }
+ segmentId
+ }
+
+ def startDictionaryServer(
+ sparkSession: SparkSession,
+ carbonTable: CarbonTable,
+ carbonLoadModel: CarbonLoadModel): Option[DictionaryServer] = {
+ // start dictionary server when use one pass load and dimension with DICTIONARY
+ // encoding is present.
+ val allDimensions = carbonTable.getAllDimensions.asScala.toList
+ val createDictionary = allDimensions.exists {
+ carbonDimension => carbonDimension.hasEncoding(Encoding.DICTIONARY) &&
+ !carbonDimension.hasEncoding(Encoding.DIRECT_DICTIONARY)
+ }
+ val carbonSecureModeDictServer = CarbonProperties.getInstance.
+ getProperty(CarbonCommonConstants.CARBON_SECURE_DICTIONARY_SERVER,
+ CarbonCommonConstants.CARBON_SECURE_DICTIONARY_SERVER_DEFAULT)
+
+ val sparkConf = sparkSession.sqlContext.sparkContext.getConf
+ val sparkDriverHost = sparkSession.sqlContext.sparkContext.
+ getConf.get("spark.driver.host")
+
+ val server: Option[DictionaryServer] = if (createDictionary) {
+ if (sparkConf.get("spark.authenticate", "false").equalsIgnoreCase("true") &&
+ carbonSecureModeDictServer.toBoolean) {
+ val dictionaryServer = SecureDictionaryServer.getInstance(sparkConf,
+ sparkDriverHost.toString, carbonLoadModel.getDictionaryServerPort, carbonTable)
+ carbonLoadModel.setDictionaryServerPort(dictionaryServer.getPort)
+ carbonLoadModel.setDictionaryServerHost(dictionaryServer.getHost)
+ carbonLoadModel.setDictionaryServerSecretKey(dictionaryServer.getSecretKey)
+ carbonLoadModel.setDictionaryEncryptServerSecure(dictionaryServer.isEncryptSecureServer)
+ carbonLoadModel.setDictionaryServiceProvider(new SecureDictionaryServiceProvider())
+ sparkSession.sparkContext.addSparkListener(new SparkListener() {
+ override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd) {
+ dictionaryServer.shutdown()
+ }
+ })
+ Some(dictionaryServer)
+ } else {
+ val dictionaryServer = NonSecureDictionaryServer
+ .getInstance(carbonLoadModel.getDictionaryServerPort, carbonTable)
+ carbonLoadModel.setDictionaryServerPort(dictionaryServer.getPort)
+ carbonLoadModel.setDictionaryServerHost(dictionaryServer.getHost)
+ carbonLoadModel.setDictionaryEncryptServerSecure(false)
+ carbonLoadModel
+ .setDictionaryServiceProvider(new NonSecureDictionaryServiceProvider(dictionaryServer
+ .getPort))
+ sparkSession.sparkContext.addSparkListener(new SparkListener() {
+ override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd) {
+ dictionaryServer.shutdown()
+ }
+ })
+ Some(dictionaryServer)
+ }
+ } else {
+ None
+ }
+ server
+ }
+
+ private def buildCarbonLoadModelForStream(
+ sparkSession: SparkSession,
+ hadoopConf: Configuration,
+ carbonTable: CarbonTable,
+ parameters: Map[String, String],
+ segmentId: String): CarbonLoadModel = {
+ val carbonProperty: CarbonProperties = CarbonProperties.getInstance()
+ carbonProperty.addProperty("zookeeper.enable.lock", "false")
+ val optionsFinal = LoadOption.fillOptionWithDefaultValue(parameters.asJava)
+ optionsFinal.put("sort_scope", "no_sort")
+ if (parameters.get("fileheader").isEmpty) {
+ optionsFinal.put("fileheader", carbonTable.getCreateOrderColumn(carbonTable.getTableName)
+ .asScala.map(_.getColName).mkString(","))
+ }
+ val carbonLoadModel = new CarbonLoadModel()
+ new CarbonLoadModelBuilder(carbonTable).build(
+ parameters.asJava,
+ optionsFinal,
+ carbonLoadModel,
+ hadoopConf)
+ carbonLoadModel.setSegmentId(segmentId)
+ // stream should use one pass
+ val dictionaryServerPort = parameters.getOrElse(
+ CarbonCommonConstants.DICTIONARY_SERVER_PORT,
+ carbonProperty.getProperty(
+ CarbonCommonConstants.DICTIONARY_SERVER_PORT,
+ CarbonCommonConstants.DICTIONARY_SERVER_PORT_DEFAULT))
+ val sparkDriverHost = sparkSession.sqlContext.sparkContext.
+ getConf.get("spark.driver.host")
+ carbonLoadModel.setDictionaryServerHost(sparkDriverHost)
+ carbonLoadModel.setDictionaryServerPort(dictionaryServerPort.toInt)
+ carbonLoadModel
+ }
+}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/integration/spark-common/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonAppendableStreamSink.scala
----------------------------------------------------------------------
diff --git a/integration/spark-common/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonAppendableStreamSink.scala b/integration/spark-common/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonAppendableStreamSink.scala
new file mode 100644
index 0000000..402bc4b
--- /dev/null
+++ b/integration/spark-common/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonAppendableStreamSink.scala
@@ -0,0 +1,362 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import java.util.Date
+
+import scala.collection.JavaConverters._
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.mapreduce._
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
+import org.apache.spark.TaskContext
+import org.apache.spark.internal.io.FileCommitProtocol
+import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.{QueryExecution, SQLExecution}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.util.{SerializableConfiguration, Utils}
+
+import org.apache.carbondata.common.CarbonIterator
+import org.apache.carbondata.common.logging.LogServiceFactory
+import org.apache.carbondata.core.constants.CarbonCommonConstants
+import org.apache.carbondata.core.datastore.impl.FileFactory
+import org.apache.carbondata.core.dictionary.server.DictionaryServer
+import org.apache.carbondata.core.metadata.schema.table.CarbonTable
+import org.apache.carbondata.core.stats.QueryStatistic
+import org.apache.carbondata.core.util.CarbonProperties
+import org.apache.carbondata.core.util.path.CarbonTablePath
+import org.apache.carbondata.events.{OperationContext, OperationListenerBus}
+import org.apache.carbondata.hadoop.util.CarbonInputFormatUtil
+import org.apache.carbondata.processing.loading.constants.DataLoadProcessorConstants
+import org.apache.carbondata.processing.loading.events.LoadEvents.{LoadTablePostExecutionEvent, LoadTablePreExecutionEvent}
+import org.apache.carbondata.processing.loading.model.CarbonLoadModel
+import org.apache.carbondata.spark.rdd.StreamHandoffRDD
+import org.apache.carbondata.streaming.{CarbonStreamException, CarbonStreamOutputFormat}
+import org.apache.carbondata.streaming.parser.CarbonStreamParser
+import org.apache.carbondata.streaming.segment.StreamSegment
+
+/**
+ * an implement of stream sink, it persist each batch to disk by appending the batch data to
+ * data files.
+ */
+class CarbonAppendableStreamSink(
+ sparkSession: SparkSession,
+ val carbonTable: CarbonTable,
+ var currentSegmentId: String,
+ parameters: Map[String, String],
+ carbonLoadModel: CarbonLoadModel,
+ server: Option[DictionaryServer]) extends Sink {
+
+ private val fileLogPath = CarbonTablePath.getStreamingLogDir(carbonTable.getTablePath)
+ private val fileLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession, fileLogPath)
+ // prepare configuration
+ private val hadoopConf = {
+ val conf = sparkSession.sessionState.newHadoopConf()
+ // put all parameters into hadoopConf
+ parameters.foreach { entry =>
+ conf.set(entry._1, entry._2)
+ }
+ // properties below will be used for default CarbonStreamParser
+ conf.set("carbon_complex_delimiter_level_1",
+ carbonLoadModel.getComplexDelimiterLevel1)
+ conf.set("carbon_complex_delimiter_level_2",
+ carbonLoadModel.getComplexDelimiterLevel2)
+ conf.set(
+ DataLoadProcessorConstants.SERIALIZATION_NULL_FORMAT,
+ carbonLoadModel.getSerializationNullFormat().split(",")(1))
+ conf.set(
+ CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT,
+ carbonLoadModel.getTimestampformat())
+ conf.set(
+ CarbonCommonConstants.CARBON_DATE_FORMAT,
+ carbonLoadModel.getDateFormat())
+ conf
+ }
+ // segment max size(byte)
+ private val segmentMaxSize = hadoopConf.getLong(
+ CarbonCommonConstants.HANDOFF_SIZE,
+ CarbonProperties.getInstance().getHandoffSize
+ )
+
+ // auto handoff
+ private val enableAutoHandoff = hadoopConf.getBoolean(
+ CarbonCommonConstants.ENABLE_AUTO_HANDOFF,
+ CarbonProperties.getInstance().isEnableAutoHandoff
+ )
+
+ override def addBatch(batchId: Long, data: DataFrame): Unit = {
+ if (batchId <= fileLog.getLatest().map(_._1).getOrElse(-1L)) {
+ CarbonAppendableStreamSink.LOGGER.info(s"Skipping already committed batch $batchId")
+ } else {
+
+ val statistic = new QueryStatistic()
+
+ // fire pre event on every batch add
+ // in case of streaming options and optionsFinal can be same
+ val operationContext = new OperationContext
+ val loadTablePreExecutionEvent = new LoadTablePreExecutionEvent(
+ carbonTable.getCarbonTableIdentifier,
+ carbonLoadModel,
+ carbonLoadModel.getFactFilePath,
+ false,
+ parameters.asJava,
+ parameters.asJava,
+ false
+ )
+ OperationListenerBus.getInstance().fireEvent(loadTablePreExecutionEvent, operationContext)
+ checkOrHandOffSegment()
+
+ // committer will record how this spark job commit its output
+ val committer = FileCommitProtocol.instantiate(
+ className = sparkSession.sessionState.conf.streamingFileCommitProtocolClass,
+ jobId = batchId.toString,
+ outputPath = fileLogPath,
+ isAppend = false)
+
+ committer match {
+ case manifestCommitter: ManifestFileCommitProtocol =>
+ manifestCommitter.setupManifestOptions(fileLog, batchId)
+ case _ => // Do nothing
+ }
+
+ CarbonAppendableStreamSink.writeDataFileJob(
+ sparkSession,
+ carbonTable,
+ parameters,
+ batchId,
+ currentSegmentId,
+ data.queryExecution,
+ committer,
+ hadoopConf,
+ carbonLoadModel,
+ server)
+ // fire post event on every batch add
+ val loadTablePostExecutionEvent = new LoadTablePostExecutionEvent(
+ carbonTable.getCarbonTableIdentifier,
+ carbonLoadModel
+ )
+ OperationListenerBus.getInstance().fireEvent(loadTablePostExecutionEvent, operationContext)
+
+ statistic.addStatistics(s"add batch: $batchId", System.currentTimeMillis())
+ CarbonAppendableStreamSink.LOGGER.info(
+ s"${statistic.getMessage}, taken time(ms): ${statistic.getTimeTaken}")
+ }
+ }
+
+ /**
+ * if the directory size of current segment beyond the threshold, hand off new segment
+ */
+ private def checkOrHandOffSegment(): Unit = {
+ val segmentDir = CarbonTablePath.getSegmentPath(carbonTable.getTablePath, currentSegmentId)
+ val fileType = FileFactory.getFileType(segmentDir)
+ if (segmentMaxSize <= StreamSegment.size(segmentDir)) {
+ val newSegmentId = StreamSegment.close(carbonTable, currentSegmentId)
+ currentSegmentId = newSegmentId
+ val newSegmentDir = CarbonTablePath.getSegmentPath(carbonTable.getTablePath, currentSegmentId)
+ FileFactory.mkdirs(newSegmentDir, fileType)
+
+ // TODO trigger hand off operation
+ if (enableAutoHandoff) {
+ StreamHandoffRDD.startStreamingHandoffThread(
+ carbonLoadModel,
+ sparkSession,
+ false)
+ }
+ }
+ }
+}
+
+object CarbonAppendableStreamSink {
+
+ private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName)
+
+ /**
+ * package the hadoop configuration and it will be passed to executor side from driver side
+ */
+ case class WriteDataFileJobDescription(
+ serializableHadoopConf: SerializableConfiguration,
+ batchId: Long,
+ segmentId: String)
+
+ /**
+ * Run a spark job to append the newly arrived data to the existing row format
+ * file directly.
+ * If there are failure in the task, spark will re-try the task and
+ * carbon will do recovery by HDFS file truncate. (see StreamSegment.tryRecoverFromTaskFault)
+ * If there are job level failure, every files in the stream segment will do truncate
+ * if necessary. (see StreamSegment.tryRecoverFromJobFault)
+ */
+ def writeDataFileJob(
+ sparkSession: SparkSession,
+ carbonTable: CarbonTable,
+ parameters: Map[String, String],
+ batchId: Long,
+ segmentId: String,
+ queryExecution: QueryExecution,
+ committer: FileCommitProtocol,
+ hadoopConf: Configuration,
+ carbonLoadModel: CarbonLoadModel,
+ server: Option[DictionaryServer]): Unit = {
+
+ // create job
+ val job = Job.getInstance(hadoopConf)
+ job.setOutputKeyClass(classOf[Void])
+ job.setOutputValueClass(classOf[InternalRow])
+ val jobId = CarbonInputFormatUtil.getJobId(new Date, batchId.toInt)
+ job.setJobID(jobId)
+
+ val description = WriteDataFileJobDescription(
+ serializableHadoopConf = new SerializableConfiguration(job.getConfiguration),
+ batchId,
+ segmentId
+ )
+
+ // run write data file job
+ SQLExecution.withNewExecutionId(sparkSession, queryExecution) {
+ var result: Array[TaskCommitMessage] = null
+ try {
+ committer.setupJob(job)
+ // initialize dictionary server
+ if (server.isDefined) {
+ server.get.initializeDictionaryGenerator(carbonTable)
+ }
+
+ val rowSchema = queryExecution.analyzed.schema
+ // write data file
+ result = sparkSession.sparkContext.runJob(queryExecution.toRdd,
+ (taskContext: TaskContext, iterator: Iterator[InternalRow]) => {
+ writeDataFileTask(
+ description,
+ carbonLoadModel,
+ sparkStageId = taskContext.stageId(),
+ sparkPartitionId = taskContext.partitionId(),
+ sparkAttemptNumber = taskContext.attemptNumber(),
+ committer,
+ iterator,
+ rowSchema
+ )
+ })
+
+ // write dictionary
+ if (server.isDefined) {
+ try {
+ server.get.writeTableDictionary(carbonTable.getCarbonTableIdentifier.getTableId)
+ } catch {
+ case _: Exception =>
+ LOGGER.error(
+ s"Error while writing dictionary file for ${carbonTable.getTableUniqueName}")
+ throw new Exception(
+ "Streaming ingest failed due to error while writing dictionary file")
+ }
+ }
+
+ // update data file info in index file
+ StreamSegment.updateIndexFile(
+ CarbonTablePath.getSegmentPath(carbonTable.getTablePath, segmentId))
+
+ } catch {
+ // catch fault of executor side
+ case t: Throwable =>
+ val segmentDir = CarbonTablePath.getSegmentPath(carbonTable.getTablePath, segmentId)
+ StreamSegment.recoverSegmentIfRequired(segmentDir)
+ LOGGER.error(t, s"Aborting job ${ job.getJobID }.")
+ committer.abortJob(job)
+ throw new CarbonStreamException("Job failed to write data file", t)
+ }
+ committer.commitJob(job, result)
+ LOGGER.info(s"Job ${ job.getJobID } committed.")
+ }
+ }
+
+ /**
+ * execute a task for each partition to write a data file
+ */
+ def writeDataFileTask(
+ description: WriteDataFileJobDescription,
+ carbonLoadModel: CarbonLoadModel,
+ sparkStageId: Int,
+ sparkPartitionId: Int,
+ sparkAttemptNumber: Int,
+ committer: FileCommitProtocol,
+ iterator: Iterator[InternalRow],
+ rowSchema: StructType
+ ): TaskCommitMessage = {
+
+ val jobId = CarbonInputFormatUtil.getJobId(new Date, sparkStageId)
+ val taskId = new TaskID(jobId, TaskType.MAP, sparkPartitionId)
+ val taskAttemptId = new TaskAttemptID(taskId, sparkAttemptNumber)
+
+ // Set up the attempt context required to use in the output committer.
+ val taskAttemptContext: TaskAttemptContext = {
+ // Set up the configuration object
+ val hadoopConf = description.serializableHadoopConf.value
+ CarbonStreamOutputFormat.setSegmentId(hadoopConf, description.segmentId)
+ hadoopConf.set("mapred.job.id", jobId.toString)
+ hadoopConf.set("mapred.tip.id", taskAttemptId.getTaskID.toString)
+ hadoopConf.set("mapred.task.id", taskAttemptId.toString)
+ hadoopConf.setBoolean("mapred.task.is.map", true)
+ hadoopConf.setInt("mapred.task.partition", 0)
+ new TaskAttemptContextImpl(hadoopConf, taskAttemptId)
+ }
+
+ committer.setupTask(taskAttemptContext)
+
+ try {
+ Utils.tryWithSafeFinallyAndFailureCallbacks(block = {
+
+ val parserName = taskAttemptContext.getConfiguration.get(
+ CarbonStreamParser.CARBON_STREAM_PARSER,
+ CarbonStreamParser.CARBON_STREAM_PARSER_DEFAULT)
+
+ val streamParser =
+ Class.forName(parserName).newInstance.asInstanceOf[CarbonStreamParser]
+ streamParser.initialize(taskAttemptContext.getConfiguration, rowSchema)
+
+ StreamSegment.appendBatchData(new InputIterator(iterator, streamParser),
+ taskAttemptContext, carbonLoadModel)
+ })(catchBlock = {
+ committer.abortTask(taskAttemptContext)
+ LOGGER.error(s"Job $jobId aborted.")
+ })
+ committer.commitTask(taskAttemptContext)
+ } catch {
+ case t: Throwable =>
+ throw new CarbonStreamException("Task failed while writing rows", t)
+ }
+ }
+
+ /**
+ * convert spark iterator to carbon iterator, so that java module can use it.
+ */
+ class InputIterator(rddIter: Iterator[InternalRow], streamParser: CarbonStreamParser)
+ extends CarbonIterator[Array[Object]] {
+
+ override def hasNext: Boolean = rddIter.hasNext
+
+ override def next: Array[Object] = {
+ streamParser.parserRow(rddIter.next())
+ }
+
+ override def close(): Unit = {
+ streamParser.close()
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/integration/spark-common/src/main/scala/org/apache/spark/sql/test/TestQueryExecutor.scala
----------------------------------------------------------------------
diff --git a/integration/spark-common/src/main/scala/org/apache/spark/sql/test/TestQueryExecutor.scala b/integration/spark-common/src/main/scala/org/apache/spark/sql/test/TestQueryExecutor.scala
index f582145..34f901f 100644
--- a/integration/spark-common/src/main/scala/org/apache/spark/sql/test/TestQueryExecutor.scala
+++ b/integration/spark-common/src/main/scala/org/apache/spark/sql/test/TestQueryExecutor.scala
@@ -124,7 +124,9 @@ object TestQueryExecutor {
TestQueryExecutor.projectPath + "/processing/target",
TestQueryExecutor.projectPath + "/integration/spark-common/target",
TestQueryExecutor.projectPath + "/integration/spark2/target",
- TestQueryExecutor.projectPath + "/integration/spark-common/target/jars")
+ TestQueryExecutor.projectPath + "/integration/spark-common/target/jars",
+ TestQueryExecutor.projectPath + "/streaming/target")
+
lazy val jars = {
val jarsLocal = new ArrayBuffer[String]()
modules.foreach { path =>
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/integration/spark2/pom.xml
----------------------------------------------------------------------
diff --git a/integration/spark2/pom.xml b/integration/spark2/pom.xml
index 13b3d8d..e4593be 100644
--- a/integration/spark2/pom.xml
+++ b/integration/spark2/pom.xml
@@ -36,7 +36,7 @@
<dependencies>
<dependency>
<groupId>org.apache.carbondata</groupId>
- <artifactId>carbondata-streaming</artifactId>
+ <artifactId>carbondata-spark-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/integration/spark2/src/main/scala/org/apache/spark/sql/CarbonSession.scala
----------------------------------------------------------------------
diff --git a/integration/spark2/src/main/scala/org/apache/spark/sql/CarbonSession.scala b/integration/spark2/src/main/scala/org/apache/spark/sql/CarbonSession.scala
index f6bdff6..1038fcf 100644
--- a/integration/spark2/src/main/scala/org/apache/spark/sql/CarbonSession.scala
+++ b/integration/spark2/src/main/scala/org/apache/spark/sql/CarbonSession.scala
@@ -27,7 +27,6 @@ import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd}
import org.apache.spark.sql.SparkSession.Builder
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.catalyst.plans.logical.{Command, LocalRelation, Union}
-import org.apache.spark.sql.execution.streaming.CarbonStreamingQueryListener
import org.apache.spark.sql.hive.execution.command.CarbonSetCommand
import org.apache.spark.sql.internal.{SessionState, SharedState}
import org.apache.spark.sql.profiler.{Profiler, SQLStart}
@@ -36,6 +35,7 @@ import org.apache.spark.util.{CarbonReflectionUtils, Utils}
import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.util.{CarbonProperties, CarbonSessionInfo, ThreadLocalSessionInfo}
import org.apache.carbondata.hadoop.util.CarbonInputFormatUtil
+import org.apache.carbondata.streaming.CarbonStreamingQueryListener
/**
* Session implementation for {org.apache.spark.sql.SparkSession}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonAlterTableCompactionCommand.scala
----------------------------------------------------------------------
diff --git a/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonAlterTableCompactionCommand.scala b/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonAlterTableCompactionCommand.scala
index 5183b02..e60a583 100644
--- a/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonAlterTableCompactionCommand.scala
+++ b/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonAlterTableCompactionCommand.scala
@@ -46,8 +46,7 @@ import org.apache.carbondata.events._
import org.apache.carbondata.processing.loading.events.LoadEvents.LoadMetadataEvent
import org.apache.carbondata.processing.loading.model.{CarbonDataLoadSchema, CarbonLoadModel}
import org.apache.carbondata.processing.merger.{CarbonDataMergerUtil, CompactionType}
-import org.apache.carbondata.spark.rdd.CarbonDataRDDFactory
-import org.apache.carbondata.streaming.StreamHandoffRDD
+import org.apache.carbondata.spark.rdd.{CarbonDataRDDFactory, StreamHandoffRDD}
import org.apache.carbondata.streaming.segment.StreamSegment
/**
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/streaming/pom.xml
----------------------------------------------------------------------
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 01affec..b8c447d 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -22,11 +22,16 @@
<dependencies>
<dependency>
<groupId>org.apache.carbondata</groupId>
- <artifactId>carbondata-spark-common</artifactId>
+ <artifactId>carbondata-hadoop</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
+ <artifactId>spark-sql_${scala.binary.version}</artifactId>
+ <version>${spark.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>${spark.deps.scope}</scope>
@@ -44,7 +49,7 @@
</dependencies>
<build>
- <testSourceDirectory>src/test/scala</testSourceDirectory>
+ <testSourceDirectory>src/test/java</testSourceDirectory>
<resources>
<resource>
<directory>src/resources</directory>
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/streaming/src/main/java/org/apache/carbondata/streaming/CarbonStreamInputFormat.java
----------------------------------------------------------------------
diff --git a/streaming/src/main/java/org/apache/carbondata/streaming/CarbonStreamInputFormat.java b/streaming/src/main/java/org/apache/carbondata/streaming/CarbonStreamInputFormat.java
new file mode 100644
index 0000000..66d89c8
--- /dev/null
+++ b/streaming/src/main/java/org/apache/carbondata/streaming/CarbonStreamInputFormat.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.streaming;
+
+import java.io.IOException;
+
+import org.apache.carbondata.core.cache.Cache;
+import org.apache.carbondata.core.cache.dictionary.Dictionary;
+import org.apache.carbondata.core.cache.dictionary.DictionaryColumnUniqueIdentifier;
+import org.apache.carbondata.core.constants.CarbonCommonConstants;
+import org.apache.carbondata.core.metadata.datatype.DataType;
+import org.apache.carbondata.core.metadata.datatype.DataTypes;
+import org.apache.carbondata.core.metadata.encoder.Encoding;
+import org.apache.carbondata.core.metadata.schema.table.CarbonTable;
+import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn;
+import org.apache.carbondata.core.metadata.schema.table.column.CarbonDimension;
+import org.apache.carbondata.core.scan.complextypes.ArrayQueryType;
+import org.apache.carbondata.core.scan.complextypes.PrimitiveQueryType;
+import org.apache.carbondata.core.scan.complextypes.StructQueryType;
+import org.apache.carbondata.core.scan.filter.GenericQueryType;
+import org.apache.carbondata.core.util.CarbonUtil;
+
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+
+/**
+ * Stream input format
+ */
+public class CarbonStreamInputFormat extends FileInputFormat<Void, Object> {
+
+ public static final String READ_BUFFER_SIZE = "carbon.stream.read.buffer.size";
+ public static final String READ_BUFFER_SIZE_DEFAULT = "65536";
+
+ @Override public RecordReader<Void, Object> createRecordReader(InputSplit split,
+ TaskAttemptContext context) throws IOException, InterruptedException {
+ return new CarbonStreamRecordReader();
+ }
+
+ public static GenericQueryType[] getComplexDimensions(CarbonTable carbontable,
+ CarbonColumn[] carbonColumns, Cache<DictionaryColumnUniqueIdentifier, Dictionary> cache)
+ throws IOException {
+ GenericQueryType[] queryTypes = new GenericQueryType[carbonColumns.length];
+ for (int i = 0; i < carbonColumns.length; i++) {
+ if (carbonColumns[i].isComplex()) {
+ if (DataTypes.isArrayType(carbonColumns[i].getDataType())) {
+ queryTypes[i] = new ArrayQueryType(carbonColumns[i].getColName(),
+ carbonColumns[i].getColName(), i);
+ } else if (DataTypes.isStructType(carbonColumns[i].getDataType())) {
+ queryTypes[i] = new StructQueryType(carbonColumns[i].getColName(),
+ carbonColumns[i].getColName(), i);
+ } else {
+ throw new UnsupportedOperationException(
+ carbonColumns[i].getDataType().getName() + " is not supported");
+ }
+
+ fillChildren(carbontable, queryTypes[i], (CarbonDimension) carbonColumns[i], i, cache);
+ }
+ }
+
+ return queryTypes;
+ }
+
+ private static void fillChildren(CarbonTable carbontable, GenericQueryType parentQueryType,
+ CarbonDimension dimension, int parentBlockIndex,
+ Cache<DictionaryColumnUniqueIdentifier, Dictionary> cache) throws IOException {
+ for (int i = 0; i < dimension.getNumberOfChild(); i++) {
+ CarbonDimension child = dimension.getListOfChildDimensions().get(i);
+ DataType dataType = child.getDataType();
+ GenericQueryType queryType = null;
+ if (DataTypes.isArrayType(dataType)) {
+ queryType =
+ new ArrayQueryType(child.getColName(), dimension.getColName(), ++parentBlockIndex);
+
+ } else if (DataTypes.isStructType(dataType)) {
+ queryType =
+ new StructQueryType(child.getColName(), dimension.getColName(), ++parentBlockIndex);
+ parentQueryType.addChildren(queryType);
+ } else {
+ boolean isDirectDictionary =
+ CarbonUtil.hasEncoding(child.getEncoder(), Encoding.DIRECT_DICTIONARY);
+ String dictionaryPath = carbontable.getTableInfo().getFactTable().getTableProperties()
+ .get(CarbonCommonConstants.DICTIONARY_PATH);
+ DictionaryColumnUniqueIdentifier dictionarIdentifier =
+ new DictionaryColumnUniqueIdentifier(carbontable.getAbsoluteTableIdentifier(),
+ child.getColumnIdentifier(), child.getDataType(), dictionaryPath);
+
+ queryType =
+ new PrimitiveQueryType(child.getColName(), dimension.getColName(), ++parentBlockIndex,
+ child.getDataType(), 4, cache.get(dictionarIdentifier),
+ isDirectDictionary);
+ }
+ parentQueryType.addChildren(queryType);
+ if (child.getNumberOfChild() > 0) {
+ fillChildren(carbontable, queryType, child, parentBlockIndex, cache);
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/streaming/src/main/java/org/apache/carbondata/streaming/CarbonStreamOutputFormat.java
----------------------------------------------------------------------
diff --git a/streaming/src/main/java/org/apache/carbondata/streaming/CarbonStreamOutputFormat.java b/streaming/src/main/java/org/apache/carbondata/streaming/CarbonStreamOutputFormat.java
new file mode 100644
index 0000000..f9f0d76
--- /dev/null
+++ b/streaming/src/main/java/org/apache/carbondata/streaming/CarbonStreamOutputFormat.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.streaming;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+
+import org.apache.carbondata.core.constants.CarbonCommonConstants;
+import org.apache.carbondata.hadoop.util.ObjectSerializationUtil;
+import org.apache.carbondata.processing.loading.model.CarbonLoadModel;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.RecordWriter;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+
+/**
+ * Stream output format
+ */
+public class CarbonStreamOutputFormat extends FileOutputFormat<Void, Object> {
+
+ static final byte[] CARBON_SYNC_MARKER =
+ "@carbondata_sync".getBytes(Charset.forName(CarbonCommonConstants.DEFAULT_CHARSET));
+
+ public static final String CARBON_ENCODER_ROW_BUFFER_SIZE = "carbon.stream.row.buffer.size";
+
+ public static final int CARBON_ENCODER_ROW_BUFFER_SIZE_DEFAULT = 1024;
+
+ public static final String CARBON_STREAM_BLOCKLET_ROW_NUMS = "carbon.stream.blocklet.row.nums";
+
+ public static final int CARBON_STREAM_BLOCKLET_ROW_NUMS_DEFAULT = 32000;
+
+ public static final String CARBON_STREAM_CACHE_SIZE = "carbon.stream.cache.size";
+
+ public static final int CARBON_STREAM_CACHE_SIZE_DEFAULT = 32 * 1024 * 1024;
+
+ private static final String LOAD_Model = "mapreduce.output.carbon.load.model";
+
+ private static final String SEGMENT_ID = "carbon.segment.id";
+
+ @Override public RecordWriter<Void, Object> getRecordWriter(TaskAttemptContext job)
+ throws IOException, InterruptedException {
+ return new CarbonStreamRecordWriter(job);
+ }
+
+ public static void setCarbonLoadModel(Configuration hadoopConf, CarbonLoadModel carbonLoadModel)
+ throws IOException {
+ if (carbonLoadModel != null) {
+ hadoopConf.set(LOAD_Model, ObjectSerializationUtil.convertObjectToString(carbonLoadModel));
+ }
+ }
+
+ public static CarbonLoadModel getCarbonLoadModel(Configuration hadoopConf) throws IOException {
+ String value = hadoopConf.get(LOAD_Model);
+ if (value == null) {
+ return null;
+ } else {
+ return (CarbonLoadModel) ObjectSerializationUtil.convertStringToObject(value);
+ }
+ }
+
+ public static void setSegmentId(Configuration hadoopConf, String segmentId) throws IOException {
+ if (segmentId != null) {
+ hadoopConf.set(SEGMENT_ID, segmentId);
+ }
+ }
+
+ public static String getSegmentId(Configuration hadoopConf) throws IOException {
+ return hadoopConf.get(SEGMENT_ID);
+ }
+
+}
[2/5] carbondata git commit: [CARBONDATA-2165]Remove spark in
carbon-hadoop module
Posted by ja...@apache.org.
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/streaming/src/main/java/org/apache/carbondata/streaming/CarbonStreamRecordReader.java
----------------------------------------------------------------------
diff --git a/streaming/src/main/java/org/apache/carbondata/streaming/CarbonStreamRecordReader.java b/streaming/src/main/java/org/apache/carbondata/streaming/CarbonStreamRecordReader.java
new file mode 100644
index 0000000..69d2a3b
--- /dev/null
+++ b/streaming/src/main/java/org/apache/carbondata/streaming/CarbonStreamRecordReader.java
@@ -0,0 +1,761 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.streaming;
+
+import java.io.IOException;
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.util.BitSet;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.carbondata.core.cache.Cache;
+import org.apache.carbondata.core.cache.CacheProvider;
+import org.apache.carbondata.core.cache.CacheType;
+import org.apache.carbondata.core.cache.dictionary.Dictionary;
+import org.apache.carbondata.core.cache.dictionary.DictionaryColumnUniqueIdentifier;
+import org.apache.carbondata.core.constants.CarbonCommonConstants;
+import org.apache.carbondata.core.datastore.block.SegmentProperties;
+import org.apache.carbondata.core.keygenerator.directdictionary.DirectDictionaryGenerator;
+import org.apache.carbondata.core.keygenerator.directdictionary.DirectDictionaryKeyGeneratorFactory;
+import org.apache.carbondata.core.metadata.datatype.DataType;
+import org.apache.carbondata.core.metadata.datatype.DataTypes;
+import org.apache.carbondata.core.metadata.encoder.Encoding;
+import org.apache.carbondata.core.metadata.schema.table.CarbonTable;
+import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn;
+import org.apache.carbondata.core.metadata.schema.table.column.CarbonDimension;
+import org.apache.carbondata.core.metadata.schema.table.column.CarbonMeasure;
+import org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema;
+import org.apache.carbondata.core.reader.CarbonHeaderReader;
+import org.apache.carbondata.core.scan.expression.exception.FilterUnsupportedException;
+import org.apache.carbondata.core.scan.filter.FilterUtil;
+import org.apache.carbondata.core.scan.filter.GenericQueryType;
+import org.apache.carbondata.core.scan.filter.executer.FilterExecuter;
+import org.apache.carbondata.core.scan.filter.intf.RowImpl;
+import org.apache.carbondata.core.scan.filter.intf.RowIntf;
+import org.apache.carbondata.core.scan.filter.resolver.FilterResolverIntf;
+import org.apache.carbondata.core.scan.model.QueryModel;
+import org.apache.carbondata.core.util.CarbonUtil;
+import org.apache.carbondata.core.util.DataTypeUtil;
+import org.apache.carbondata.format.BlockletHeader;
+import org.apache.carbondata.format.FileHeader;
+import org.apache.carbondata.hadoop.CarbonInputSplit;
+import org.apache.carbondata.hadoop.CarbonMultiBlockSplit;
+import org.apache.carbondata.hadoop.InputMetricsStats;
+import org.apache.carbondata.hadoop.api.CarbonTableInputFormat;
+import org.apache.carbondata.processing.util.CarbonDataProcessorUtil;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.spark.memory.MemoryMode;
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
+import org.apache.spark.sql.execution.vectorized.ColumnVector;
+import org.apache.spark.sql.execution.vectorized.ColumnarBatch;
+import org.apache.spark.sql.types.CalendarIntervalType;
+import org.apache.spark.sql.types.Decimal;
+import org.apache.spark.sql.types.DecimalType;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+import org.apache.spark.unsafe.types.CalendarInterval;
+import org.apache.spark.unsafe.types.UTF8String;
+
+/**
+ * Stream record reader
+ */
+public class CarbonStreamRecordReader extends RecordReader<Void, Object> {
+ // vector reader
+ private boolean isVectorReader;
+
+ // metadata
+ private CarbonTable carbonTable;
+ private CarbonColumn[] storageColumns;
+ private boolean[] isRequired;
+ private DataType[] measureDataTypes;
+ private int dimensionCount;
+ private int measureCount;
+
+ // input
+ private FileSplit fileSplit;
+ private Configuration hadoopConf;
+ private StreamBlockletReader input;
+ private boolean isFirstRow = true;
+ private QueryModel model;
+
+ // decode data
+ private BitSet allNonNull;
+ private boolean[] isNoDictColumn;
+ private DirectDictionaryGenerator[] directDictionaryGenerators;
+ private CacheProvider cacheProvider;
+ private Cache<DictionaryColumnUniqueIdentifier, Dictionary> cache;
+ private GenericQueryType[] queryTypes;
+
+ // vectorized reader
+ private StructType outputSchema;
+ private ColumnarBatch columnarBatch;
+ private boolean isFinished = false;
+
+ // filter
+ private FilterExecuter filter;
+ private boolean[] isFilterRequired;
+ private Object[] filterValues;
+ private RowIntf filterRow;
+ private int[] filterMap;
+
+ // output
+ private CarbonColumn[] projection;
+ private boolean[] isProjectionRequired;
+ private int[] projectionMap;
+ private Object[] outputValues;
+ private InternalRow outputRow;
+
+ // empty project, null filter
+ private boolean skipScanData;
+
+ // return raw row for handoff
+ private boolean useRawRow = false;
+
+ // InputMetricsStats
+ private InputMetricsStats inputMetricsStats;
+
+ @Override public void initialize(InputSplit split, TaskAttemptContext context)
+ throws IOException, InterruptedException {
+ // input
+ if (split instanceof CarbonInputSplit) {
+ fileSplit = (CarbonInputSplit) split;
+ } else if (split instanceof CarbonMultiBlockSplit) {
+ fileSplit = ((CarbonMultiBlockSplit) split).getAllSplits().get(0);
+ } else {
+ fileSplit = (FileSplit) split;
+ }
+
+ // metadata
+ hadoopConf = context.getConfiguration();
+ if (model == null) {
+ CarbonTableInputFormat format = new CarbonTableInputFormat<Object>();
+ model = format.createQueryModel(split, context);
+ }
+ carbonTable = model.getTable();
+ List<CarbonDimension> dimensions =
+ carbonTable.getDimensionByTableName(carbonTable.getTableName());
+ dimensionCount = dimensions.size();
+ List<CarbonMeasure> measures =
+ carbonTable.getMeasureByTableName(carbonTable.getTableName());
+ measureCount = measures.size();
+ List<CarbonColumn> carbonColumnList =
+ carbonTable.getStreamStorageOrderColumn(carbonTable.getTableName());
+ storageColumns = carbonColumnList.toArray(new CarbonColumn[carbonColumnList.size()]);
+ isNoDictColumn = CarbonDataProcessorUtil.getNoDictionaryMapping(storageColumns);
+ directDictionaryGenerators = new DirectDictionaryGenerator[storageColumns.length];
+ for (int i = 0; i < storageColumns.length; i++) {
+ if (storageColumns[i].hasEncoding(Encoding.DIRECT_DICTIONARY)) {
+ directDictionaryGenerators[i] = DirectDictionaryKeyGeneratorFactory
+ .getDirectDictionaryGenerator(storageColumns[i].getDataType());
+ }
+ }
+ measureDataTypes = new DataType[measureCount];
+ for (int i = 0; i < measureCount; i++) {
+ measureDataTypes[i] = storageColumns[dimensionCount + i].getDataType();
+ }
+
+ // decode data
+ allNonNull = new BitSet(storageColumns.length);
+ projection = model.getProjectionColumns();
+
+ isRequired = new boolean[storageColumns.length];
+ boolean[] isFiltlerDimensions = model.getIsFilterDimensions();
+ boolean[] isFiltlerMeasures = model.getIsFilterMeasures();
+ isFilterRequired = new boolean[storageColumns.length];
+ filterMap = new int[storageColumns.length];
+ for (int i = 0; i < storageColumns.length; i++) {
+ if (storageColumns[i].isDimension()) {
+ if (isFiltlerDimensions[storageColumns[i].getOrdinal()]) {
+ isRequired[i] = true;
+ isFilterRequired[i] = true;
+ filterMap[i] = storageColumns[i].getOrdinal();
+ }
+ } else {
+ if (isFiltlerMeasures[storageColumns[i].getOrdinal()]) {
+ isRequired[i] = true;
+ isFilterRequired[i] = true;
+ filterMap[i] = carbonTable.getDimensionOrdinalMax() + storageColumns[i].getOrdinal();
+ }
+ }
+ }
+
+ isProjectionRequired = new boolean[storageColumns.length];
+ projectionMap = new int[storageColumns.length];
+ for (int i = 0; i < storageColumns.length; i++) {
+ for (int j = 0; j < projection.length; j++) {
+ if (storageColumns[i].getColName().equals(projection[j].getColName())) {
+ isRequired[i] = true;
+ isProjectionRequired[i] = true;
+ projectionMap[i] = j;
+ break;
+ }
+ }
+ }
+
+ // initialize filter
+ if (null != model.getFilterExpressionResolverTree()) {
+ initializeFilter();
+ } else if (projection.length == 0) {
+ skipScanData = true;
+ }
+
+ }
+
+ private void initializeFilter() {
+
+ List<ColumnSchema> wrapperColumnSchemaList = CarbonUtil
+ .getColumnSchemaList(carbonTable.getDimensionByTableName(carbonTable.getTableName()),
+ carbonTable.getMeasureByTableName(carbonTable.getTableName()));
+ int[] dimLensWithComplex = new int[wrapperColumnSchemaList.size()];
+ for (int i = 0; i < dimLensWithComplex.length; i++) {
+ dimLensWithComplex[i] = Integer.MAX_VALUE;
+ }
+
+ int[] dictionaryColumnCardinality =
+ CarbonUtil.getFormattedCardinality(dimLensWithComplex, wrapperColumnSchemaList);
+ SegmentProperties segmentProperties =
+ new SegmentProperties(wrapperColumnSchemaList, dictionaryColumnCardinality);
+ Map<Integer, GenericQueryType> complexDimensionInfoMap = new HashMap<>();
+
+ FilterResolverIntf resolverIntf = model.getFilterExpressionResolverTree();
+ filter = FilterUtil.getFilterExecuterTree(resolverIntf, segmentProperties,
+ complexDimensionInfoMap);
+ // for row filter, we need update column index
+ FilterUtil.updateIndexOfColumnExpression(resolverIntf.getFilterExpression(),
+ carbonTable.getDimensionOrdinalMax());
+
+ }
+
+ public void setQueryModel(QueryModel model) {
+ this.model = model;
+ }
+
+ private byte[] getSyncMarker(String filePath) throws IOException {
+ CarbonHeaderReader headerReader = new CarbonHeaderReader(filePath);
+ FileHeader header = headerReader.readHeader();
+ return header.getSync_marker();
+ }
+
+ public void setUseRawRow(boolean useRawRow) {
+ this.useRawRow = useRawRow;
+ }
+
+ private void initializeAtFirstRow() throws IOException {
+ filterValues = new Object[carbonTable.getDimensionOrdinalMax() + measureCount];
+ filterRow = new RowImpl();
+ filterRow.setValues(filterValues);
+
+ outputValues = new Object[projection.length];
+ outputRow = new GenericInternalRow(outputValues);
+
+ Path file = fileSplit.getPath();
+
+ byte[] syncMarker = getSyncMarker(file.toString());
+
+ FileSystem fs = file.getFileSystem(hadoopConf);
+
+ int bufferSize = Integer.parseInt(hadoopConf.get(CarbonStreamInputFormat.READ_BUFFER_SIZE,
+ CarbonStreamInputFormat.READ_BUFFER_SIZE_DEFAULT));
+
+ FSDataInputStream fileIn = fs.open(file, bufferSize);
+ fileIn.seek(fileSplit.getStart());
+ input = new StreamBlockletReader(syncMarker, fileIn, fileSplit.getLength(),
+ fileSplit.getStart() == 0);
+
+ cacheProvider = CacheProvider.getInstance();
+ cache = cacheProvider.createCache(CacheType.FORWARD_DICTIONARY);
+ queryTypes = CarbonStreamInputFormat.getComplexDimensions(carbonTable, storageColumns, cache);
+
+ outputSchema = new StructType((StructField[])
+ DataTypeUtil.getDataTypeConverter().convertCarbonSchemaToSparkSchema(projection));
+ }
+
+ @Override public boolean nextKeyValue() throws IOException, InterruptedException {
+ if (isFirstRow) {
+ isFirstRow = false;
+ initializeAtFirstRow();
+ }
+ if (isFinished) {
+ return false;
+ }
+
+ if (isVectorReader) {
+ return nextColumnarBatch();
+ }
+
+ return nextRow();
+ }
+
+ /**
+ * for vector reader, check next columnar batch
+ */
+ private boolean nextColumnarBatch() throws IOException {
+ boolean hasNext;
+ boolean scanMore = false;
+ do {
+ // move to the next blocklet
+ hasNext = input.nextBlocklet();
+ if (hasNext) {
+ // read blocklet header
+ BlockletHeader header = input.readBlockletHeader();
+ if (isScanRequired(header)) {
+ scanMore = !scanBlockletAndFillVector(header);
+ } else {
+ input.skipBlockletData(true);
+ scanMore = true;
+ }
+ } else {
+ isFinished = true;
+ scanMore = false;
+ }
+ } while (scanMore);
+ return hasNext;
+ }
+
+ /**
+ * check next Row
+ */
+ private boolean nextRow() throws IOException {
+ // read row one by one
+ try {
+ boolean hasNext;
+ boolean scanMore = false;
+ do {
+ hasNext = input.hasNext();
+ if (hasNext) {
+ if (skipScanData) {
+ input.nextRow();
+ scanMore = false;
+ } else {
+ if (useRawRow) {
+ // read raw row for streaming handoff which does not require decode raw row
+ readRawRowFromStream();
+ } else {
+ readRowFromStream();
+ }
+ if (null != filter) {
+ scanMore = !filter.applyFilter(filterRow, carbonTable.getDimensionOrdinalMax());
+ } else {
+ scanMore = false;
+ }
+ }
+ } else {
+ if (input.nextBlocklet()) {
+ BlockletHeader header = input.readBlockletHeader();
+ if (isScanRequired(header)) {
+ if (skipScanData) {
+ input.skipBlockletData(false);
+ } else {
+ input.readBlockletData(header);
+ }
+ } else {
+ input.skipBlockletData(true);
+ }
+ scanMore = true;
+ } else {
+ isFinished = true;
+ scanMore = false;
+ }
+ }
+ } while (scanMore);
+ return hasNext;
+ } catch (FilterUnsupportedException e) {
+ throw new IOException("Failed to filter row in detail reader", e);
+ }
+ }
+
+ @Override public Void getCurrentKey() throws IOException, InterruptedException {
+ return null;
+ }
+
+ @Override public Object getCurrentValue() throws IOException, InterruptedException {
+ if (isVectorReader) {
+ int value = columnarBatch.numValidRows();
+ if (inputMetricsStats != null) {
+ inputMetricsStats.incrementRecordRead((long) value);
+ }
+
+ return columnarBatch;
+ }
+
+ if (inputMetricsStats != null) {
+ inputMetricsStats.incrementRecordRead(1L);
+ }
+
+ return outputRow;
+ }
+
+ private boolean isScanRequired(BlockletHeader header) {
+ // TODO require to implement min-max index
+ if (null == filter) {
+ return true;
+ }
+ return true;
+ }
+
+ private boolean scanBlockletAndFillVector(BlockletHeader header) throws IOException {
+ // if filter is null and output projection is empty, use the row number of blocklet header
+ if (skipScanData) {
+ int rowNums = header.getBlocklet_info().getNum_rows();
+ columnarBatch = ColumnarBatch.allocate(outputSchema, MemoryMode.OFF_HEAP, rowNums);
+ columnarBatch.setNumRows(rowNums);
+ input.skipBlockletData(true);
+ return rowNums > 0;
+ }
+
+ input.readBlockletData(header);
+ columnarBatch = ColumnarBatch.allocate(outputSchema, MemoryMode.OFF_HEAP, input.getRowNums());
+ int rowNum = 0;
+ if (null == filter) {
+ while (input.hasNext()) {
+ readRowFromStream();
+ putRowToColumnBatch(rowNum++);
+ }
+ } else {
+ try {
+ while (input.hasNext()) {
+ readRowFromStream();
+ if (filter.applyFilter(filterRow, carbonTable.getDimensionOrdinalMax())) {
+ putRowToColumnBatch(rowNum++);
+ }
+ }
+ } catch (FilterUnsupportedException e) {
+ throw new IOException("Failed to filter row in vector reader", e);
+ }
+ }
+ columnarBatch.setNumRows(rowNum);
+ return rowNum > 0;
+ }
+
+ private void readRowFromStream() {
+ input.nextRow();
+ short nullLen = input.readShort();
+ BitSet nullBitSet = allNonNull;
+ if (nullLen > 0) {
+ nullBitSet = BitSet.valueOf(input.readBytes(nullLen));
+ }
+ int colCount = 0;
+ // primitive type dimension
+ for (; colCount < isNoDictColumn.length; colCount++) {
+ if (nullBitSet.get(colCount)) {
+ if (isFilterRequired[colCount]) {
+ filterValues[filterMap[colCount]] = CarbonCommonConstants.MEMBER_DEFAULT_VAL_ARRAY;
+ }
+ if (isProjectionRequired[colCount]) {
+ outputValues[projectionMap[colCount]] = null;
+ }
+ } else {
+ if (isNoDictColumn[colCount]) {
+ int v = input.readShort();
+ if (isRequired[colCount]) {
+ byte[] b = input.readBytes(v);
+ if (isFilterRequired[colCount]) {
+ filterValues[filterMap[colCount]] = b;
+ }
+ if (isProjectionRequired[colCount]) {
+ outputValues[projectionMap[colCount]] =
+ DataTypeUtil.getDataBasedOnDataTypeForNoDictionaryColumn(b,
+ storageColumns[colCount].getDataType());
+ }
+ } else {
+ input.skipBytes(v);
+ }
+ } else if (null != directDictionaryGenerators[colCount]) {
+ if (isRequired[colCount]) {
+ if (isFilterRequired[colCount]) {
+ filterValues[filterMap[colCount]] = input.copy(4);
+ }
+ if (isProjectionRequired[colCount]) {
+ outputValues[projectionMap[colCount]] =
+ directDictionaryGenerators[colCount].getValueFromSurrogate(input.readInt());
+ } else {
+ input.skipBytes(4);
+ }
+ } else {
+ input.skipBytes(4);
+ }
+ } else {
+ if (isRequired[colCount]) {
+ if (isFilterRequired[colCount]) {
+ filterValues[filterMap[colCount]] = input.copy(4);
+ }
+ if (isProjectionRequired[colCount]) {
+ outputValues[projectionMap[colCount]] = input.readInt();
+ } else {
+ input.skipBytes(4);
+ }
+ } else {
+ input.skipBytes(4);
+ }
+ }
+ }
+ }
+ // complex type dimension
+ for (; colCount < dimensionCount; colCount++) {
+ if (nullBitSet.get(colCount)) {
+ if (isFilterRequired[colCount]) {
+ filterValues[filterMap[colCount]] = null;
+ }
+ if (isProjectionRequired[colCount]) {
+ outputValues[projectionMap[colCount]] = null;
+ }
+ } else {
+ short v = input.readShort();
+ if (isRequired[colCount]) {
+ byte[] b = input.readBytes(v);
+ if (isFilterRequired[colCount]) {
+ filterValues[filterMap[colCount]] = b;
+ }
+ if (isProjectionRequired[colCount]) {
+ outputValues[projectionMap[colCount]] = queryTypes[colCount]
+ .getDataBasedOnDataTypeFromSurrogates(ByteBuffer.wrap(b));
+ }
+ } else {
+ input.skipBytes(v);
+ }
+ }
+ }
+ // measure
+ DataType dataType;
+ for (int msrCount = 0; msrCount < measureCount; msrCount++, colCount++) {
+ if (nullBitSet.get(colCount)) {
+ if (isFilterRequired[colCount]) {
+ filterValues[filterMap[colCount]] = null;
+ }
+ if (isProjectionRequired[colCount]) {
+ outputValues[projectionMap[colCount]] = null;
+ }
+ } else {
+ dataType = measureDataTypes[msrCount];
+ if (dataType == DataTypes.BOOLEAN) {
+ if (isRequired[colCount]) {
+ boolean v = input.readBoolean();
+ if (isFilterRequired[colCount]) {
+ filterValues[filterMap[colCount]] = v;
+ }
+ if (isProjectionRequired[colCount]) {
+ outputValues[projectionMap[colCount]] = v;
+ }
+ } else {
+ input.skipBytes(1);
+ }
+ } else if (dataType == DataTypes.SHORT) {
+ if (isRequired[colCount]) {
+ short v = input.readShort();
+ if (isFilterRequired[colCount]) {
+ filterValues[filterMap[colCount]] = v;
+ }
+ if (isProjectionRequired[colCount]) {
+ outputValues[projectionMap[colCount]] = v;
+ }
+ } else {
+ input.skipBytes(2);
+ }
+ } else if (dataType == DataTypes.INT) {
+ if (isRequired[colCount]) {
+ int v = input.readInt();
+ if (isFilterRequired[colCount]) {
+ filterValues[filterMap[colCount]] = v;
+ }
+ if (isProjectionRequired[colCount]) {
+ outputValues[projectionMap[colCount]] = v;
+ }
+ } else {
+ input.skipBytes(4);
+ }
+ } else if (dataType == DataTypes.LONG) {
+ if (isRequired[colCount]) {
+ long v = input.readLong();
+ if (isFilterRequired[colCount]) {
+ filterValues[filterMap[colCount]] = v;
+ }
+ if (isProjectionRequired[colCount]) {
+ outputValues[projectionMap[colCount]] = v;
+ }
+ } else {
+ input.skipBytes(8);
+ }
+ } else if (dataType == DataTypes.DOUBLE) {
+ if (isRequired[colCount]) {
+ double v = input.readDouble();
+ if (isFilterRequired[colCount]) {
+ filterValues[filterMap[colCount]] = v;
+ }
+ if (isProjectionRequired[colCount]) {
+ outputValues[projectionMap[colCount]] = v;
+ }
+ } else {
+ input.skipBytes(8);
+ }
+ } else if (DataTypes.isDecimal(dataType)) {
+ int len = input.readShort();
+ if (isRequired[colCount]) {
+ BigDecimal v = DataTypeUtil.byteToBigDecimal(input.readBytes(len));
+ if (isFilterRequired[colCount]) {
+ filterValues[filterMap[colCount]] = v;
+ }
+ if (isProjectionRequired[colCount]) {
+ outputValues[projectionMap[colCount]] =
+ DataTypeUtil.getDataTypeConverter().convertFromBigDecimalToDecimal(v);
+ }
+ } else {
+ input.skipBytes(len);
+ }
+ }
+ }
+ }
+ }
+
+ private void readRawRowFromStream() {
+ input.nextRow();
+ short nullLen = input.readShort();
+ BitSet nullBitSet = allNonNull;
+ if (nullLen > 0) {
+ nullBitSet = BitSet.valueOf(input.readBytes(nullLen));
+ }
+ int colCount = 0;
+ // primitive type dimension
+ for (; colCount < isNoDictColumn.length; colCount++) {
+ if (nullBitSet.get(colCount)) {
+ outputValues[colCount] = CarbonCommonConstants.MEMBER_DEFAULT_VAL_ARRAY;
+ } else {
+ if (isNoDictColumn[colCount]) {
+ int v = input.readShort();
+ outputValues[colCount] = input.readBytes(v);
+ } else {
+ outputValues[colCount] = input.readInt();
+ }
+ }
+ }
+ // complex type dimension
+ for (; colCount < dimensionCount; colCount++) {
+ if (nullBitSet.get(colCount)) {
+ outputValues[colCount] = null;
+ } else {
+ short v = input.readShort();
+ outputValues[colCount] = input.readBytes(v);
+ }
+ }
+ // measure
+ DataType dataType;
+ for (int msrCount = 0; msrCount < measureCount; msrCount++, colCount++) {
+ if (nullBitSet.get(colCount)) {
+ outputValues[colCount] = null;
+ } else {
+ dataType = measureDataTypes[msrCount];
+ if (dataType == DataTypes.BOOLEAN) {
+ outputValues[colCount] = input.readBoolean();
+ } else if (dataType == DataTypes.SHORT) {
+ outputValues[colCount] = input.readShort();
+ } else if (dataType == DataTypes.INT) {
+ outputValues[colCount] = input.readInt();
+ } else if (dataType == DataTypes.LONG) {
+ outputValues[colCount] = input.readLong();
+ } else if (dataType == DataTypes.DOUBLE) {
+ outputValues[colCount] = input.readDouble();
+ } else if (DataTypes.isDecimal(dataType)) {
+ int len = input.readShort();
+ outputValues[colCount] = DataTypeUtil.byteToBigDecimal(input.readBytes(len));
+ }
+ }
+ }
+ }
+
+ private void putRowToColumnBatch(int rowId) {
+ for (int i = 0; i < projection.length; i++) {
+ Object value = outputValues[i];
+ ColumnVector col = columnarBatch.column(i);
+ org.apache.spark.sql.types.DataType t = col.dataType();
+ if (null == value) {
+ col.putNull(rowId);
+ } else {
+ if (t == org.apache.spark.sql.types.DataTypes.BooleanType) {
+ col.putBoolean(rowId, (boolean)value);
+ } else if (t == org.apache.spark.sql.types.DataTypes.ByteType) {
+ col.putByte(rowId, (byte) value);
+ } else if (t == org.apache.spark.sql.types.DataTypes.ShortType) {
+ col.putShort(rowId, (short) value);
+ } else if (t == org.apache.spark.sql.types.DataTypes.IntegerType) {
+ col.putInt(rowId, (int) value);
+ } else if (t == org.apache.spark.sql.types.DataTypes.LongType) {
+ col.putLong(rowId, (long) value);
+ } else if (t == org.apache.spark.sql.types.DataTypes.FloatType) {
+ col.putFloat(rowId, (float) value);
+ } else if (t == org.apache.spark.sql.types.DataTypes.DoubleType) {
+ col.putDouble(rowId, (double) value);
+ } else if (t == org.apache.spark.sql.types.DataTypes.StringType) {
+ UTF8String v = (UTF8String) value;
+ col.putByteArray(rowId, v.getBytes());
+ } else if (t instanceof org.apache.spark.sql.types.DecimalType) {
+ DecimalType dt = (DecimalType)t;
+ Decimal d = Decimal.fromDecimal(value);
+ if (dt.precision() <= Decimal.MAX_INT_DIGITS()) {
+ col.putInt(rowId, (int)d.toUnscaledLong());
+ } else if (dt.precision() <= Decimal.MAX_LONG_DIGITS()) {
+ col.putLong(rowId, d.toUnscaledLong());
+ } else {
+ final BigInteger integer = d.toJavaBigDecimal().unscaledValue();
+ byte[] bytes = integer.toByteArray();
+ col.putByteArray(rowId, bytes, 0, bytes.length);
+ }
+ } else if (t instanceof CalendarIntervalType) {
+ CalendarInterval c = (CalendarInterval) value;
+ col.getChildColumn(0).putInt(rowId, c.months);
+ col.getChildColumn(1).putLong(rowId, c.microseconds);
+ } else if (t instanceof org.apache.spark.sql.types.DateType) {
+ col.putInt(rowId, (int) value);
+ } else if (t instanceof org.apache.spark.sql.types.TimestampType) {
+ col.putLong(rowId, (long) value);
+ }
+ }
+ }
+ }
+
+ @Override public float getProgress() throws IOException, InterruptedException {
+ return 0;
+ }
+
+ public void setVectorReader(boolean isVectorReader) {
+ this.isVectorReader = isVectorReader;
+ }
+
+ public void setInputMetricsStats(InputMetricsStats inputMetricsStats) {
+ this.inputMetricsStats = inputMetricsStats;
+ }
+
+ @Override public void close() throws IOException {
+ if (null != input) {
+ input.close();
+ }
+ if (null != columnarBatch) {
+ columnarBatch.close();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/streaming/src/main/java/org/apache/carbondata/streaming/CarbonStreamRecordWriter.java
----------------------------------------------------------------------
diff --git a/streaming/src/main/java/org/apache/carbondata/streaming/CarbonStreamRecordWriter.java b/streaming/src/main/java/org/apache/carbondata/streaming/CarbonStreamRecordWriter.java
new file mode 100644
index 0000000..4e555d3
--- /dev/null
+++ b/streaming/src/main/java/org/apache/carbondata/streaming/CarbonStreamRecordWriter.java
@@ -0,0 +1,325 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.streaming;
+
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.math.BigDecimal;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.List;
+
+import org.apache.carbondata.common.logging.LogService;
+import org.apache.carbondata.common.logging.LogServiceFactory;
+import org.apache.carbondata.core.datastore.filesystem.CarbonFile;
+import org.apache.carbondata.core.datastore.impl.FileFactory;
+import org.apache.carbondata.core.datastore.row.CarbonRow;
+import org.apache.carbondata.core.metadata.datatype.DataType;
+import org.apache.carbondata.core.metadata.datatype.DataTypes;
+import org.apache.carbondata.core.metadata.schema.table.CarbonTable;
+import org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema;
+import org.apache.carbondata.core.util.CarbonMetadataUtil;
+import org.apache.carbondata.core.util.CarbonUtil;
+import org.apache.carbondata.core.util.DataTypeUtil;
+import org.apache.carbondata.core.util.path.CarbonTablePath;
+import org.apache.carbondata.format.FileHeader;
+import org.apache.carbondata.processing.loading.BadRecordsLogger;
+import org.apache.carbondata.processing.loading.BadRecordsLoggerProvider;
+import org.apache.carbondata.processing.loading.CarbonDataLoadConfiguration;
+import org.apache.carbondata.processing.loading.DataField;
+import org.apache.carbondata.processing.loading.DataLoadProcessBuilder;
+import org.apache.carbondata.processing.loading.converter.RowConverter;
+import org.apache.carbondata.processing.loading.converter.impl.RowConverterImpl;
+import org.apache.carbondata.processing.loading.model.CarbonLoadModel;
+import org.apache.carbondata.processing.loading.parser.RowParser;
+import org.apache.carbondata.processing.loading.parser.impl.RowParserImpl;
+import org.apache.carbondata.processing.store.writer.AbstractFactDataWriter;
+import org.apache.carbondata.processing.util.CarbonDataProcessorUtil;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.RecordWriter;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.TaskID;
+
+/**
+ * Stream record writer
+ */
+public class CarbonStreamRecordWriter extends RecordWriter<Void, Object> {
+
+ private static final LogService LOGGER =
+ LogServiceFactory.getLogService(CarbonStreamRecordWriter.class.getName());
+
+ // basic info
+ private Configuration hadoopConf;
+ private CarbonLoadModel carbonLoadModel;
+ private CarbonDataLoadConfiguration configuration;
+ private CarbonTable carbonTable;
+ private int maxRowNums;
+ private int maxCacheSize;
+
+ // parser and converter
+ private RowParser rowParser;
+ private BadRecordsLogger badRecordLogger;
+ private RowConverter converter;
+ private CarbonRow currentRow = new CarbonRow(null);
+
+ // encoder
+ private DataField[] dataFields;
+ private BitSet nullBitSet;
+ private boolean[] isNoDictionaryDimensionColumn;
+ private int dimensionWithComplexCount;
+ private int measureCount;
+ private DataType[] measureDataTypes;
+ private StreamBlockletWriter output = null;
+
+ // data write
+ private String segmentDir;
+ private String fileName;
+ private DataOutputStream outputStream;
+ private boolean isFirstRow = true;
+ private boolean hasException = false;
+
+ CarbonStreamRecordWriter(TaskAttemptContext job) throws IOException {
+ initialize(job);
+ }
+
+ public CarbonStreamRecordWriter(TaskAttemptContext job, CarbonLoadModel carbonLoadModel)
+ throws IOException {
+ this.carbonLoadModel = carbonLoadModel;
+ initialize(job);
+ }
+
+ private void initialize(TaskAttemptContext job) throws IOException {
+ // set basic information
+ hadoopConf = job.getConfiguration();
+ if (carbonLoadModel == null) {
+ carbonLoadModel = CarbonStreamOutputFormat.getCarbonLoadModel(hadoopConf);
+ if (carbonLoadModel == null) {
+ throw new IOException(
+ "CarbonStreamRecordWriter require configuration: mapreduce.output.carbon.load.model");
+ }
+ }
+ String segmentId = CarbonStreamOutputFormat.getSegmentId(hadoopConf);
+ carbonLoadModel.setSegmentId(segmentId);
+ carbonTable = carbonLoadModel.getCarbonDataLoadSchema().getCarbonTable();
+ long taskNo = TaskID.forName(hadoopConf.get("mapred.tip.id")).getId();
+ carbonLoadModel.setTaskNo("" + taskNo);
+ configuration = DataLoadProcessBuilder.createConfiguration(carbonLoadModel);
+ maxRowNums = hadoopConf.getInt(CarbonStreamOutputFormat.CARBON_STREAM_BLOCKLET_ROW_NUMS,
+ CarbonStreamOutputFormat.CARBON_STREAM_BLOCKLET_ROW_NUMS_DEFAULT) - 1;
+ maxCacheSize = hadoopConf.getInt(CarbonStreamOutputFormat.CARBON_STREAM_CACHE_SIZE,
+ CarbonStreamOutputFormat.CARBON_STREAM_CACHE_SIZE_DEFAULT);
+
+ segmentDir = CarbonTablePath.getSegmentPath(
+ carbonTable.getAbsoluteTableIdentifier().getTablePath(), segmentId);
+ fileName = CarbonTablePath.getCarbonDataFileName(0, taskNo, 0, 0, "0");
+ }
+
+ private void initializeAtFirstRow() throws IOException, InterruptedException {
+
+ // initialize metadata
+ isNoDictionaryDimensionColumn =
+ CarbonDataProcessorUtil.getNoDictionaryMapping(configuration.getDataFields());
+ dimensionWithComplexCount = configuration.getDimensionCount();
+ measureCount = configuration.getMeasureCount();
+ dataFields = configuration.getDataFields();
+ measureDataTypes = new DataType[measureCount];
+ for (int i = 0; i < measureCount; i++) {
+ measureDataTypes[i] =
+ dataFields[dimensionWithComplexCount + i].getColumn().getDataType();
+ }
+
+ // initialize parser and converter
+ rowParser = new RowParserImpl(dataFields, configuration);
+ badRecordLogger = BadRecordsLoggerProvider.createBadRecordLogger(configuration);
+ converter = new RowConverterImpl(configuration.getDataFields(), configuration, badRecordLogger);
+ configuration.setCardinalityFinder(converter);
+ converter.initialize();
+
+ // initialize encoder
+ nullBitSet = new BitSet(dataFields.length);
+ int rowBufferSize = hadoopConf.getInt(CarbonStreamOutputFormat.CARBON_ENCODER_ROW_BUFFER_SIZE,
+ CarbonStreamOutputFormat.CARBON_ENCODER_ROW_BUFFER_SIZE_DEFAULT);
+ output = new StreamBlockletWriter(maxCacheSize, maxRowNums, rowBufferSize);
+
+ // initialize data writer
+ String filePath = segmentDir + File.separator + fileName;
+ FileFactory.FileType fileType = FileFactory.getFileType(filePath);
+ CarbonFile carbonFile = FileFactory.getCarbonFile(filePath, fileType);
+ if (carbonFile.exists()) {
+ // if the file is existed, use the append api
+ outputStream = FileFactory.getDataOutputStreamUsingAppend(filePath, fileType);
+ } else {
+ // IF the file is not existed, use the create api
+ outputStream = FileFactory.getDataOutputStream(filePath, fileType);
+ writeFileHeader();
+ }
+
+ isFirstRow = false;
+ }
+
+ @Override public void write(Void key, Object value) throws IOException, InterruptedException {
+ if (isFirstRow) {
+ initializeAtFirstRow();
+ }
+
+ // parse and convert row
+ currentRow.setData(rowParser.parseRow((Object[]) value));
+ converter.convert(currentRow);
+
+ // null bit set
+ nullBitSet.clear();
+ for (int i = 0; i < dataFields.length; i++) {
+ if (null == currentRow.getObject(i)) {
+ nullBitSet.set(i);
+ }
+ }
+ output.nextRow();
+ byte[] b = nullBitSet.toByteArray();
+ output.writeShort(b.length);
+ if (b.length > 0) {
+ output.writeBytes(b);
+ }
+ int dimCount = 0;
+ Object columnValue;
+
+ // primitive type dimension
+ for (; dimCount < isNoDictionaryDimensionColumn.length; dimCount++) {
+ columnValue = currentRow.getObject(dimCount);
+ if (null != columnValue) {
+ if (isNoDictionaryDimensionColumn[dimCount]) {
+ byte[] col = (byte[]) columnValue;
+ output.writeShort(col.length);
+ output.writeBytes(col);
+ } else {
+ output.writeInt((int) columnValue);
+ }
+ }
+ }
+ // complex type dimension
+ for (; dimCount < dimensionWithComplexCount; dimCount++) {
+ columnValue = currentRow.getObject(dimCount);
+ if (null != columnValue) {
+ byte[] col = (byte[]) columnValue;
+ output.writeShort(col.length);
+ output.writeBytes(col);
+ }
+ }
+ // measure
+ DataType dataType;
+ for (int msrCount = 0; msrCount < measureCount; msrCount++) {
+ columnValue = currentRow.getObject(dimCount + msrCount);
+ if (null != columnValue) {
+ dataType = measureDataTypes[msrCount];
+ if (dataType == DataTypes.BOOLEAN) {
+ output.writeBoolean((boolean) columnValue);
+ } else if (dataType == DataTypes.SHORT) {
+ output.writeShort((short) columnValue);
+ } else if (dataType == DataTypes.INT) {
+ output.writeInt((int) columnValue);
+ } else if (dataType == DataTypes.LONG) {
+ output.writeLong((long) columnValue);
+ } else if (dataType == DataTypes.DOUBLE) {
+ output.writeDouble((double) columnValue);
+ } else if (DataTypes.isDecimal(dataType)) {
+ BigDecimal val = (BigDecimal) columnValue;
+ byte[] bigDecimalInBytes = DataTypeUtil.bigDecimalToByte(val);
+ output.writeShort(bigDecimalInBytes.length);
+ output.writeBytes(bigDecimalInBytes);
+ } else {
+ String msg =
+ "unsupported data type:" + dataFields[dimCount + msrCount].getColumn().getDataType()
+ .getName();
+ LOGGER.error(msg);
+ throw new IOException(msg);
+ }
+ }
+ }
+
+ if (output.isFull()) {
+ appendBlockletToDataFile();
+ }
+ }
+
+ private void writeFileHeader() throws IOException {
+ List<ColumnSchema> wrapperColumnSchemaList = CarbonUtil
+ .getColumnSchemaList(carbonTable.getDimensionByTableName(carbonTable.getTableName()),
+ carbonTable.getMeasureByTableName(carbonTable.getTableName()));
+ int[] dimLensWithComplex = new int[wrapperColumnSchemaList.size()];
+ for (int i = 0; i < dimLensWithComplex.length; i++) {
+ dimLensWithComplex[i] = Integer.MAX_VALUE;
+ }
+ int[] dictionaryColumnCardinality =
+ CarbonUtil.getFormattedCardinality(dimLensWithComplex, wrapperColumnSchemaList);
+ List<Integer> cardinality = new ArrayList<>();
+ List<org.apache.carbondata.format.ColumnSchema> columnSchemaList = AbstractFactDataWriter
+ .getColumnSchemaListAndCardinality(cardinality, dictionaryColumnCardinality,
+ wrapperColumnSchemaList);
+ FileHeader fileHeader =
+ CarbonMetadataUtil.getFileHeader(true, columnSchemaList, System.currentTimeMillis());
+ fileHeader.setIs_footer_present(false);
+ fileHeader.setIs_splitable(true);
+ fileHeader.setSync_marker(CarbonStreamOutputFormat.CARBON_SYNC_MARKER);
+ outputStream.write(CarbonUtil.getByteArray(fileHeader));
+ }
+
+ /**
+ * write a blocklet to file
+ */
+ private void appendBlockletToDataFile() throws IOException {
+ if (output.getRowIndex() == -1) {
+ return;
+ }
+ output.apppendBlocklet(outputStream);
+ outputStream.flush();
+ // reset data
+ output.reset();
+ }
+
+ @Override public void close(TaskAttemptContext context) throws IOException, InterruptedException {
+ try {
+ // append remain buffer data
+ if (!hasException && !isFirstRow) {
+ appendBlockletToDataFile();
+ converter.finish();
+ }
+ } finally {
+ // close resource
+ CarbonUtil.closeStreams(outputStream);
+ if (output != null) {
+ output.close();
+ }
+ if (badRecordLogger != null) {
+ badRecordLogger.closeStreams();
+ }
+ }
+ }
+
+ public String getSegmentDir() {
+ return segmentDir;
+ }
+
+ public String getFileName() {
+ return fileName;
+ }
+
+ public void setHasException(boolean hasException) {
+ this.hasException = hasException;
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/streaming/src/main/java/org/apache/carbondata/streaming/StreamBlockletReader.java
----------------------------------------------------------------------
diff --git a/streaming/src/main/java/org/apache/carbondata/streaming/StreamBlockletReader.java b/streaming/src/main/java/org/apache/carbondata/streaming/StreamBlockletReader.java
new file mode 100644
index 0000000..43fe6ed
--- /dev/null
+++ b/streaming/src/main/java/org/apache/carbondata/streaming/StreamBlockletReader.java
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.streaming;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.carbondata.core.datastore.compression.Compressor;
+import org.apache.carbondata.core.datastore.compression.CompressorFactory;
+import org.apache.carbondata.core.util.CarbonUtil;
+import org.apache.carbondata.format.BlockletHeader;
+
+/**
+ * stream blocklet reader
+ */
+public class StreamBlockletReader {
+
+ private byte[] buffer;
+ private int offset;
+ private final byte[] syncMarker;
+ private final byte[] syncBuffer;
+ private final int syncLen;
+ private long pos = 0;
+ private final InputStream in;
+ private final long limitStart;
+ private final long limitEnd;
+ private boolean isAlreadySync = false;
+ private Compressor compressor = CompressorFactory.getInstance().getCompressor();
+ private int rowNums = 0;
+ private int rowIndex = 0;
+ private boolean isHeaderPresent;
+
+ StreamBlockletReader(byte[] syncMarker, InputStream in, long limit, boolean isHeaderPresent) {
+ this.syncMarker = syncMarker;
+ syncLen = syncMarker.length;
+ syncBuffer = new byte[syncLen];
+ this.in = in;
+ limitStart = limit;
+ limitEnd = limitStart + syncLen;
+ this.isHeaderPresent = isHeaderPresent;
+ }
+
+ private void ensureCapacity(int capacity) {
+ if (buffer == null || capacity > buffer.length) {
+ buffer = new byte[capacity];
+ }
+ }
+
+ /**
+ * find the first position of sync_marker in input stream
+ */
+ private boolean sync() throws IOException {
+ if (!readBytesFromStream(syncBuffer, 0, syncLen)) {
+ return false;
+ }
+ boolean skipHeader = false;
+ for (int i = 0; i < limitStart; i++) {
+ int j = 0;
+ for (; j < syncLen; j++) {
+ if (syncMarker[j] != syncBuffer[(i + j) % syncLen]) break;
+ }
+ if (syncLen == j) {
+ if (isHeaderPresent) {
+ if (skipHeader) {
+ return true;
+ } else {
+ skipHeader = true;
+ }
+ } else {
+ return true;
+ }
+ }
+ int value = in.read();
+ if (-1 == value) {
+ return false;
+ }
+ syncBuffer[i % syncLen] = (byte) value;
+ pos++;
+ }
+ return false;
+ }
+
+ BlockletHeader readBlockletHeader() throws IOException {
+ int len = readIntFromStream();
+ byte[] b = new byte[len];
+ if (!readBytesFromStream(b, 0, len)) {
+ throw new EOFException("Failed to read blocklet header");
+ }
+ BlockletHeader header = CarbonUtil.readBlockletHeader(b);
+ rowNums = header.getBlocklet_info().getNum_rows();
+ rowIndex = 0;
+ return header;
+ }
+
+ void readBlockletData(BlockletHeader header) throws IOException {
+ ensureCapacity(header.getBlocklet_length());
+ offset = 0;
+ int len = readIntFromStream();
+ byte[] b = new byte[len];
+ if (!readBytesFromStream(b, 0, len)) {
+ throw new EOFException("Failed to read blocklet data");
+ }
+ compressor.rawUncompress(b, buffer);
+ }
+
+ void skipBlockletData(boolean reset) throws IOException {
+ int len = readIntFromStream();
+ skip(len);
+ pos += len;
+ if (reset) {
+ this.rowNums = 0;
+ this.rowIndex = 0;
+ }
+ }
+
+ private void skip(int len) throws IOException {
+ long remaining = len;
+ do {
+ long skipLen = in.skip(remaining);
+ remaining -= skipLen;
+ } while (remaining > 0);
+ }
+
+ /**
+ * find the next blocklet
+ */
+ boolean nextBlocklet() throws IOException {
+ if (pos >= limitStart) {
+ return false;
+ }
+ if (isAlreadySync) {
+ if (!readBytesFromStream(syncBuffer, 0, syncLen)) {
+ return false;
+ }
+ } else {
+ isAlreadySync = true;
+ if (!sync()) {
+ return false;
+ }
+ }
+
+ return pos < limitEnd;
+ }
+
+ boolean hasNext() throws IOException {
+ return rowIndex < rowNums;
+ }
+
+ void nextRow() {
+ rowIndex++;
+ }
+
+ int readIntFromStream() throws IOException {
+ int ch1 = in.read();
+ int ch2 = in.read();
+ int ch3 = in.read();
+ int ch4 = in.read();
+ if ((ch1 | ch2 | ch3 | ch4) < 0) throw new EOFException();
+ pos += 4;
+ return ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0));
+ }
+
+ /**
+ * Reads <code>len</code> bytes of data from the input stream into
+ * an array of bytes.
+ * @return <code>true</code> if reading data successfully, or
+ * <code>false</code> if there is no more data because the end of the stream has been reached.
+ */
+ boolean readBytesFromStream(byte[] b, int offset, int len) throws IOException {
+ int readLen = in.read(b, offset, len);
+ if (readLen < 0) {
+ return false;
+ }
+ pos += readLen;
+ if (readLen < len) {
+ return readBytesFromStream(b, offset + readLen, len - readLen);
+ } else {
+ return true;
+ }
+ }
+
+ boolean readBoolean() {
+ return (buffer[offset++]) != 0;
+ }
+
+ short readShort() {
+ short v = (short) ((buffer[offset + 1] & 255) +
+ ((buffer[offset]) << 8));
+ offset += 2;
+ return v;
+ }
+
+ byte[] copy(int len) {
+ byte[] b = new byte[len];
+ System.arraycopy(buffer, offset, b, 0, len);
+ return b;
+ }
+
+ int readInt() {
+ int v = ((buffer[offset + 3] & 255) +
+ ((buffer[offset + 2] & 255) << 8) +
+ ((buffer[offset + 1] & 255) << 16) +
+ ((buffer[offset]) << 24));
+ offset += 4;
+ return v;
+ }
+
+ long readLong() {
+ long v = ((long)(buffer[offset + 7] & 255)) +
+ ((long) (buffer[offset + 6] & 255) << 8) +
+ ((long) (buffer[offset + 5] & 255) << 16) +
+ ((long) (buffer[offset + 4] & 255) << 24) +
+ ((long) (buffer[offset + 3] & 255) << 32) +
+ ((long) (buffer[offset + 2] & 255) << 40) +
+ ((long) (buffer[offset + 1] & 255) << 48) +
+ ((long) (buffer[offset]) << 56);
+ offset += 8;
+ return v;
+ }
+
+ double readDouble() {
+ return Double.longBitsToDouble(readLong());
+ }
+
+ byte[] readBytes(int len) {
+ byte[] b = new byte[len];
+ System.arraycopy(buffer, offset, b, 0, len);
+ offset += len;
+ return b;
+ }
+
+ void skipBytes(int len) {
+ offset += len;
+ }
+
+ int getRowNums() {
+ return rowNums;
+ }
+
+ void close() {
+ CarbonUtil.closeStreams(in);
+ }
+}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/streaming/src/main/java/org/apache/carbondata/streaming/StreamBlockletWriter.java
----------------------------------------------------------------------
diff --git a/streaming/src/main/java/org/apache/carbondata/streaming/StreamBlockletWriter.java b/streaming/src/main/java/org/apache/carbondata/streaming/StreamBlockletWriter.java
new file mode 100644
index 0000000..509e2aa
--- /dev/null
+++ b/streaming/src/main/java/org/apache/carbondata/streaming/StreamBlockletWriter.java
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.streaming;
+
+import java.io.DataOutputStream;
+import java.io.IOException;
+
+import org.apache.carbondata.core.datastore.compression.Compressor;
+import org.apache.carbondata.core.datastore.compression.CompressorFactory;
+import org.apache.carbondata.core.util.CarbonUtil;
+import org.apache.carbondata.format.BlockletHeader;
+import org.apache.carbondata.format.BlockletInfo;
+import org.apache.carbondata.format.MutationType;
+
+/**
+ * stream blocklet writer
+ */
+public class StreamBlockletWriter {
+ private byte[] buffer;
+ private int maxSize;
+ private int maxRowNum;
+ private int rowSize;
+ private int count = 0;
+ private int rowIndex = -1;
+ private Compressor compressor = CompressorFactory.getInstance().getCompressor();
+
+ StreamBlockletWriter(int maxSize, int maxRowNum, int rowSize) {
+ buffer = new byte[maxSize];
+ this.maxSize = maxSize;
+ this.maxRowNum = maxRowNum;
+ this.rowSize = rowSize;
+ }
+
+ private void ensureCapacity(int space) {
+ int newcount = space + count;
+ if (newcount > buffer.length) {
+ byte[] newbuf = new byte[Math.max(newcount, buffer.length + rowSize)];
+ System.arraycopy(buffer, 0, newbuf, 0, count);
+ buffer = newbuf;
+ }
+ }
+
+ void reset() {
+ count = 0;
+ rowIndex = -1;
+ }
+
+ byte[] getBytes() {
+ return buffer;
+ }
+
+ int getCount() {
+ return count;
+ }
+
+ int getRowIndex() {
+ return rowIndex;
+ }
+
+ void nextRow() {
+ rowIndex++;
+ }
+
+ boolean isFull() {
+ return rowIndex == maxRowNum || count >= maxSize;
+ }
+
+ void writeBoolean(boolean val) {
+ ensureCapacity(1);
+ buffer[count] = (byte) (val ? 1 : 0);
+ count += 1;
+ }
+
+ void writeShort(int val) {
+ ensureCapacity(2);
+ buffer[count + 1] = (byte) (val);
+ buffer[count] = (byte) (val >>> 8);
+ count += 2;
+ }
+
+ void writeInt(int val) {
+ ensureCapacity(4);
+ buffer[count + 3] = (byte) (val);
+ buffer[count + 2] = (byte) (val >>> 8);
+ buffer[count + 1] = (byte) (val >>> 16);
+ buffer[count] = (byte) (val >>> 24);
+ count += 4;
+ }
+
+ void writeLong(long val) {
+ ensureCapacity(8);
+ buffer[count + 7] = (byte) (val);
+ buffer[count + 6] = (byte) (val >>> 8);
+ buffer[count + 5] = (byte) (val >>> 16);
+ buffer[count + 4] = (byte) (val >>> 24);
+ buffer[count + 3] = (byte) (val >>> 32);
+ buffer[count + 2] = (byte) (val >>> 40);
+ buffer[count + 1] = (byte) (val >>> 48);
+ buffer[count] = (byte) (val >>> 56);
+ count += 8;
+ }
+
+ void writeDouble(double val) {
+ writeLong(Double.doubleToLongBits(val));
+ }
+
+ void writeBytes(byte[] b) {
+ writeBytes(b, 0, b.length);
+ }
+
+ void writeBytes(byte[] b, int off, int len) {
+ ensureCapacity(len);
+ System.arraycopy(b, off, buffer, count, len);
+ count += len;
+ }
+
+ void apppendBlocklet(DataOutputStream outputStream) throws IOException {
+ outputStream.write(CarbonStreamOutputFormat.CARBON_SYNC_MARKER);
+
+ BlockletInfo blockletInfo = new BlockletInfo();
+ blockletInfo.setNum_rows(getRowIndex() + 1);
+ BlockletHeader blockletHeader = new BlockletHeader();
+ blockletHeader.setBlocklet_length(getCount());
+ blockletHeader.setMutation(MutationType.INSERT);
+ blockletHeader.setBlocklet_info(blockletInfo);
+ byte[] headerBytes = CarbonUtil.getByteArray(blockletHeader);
+ outputStream.writeInt(headerBytes.length);
+ outputStream.write(headerBytes);
+
+ byte[] compressed = compressor.compressByte(getBytes(), getCount());
+ outputStream.writeInt(compressed.length);
+ outputStream.write(compressed);
+ }
+
+ void close() {
+ }
+}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/streaming/src/main/java/org/apache/carbondata/streaming/segment/StreamSegment.java
----------------------------------------------------------------------
diff --git a/streaming/src/main/java/org/apache/carbondata/streaming/segment/StreamSegment.java b/streaming/src/main/java/org/apache/carbondata/streaming/segment/StreamSegment.java
index 8c9889d..9e83924 100644
--- a/streaming/src/main/java/org/apache/carbondata/streaming/segment/StreamSegment.java
+++ b/streaming/src/main/java/org/apache/carbondata/streaming/segment/StreamSegment.java
@@ -42,8 +42,8 @@ import org.apache.carbondata.core.util.path.CarbonTablePath;
import org.apache.carbondata.core.writer.CarbonIndexFileWriter;
import org.apache.carbondata.format.BlockIndex;
import org.apache.carbondata.format.BlockletIndex;
-import org.apache.carbondata.hadoop.streaming.CarbonStreamRecordWriter;
import org.apache.carbondata.processing.loading.model.CarbonLoadModel;
+import org.apache.carbondata.streaming.CarbonStreamRecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/streaming/src/main/scala/org/apache/carbondata/streaming/CarbonSparkStreamingListener.scala
----------------------------------------------------------------------
diff --git a/streaming/src/main/scala/org/apache/carbondata/streaming/CarbonSparkStreamingListener.scala b/streaming/src/main/scala/org/apache/carbondata/streaming/CarbonSparkStreamingListener.scala
deleted file mode 100644
index 6d1fa45..0000000
--- a/streaming/src/main/scala/org/apache/carbondata/streaming/CarbonSparkStreamingListener.scala
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.carbondata.streaming
-
-import org.apache.spark.scheduler.SparkListener
-import org.apache.spark.scheduler.SparkListenerApplicationEnd
-
-class CarbonSparkStreamingListener extends SparkListener {
-
- /**
- * When Spark Streaming App stops, remove all locks for stream table.
- */
- override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = {
- CarbonStreamSparkStreaming.cleanAllLockAfterStop()
- }
-}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/streaming/src/main/scala/org/apache/carbondata/streaming/CarbonStreamSparkStreaming.scala
----------------------------------------------------------------------
diff --git a/streaming/src/main/scala/org/apache/carbondata/streaming/CarbonStreamSparkStreaming.scala b/streaming/src/main/scala/org/apache/carbondata/streaming/CarbonStreamSparkStreaming.scala
deleted file mode 100644
index 4aa1517..0000000
--- a/streaming/src/main/scala/org/apache/carbondata/streaming/CarbonStreamSparkStreaming.scala
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.carbondata.streaming
-
-import java.util
-
-import scala.collection.JavaConverters._
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.execution.streaming.CarbonAppendableStreamSink
-import org.apache.spark.sql.execution.streaming.Sink
-import org.apache.spark.sql.SaveMode
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.streaming.Time
-
-import org.apache.carbondata.common.logging.LogServiceFactory
-import org.apache.carbondata.core.locks.{CarbonLockFactory, ICarbonLock, LockUsage}
-import org.apache.carbondata.core.metadata.schema.table.CarbonTable
-
-/**
- * Interface used to write stream data to stream table
- * when integrate with Spark Streaming.
- *
- * NOTE: Current integration with Spark Streaming is an alpha feature.
- */
-class CarbonStreamSparkStreamingWriter(val sparkSession: SparkSession,
- val carbonTable: CarbonTable,
- val configuration: Configuration) {
-
- private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName)
-
- private var isInitialize: Boolean = false
-
- private var lock: ICarbonLock = null
- private var carbonAppendableStreamSink: Sink = null
-
- /**
- * Acquired the lock for stream table
- */
- def lockStreamTable(): Unit = {
- lock = CarbonLockFactory.getCarbonLockObj(carbonTable.getAbsoluteTableIdentifier,
- LockUsage.STREAMING_LOCK)
- if (lock.lockWithRetries()) {
- LOGGER.info("Acquired the lock for stream table: " +
- carbonTable.getDatabaseName + "." +
- carbonTable.getTableName)
- } else {
- LOGGER.error("Not able to acquire the lock for stream table:" +
- carbonTable.getDatabaseName + "." + carbonTable.getTableName)
- throw new InterruptedException(
- "Not able to acquire the lock for stream table: " + carbonTable.getDatabaseName + "." +
- carbonTable.getTableName)
- }
- }
-
- /**
- * unlock for stream table
- */
- def unLockStreamTable(): Unit = {
- if (null != lock) {
- lock.unlock()
- LOGGER.info("unlock for stream table: " +
- carbonTable.getDatabaseName + "." +
- carbonTable.getTableName)
- }
- }
-
- def initialize(): Unit = {
- carbonAppendableStreamSink = StreamSinkFactory.createStreamTableSink(
- sparkSession,
- configuration,
- carbonTable,
- extraOptions.toMap).asInstanceOf[CarbonAppendableStreamSink]
-
- lockStreamTable()
-
- isInitialize = true
- }
-
- def writeStreamData(dataFrame: DataFrame, time: Time): Unit = {
- if (!isInitialize) {
- initialize()
- }
- carbonAppendableStreamSink.addBatch(time.milliseconds, dataFrame)
- }
-
- private val extraOptions = new scala.collection.mutable.HashMap[String, String]
- private var mode: SaveMode = SaveMode.ErrorIfExists
-
- this.option("dbName", carbonTable.getDatabaseName)
- this.option("tableName", carbonTable.getTableName)
-
- /**
- * Specifies the behavior when data or table already exists. Options include:
- * - `SaveMode.Overwrite`: overwrite the existing data.
- * - `SaveMode.Append`: append the data.
- * - `SaveMode.Ignore`: ignore the operation (i.e. no-op).
- * - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
- */
- def mode(saveMode: SaveMode): CarbonStreamSparkStreamingWriter = {
- if (mode == SaveMode.ErrorIfExists) {
- mode = saveMode
- }
- this
- }
-
- /**
- * Specifies the behavior when data or table already exists. Options include:
- * - `overwrite`: overwrite the existing data.
- * - `append`: append the data.
- * - `ignore`: ignore the operation (i.e. no-op).
- * - `error or default`: default option, throw an exception at runtime.
- */
- def mode(saveMode: String): CarbonStreamSparkStreamingWriter = {
- if (mode == SaveMode.ErrorIfExists) {
- mode = saveMode.toLowerCase(util.Locale.ROOT) match {
- case "overwrite" => SaveMode.Overwrite
- case "append" => SaveMode.Append
- case "ignore" => SaveMode.Ignore
- case "error" | "default" => SaveMode.ErrorIfExists
- case _ => throw new IllegalArgumentException(s"Unknown save mode: $saveMode. " +
- "Accepted save modes are 'overwrite', 'append', 'ignore', 'error'.")
- }
- }
- this
- }
-
- /**
- * Adds an output option
- */
- def option(key: String, value: String): CarbonStreamSparkStreamingWriter = {
- if (!extraOptions.contains(key)) {
- extraOptions += (key -> value)
- }
- this
- }
-
- /**
- * Adds an output option
- */
- def option(key: String, value: Boolean): CarbonStreamSparkStreamingWriter =
- option(key, value.toString)
-
- /**
- * Adds an output option
- */
- def option(key: String, value: Long): CarbonStreamSparkStreamingWriter =
- option(key, value.toString)
-
- /**
- * Adds an output option
- */
- def option(key: String, value: Double): CarbonStreamSparkStreamingWriter =
- option(key, value.toString)
-}
-
-object CarbonStreamSparkStreaming {
-
- @transient private val tableMap =
- new util.HashMap[String, CarbonStreamSparkStreamingWriter]()
-
- def getTableMap: util.Map[String, CarbonStreamSparkStreamingWriter] = tableMap
-
- /**
- * remove all stream lock.
- */
- def cleanAllLockAfterStop(): Unit = {
- tableMap.asScala.values.foreach { writer => writer.unLockStreamTable() }
- tableMap.clear()
- }
-}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/c723947a/streaming/src/main/scala/org/apache/carbondata/streaming/StreamHandoffRDD.scala
----------------------------------------------------------------------
diff --git a/streaming/src/main/scala/org/apache/carbondata/streaming/StreamHandoffRDD.scala b/streaming/src/main/scala/org/apache/carbondata/streaming/StreamHandoffRDD.scala
deleted file mode 100644
index 4df04b9..0000000
--- a/streaming/src/main/scala/org/apache/carbondata/streaming/StreamHandoffRDD.scala
+++ /dev/null
@@ -1,436 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.carbondata.streaming
-
-import java.text.SimpleDateFormat
-import java.util
-import java.util.Date
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.mapreduce.{Job, TaskAttemptID, TaskType}
-import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
-import org.apache.spark.{Partition, SerializableWritable, SparkContext, TaskContext}
-import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
-import org.apache.spark.sql.SparkSession
-
-import org.apache.carbondata.common.logging.LogServiceFactory
-import org.apache.carbondata.core.datamap.Segment
-import org.apache.carbondata.core.datastore.block.SegmentProperties
-import org.apache.carbondata.core.datastore.impl.FileFactory
-import org.apache.carbondata.core.locks.{CarbonLockFactory, LockUsage}
-import org.apache.carbondata.core.metadata.CarbonMetadata
-import org.apache.carbondata.core.metadata.schema.table.CarbonTable
-import org.apache.carbondata.core.scan.result.iterator.RawResultIterator
-import org.apache.carbondata.core.statusmanager.{LoadMetadataDetails, SegmentStatus, SegmentStatusManager}
-import org.apache.carbondata.core.util.CarbonUtil
-import org.apache.carbondata.core.util.path.CarbonTablePath
-import org.apache.carbondata.events.{OperationContext, OperationListenerBus}
-import org.apache.carbondata.hadoop.{CarbonInputSplit, CarbonProjection}
-import org.apache.carbondata.hadoop.api.{CarbonInputFormat, CarbonTableInputFormat}
-import org.apache.carbondata.hadoop.streaming.{CarbonStreamInputFormat, CarbonStreamRecordReader}
-import org.apache.carbondata.processing.loading.events.LoadEvents.{LoadTablePostStatusUpdateEvent, LoadTablePreStatusUpdateEvent}
-import org.apache.carbondata.processing.loading.model.CarbonLoadModel
-import org.apache.carbondata.processing.merger.{CompactionResultSortProcessor, CompactionType}
-import org.apache.carbondata.processing.util.CarbonLoaderUtil
-import org.apache.carbondata.spark.{HandoffResult, HandoffResultImpl}
-import org.apache.carbondata.spark.rdd.CarbonRDD
-import org.apache.carbondata.spark.util.CommonUtil
-
-
-/**
- * partition of the handoff segment
- */
-class HandoffPartition(
- val rddId: Int,
- val idx: Int,
- @transient val inputSplit: CarbonInputSplit
-) extends Partition {
-
- val split = new SerializableWritable[CarbonInputSplit](inputSplit)
-
- override val index: Int = idx
-
- override def hashCode(): Int = 41 * (41 + rddId) + idx
-}
-
-/**
- * package the record reader of the handoff segment to RawResultIterator
- */
-class StreamingRawResultIterator(
- recordReader: CarbonStreamRecordReader
-) extends RawResultIterator(null, null, null) {
-
- override def hasNext: Boolean = {
- recordReader.nextKeyValue()
- }
-
- override def next(): Array[Object] = {
- val rowTmp = recordReader
- .getCurrentValue
- .asInstanceOf[GenericInternalRow]
- .values
- .asInstanceOf[Array[Object]]
- val row = new Array[Object](rowTmp.length)
- System.arraycopy(rowTmp, 0, row, 0, rowTmp.length)
- row
- }
-}
-
-/**
- * execute streaming segment handoff
- */
-class StreamHandoffRDD[K, V](
- sc: SparkContext,
- result: HandoffResult[K, V],
- carbonLoadModel: CarbonLoadModel,
- handOffSegmentId: String
-) extends CarbonRDD[(K, V)](sc, Nil, sc.hadoopConfiguration) {
-
- private val jobTrackerId: String = {
- val formatter = new SimpleDateFormat("yyyyMMddHHmm")
- formatter.format(new Date())
- }
-
- override def internalCompute(
- split: Partition,
- context: TaskContext
- ): Iterator[(K, V)] = {
- carbonLoadModel.setTaskNo("" + split.index)
- val carbonTable = carbonLoadModel.getCarbonDataLoadSchema.getCarbonTable
- CarbonMetadata.getInstance().addCarbonTable(carbonTable)
- // the input iterator is using raw row
- val iteratorList = prepareInputIterator(split, carbonTable)
-
- CommonUtil.setTempStoreLocation(split.index, carbonLoadModel, true, false)
- // use CompactionResultSortProcessor to sort data dan write to columnar files
- val processor = prepareHandoffProcessor(carbonTable)
- val status = processor.execute(iteratorList)
-
- new Iterator[(K, V)] {
- private var finished = false
-
- override def hasNext: Boolean = {
- !finished
- }
-
- override def next(): (K, V) = {
- finished = true
- result.getKey("" + split.index, status)
- }
- }
- }
-
- /**
- * prepare input iterator by basing CarbonStreamRecordReader
- */
- private def prepareInputIterator(
- split: Partition,
- carbonTable: CarbonTable
- ): util.ArrayList[RawResultIterator] = {
- val inputSplit = split.asInstanceOf[HandoffPartition].split.value
- val attemptId = new TaskAttemptID(jobTrackerId, id, TaskType.MAP, split.index, 0)
- val hadoopConf = new Configuration()
- CarbonInputFormat.setDatabaseName(hadoopConf, carbonTable.getDatabaseName)
- CarbonInputFormat.setTableName(hadoopConf, carbonTable.getTableName)
- CarbonInputFormat.setTablePath(hadoopConf, carbonTable.getTablePath)
- val projection = new CarbonProjection
- val dataFields = carbonTable.getStreamStorageOrderColumn(carbonTable.getTableName)
- (0 until dataFields.size()).foreach { index =>
- projection.addColumn(dataFields.get(index).getColName)
- }
- CarbonInputFormat.setColumnProjection(hadoopConf, projection)
- CarbonInputFormat.setTableInfo(hadoopConf, carbonTable.getTableInfo)
- val attemptContext = new TaskAttemptContextImpl(hadoopConf, attemptId)
- val format = new CarbonTableInputFormat[Array[Object]]()
- val model = format.createQueryModel(inputSplit, attemptContext)
- val inputFormat = new CarbonStreamInputFormat
- val streamReader = inputFormat.createRecordReader(inputSplit, attemptContext)
- .asInstanceOf[CarbonStreamRecordReader]
- streamReader.setVectorReader(false)
- streamReader.setQueryModel(model)
- streamReader.setUseRawRow(true)
- streamReader.initialize(inputSplit, attemptContext)
- val iteratorList = new util.ArrayList[RawResultIterator](1)
- iteratorList.add(new StreamingRawResultIterator(streamReader))
- iteratorList
- }
-
- private def prepareHandoffProcessor(
- carbonTable: CarbonTable
- ): CompactionResultSortProcessor = {
- val wrapperColumnSchemaList = CarbonUtil.getColumnSchemaList(
- carbonTable.getDimensionByTableName(carbonTable.getTableName),
- carbonTable.getMeasureByTableName(carbonTable.getTableName))
- val dimLensWithComplex =
- (0 until wrapperColumnSchemaList.size()).map(_ => Integer.MAX_VALUE).toArray
- val dictionaryColumnCardinality =
- CarbonUtil.getFormattedCardinality(dimLensWithComplex, wrapperColumnSchemaList)
- val segmentProperties =
- new SegmentProperties(wrapperColumnSchemaList, dictionaryColumnCardinality)
-
- new CompactionResultSortProcessor(
- carbonLoadModel,
- carbonTable,
- segmentProperties,
- CompactionType.STREAMING,
- carbonTable.getTableName,
- null
- )
- }
-
- /**
- * get the partitions of the handoff segment
- */
- override protected def getPartitions: Array[Partition] = {
- val job = Job.getInstance(FileFactory.getConfiguration)
- val inputFormat = new CarbonTableInputFormat[Array[Object]]()
- val segmentList = new util.ArrayList[Segment](1)
- segmentList.add(Segment.toSegment(handOffSegmentId))
- val splits = inputFormat.getSplitsOfStreaming(
- job,
- carbonLoadModel.getCarbonDataLoadSchema.getCarbonTable.getAbsoluteTableIdentifier,
- segmentList
- )
-
- (0 until splits.size()).map { index =>
- new HandoffPartition(id, index, splits.get(index).asInstanceOf[CarbonInputSplit])
- }.toArray[Partition]
- }
-}
-
-object StreamHandoffRDD {
-
- private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName)
-
- def iterateStreamingHandoff(
- carbonLoadModel: CarbonLoadModel,
- sparkSession: SparkSession
- ): Unit = {
- val carbonTable = carbonLoadModel.getCarbonDataLoadSchema.getCarbonTable
- val identifier = carbonTable.getAbsoluteTableIdentifier
- var continueHandoff = false
- // require handoff lock on table
- val lock = CarbonLockFactory.getCarbonLockObj(identifier, LockUsage.HANDOFF_LOCK)
- try {
- if (lock.lockWithRetries()) {
- LOGGER.info("Acquired the handoff lock for table" +
- s" ${ carbonTable.getDatabaseName }.${ carbonTable.getTableName }")
- // handoff streaming segment one by one
- do {
- val segmentStatusManager = new SegmentStatusManager(identifier)
- var loadMetadataDetails: Array[LoadMetadataDetails] = null
- // lock table to read table status file
- val statusLock = segmentStatusManager.getTableStatusLock
- try {
- if (statusLock.lockWithRetries()) {
- loadMetadataDetails = SegmentStatusManager.readLoadMetadata(
- CarbonTablePath.getMetadataPath(identifier.getTablePath))
- }
- } finally {
- if (null != statusLock) {
- statusLock.unlock()
- }
- }
- if (null != loadMetadataDetails) {
- val streamSegments =
- loadMetadataDetails.filter(_.getSegmentStatus == SegmentStatus.STREAMING_FINISH)
-
- continueHandoff = streamSegments.length > 0
- if (continueHandoff) {
- // handoff a streaming segment
- val loadMetadataDetail = streamSegments(0)
- executeStreamingHandoff(
- carbonLoadModel,
- sparkSession,
- loadMetadataDetail.getLoadName
- )
- }
- } else {
- continueHandoff = false
- }
- } while (continueHandoff)
- }
- } finally {
- if (null != lock) {
- lock.unlock()
- }
- }
- }
-
- /**
- * start new thread to execute stream segment handoff
- */
- def startStreamingHandoffThread(
- carbonLoadModel: CarbonLoadModel,
- sparkSession: SparkSession,
- isDDL: Boolean
- ): Unit = {
- if (isDDL) {
- iterateStreamingHandoff(carbonLoadModel, sparkSession)
- } else {
- // start a new thread to execute streaming segment handoff
- val handoffThread = new Thread() {
- override def run(): Unit = {
- iterateStreamingHandoff(carbonLoadModel, sparkSession)
- }
- }
- handoffThread.start()
- }
- }
-
- /**
- * invoke StreamHandoffRDD to handoff a streaming segment to a columnar segment
- */
- def executeStreamingHandoff(
- carbonLoadModel: CarbonLoadModel,
- sparkSession: SparkSession,
- handoffSegmenId: String
- ): Unit = {
- var loadStatus = SegmentStatus.SUCCESS
- var errorMessage: String = "Handoff failure"
- try {
- // generate new columnar segment
- val newMetaEntry = new LoadMetadataDetails
- carbonLoadModel.setFactTimeStamp(System.currentTimeMillis())
- CarbonLoaderUtil.populateNewLoadMetaEntry(
- newMetaEntry,
- SegmentStatus.INSERT_IN_PROGRESS,
- carbonLoadModel.getFactTimeStamp,
- false)
- val operationContext = new OperationContext()
- val loadTablePreStatusUpdateEvent: LoadTablePreStatusUpdateEvent =
- new LoadTablePreStatusUpdateEvent(
- carbonLoadModel.getCarbonDataLoadSchema.getCarbonTable.getCarbonTableIdentifier,
- carbonLoadModel)
- OperationListenerBus.getInstance().fireEvent(loadTablePreStatusUpdateEvent, operationContext)
-
- CarbonLoaderUtil.recordNewLoadMetadata(newMetaEntry, carbonLoadModel, true, false)
- val loadTablePostStatusUpdateEvent: LoadTablePostStatusUpdateEvent =
- new LoadTablePostStatusUpdateEvent(carbonLoadModel)
- OperationListenerBus.getInstance()
- .fireEvent(loadTablePostStatusUpdateEvent, operationContext)
- // convert a streaming segment to columnar segment
- val status = new StreamHandoffRDD(
- sparkSession.sparkContext,
- new HandoffResultImpl(),
- carbonLoadModel,
- handoffSegmenId).collect()
-
- status.foreach { x =>
- if (!x._2) {
- loadStatus = SegmentStatus.LOAD_FAILURE
- }
- }
- } catch {
- case ex: Exception =>
- loadStatus = SegmentStatus.LOAD_FAILURE
- errorMessage = errorMessage + ": " + ex.getCause.getMessage
- LOGGER.error(errorMessage)
- LOGGER.error(ex, s"Handoff failed on streaming segment $handoffSegmenId")
- }
-
- if (loadStatus == SegmentStatus.LOAD_FAILURE) {
- CarbonLoaderUtil.updateTableStatusForFailure(carbonLoadModel)
- LOGGER.info("********starting clean up**********")
- CarbonLoaderUtil.deleteSegment(carbonLoadModel, carbonLoadModel.getSegmentId.toInt)
- LOGGER.info("********clean up done**********")
- LOGGER.audit(s"Handoff is failed for " +
- s"${ carbonLoadModel.getDatabaseName }.${ carbonLoadModel.getTableName }")
- LOGGER.error("Cannot write load metadata file as handoff failed")
- throw new Exception(errorMessage)
- }
-
- if (loadStatus == SegmentStatus.SUCCESS) {
- val done = updateLoadMetadata(handoffSegmenId, carbonLoadModel)
- if (!done) {
- val errorMessage = "Handoff failed due to failure in table status updation."
- LOGGER.audit("Handoff is failed for " +
- s"${ carbonLoadModel.getDatabaseName }.${ carbonLoadModel.getTableName }")
- LOGGER.error("Handoff failed due to failure in table status updation.")
- throw new Exception(errorMessage)
- }
- done
- }
-
- }
-
- /**
- * update streaming segment and new columnar segment
- */
- private def updateLoadMetadata(
- handoffSegmentId: String,
- loadModel: CarbonLoadModel
- ): Boolean = {
- var status = false
- val metaDataFilepath = loadModel.getCarbonDataLoadSchema.getCarbonTable.getMetadataPath
- val identifier = loadModel.getCarbonDataLoadSchema.getCarbonTable.getAbsoluteTableIdentifier
- val metadataPath = CarbonTablePath.getMetadataPath(identifier.getTablePath)
- val fileType = FileFactory.getFileType(metadataPath)
- if (!FileFactory.isFileExist(metadataPath, fileType)) {
- FileFactory.mkdirs(metadataPath, fileType)
- }
- val tableStatusPath = CarbonTablePath.getTableStatusFilePath(identifier.getTablePath)
- val segmentStatusManager = new SegmentStatusManager(identifier)
- val carbonLock = segmentStatusManager.getTableStatusLock
- try {
- if (carbonLock.lockWithRetries()) {
- LOGGER.info(
- "Acquired lock for table" + loadModel.getDatabaseName() + "." + loadModel.getTableName()
- + " for table status updation")
- val listOfLoadFolderDetailsArray =
- SegmentStatusManager.readLoadMetadata(metaDataFilepath)
-
- // update new columnar segment to success status
- val newSegment =
- listOfLoadFolderDetailsArray.find(_.getLoadName.equals(loadModel.getSegmentId))
- if (newSegment.isEmpty) {
- throw new Exception("Failed to update table status for new segment")
- } else {
- newSegment.get.setSegmentStatus(SegmentStatus.SUCCESS)
- newSegment.get.setLoadEndTime(System.currentTimeMillis())
- }
-
- // update streaming segment to compacted status
- val streamSegment =
- listOfLoadFolderDetailsArray.find(_.getLoadName.equals(handoffSegmentId))
- if (streamSegment.isEmpty) {
- throw new Exception("Failed to update table status for streaming segment")
- } else {
- streamSegment.get.setSegmentStatus(SegmentStatus.COMPACTED)
- streamSegment.get.setMergedLoadName(loadModel.getSegmentId)
- }
-
- // refresh table status file
- SegmentStatusManager.writeLoadDetailsIntoFile(tableStatusPath, listOfLoadFolderDetailsArray)
- status = true
- } else {
- LOGGER.error("Not able to acquire the lock for Table status updation for table " + loadModel
- .getDatabaseName() + "." + loadModel.getTableName())
- }
- } finally {
- if (carbonLock.unlock()) {
- LOGGER.info("Table unlocked successfully after table status updation" +
- loadModel.getDatabaseName() + "." + loadModel.getTableName())
- } else {
- LOGGER.error("Unable to unlock Table lock for table" + loadModel.getDatabaseName() +
- "." + loadModel.getTableName() + " during table status updation")
- }
- }
- status
- }
-}