You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@carbondata.apache.org by ja...@apache.org on 2018/08/07 13:10:05 UTC
[41/50] [abbrv] carbondata git commit: [CARBONDATA-2795] Add documentation for S3

[CARBONDATA-2795] Add documentation for S3

This closes #2576


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/e26a742c
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/e26a742c
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/e26a742c

Branch: refs/heads/external-format
Commit: e26a742cb7031cdf0dde3601a4d21616661f4b73
Parents: 6d6a5b2
Author: kunal642 <ku...@gmail.com>
Authored: Sun Jul 29 21:44:22 2018 +0530
Committer: ravipesala <ra...@gmail.com>
Committed: Sun Aug 5 16:51:47 2018 +0530

----------------------------------------------------------------------
 docs/configuration-parameters.md                |  7 +-
 docs/data-management-on-carbondata.md           |  8 ++
 docs/datamap/bloomfilter-datamap-guide.md       | 17 ++++
 docs/datamap/datamap-management.md              | 17 ++++
 docs/datamap/lucene-datamap-guide.md            | 17 ++++
 docs/datamap/preaggregate-datamap-guide.md      | 17 ++++
 docs/datamap/timeseries-datamap-guide.md        | 17 ++++
 docs/s3-guide.md                                | 91 ++++++++++++++++++++
 docs/sdk-guide.md                               | 17 ++++
 docs/streaming-guide.md                         | 17 ++++
 .../sql/CarbonDatasourceHadoopRelation.scala    |  2 +
 .../org/apache/spark/sql/CarbonSource.scala     |  2 +
 12 files changed, 228 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/e26a742c/docs/configuration-parameters.md
----------------------------------------------------------------------
diff --git a/docs/configuration-parameters.md b/docs/configuration-parameters.md
index eee85e2..46b8bd0 100644
--- a/docs/configuration-parameters.md
+++ b/docs/configuration-parameters.md
@@ -109,7 +109,12 @@ This section provides the details of all the configurations required for CarbonD
 |---------------------------------------------|--------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | carbon.sort.file.write.buffer.size | 16384 | File write buffer size used during sorting. Minimum allowed buffer size is 10240 byte and Maximum allowed buffer size is 10485760 byte. |
 | carbon.lock.type | LOCALLOCK | This configuration specifies the type of lock to be acquired during concurrent operations on table. There are following types of lock implementation: - LOCALLOCK: Lock is created on local file system as file. This lock is useful when only one spark driver (thrift server) runs on a machine and no other CarbonData spark application is launched concurrently. - HDFSLOCK: Lock is created on HDFS file system as file. This lock is useful when multiple CarbonData spark applications are launched and no ZooKeeper is running on cluster and HDFS supports file based locking. |
-| carbon.lock.path | TABLEPATH | This configuration specifies the path where lock files have to be created. Recommended to configure zookeeper lock type or configure HDFS lock path(to this property) in case of S3 file system as locking is not feasible on S3.
+| carbon.lock.path | TABLEPATH | Locks on the files are used to prevent concurrent operation from modifying the same files. This 
+configuration specifies the path where lock files have to be created. Recommended to configure 
+HDFS lock path(to this property) in case of S3 file system as locking is not feasible on S3. 
+**Note:** If this property is not set to HDFS location for S3 store, then there is a possibility 
+of data corruption because multiple data manipulation calls might try to update the status file 
+and as lock is not acquired before updation data might get overwritten. |
 | carbon.sort.intermediate.files.limit | 20 | Minimum number of intermediate files after which merged sort can be started (minValue = 2, maxValue=50). |
 | carbon.block.meta.size.reserved.percentage | 10 | Space reserved in percentage for writing block meta data in CarbonData file. |
 | carbon.csv.read.buffersize.byte | 1048576 | csv reading buffer size. |

http://git-wip-us.apache.org/repos/asf/carbondata/blob/e26a742c/docs/data-management-on-carbondata.md
----------------------------------------------------------------------
diff --git a/docs/data-management-on-carbondata.md b/docs/data-management-on-carbondata.md
index 41fd513..7cf6123 100644
--- a/docs/data-management-on-carbondata.md
+++ b/docs/data-management-on-carbondata.md
@@ -174,6 +174,12 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
       
       Local Dictionary size = ((memory occupied by each unique value * cardinality of the column) * number of columns)
       
+      **Bad Records Path:**
+      
+      This property is used to specify the location where bad records would be written.
+      
+      ```TBLPROPERTIES('BAD_RECORDS_PATH'='/opt/badrecords'')```
+      
 ### Example:
  
    ```
@@ -775,6 +781,8 @@ Users can specify which columns to include and exclude for local dictionary gene
   * If the IGNORE option is used, then bad records are neither loaded nor written to the separate CSV file.
   * In loaded data, if all records are bad records, the BAD_RECORDS_ACTION is invalid and the load operation fails.
   * The default maximum number of characters per column is 32000. If there are more than 32000 characters in a column, please refer to *String longer than 32000 characters* section.
+  * Since Bad Records Path can be specified in create, load and carbon properties. 
+  Therefore, value specified in load will have the highest priority, and value specified in carbon properties will have the least priority.
 
   Example:
 

http://git-wip-us.apache.org/repos/asf/carbondata/blob/e26a742c/docs/datamap/bloomfilter-datamap-guide.md
----------------------------------------------------------------------
diff --git a/docs/datamap/bloomfilter-datamap-guide.md b/docs/datamap/bloomfilter-datamap-guide.md
index ccbcabe..92810f8 100644
--- a/docs/datamap/bloomfilter-datamap-guide.md
+++ b/docs/datamap/bloomfilter-datamap-guide.md
@@ -1,3 +1,20 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more 
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership. 
+    The ASF licenses this file to you under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with 
+    the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software 
+    distributed under the License is distributed on an "AS IS" BASIS, 
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and 
+    limitations under the License.
+-->
+
 # CarbonData BloomFilter DataMap (Alpha Feature)
 
 * [DataMap Management](#datamap-management)

http://git-wip-us.apache.org/repos/asf/carbondata/blob/e26a742c/docs/datamap/datamap-management.md
----------------------------------------------------------------------
diff --git a/docs/datamap/datamap-management.md b/docs/datamap/datamap-management.md
index 1695a23..23f1517 100644
--- a/docs/datamap/datamap-management.md
+++ b/docs/datamap/datamap-management.md
@@ -1,3 +1,20 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more 
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership. 
+    The ASF licenses this file to you under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with 
+    the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software 
+    distributed under the License is distributed on an "AS IS" BASIS, 
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and 
+    limitations under the License.
+-->
+
 # CarbonData DataMap Management
 
 ## Overview

http://git-wip-us.apache.org/repos/asf/carbondata/blob/e26a742c/docs/datamap/lucene-datamap-guide.md
----------------------------------------------------------------------
diff --git a/docs/datamap/lucene-datamap-guide.md b/docs/datamap/lucene-datamap-guide.md
index 119b609..06cd194 100644
--- a/docs/datamap/lucene-datamap-guide.md
+++ b/docs/datamap/lucene-datamap-guide.md
@@ -1,3 +1,20 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more 
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership. 
+    The ASF licenses this file to you under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with 
+    the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software 
+    distributed under the License is distributed on an "AS IS" BASIS, 
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and 
+    limitations under the License.
+-->
+
 # CarbonData Lucene DataMap (Alpha Feature)
   
 * [DataMap Management](#datamap-management)

http://git-wip-us.apache.org/repos/asf/carbondata/blob/e26a742c/docs/datamap/preaggregate-datamap-guide.md
----------------------------------------------------------------------
diff --git a/docs/datamap/preaggregate-datamap-guide.md b/docs/datamap/preaggregate-datamap-guide.md
index d85f527..ff4c28e 100644
--- a/docs/datamap/preaggregate-datamap-guide.md
+++ b/docs/datamap/preaggregate-datamap-guide.md
@@ -1,3 +1,20 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more 
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership. 
+    The ASF licenses this file to you under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with 
+    the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software 
+    distributed under the License is distributed on an "AS IS" BASIS, 
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and 
+    limitations under the License.
+-->
+
 # CarbonData Pre-aggregate DataMap
   
 * [Quick Example](#quick-example)

http://git-wip-us.apache.org/repos/asf/carbondata/blob/e26a742c/docs/datamap/timeseries-datamap-guide.md
----------------------------------------------------------------------
diff --git a/docs/datamap/timeseries-datamap-guide.md b/docs/datamap/timeseries-datamap-guide.md
index 15ca3fc..135188d 100644
--- a/docs/datamap/timeseries-datamap-guide.md
+++ b/docs/datamap/timeseries-datamap-guide.md
@@ -1,3 +1,20 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more 
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership. 
+    The ASF licenses this file to you under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with 
+    the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software 
+    distributed under the License is distributed on an "AS IS" BASIS, 
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and 
+    limitations under the License.
+-->
+
 # CarbonData Timeseries DataMap
 
 * [Timeseries DataMap Introduction](#timeseries-datamap-intoduction)

http://git-wip-us.apache.org/repos/asf/carbondata/blob/e26a742c/docs/s3-guide.md
----------------------------------------------------------------------
diff --git a/docs/s3-guide.md b/docs/s3-guide.md
new file mode 100644
index 0000000..2f4dfa9
--- /dev/null
+++ b/docs/s3-guide.md
@@ -0,0 +1,91 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more 
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership. 
+    The ASF licenses this file to you under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with 
+    the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software 
+    distributed under the License is distributed on an "AS IS" BASIS, 
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and 
+    limitations under the License.
+-->
+
+#S3 Guide (Alpha Feature 1.4.1)
+
+Object storage is the recommended storage format in cloud as it can support storing large data 
+files. S3 APIs are widely used for accessing object stores. This can be 
+used to store or retrieve data on Amazon cloud, Huawei Cloud(OBS) or on any other object
+ stores conforming to S3 API.
+Storing data in cloud is advantageous as there are no restrictions on the size of 
+data and the data can be accessed from anywhere at any time.
+Carbondata can support any Object Storage that conforms to Amazon S3 API.
+Carbondata relies on Hadoop provided S3 filesystem APIs to access Object stores.
+
+#Writing to Object Storage
+
+To store carbondata files onto Object Store, `carbon.storelocation` property will have 
+to be configured with Object Store path in CarbonProperties file. 
+
+For example:
+```
+carbon.storelocation=s3a://mybucket/carbonstore.
+```
+
+If the existing store location cannot be changed or only specific tables need to be stored 
+onto cloud object store, it can be done so by specifying the `location` option in the create 
+table DDL command.
+
+For example:
+
+```
+CREATE TABLE IF NOT EXISTS db1.table1(col1 string, col2 int) STORED AS carbondata LOCATION 's3a://mybucket/carbonstore'
+``` 
+
+For more details on create table, Refer [data-management-on-carbondata](./data-management-on-carbondata.md#create-table)
+
+#Authentication
+
+Authentication properties will have to be configured to store the carbondata files on to S3 location. 
+
+Authentication properties can be set in any of the following ways:
+1. Set authentication properties in core-site.xml, refer 
+[hadoop authentication document](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Authentication_properties)
+
+2. Set authentication properties in spark-defaults.conf.
+
+Example
+```
+spark.hadoop.fs.s3a.secret.key=123
+spark.hadoop.fs.s3a.access.key=456
+```
+
+3. Pass authentication properties with spark-submit as configuration.
+
+Example:
+```
+./bin/spark-submit --master yarn --conf spark.hadoop.fs.s3a.secret.key=123 --conf spark.hadoop.fs
+.s3a.access.key=456 --class=
+```  
+
+4. Set authentication properties to hadoop configuration object in sparkContext.
+
+Example:
+```
+sparkSession.sparkContext.hadoopConfiguration.set("fs.s3a.secret.key", "123")
+sparkSession.sparkContext.hadoopConfiguration.set("fs.s3a.access.key","456")
+```
+
+#Recommendations
+
+1. Object Storage like S3 does not support file leasing mechanism(supported by HDFS) that is 
+required to take locks which ensure consistency between concurrent operations therefore, it is 
+recommended to set the configurable lock path property([carbon.lock.path](https://github.com/apache/carbondata/blob/master/docs/configuration-parameters.md#miscellaneous-configuration))
+ to a HDFS directory.
+2. Concurrent data manipulation operations are not supported. Object stores follow eventual 
+consistency semantics, i.e., any put request might take some time to reflect when trying to list
+.This behaviour causes not to ensure the data read is always consistent or latest.

http://git-wip-us.apache.org/repos/asf/carbondata/blob/e26a742c/docs/sdk-guide.md
----------------------------------------------------------------------
diff --git a/docs/sdk-guide.md b/docs/sdk-guide.md
index c7bff59..e592aa5 100644
--- a/docs/sdk-guide.md
+++ b/docs/sdk-guide.md
@@ -1,3 +1,20 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more 
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership. 
+    The ASF licenses this file to you under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with 
+    the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software 
+    distributed under the License is distributed on an "AS IS" BASIS, 
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and 
+    limitations under the License.
+-->
+
 # SDK Guide
 In the carbon jars package, there exist a carbondata-store-sdk-x.x.x-SNAPSHOT.jar, including SDK writer and reader.
 # SDK Writer

http://git-wip-us.apache.org/repos/asf/carbondata/blob/e26a742c/docs/streaming-guide.md
----------------------------------------------------------------------
diff --git a/docs/streaming-guide.md b/docs/streaming-guide.md
index a9284e6..32d24dc 100644
--- a/docs/streaming-guide.md
+++ b/docs/streaming-guide.md
@@ -1,3 +1,20 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more 
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership. 
+    The ASF licenses this file to you under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with 
+    the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software 
+    distributed under the License is distributed on an "AS IS" BASIS, 
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and 
+    limitations under the License.
+-->
+
 # CarbonData Streaming Ingestion
 
 ## Quick example

http://git-wip-us.apache.org/repos/asf/carbondata/blob/e26a742c/integration/spark2/src/main/scala/org/apache/spark/sql/CarbonDatasourceHadoopRelation.scala
----------------------------------------------------------------------
diff --git a/integration/spark2/src/main/scala/org/apache/spark/sql/CarbonDatasourceHadoopRelation.scala b/integration/spark2/src/main/scala/org/apache/spark/sql/CarbonDatasourceHadoopRelation.scala
index 3ce8c8c..b5842a9 100644
--- a/integration/spark2/src/main/scala/org/apache/spark/sql/CarbonDatasourceHadoopRelation.scala
+++ b/integration/spark2/src/main/scala/org/apache/spark/sql/CarbonDatasourceHadoopRelation.scala
@@ -38,6 +38,7 @@ import org.apache.carbondata.core.metadata.schema.table.CarbonTable
 import org.apache.carbondata.core.scan.expression.Expression
 import org.apache.carbondata.core.scan.expression.logical.AndExpression
 import org.apache.carbondata.hadoop.CarbonProjection
+import org.apache.carbondata.hadoop.util.CarbonInputFormatUtil
 import org.apache.carbondata.spark.rdd.{CarbonScanRDD, SparkReadSupport}
 
 case class CarbonDatasourceHadoopRelation(
@@ -55,6 +56,7 @@ case class CarbonDatasourceHadoopRelation(
     caseInsensitiveMap("tablename"))
   lazy val databaseName: String = carbonTable.getDatabaseName
   lazy val tableName: String = carbonTable.getTableName
+  CarbonInputFormatUtil.setS3Configurations(sparkSession.sessionState.newHadoopConf())
   CarbonSession.updateSessionInfoToCurrentThread(sparkSession)
 
   @transient lazy val carbonRelation: CarbonRelation =

http://git-wip-us.apache.org/repos/asf/carbondata/blob/e26a742c/integration/spark2/src/main/scala/org/apache/spark/sql/CarbonSource.scala
----------------------------------------------------------------------
diff --git a/integration/spark2/src/main/scala/org/apache/spark/sql/CarbonSource.scala b/integration/spark2/src/main/scala/org/apache/spark/sql/CarbonSource.scala
index 0d13d4c..b162294 100644
--- a/integration/spark2/src/main/scala/org/apache/spark/sql/CarbonSource.scala
+++ b/integration/spark2/src/main/scala/org/apache/spark/sql/CarbonSource.scala
@@ -44,6 +44,7 @@ import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier
 import org.apache.carbondata.core.metadata.schema.SchemaEvolutionEntry
 import org.apache.carbondata.core.metadata.schema.table.TableInfo
 import org.apache.carbondata.core.util.{CarbonProperties, CarbonUtil}
+import org.apache.carbondata.hadoop.util.CarbonInputFormatUtil
 import org.apache.carbondata.spark.CarbonOption
 import org.apache.carbondata.spark.util.CarbonScalaUtil
 import org.apache.carbondata.streaming.{CarbonStreamException, CarbonStreamingQueryListener, StreamSinkFactory}
@@ -328,6 +329,7 @@ object CarbonSource {
         .contains("true")
       tableInfo.setTransactionalTable(isTransactionalTable)
       if (isTransactionalTable && !metaStore.isReadFromHiveMetaStore) {
+        CarbonInputFormatUtil.setS3Configurations(sparkSession.sessionState.newHadoopConf())
         // save to disk
         metaStore.saveToDisk(tableInfo, properties("tablePath"))
         // remove schema string from map as we don't store carbon schema to hive metastore