You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@carbondata.apache.org by ja...@apache.org on 2019/09/28 13:24:34 UTC

[carbondata] branch master updated: [CARBONDATA-3527] Fix 'String length cannot exceed 32000 characters' issue when load data with 'GLOBAL_SORT' from csv files which include big complex type data

This is an automated email from the ASF dual-hosted git repository.

jackylk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/carbondata.git


The following commit(s) were added to refs/heads/master by this push:
     new f64f3d7  [CARBONDATA-3527] Fix 'String length cannot exceed 32000 characters' issue when load data with 'GLOBAL_SORT' from csv files which include big complex type data
f64f3d7 is described below

commit f64f3d7aef20ccf067969a76ddf2e9c848df2f24
Author: Zhang Zhichao <44...@qq.com>
AuthorDate: Wed Sep 25 15:58:35 2019 +0800

    [CARBONDATA-3527] Fix 'String length cannot exceed 32000 characters' issue when load data with 'GLOBAL_SORT' from csv files which include big complex type data
    
    Problem:
    When complex type data is used more than 32000 characters to indicate in csv file, and load data with 'GLOBAL_SORT' from these csv files, it will throw 'String length cannot exceed 32000 characters' exception.
    
    Cause:
    Use 'GLOBAL_SORT' to load data from csv files, it reads files and firstly store data in StringArrayRow, the type of all data are string, when call 'CarbonScalaUtil.getString' in 'NewRddIterator.next', it will check the length of all data and throw 'String length cannot exceed 32000 characters' exception even if it's complex type data which store as more than 32000 characters in csv files.
    
    Solution:
    In 'FieldConverter.objectToString' (called in 'CarbonScalaUtil.getString'), if the data type of field is complex type, don't check the length.
    
    This closes #3399
---
 .../src/test/resources/complexdata3.csv            | 10 +++++
 .../complexType/TestComplexDataType.scala          | 52 ++++++++++++++++++++++
 .../spark/rdd/NewCarbonDataLoadRDD.scala           |  6 ++-
 .../carbondata/spark/util/CarbonScalaUtil.scala    |  4 +-
 .../streaming/parser/FieldConverter.scala          | 14 +++---
 5 files changed, 79 insertions(+), 7 deletions(-)

diff --git a/integration/spark-common-test/src/test/resources/complexdata3.csv b/integration/spark-common-test/src/test/resources/complexdata3.csv
new file mode 100644
index 0000000..63cd44b
--- /dev/null
+++ b/integration/spark-common-test/src/test/resources/complexdata3.csv
@@ -0,0 +1,10 @@
+e01a1773-bd37-40be-a1de-d7e74837a281	(0551)96116063	886	0031	5	(0551)46819921	853		4	0	1568220618904	50	asp	fk	2745000	1	0	0	0	0	-0.19569306\0020.10781755\002-0.06963766\002-0.06576662\002-0.17820272\002-0.01949397\0020.08014756\002-0.05287997\0020.02067086\002-0.11302640\0020.07383678\0020.07296083\0020.11693181\002-0.06988186\0020.05753217\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631518\0020.05918765\0020.07385136\002-0.05143059\002-0.19158234\0020.13839211\002 [...]
+f72ce5cb-2ea6-423b-8c1f-6dadfd6f52e7	(0551)73382297	853	0031	4	(0551)73382297	49		9	0	1568275177770	1559	asp	fk	5821000	1	0	0	0	0	-0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.13839212\00 [...]
+e282ecb5-9be8-4a0e-8faf-d10e535ab877	13396633307	49	0031	9	13918448986	1		7	0	1568260253193	1150	asp	fk	3884000	1	0	0	0	0	-0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.13839212\002-0.0826 [...]
+01e36a06-b4fd-4638-862c-2785f9e4331b	13924865616	82	0031	0	0086(021)60080162	82		6	0	1568293725356	2108	asp	fk	3152000	1	0	0	0	0	-0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.13839212\002 [...]
+a451790d-42f8-48e5-88f4-ba21118e63e6	13326037312	81	0031	8	(0551)17198025	852		2	0	1568294179731	2116	asp	fk	1127000	1	0	0	0	0	-0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.13839212\002-0 [...]
+9d26e280-4e87-4cbe-a850-5965b7c36a4b	13376907227	44	0031	9	13376907227	82		3	0	1568302365552	2332	asp	fk	2043000	1	0	0	0	0	-0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.13839212\002-0.082 [...]
+c2eabec9-b8a7-405b-80d9-e73692d586f4	0086(021)77426829	81	0031	8	13326037312	44		0	0	1568252700180	945	asp	fk	943000	1	0	0	0	0	-0.19569306\0020.10781755\002-0.06963766\002-0.06576662\002-0.17820272\002-0.01949397\0020.08014756\002-0.05287997\0020.02067086\002-0.11302640\0020.07383678\0020.07296083\0020.11693181\002-0.06988186\0020.05753217\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631518\0020.05918765\0020.07385136\002-0.05143059\002-0.19158234\0020.13839211\002-0 [...]
+04a548aa-a103-4ffd-b72c-81b6cb2ea420	0086(021)77426829	82	0031	2	13924865616	33		0	0	1568249850352	857	asp	fk	2450000	1	0	0	0	0	-0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.13839212\002- [...]
+45c0ded1-c608-4a49-981d-faf720442a59	13378149447	49	0031	8	13376907227	1		5	0	1568289879606	2004	asp	fk	3686000	1	0	0	0	0	-0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.13839212\002-0.0826 [...]
+cff43f86-ae81-4bbc-90dd-b7de39bdda1b	0086(021)77426829	82	0031	2	0086(021)60080162	886		6	0	1568230183633	329	asp	fk	1615000	1	0	0	0	0	-0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.138392 [...]
diff --git a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/complexType/TestComplexDataType.scala b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/complexType/TestComplexDataType.scala
index 9d6b4d1..8ec1420 100644
--- a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/complexType/TestComplexDataType.scala
+++ b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/complexType/TestComplexDataType.scala
@@ -1127,4 +1127,56 @@ class TestComplexDataType extends QueryTest with BeforeAndAfterAll {
     sql("drop table if exists hive_table")
   }
 
+  test("[CARBONDATA-3527] Fix 'String length cannot exceed 32000 characters' issue when load data with 'GLOBAL_SORT' from csv files which include big complex type data") {
+    val tableName = "complexdata3_table"
+    sql(s"drop table if exists ${tableName}")
+    sql(
+      s"""
+         |CREATE TABLE IF NOT EXISTS ${tableName} (
+         | begin_time LONG,
+         | id string,
+         | phone string,
+         | other_phone string,
+         | vtl LONG,
+         | gender string,
+         | lang string,
+         | lang_dec string,
+         | phone_country string,
+         | phone_province string,
+         | phone_city string,
+         | other_phone_country string,
+         | other_phone_province string,
+         | other_phone_city string,
+         | call_type INT,
+         | begin_hhmm INT,
+         | ds string,
+         | voice_flag INT,
+         | dss string,
+         | dur LONG,
+         | modela array < array < FLOAT >>, modelb array < array < FLOAT >>, modela_pk array < array < FLOAT >>, modelb_pk array < array < FLOAT >>, modela_ms array < array < FLOAT >>, modelb_ms array < array < FLOAT >>, tl LONG,
+         | lang_sc FLOAT,
+         | nlp_sc FLOAT,
+         | create_time LONG,
+         | cdr_create_time LONG,
+         | fulltext string,
+         | tag_label string,
+         | tag_memo string,
+         | tag_listen string,
+         | tag_imp string,
+         | prop string,
+         | files string
+         | )
+         | STORED AS carbondata TBLPROPERTIES (
+         | 'SORT_COLUMNS' = 'begin_time,id,phone,other_phone,vtl,gender,lang,lang_dec,phone_country,phone_province,phone_city,other_phone_country,other_phone_province,other_phone_city,call_type,begin_hhmm,ds,voice_flag',
+         | 'SORT_SCOPE' = 'GLOBAL_SORT','LONG_STRING_COLUMNS' = 'fulltext,files')""".stripMargin)
+    sql(s"""LOAD DATA inpath '${resourcesPath}/complexdata3.csv' INTO table ${tableName}
+        options('DELIMITER'='\t','QUOTECHAR'='"','COMMENTCHAR'='#','HEADER'='false',
+                'FILEHEADER'='id,phone,phone_country,phone_province,phone_city,other_phone,other_phone_country,other_phone_province,other_phone_city,call_type,begin_time,begin_hhmm,ds,dss,dur,voice_flag,modela,modelb,modela_pk,modelb_pk,modela_ms,modelb_ms,lang,lang_dec,lang_sc,gender,nlp_sc,tl,vtl,create_time,cdr_create_time,fulltext,tag_label,tag_memo,tag_listen,tag_imp,prop,files',
+                'MULTILINE'='true','ESCAPECHAR'='\','COMPLEX_DELIMITER_LEVEL_1'='\\001','COMPLEX_DELIMITER_LEVEL_2'='\\002',
+                'SINGLE_PASS'='TRUE')""")
+    checkAnswer(sql(s"select count(1) from ${tableName}"), Seq(Row(10)))
+    checkAnswer(sql(s"select modela[0][0], modela_ms[0][1] from ${tableName} where id = 'e01a1773-bd37-40be-a1de-d7e74837a281'"),
+      Seq(Row(0.0, 0.10781755)))
+    sql(s"drop table if exists ${tableName}")
+  }
 }
diff --git a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/NewCarbonDataLoadRDD.scala b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/NewCarbonDataLoadRDD.scala
index ac8224e..ce60a55 100644
--- a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/NewCarbonDataLoadRDD.scala
+++ b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/NewCarbonDataLoadRDD.scala
@@ -348,6 +348,9 @@ class NewRddIterator(rddIter: Iterator[Row],
   private val isVarcharTypeMapping =
     carbonLoadModel.getCarbonDataLoadSchema.getCarbonTable.getCreateOrderColumn(
       carbonLoadModel.getTableName).asScala.map(_.getDataType == DataTypes.VARCHAR)
+  private val isComplexTypeMapping =
+    carbonLoadModel.getCarbonDataLoadSchema.getCarbonTable.getCreateOrderColumn(
+      carbonLoadModel.getTableName).asScala.map(_.isComplex())
   def hasNext: Boolean = rddIter.hasNext
 
   def next: Array[AnyRef] = {
@@ -356,7 +359,8 @@ class NewRddIterator(rddIter: Iterator[Row],
     for (i <- 0 until columns.length) {
       columns(i) = CarbonScalaUtil.getString(row.get(i), serializationNullFormat,
         complexDelimiters, timeStampFormat, dateFormat,
-        isVarcharType = i < isVarcharTypeMapping.size && isVarcharTypeMapping(i))
+        isVarcharType = i < isVarcharTypeMapping.size && isVarcharTypeMapping(i),
+        isComplexType = i < isComplexTypeMapping.size && isComplexTypeMapping(i))
     }
     columns
   }
diff --git a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala
index b0af2ea..d94c5d7 100644
--- a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala
+++ b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala
@@ -66,9 +66,11 @@ object CarbonScalaUtil {
       timeStampFormat: SimpleDateFormat,
       dateFormat: SimpleDateFormat,
       isVarcharType: Boolean = false,
+      isComplexType: Boolean = false,
       level: Int = 0): String = {
     FieldConverter.objectToString(value, serializationNullFormat, complexDelimiters,
-      timeStampFormat, dateFormat, isVarcharType = isVarcharType, level)
+      timeStampFormat, dateFormat, isVarcharType = isVarcharType, isComplexType = isComplexType,
+      level)
   }
 
   /**
diff --git a/streaming/src/main/scala/org/apache/carbondata/streaming/parser/FieldConverter.scala b/streaming/src/main/scala/org/apache/carbondata/streaming/parser/FieldConverter.scala
index 5c67dfb..0cf244a 100644
--- a/streaming/src/main/scala/org/apache/carbondata/streaming/parser/FieldConverter.scala
+++ b/streaming/src/main/scala/org/apache/carbondata/streaming/parser/FieldConverter.scala
@@ -42,12 +42,13 @@ object FieldConverter {
       timeStampFormat: SimpleDateFormat,
       dateFormat: SimpleDateFormat,
       isVarcharType: Boolean = false,
+      isComplexType: Boolean = false,
       level: Int = 0): String = {
     if (value == null) {
       serializationNullFormat
     } else {
       value match {
-        case s: String => if (!isVarcharType &&
+        case s: String => if (!isVarcharType && !isComplexType &&
                               s.length > CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT) {
           throw new Exception("Dataload failed, String length cannot exceed " +
                               CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT + " characters")
@@ -68,23 +69,25 @@ object FieldConverter {
           val delimiter = complexDelimiters.get(level)
           val builder = new StringBuilder()
           s.foreach { x =>
+            val nextLevel = level + 1
             builder.append(objectToString(x, serializationNullFormat, complexDelimiters,
-              timeStampFormat, dateFormat, isVarcharType, level + 1))
+              timeStampFormat, dateFormat, isVarcharType, level = nextLevel))
               .append(delimiter)
           }
           builder.substring(0, builder.length - delimiter.length())
         // First convert the 'key' of Map and then append the keyValueDelimiter and then convert
         // the 'value of the map and append delimiter
         case m: scala.collection.Map[_, _] =>
+          val nextLevel = level + 2
           val delimiter = complexDelimiters.get(level)
           val keyValueDelimiter = complexDelimiters.get(level + 1)
           val builder = new StringBuilder()
           m.foreach { x =>
             builder.append(objectToString(x._1, serializationNullFormat, complexDelimiters,
-              timeStampFormat, dateFormat, isVarcharType, level + 2))
+              timeStampFormat, dateFormat, isVarcharType, level = nextLevel))
               .append(keyValueDelimiter)
             builder.append(objectToString(x._2, serializationNullFormat, complexDelimiters,
-              timeStampFormat, dateFormat, isVarcharType, level + 2))
+              timeStampFormat, dateFormat, isVarcharType, level = nextLevel))
               .append(delimiter)
           }
           builder.substring(0, builder.length - delimiter.length())
@@ -92,8 +95,9 @@ object FieldConverter {
           val delimiter = complexDelimiters.get(level)
           val builder = new StringBuilder()
           for (i <- 0 until r.length) {
+            val nextLevel = level + 1
             builder.append(objectToString(r(i), serializationNullFormat, complexDelimiters,
-              timeStampFormat, dateFormat, isVarcharType, level + 1))
+              timeStampFormat, dateFormat, isVarcharType, level = nextLevel))
               .append(delimiter)
           }
           builder.substring(0, builder.length - delimiter.length())