You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@carbondata.apache.org by ja...@apache.org on 2019/09/28 13:24:34 UTC
[carbondata] branch master updated: [CARBONDATA-3527] Fix 'String
length cannot exceed 32000 characters' issue when load data with
'GLOBAL_SORT' from csv files which include big complex type data
This is an automated email from the ASF dual-hosted git repository.
jackylk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/carbondata.git
The following commit(s) were added to refs/heads/master by this push:
new f64f3d7 [CARBONDATA-3527] Fix 'String length cannot exceed 32000 characters' issue when load data with 'GLOBAL_SORT' from csv files which include big complex type data
f64f3d7 is described below
commit f64f3d7aef20ccf067969a76ddf2e9c848df2f24
Author: Zhang Zhichao <44...@qq.com>
AuthorDate: Wed Sep 25 15:58:35 2019 +0800
[CARBONDATA-3527] Fix 'String length cannot exceed 32000 characters' issue when load data with 'GLOBAL_SORT' from csv files which include big complex type data
Problem:
When complex type data is used more than 32000 characters to indicate in csv file, and load data with 'GLOBAL_SORT' from these csv files, it will throw 'String length cannot exceed 32000 characters' exception.
Cause:
Use 'GLOBAL_SORT' to load data from csv files, it reads files and firstly store data in StringArrayRow, the type of all data are string, when call 'CarbonScalaUtil.getString' in 'NewRddIterator.next', it will check the length of all data and throw 'String length cannot exceed 32000 characters' exception even if it's complex type data which store as more than 32000 characters in csv files.
Solution:
In 'FieldConverter.objectToString' (called in 'CarbonScalaUtil.getString'), if the data type of field is complex type, don't check the length.
This closes #3399
---
.../src/test/resources/complexdata3.csv | 10 +++++
.../complexType/TestComplexDataType.scala | 52 ++++++++++++++++++++++
.../spark/rdd/NewCarbonDataLoadRDD.scala | 6 ++-
.../carbondata/spark/util/CarbonScalaUtil.scala | 4 +-
.../streaming/parser/FieldConverter.scala | 14 +++---
5 files changed, 79 insertions(+), 7 deletions(-)
diff --git a/integration/spark-common-test/src/test/resources/complexdata3.csv b/integration/spark-common-test/src/test/resources/complexdata3.csv
new file mode 100644
index 0000000..63cd44b
--- /dev/null
+++ b/integration/spark-common-test/src/test/resources/complexdata3.csv
@@ -0,0 +1,10 @@
+e01a1773-bd37-40be-a1de-d7e74837a281 (0551)96116063 886 0031 5 (0551)46819921 853 4 0 1568220618904 50 asp fk 2745000 1 0 0 0 0 -0.19569306\0020.10781755\002-0.06963766\002-0.06576662\002-0.17820272\002-0.01949397\0020.08014756\002-0.05287997\0020.02067086\002-0.11302640\0020.07383678\0020.07296083\0020.11693181\002-0.06988186\0020.05753217\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631518\0020.05918765\0020.07385136\002-0.05143059\002-0.19158234\0020.13839211\002 [...]
+f72ce5cb-2ea6-423b-8c1f-6dadfd6f52e7 (0551)73382297 853 0031 4 (0551)73382297 49 9 0 1568275177770 1559 asp fk 5821000 1 0 0 0 0 -0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.13839212\00 [...]
+e282ecb5-9be8-4a0e-8faf-d10e535ab877 13396633307 49 0031 9 13918448986 1 7 0 1568260253193 1150 asp fk 3884000 1 0 0 0 0 -0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.13839212\002-0.0826 [...]
+01e36a06-b4fd-4638-862c-2785f9e4331b 13924865616 82 0031 0 0086(021)60080162 82 6 0 1568293725356 2108 asp fk 3152000 1 0 0 0 0 -0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.13839212\002 [...]
+a451790d-42f8-48e5-88f4-ba21118e63e6 13326037312 81 0031 8 (0551)17198025 852 2 0 1568294179731 2116 asp fk 1127000 1 0 0 0 0 -0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.13839212\002-0 [...]
+9d26e280-4e87-4cbe-a850-5965b7c36a4b 13376907227 44 0031 9 13376907227 82 3 0 1568302365552 2332 asp fk 2043000 1 0 0 0 0 -0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.13839212\002-0.082 [...]
+c2eabec9-b8a7-405b-80d9-e73692d586f4 0086(021)77426829 81 0031 8 13326037312 44 0 0 1568252700180 945 asp fk 943000 1 0 0 0 0 -0.19569306\0020.10781755\002-0.06963766\002-0.06576662\002-0.17820272\002-0.01949397\0020.08014756\002-0.05287997\0020.02067086\002-0.11302640\0020.07383678\0020.07296083\0020.11693181\002-0.06988186\0020.05753217\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631518\0020.05918765\0020.07385136\002-0.05143059\002-0.19158234\0020.13839211\002-0 [...]
+04a548aa-a103-4ffd-b72c-81b6cb2ea420 0086(021)77426829 82 0031 2 13924865616 33 0 0 1568249850352 857 asp fk 2450000 1 0 0 0 0 -0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.13839212\002- [...]
+45c0ded1-c608-4a49-981d-faf720442a59 13378149447 49 0031 8 13376907227 1 5 0 1568289879606 2004 asp fk 3686000 1 0 0 0 0 -0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.13839212\002-0.0826 [...]
+cff43f86-ae81-4bbc-90dd-b7de39bdda1b 0086(021)77426829 82 0031 2 0086(021)60080162 886 6 0 1568230183633 329 asp fk 1615000 1 0 0 0 0 -0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.138392 [...]
diff --git a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/complexType/TestComplexDataType.scala b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/complexType/TestComplexDataType.scala
index 9d6b4d1..8ec1420 100644
--- a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/complexType/TestComplexDataType.scala
+++ b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/complexType/TestComplexDataType.scala
@@ -1127,4 +1127,56 @@ class TestComplexDataType extends QueryTest with BeforeAndAfterAll {
sql("drop table if exists hive_table")
}
+ test("[CARBONDATA-3527] Fix 'String length cannot exceed 32000 characters' issue when load data with 'GLOBAL_SORT' from csv files which include big complex type data") {
+ val tableName = "complexdata3_table"
+ sql(s"drop table if exists ${tableName}")
+ sql(
+ s"""
+ |CREATE TABLE IF NOT EXISTS ${tableName} (
+ | begin_time LONG,
+ | id string,
+ | phone string,
+ | other_phone string,
+ | vtl LONG,
+ | gender string,
+ | lang string,
+ | lang_dec string,
+ | phone_country string,
+ | phone_province string,
+ | phone_city string,
+ | other_phone_country string,
+ | other_phone_province string,
+ | other_phone_city string,
+ | call_type INT,
+ | begin_hhmm INT,
+ | ds string,
+ | voice_flag INT,
+ | dss string,
+ | dur LONG,
+ | modela array < array < FLOAT >>, modelb array < array < FLOAT >>, modela_pk array < array < FLOAT >>, modelb_pk array < array < FLOAT >>, modela_ms array < array < FLOAT >>, modelb_ms array < array < FLOAT >>, tl LONG,
+ | lang_sc FLOAT,
+ | nlp_sc FLOAT,
+ | create_time LONG,
+ | cdr_create_time LONG,
+ | fulltext string,
+ | tag_label string,
+ | tag_memo string,
+ | tag_listen string,
+ | tag_imp string,
+ | prop string,
+ | files string
+ | )
+ | STORED AS carbondata TBLPROPERTIES (
+ | 'SORT_COLUMNS' = 'begin_time,id,phone,other_phone,vtl,gender,lang,lang_dec,phone_country,phone_province,phone_city,other_phone_country,other_phone_province,other_phone_city,call_type,begin_hhmm,ds,voice_flag',
+ | 'SORT_SCOPE' = 'GLOBAL_SORT','LONG_STRING_COLUMNS' = 'fulltext,files')""".stripMargin)
+ sql(s"""LOAD DATA inpath '${resourcesPath}/complexdata3.csv' INTO table ${tableName}
+ options('DELIMITER'='\t','QUOTECHAR'='"','COMMENTCHAR'='#','HEADER'='false',
+ 'FILEHEADER'='id,phone,phone_country,phone_province,phone_city,other_phone,other_phone_country,other_phone_province,other_phone_city,call_type,begin_time,begin_hhmm,ds,dss,dur,voice_flag,modela,modelb,modela_pk,modelb_pk,modela_ms,modelb_ms,lang,lang_dec,lang_sc,gender,nlp_sc,tl,vtl,create_time,cdr_create_time,fulltext,tag_label,tag_memo,tag_listen,tag_imp,prop,files',
+ 'MULTILINE'='true','ESCAPECHAR'='\','COMPLEX_DELIMITER_LEVEL_1'='\\001','COMPLEX_DELIMITER_LEVEL_2'='\\002',
+ 'SINGLE_PASS'='TRUE')""")
+ checkAnswer(sql(s"select count(1) from ${tableName}"), Seq(Row(10)))
+ checkAnswer(sql(s"select modela[0][0], modela_ms[0][1] from ${tableName} where id = 'e01a1773-bd37-40be-a1de-d7e74837a281'"),
+ Seq(Row(0.0, 0.10781755)))
+ sql(s"drop table if exists ${tableName}")
+ }
}
diff --git a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/NewCarbonDataLoadRDD.scala b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/NewCarbonDataLoadRDD.scala
index ac8224e..ce60a55 100644
--- a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/NewCarbonDataLoadRDD.scala
+++ b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/NewCarbonDataLoadRDD.scala
@@ -348,6 +348,9 @@ class NewRddIterator(rddIter: Iterator[Row],
private val isVarcharTypeMapping =
carbonLoadModel.getCarbonDataLoadSchema.getCarbonTable.getCreateOrderColumn(
carbonLoadModel.getTableName).asScala.map(_.getDataType == DataTypes.VARCHAR)
+ private val isComplexTypeMapping =
+ carbonLoadModel.getCarbonDataLoadSchema.getCarbonTable.getCreateOrderColumn(
+ carbonLoadModel.getTableName).asScala.map(_.isComplex())
def hasNext: Boolean = rddIter.hasNext
def next: Array[AnyRef] = {
@@ -356,7 +359,8 @@ class NewRddIterator(rddIter: Iterator[Row],
for (i <- 0 until columns.length) {
columns(i) = CarbonScalaUtil.getString(row.get(i), serializationNullFormat,
complexDelimiters, timeStampFormat, dateFormat,
- isVarcharType = i < isVarcharTypeMapping.size && isVarcharTypeMapping(i))
+ isVarcharType = i < isVarcharTypeMapping.size && isVarcharTypeMapping(i),
+ isComplexType = i < isComplexTypeMapping.size && isComplexTypeMapping(i))
}
columns
}
diff --git a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala
index b0af2ea..d94c5d7 100644
--- a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala
+++ b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala
@@ -66,9 +66,11 @@ object CarbonScalaUtil {
timeStampFormat: SimpleDateFormat,
dateFormat: SimpleDateFormat,
isVarcharType: Boolean = false,
+ isComplexType: Boolean = false,
level: Int = 0): String = {
FieldConverter.objectToString(value, serializationNullFormat, complexDelimiters,
- timeStampFormat, dateFormat, isVarcharType = isVarcharType, level)
+ timeStampFormat, dateFormat, isVarcharType = isVarcharType, isComplexType = isComplexType,
+ level)
}
/**
diff --git a/streaming/src/main/scala/org/apache/carbondata/streaming/parser/FieldConverter.scala b/streaming/src/main/scala/org/apache/carbondata/streaming/parser/FieldConverter.scala
index 5c67dfb..0cf244a 100644
--- a/streaming/src/main/scala/org/apache/carbondata/streaming/parser/FieldConverter.scala
+++ b/streaming/src/main/scala/org/apache/carbondata/streaming/parser/FieldConverter.scala
@@ -42,12 +42,13 @@ object FieldConverter {
timeStampFormat: SimpleDateFormat,
dateFormat: SimpleDateFormat,
isVarcharType: Boolean = false,
+ isComplexType: Boolean = false,
level: Int = 0): String = {
if (value == null) {
serializationNullFormat
} else {
value match {
- case s: String => if (!isVarcharType &&
+ case s: String => if (!isVarcharType && !isComplexType &&
s.length > CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT) {
throw new Exception("Dataload failed, String length cannot exceed " +
CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT + " characters")
@@ -68,23 +69,25 @@ object FieldConverter {
val delimiter = complexDelimiters.get(level)
val builder = new StringBuilder()
s.foreach { x =>
+ val nextLevel = level + 1
builder.append(objectToString(x, serializationNullFormat, complexDelimiters,
- timeStampFormat, dateFormat, isVarcharType, level + 1))
+ timeStampFormat, dateFormat, isVarcharType, level = nextLevel))
.append(delimiter)
}
builder.substring(0, builder.length - delimiter.length())
// First convert the 'key' of Map and then append the keyValueDelimiter and then convert
// the 'value of the map and append delimiter
case m: scala.collection.Map[_, _] =>
+ val nextLevel = level + 2
val delimiter = complexDelimiters.get(level)
val keyValueDelimiter = complexDelimiters.get(level + 1)
val builder = new StringBuilder()
m.foreach { x =>
builder.append(objectToString(x._1, serializationNullFormat, complexDelimiters,
- timeStampFormat, dateFormat, isVarcharType, level + 2))
+ timeStampFormat, dateFormat, isVarcharType, level = nextLevel))
.append(keyValueDelimiter)
builder.append(objectToString(x._2, serializationNullFormat, complexDelimiters,
- timeStampFormat, dateFormat, isVarcharType, level + 2))
+ timeStampFormat, dateFormat, isVarcharType, level = nextLevel))
.append(delimiter)
}
builder.substring(0, builder.length - delimiter.length())
@@ -92,8 +95,9 @@ object FieldConverter {
val delimiter = complexDelimiters.get(level)
val builder = new StringBuilder()
for (i <- 0 until r.length) {
+ val nextLevel = level + 1
builder.append(objectToString(r(i), serializationNullFormat, complexDelimiters,
- timeStampFormat, dateFormat, isVarcharType, level + 1))
+ timeStampFormat, dateFormat, isVarcharType, level = nextLevel))
.append(delimiter)
}
builder.substring(0, builder.length - delimiter.length())