You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@carbondata.apache.org by GitBox <gi...@apache.org> on 2020/08/03 08:59:38 UTC

[GitHub] [carbondata] akashrn5 commented on a change in pull request #3865: [CARBONDATA-3928] Handled the Strings which length is greater than 32000 as a bad record.

akashrn5 commented on a change in pull request #3865:
URL: https://github.com/apache/carbondata/pull/3865#discussion_r464273711



##########
File path: integration/spark/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala
##########
@@ -75,11 +75,10 @@ object CarbonScalaUtil {
         carbonLoadModel.getBinaryDecoder)
     } catch {
       case e: Exception =>
-        if (e.getMessage.startsWith(FieldConverter.stringLengthExceedErrorMsg)) {
-          val msg = s"Column ${carbonLoadModel.getCarbonDataLoadSchema.getCarbonTable
-            .getCreateOrderColumn.get(idx).getColName} is too long," +
-            s" consider to use 'long_string_columns' table property."
-          LOGGER.error(msg, e)
+        if (e.getMessage.startsWith(CarbonCommonConstants.STRING_LENGTH_EXCEEDED_MESSAGE)) {
+          val msg = CarbonCommonConstants.STRING_LENGTH_EXCEEDED_MESSAGE.format(row,
+              carbonLoadModel.getCarbonDataLoadSchema.getCarbonTable.getCreateOrderColumn
+                .get(idx).getColName)

Review comment:
       add LOGGER.error(msg, e)

##########
File path: integration/spark/src/test/scala/org/apache/carbondata/integration/spark/testsuite/dataload/TestLoadDataGeneral.scala
##########
@@ -170,21 +175,22 @@ class TestLoadDataGeneral extends QueryTest with BeforeAndAfterEach {
     sql("drop table if exists load32000bytes")
     sql("create table load32000bytes(name string) STORED AS carbondata")
     sql("insert into table load32000bytes select 'aaa'")
+    checkAnswer(sql("select count(*) from load32000bytes"), Seq(Row(1)))
 
-    assert(intercept[Exception] {
-      sql(s"load data local inpath '$testdata' into table load32000bytes OPTIONS ('FILEHEADER'='name')")
-    }.getMessage.contains("DataLoad failure: Dataload failed, String size cannot exceed 32000 bytes"))
+    // Below load will be inserted as null because Strings greater than 32000 is bad record.
+    sql(s"load data local inpath '$testdata' into table load32000bytes OPTIONS ('FILEHEADER'='name')")
+    checkAnswer(sql("select count(*) from load32000bytes"), Seq(Row(2)))
+    checkAnswer(sql("select * from load32000bytes"), Seq(Row("aaa"), Row(null)))
 
     val source = scala.io.Source.fromFile(testdata, CarbonCommonConstants.DEFAULT_CHARSET)
     val data = source.mkString
 
+    // Insert will throw exception as it is without converter step.
     intercept[Exception] {
       sql(s"insert into load32000bytes values('$data')")
     }
 
-    intercept[Exception] {
-      sql(s"update load32000bytes set(name)= ('$data')").show()
-    }
+    sql(s"update load32000bytes set(name)= ('$data')").show()

Review comment:
       if updates not throwing exception, no need to add update command here

##########
File path: processing/src/main/java/org/apache/carbondata/processing/loading/converter/impl/NonDictionaryFieldConverterImpl.java
##########
@@ -82,21 +83,25 @@ public Object convert(Object value, BadRecordLogHolder logHolder)
               .getBytesBasedOnDataTypeForNoDictionaryColumn(dimensionValue, dataType, dateFormat);
           if (dataType == DataTypes.STRING
               && parsedValue.length > CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT) {
-            throw new CarbonDataLoadingException(String.format(
-                "Dataload failed, String size cannot exceed %d bytes,"
-                    + " please consider long string data type",
-                CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT));
+            logHolder.setReason(CarbonCommonConstants.STRING_LENGTH_EXCEEDED_MESSAGE);
+            String badRecordAction = CarbonProperties.getInstance()
+                .getProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION);
+            if (badRecordAction.equalsIgnoreCase("FORCE")) {

Review comment:
       Do not hardcode `"FORCE"`, use static constants in all places

##########
File path: integration/spark/src/test/scala/org/apache/carbondata/integration/spark/testsuite/dataload/TestLoadDataGeneral.scala
##########
@@ -154,13 +154,18 @@ class TestLoadDataGeneral extends QueryTest with BeforeAndAfterEach {
     sql("CREATE TABLE load32000chardata(dim1 String, dim2 String, mes1 int) STORED AS carbondata")
     sql("CREATE TABLE load32000chardata_dup(dim1 String, dim2 String, mes1 int) STORED AS carbondata")
     sql(s"LOAD DATA LOCAL INPATH '$testdata' into table load32000chardata OPTIONS('FILEHEADER'='dim1,dim2,mes1')")
-    intercept[Exception] {
-      sql("insert into load32000chardata_dup select dim1,concat(load32000chardata.dim2,'aaaa'),mes1 from load32000chardata").show()
-    }
+    checkAnswer(sql("select count(*) from load32000chardata"), Seq(Row(3)))

Review comment:
       i think you have handled specifically for the bad record action `FORCE`. Even test cases are also for force. Please handle for all the actions of bad record and add a test case for each action.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org