You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@carbondata.apache.org by ch...@apache.org on 2016/07/20 10:14:02 UTC

[34/50] [abbrv] incubator-carbondata git commit: [CARBONDATA-62] Invalid type for column values to be discarded while generating global dictionary (#830)

[CARBONDATA-62] Invalid type for column values to be discarded while generating global dictionary (#830)

If value read from raw data is not valid for its datatype then discard that value at the time of data loading and insert null at its place while storing data in carbon format

Project: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/commit/31d824dd
Tree: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/tree/31d824dd
Diff: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/diff/31d824dd

Branch: refs/heads/master
Commit: 31d824ddbd6146a06e7095766bcb315cb803be99
Parents: 13a0df3
Author: manishgupta88 <to...@gmail.com>
Authored: Mon Jul 18 18:18:19 2016 +0530
Committer: Venkata Ramana G <g....@gmail.com>
Committed: Mon Jul 18 18:18:19 2016 +0530

----------------------------------------------------------------------
 .../core/constants/CarbonCommonConstants.java   |  5 +++
 .../org/carbondata/core/util/DataTypeUtil.java  | 37 ++++++++++++++++++
 .../spark/tasks/DictionaryWriterTask.scala      | 40 +++++++++++++++-----
 .../processing/datatypes/PrimitiveDataType.java |  9 ++++-
 .../csvbased/CarbonCSVBasedSeqGenStep.java      |  3 +-
 5 files changed, 80 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/31d824dd/core/src/main/java/org/carbondata/core/constants/CarbonCommonConstants.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/carbondata/core/constants/CarbonCommonConstants.java b/core/src/main/java/org/carbondata/core/constants/CarbonCommonConstants.java
index 5ca24b4..8dbe4dd 100644
--- a/core/src/main/java/org/carbondata/core/constants/CarbonCommonConstants.java
+++ b/core/src/main/java/org/carbondata/core/constants/CarbonCommonConstants.java
@@ -666,6 +666,11 @@ public final class CarbonCommonConstants {
    */
   public static final int INVALID_SURROGATE_KEY = -1;
 
+  /**
+   * surrogate key for MEMBER_DEFAULT_VAL
+   */
+  public static final int MEMBER_DEFAULT_VAL_SURROGATE_KEY = 1;
+
   public static final String INVALID_SEGMENT_ID = "-1";
 
   /**

http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/31d824dd/core/src/main/java/org/carbondata/core/util/DataTypeUtil.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/carbondata/core/util/DataTypeUtil.java b/core/src/main/java/org/carbondata/core/util/DataTypeUtil.java
index f14f29e..e6d6422 100644
--- a/core/src/main/java/org/carbondata/core/util/DataTypeUtil.java
+++ b/core/src/main/java/org/carbondata/core/util/DataTypeUtil.java
@@ -189,4 +189,41 @@ public final class DataTypeUtil {
       return null;
     }
   }
+
+  /**
+   * This method will parse a given string value corresponding to its datatype
+   *
+   * @param value    value to parse
+   * @param dataType datatype for that value
+   * @return
+   */
+  public static boolean validateColumnValueForItsDataType(String value, DataType dataType) {
+    try {
+      Object parsedValue = null;
+      // validation will not be done for timestamp datatype as for timestamp direct dictionary
+      // is generated. No dictionary file is created for timestamp datatype column
+      switch (dataType) {
+        case DECIMAL:
+          parsedValue = new BigDecimal(value);
+          break;
+        case INT:
+          parsedValue = Integer.parseInt(value);
+          break;
+        case LONG:
+          parsedValue = Long.valueOf(value);
+          break;
+        case DOUBLE:
+          parsedValue = Double.valueOf(value);
+          break;
+        default:
+          return true;
+      }
+      if (null != parsedValue) {
+        return true;
+      }
+      return false;
+    } catch (Exception e) {
+      return false;
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/31d824dd/integration/spark/src/main/scala/org/carbondata/spark/tasks/DictionaryWriterTask.scala
----------------------------------------------------------------------
diff --git a/integration/spark/src/main/scala/org/carbondata/spark/tasks/DictionaryWriterTask.scala b/integration/spark/src/main/scala/org/carbondata/spark/tasks/DictionaryWriterTask.scala
index c526cda..aacf402 100644
--- a/integration/spark/src/main/scala/org/carbondata/spark/tasks/DictionaryWriterTask.scala
+++ b/integration/spark/src/main/scala/org/carbondata/spark/tasks/DictionaryWriterTask.scala
@@ -65,28 +65,48 @@ class DictionaryWriterTask(valuesBuffer: mutable.HashSet[String],
         if (model.dictFileExists(columnIndex)) {
           if (dictionary.getSurrogateKey(values(0)) == CarbonCommonConstants
             .INVALID_SURROGATE_KEY) {
-            writer.write(values(0))
-            distinctValues.add(values(0))
+            val parseSuccess = org.carbondata.core.util.DataTypeUtil
+              .validateColumnValueForItsDataType(values(0),
+                model.primDimensions(columnIndex).getDataType);
+            if (parseSuccess) {
+              writer.write(values(0))
+              distinctValues.add(values(0))
+            }
           }
           for (i <- 1 until values.length) {
             if (preValue != values(i)) {
               if (dictionary.getSurrogateKey(values(i)) ==
                   CarbonCommonConstants.INVALID_SURROGATE_KEY) {
-                writer.write(values(i))
-                distinctValues.add(values(i))
-                preValue = values(i)
+                val parseSuccess = org.carbondata.core.util.DataTypeUtil
+                  .validateColumnValueForItsDataType(values(i),
+                    model.primDimensions(columnIndex).getDataType);
+                if (parseSuccess) {
+                  writer.write(values(i))
+                  distinctValues.add(values(i))
+                  preValue = values(i)
+                }
               }
             }
           }
 
         } else {
-          writer.write(values(0))
-          distinctValues.add(values(0))
+          val parseSuccess = org.carbondata.core.util.DataTypeUtil
+            .validateColumnValueForItsDataType(values(0),
+              model.primDimensions(columnIndex).getDataType);
+          if (parseSuccess) {
+            writer.write(values(0))
+            distinctValues.add(values(0))
+          }
           for (i <- 1 until values.length) {
             if (preValue != values(i)) {
-              writer.write(values(i))
-              distinctValues.add(values(i))
-              preValue = values(i)
+              val parseSuccess = org.carbondata.core.util.DataTypeUtil
+                .validateColumnValueForItsDataType(values(i),
+                  model.primDimensions(columnIndex).getDataType);
+              if (parseSuccess) {
+                writer.write(values(i))
+                distinctValues.add(values(i))
+                preValue = values(i)
+              }
             }
           }
         }

http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/31d824dd/processing/src/main/java/org/carbondata/processing/datatypes/PrimitiveDataType.java
----------------------------------------------------------------------
diff --git a/processing/src/main/java/org/carbondata/processing/datatypes/PrimitiveDataType.java b/processing/src/main/java/org/carbondata/processing/datatypes/PrimitiveDataType.java
index 3d1da8c..e63d727 100644
--- a/processing/src/main/java/org/carbondata/processing/datatypes/PrimitiveDataType.java
+++ b/processing/src/main/java/org/carbondata/processing/datatypes/PrimitiveDataType.java
@@ -155,8 +155,13 @@ public class PrimitiveDataType implements GenericDataType {
   public void parseStringAndWriteByteArray(String tableName, String inputString,
       String[] delimiter, int delimiterIndex, DataOutputStream dataOutputStream,
       CarbonCSVBasedDimSurrogateKeyGen surrogateKeyGen) throws KettleException, IOException {
-    dataOutputStream.writeInt(surrogateKeyGen.generateSurrogateKeys(inputString, tableName
-        + CarbonCommonConstants.UNDERSCORE + name, this.getColumnId()));
+    Integer surrogateKey = surrogateKeyGen
+        .generateSurrogateKeys(inputString, tableName + CarbonCommonConstants.UNDERSCORE + name,
+            this.getColumnId());
+    if (surrogateKey == CarbonCommonConstants.INVALID_SURROGATE_KEY) {
+      surrogateKey = CarbonCommonConstants.MEMBER_DEFAULT_VAL_SURROGATE_KEY;
+    }
+    dataOutputStream.writeInt(surrogateKey);
   }
 
   /*

http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/31d824dd/processing/src/main/java/org/carbondata/processing/surrogatekeysgenerator/csvbased/CarbonCSVBasedSeqGenStep.java
----------------------------------------------------------------------
diff --git a/processing/src/main/java/org/carbondata/processing/surrogatekeysgenerator/csvbased/CarbonCSVBasedSeqGenStep.java b/processing/src/main/java/org/carbondata/processing/surrogatekeysgenerator/csvbased/CarbonCSVBasedSeqGenStep.java
index 237d793..2978ad2 100644
--- a/processing/src/main/java/org/carbondata/processing/surrogatekeysgenerator/csvbased/CarbonCSVBasedSeqGenStep.java
+++ b/processing/src/main/java/org/carbondata/processing/surrogatekeysgenerator/csvbased/CarbonCSVBasedSeqGenStep.java
@@ -1146,8 +1146,7 @@ public class CarbonCSVBasedSeqGenStep extends BaseStep {
             }
           }
           if (surrogateKeyForHrrchy[0] == CarbonCommonConstants.INVALID_SURROGATE_KEY) {
-            addEntryToBadRecords(r, inputColumnsSize, j, columnName);
-            return null;
+            surrogateKeyForHrrchy[0] = CarbonCommonConstants.MEMBER_DEFAULT_VAL_SURROGATE_KEY;
           }
         }
         for (int k = 0; k < surrogateKeyForHrrchy.length; k++) {