You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@carbondata.apache.org by ja...@apache.org on 2018/08/30 08:13:45 UTC

carbondata git commit: [CARBONDATA-2856][BloomDataMap] Fix bug in bloom index on multiple dictionary columns

Repository: carbondata
Updated Branches:
  refs/heads/master de0f54516 -> 7fc0c6b9f


[CARBONDATA-2856][BloomDataMap] Fix bug in bloom index on multiple dictionary columns

During direct generating bloom index for dictionary and date columns, we
will use MdkKeyGenerator to get the surrogate key. But the
MdkKeyGenerator is for all MDK, so we need to pad fake bytes to mdk.
Finally, the generator will only deal with the real parts of the mdk and
get the final value.

This closes #2635


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/7fc0c6b9
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/7fc0c6b9
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/7fc0c6b9

Branch: refs/heads/master
Commit: 7fc0c6b9f372cda66aa10fdcc0a0cb97ae38ab26
Parents: de0f545
Author: xuchuanyin <xu...@hust.edu.cn>
Authored: Wed Aug 15 10:36:48 2018 +0800
Committer: Jacky Li <ja...@qq.com>
Committed: Thu Aug 30 16:13:02 2018 +0800

----------------------------------------------------------------------
 .../apache/carbondata/core/util/CarbonUtil.java |  2 +-
 .../datamap/bloom/BloomDataMapWriter.java       | 38 +++-----------------
 .../BloomCoarseGrainDataMapFunctionSuite.scala  |  2 +-
 3 files changed, 6 insertions(+), 36 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/7fc0c6b9/core/src/main/java/org/apache/carbondata/core/util/CarbonUtil.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/util/CarbonUtil.java b/core/src/main/java/org/apache/carbondata/core/util/CarbonUtil.java
index f182239..c247690 100644
--- a/core/src/main/java/org/apache/carbondata/core/util/CarbonUtil.java
+++ b/core/src/main/java/org/apache/carbondata/core/util/CarbonUtil.java
@@ -1840,7 +1840,7 @@ public final class CarbonUtil {
         surrogate ^= data[startOffsetOfData + 3] & 0xFF;
         return surrogate;
       default:
-        throw new IllegalArgumentException("Int cannot me more than 4 bytes");
+        throw new IllegalArgumentException("Int cannot be more than 4 bytes");
     }
   }
   /**

http://git-wip-us.apache.org/repos/asf/carbondata/blob/7fc0c6b9/datamap/bloom/src/main/java/org/apache/carbondata/datamap/bloom/BloomDataMapWriter.java
----------------------------------------------------------------------
diff --git a/datamap/bloom/src/main/java/org/apache/carbondata/datamap/bloom/BloomDataMapWriter.java b/datamap/bloom/src/main/java/org/apache/carbondata/datamap/bloom/BloomDataMapWriter.java
index 1d01e66..cad9787 100644
--- a/datamap/bloom/src/main/java/org/apache/carbondata/datamap/bloom/BloomDataMapWriter.java
+++ b/datamap/bloom/src/main/java/org/apache/carbondata/datamap/bloom/BloomDataMapWriter.java
@@ -24,7 +24,6 @@ import java.util.Map;
 import org.apache.carbondata.common.annotations.InterfaceAudience;
 import org.apache.carbondata.core.datamap.Segment;
 import org.apache.carbondata.core.datastore.block.SegmentProperties;
-import org.apache.carbondata.core.keygenerator.KeyGenerator;
 import org.apache.carbondata.core.keygenerator.columnar.ColumnarSplitter;
 import org.apache.carbondata.core.metadata.datatype.DataTypes;
 import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn;
@@ -43,13 +42,10 @@ import org.apache.commons.collections.Predicate;
  */
 @InterfaceAudience.Internal
 public class BloomDataMapWriter extends AbstractBloomDataMapWriter {
-  private KeyGenerator keyGenerator;
   private ColumnarSplitter columnarSplitter;
   // for the dict/sort/date column, they are encoded in MDK,
   // this maps the index column name to the index in MDK
   private Map<String, Integer> indexCol2MdkIdx;
-  // this gives the reverse map to indexCol2MdkIdx
-  private Map<Integer, String> mdkIdx2IndexCol;
 
   BloomDataMapWriter(String tablePath, String dataMapName, List<CarbonColumn> indexColumns,
       Segment segment, String shardName, SegmentProperties segmentProperties,
@@ -58,10 +54,8 @@ public class BloomDataMapWriter extends AbstractBloomDataMapWriter {
     super(tablePath, dataMapName, indexColumns, segment, shardName, segmentProperties,
         bloomFilterSize, bloomFilterFpp, compressBloom);
 
-    keyGenerator = segmentProperties.getDimensionKeyGenerator();
     columnarSplitter = segmentProperties.getFixedLengthKeySplitter();
     this.indexCol2MdkIdx = new HashMap<>();
-    this.mdkIdx2IndexCol = new HashMap<>();
     int idx = 0;
     for (final CarbonDimension dimension : segmentProperties.getDimensions()) {
       if (!dimension.isGlobalDictionaryEncoding() && !dimension.isDirectDictionaryEncoding()) {
@@ -74,7 +68,6 @@ public class BloomDataMapWriter extends AbstractBloomDataMapWriter {
       });
       if (isExistInIndex) {
         this.indexCol2MdkIdx.put(dimension.getColName(), idx);
-        this.mdkIdx2IndexCol.put(idx, dimension.getColName());
       }
       idx++;
     }
@@ -91,34 +84,11 @@ public class BloomDataMapWriter extends AbstractBloomDataMapWriter {
   @Override
   protected byte[] convertDictionaryValue(int indexColIdx, Object value) {
     // input value from onPageAdded in load process is byte[]
-    byte[] fakeMdkBytes;
-    // this means that we need to pad some fake bytes
-    // to get the whole MDK in corresponding position
-    if (columnarSplitter.getBlockKeySize().length > indexCol2MdkIdx.size()) {
-      int totalSize = 0;
-      for (int size : columnarSplitter.getBlockKeySize()) {
-        totalSize += size;
-      }
-      fakeMdkBytes = new byte[totalSize];
 
-      // put this bytes to corresponding position
-      int thisKeyIdx = indexCol2MdkIdx.get(indexColumns.get(indexColIdx).getColName());
-      int destPos = 0;
-      for (int keyIdx = 0; keyIdx < columnarSplitter.getBlockKeySize().length; keyIdx++) {
-        if (thisKeyIdx == keyIdx) {
-          System.arraycopy(value, 0,
-              fakeMdkBytes, destPos, columnarSplitter.getBlockKeySize()[thisKeyIdx]);
-          break;
-        }
-        destPos += columnarSplitter.getBlockKeySize()[keyIdx];
-      }
-    } else {
-      fakeMdkBytes = (byte[])value;
-    }
-    // for dict columns including dictionary and date columns
-    // decode value to get the surrogate key
-    int surrogateKey = (int) keyGenerator.getKey(fakeMdkBytes,
-        indexCol2MdkIdx.get(indexColumns.get(indexColIdx).getColName()));
+    // for dict columns including dictionary and date columns decode value to get the surrogate key
+    int thisKeyIdx = indexCol2MdkIdx.get(indexColumns.get(indexColIdx).getColName());
+    int surrogateKey = CarbonUtil.getSurrogateInternal((byte[]) value, 0,
+        columnarSplitter.getBlockKeySize()[thisKeyIdx]);
     // store the dictionary key in bloom
     return CarbonUtil.getValueAsBytes(DataTypes.INT, surrogateKey);
   }

http://git-wip-us.apache.org/repos/asf/carbondata/blob/7fc0c6b9/integration/spark2/src/test/scala/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMapFunctionSuite.scala
----------------------------------------------------------------------
diff --git a/integration/spark2/src/test/scala/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMapFunctionSuite.scala b/integration/spark2/src/test/scala/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMapFunctionSuite.scala
index fd1345c..7ea89a9 100644
--- a/integration/spark2/src/test/scala/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMapFunctionSuite.scala
+++ b/integration/spark2/src/test/scala/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMapFunctionSuite.scala
@@ -709,7 +709,7 @@ class BloomCoarseGrainDataMapFunctionSuite  extends QueryTest with BeforeAndAfte
       s"""
          | CREATE TABLE $bloomDMSampleTable(empno string, doj date, salary float)
          | STORED BY 'carbondata'
-         | TBLPROPERTIES('SORT_COLUMNS'='empno,doj', 'DICTIONARY_INCLUDE'='doj')
+         | TBLPROPERTIES('SORT_COLUMNS'='empno,doj', 'DICTIONARY_INCLUDE'='doj,empno')
        """.stripMargin)
     sql(
       s"""