You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@carbondata.apache.org by aj...@apache.org on 2020/12/18 12:36:16 UTC

[carbondata] branch master updated: [CARBONDATA-4084] Fixed data corruption issue after fallback of Local dictionary

This is an automated email from the ASF dual-hosted git repository.

ajantha pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/carbondata.git


The following commit(s) were added to refs/heads/master by this push:
     new 297b18f  [CARBONDATA-4084] Fixed data corruption issue after fallback of Local dictionary
297b18f is described below

commit 297b18f68d0a811bf612187ab94efe15211d2679
Author: kumarvishal09 <ku...@gmail.com>
AuthorDate: Fri Dec 18 15:48:58 2020 +0530

    [CARBONDATA-4084] Fixed data corruption issue after fallback of Local dictionary
    
    Why is this PR needed?
    Data is getting corrupted when fallback happens for local dictionary encoded page.
    
    What changes were proposed in this PR?
    In LVByteBufferColumnPage Length getting added inside put method. In case of fallback when reverse lookup is done and encoded page is replaced with actual value[Dictionary values]. As actual Dictionary value is already present in LV format again one more length is getting added[LLV], so during query it's showing extra character for those columns
    
    Does this PR introduce any user interface change?
    No
    
    Is any new testcase added?
    No
    
    This closes #4058
---
 .../datastore/page/DecoderBasedFallbackEncoder.java  | 20 ++++++++++++++++++--
 .../generator/ColumnLocalDictionaryGenerator.java    | 11 +++++++++++
 .../generator/LocalDictionaryGenerator.java          |  6 ++++++
 3 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/page/DecoderBasedFallbackEncoder.java b/core/src/main/java/org/apache/carbondata/core/datastore/page/DecoderBasedFallbackEncoder.java
index 30d07f7..abb4160 100644
--- a/core/src/main/java/org/apache/carbondata/core/datastore/page/DecoderBasedFallbackEncoder.java
+++ b/core/src/main/java/org/apache/carbondata/core/datastore/page/DecoderBasedFallbackEncoder.java
@@ -114,8 +114,24 @@ public class DecoderBasedFallbackEncoder implements Callable<FallbackEncodedColu
     for (int i = 0; i < pageSize; i++) {
       int index = reverseInvertedIndex[i] * 3;
       int keyArray = (int) keyGenerator.getKeyArray(bytes, index)[0];
-      actualDataColumnPage
-          .putBytes(rowId++, localDictionaryGenerator.getDictionaryKeyBasedOnValue(keyArray));
+      if (actualDataColumnPage instanceof LVByteBufferColumnPage) {
+        //TODO It's a quick fix, we can fallback to old logic adding Length outside of
+        // LVByteBufferColumnPage. Update the page data with Length before adding to column page.
+        // so it won't be added again when fallback scenario occurs.
+        // Though this is not a bottleneck as after fallback only encoded page will be decoded
+        // and System.array copy is very fast. So for few pages it won't impact much
+        byte[] dictionaryKeyBasedOnValue =
+            localDictionaryGenerator.getDictionaryKeyBasedOnValue(keyArray);
+        byte[] output =
+            new byte[dictionaryKeyBasedOnValue.length - localDictionaryGenerator.getLVLength()];
+        System
+            .arraycopy(dictionaryKeyBasedOnValue, localDictionaryGenerator.getLVLength(), output, 0,
+                output.length);
+        actualDataColumnPage.putBytes(rowId++, output);
+      } else {
+        actualDataColumnPage
+            .putBytes(rowId++, localDictionaryGenerator.getDictionaryKeyBasedOnValue(keyArray));
+      }
     }
 
     // get column spec for existing column page
diff --git a/core/src/main/java/org/apache/carbondata/core/localdictionary/generator/ColumnLocalDictionaryGenerator.java b/core/src/main/java/org/apache/carbondata/core/localdictionary/generator/ColumnLocalDictionaryGenerator.java
index d7df88f..de5f84b 100644
--- a/core/src/main/java/org/apache/carbondata/core/localdictionary/generator/ColumnLocalDictionaryGenerator.java
+++ b/core/src/main/java/org/apache/carbondata/core/localdictionary/generator/ColumnLocalDictionaryGenerator.java
@@ -34,10 +34,16 @@ public class ColumnLocalDictionaryGenerator implements LocalDictionaryGenerator
    */
   private DictionaryStore dictionaryHolder;
 
+  /**
+   * number of bytes used for storing the length of the dictionary value
+   */
+  private int lvLength;
+
   public ColumnLocalDictionaryGenerator(int threshold, int lvLength) {
     // adding 1 to threshold for null value
     int newThreshold = threshold + 1;
     this.dictionaryHolder = new MapBasedDictionaryStore(newThreshold);
+    this.lvLength = lvLength;
     ByteBuffer byteBuffer = ByteBuffer.allocate(
         lvLength + CarbonCommonConstants.MEMBER_DEFAULT_VAL_ARRAY.length);
 
@@ -86,4 +92,9 @@ public class ColumnLocalDictionaryGenerator implements LocalDictionaryGenerator
   public byte[] getDictionaryKeyBasedOnValue(int value) {
     return this.dictionaryHolder.getDictionaryKeyBasedOnValue(value);
   }
+
+  @Override
+  public int getLVLength() {
+    return this.lvLength;
+  }
 }
diff --git a/core/src/main/java/org/apache/carbondata/core/localdictionary/generator/LocalDictionaryGenerator.java b/core/src/main/java/org/apache/carbondata/core/localdictionary/generator/LocalDictionaryGenerator.java
index 35a9f65..05a9473 100644
--- a/core/src/main/java/org/apache/carbondata/core/localdictionary/generator/LocalDictionaryGenerator.java
+++ b/core/src/main/java/org/apache/carbondata/core/localdictionary/generator/LocalDictionaryGenerator.java
@@ -46,4 +46,10 @@ public interface LocalDictionaryGenerator {
    * @return dictionary key based on value
    */
   byte[] getDictionaryKeyBasedOnValue(int value);
+
+  /**
+   * @return number of bytes used for storing the length of the dictionary value
+   * for String type 2 Bytes and Long String 4 bytes
+   */
+  int getLVLength();
 }