You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by GitBox <gi...@apache.org> on 2022/06/16 00:34:09 UTC

[GitHub] [pinot] Jackie-Jiang commented on a diff in pull request #8872: DataTable V4 implementation with per-column null bitmaps

Jackie-Jiang commented on code in PR #8872:
URL: https://github.com/apache/pinot/pull/8872#discussion_r898573213


##########
pinot-core/src/main/java/org/apache/pinot/core/common/datatable/DataTableImplV4.java:
##########
@@ -19,36 +19,306 @@
 
 package org.apache.pinot.core.common.datatable;
 
+import java.io.DataOutputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.util.HashMap;
 import java.util.Map;
 import org.apache.pinot.common.utils.DataSchema;
+import org.apache.pinot.core.common.datablock.DataBlockUtils;
 import org.apache.pinot.core.common.datablock.RowDataBlock;
 import org.apache.pinot.spi.annotations.InterfaceStability;
+import org.roaringbitmap.buffer.MutableRoaringBitmap;
 
 
 /**
- * Datatable V4 Implementation is a wrapper around the Row-based data block.
+ * Datatable V4 implementation is a wrapper around the Row-based data block.
+ * The layout of serialized V4 datatable looks like:
+ * +-----------------------------------------------+
+ * | 17 integers of header:                        |
+ * | VERSION                                       |
+ * | NUM_ROWS                                      |
+ * | NUM_COLUMNS                                   |
+ * | EXCEPTIONS SECTION START OFFSET               |
+ * | EXCEPTIONS SECTION LENGTH                     |
+ * | DICTIONARY_MAP SECTION START OFFSET           |
+ * | DICTIONARY_MAP SECTION LENGTH                 |
+ * | DATA_SCHEMA SECTION START OFFSET              |
+ * | DATA_SCHEMA SECTION LENGTH                    |
+ * | FIXED_SIZE_DATA SECTION START OFFSET          |
+ * | FIXED_SIZE_DATA SECTION LENGTH                |
+ * | VARIABLE_SIZE_DATA SECTION START OFFSET       |
+ * | VARIABLE_SIZE_DATA SECTION LENGTH             |
+ * | FIXED_SIZE_NULL_VECTOR SECTION START OFFSET   |
+ * | FIXED_SIZE_NULL_VECTOR SECTION LENGTH         |
+ * | VARIABLE_SIZE_NULL_VECTOR SECTION START OFFSET|
+ * | VARIABLE_SIZE_NULL_VECTOR SECTION LENGTH      |
+ * +-----------------------------------------------+
+ * | EXCEPTIONS SECTION                            |
+ * +-----------------------------------------------+
+ * | DICTIONARY_MAP SECTION                        |
+ * +-----------------------------------------------+
+ * | DATA_SCHEMA SECTION                           |
+ * +-----------------------------------------------+
+ * | FIXED_SIZE_DATA SECTION                       |
+ * +-----------------------------------------------+
+ * | VARIABLE_SIZE_DATA SECTION                    |
+ * +-----------------------------------------------+
+ * | FIXED_SIZE_NULL_VECTOR SECTION                |
+ * +-----------------------------------------------+
+ * | VARIABLE_SIZE_VECTOR SECTION SECTION          |
+ * +-----------------------------------------------+
+ * | METADATA LENGTH                               |
+ * | METADATA SECTION                              |
+ * +-----------------------------------------------+
  */
 @InterfaceStability.Evolving
 public class DataTableImplV4 extends RowDataBlock {
+  private static final int HEADER_SIZE = Integer.BYTES * 17;
+  protected byte[] _fixedSizeNullVectorBytes;
+  protected ByteBuffer _fixedSizeNullVectorData;
+  protected byte[] _variableSizeNullVectorBytes;
+  protected ByteBuffer _variableSizeNullVectorData;
+
+  public DataTableImplV4(int numRows, DataSchema dataSchema, Map<String, Map<Integer, String>> dictionaryMap,

Review Comment:
   Move the changes to `BaseDataBlock`



##########
pinot-core/src/main/java/org/apache/pinot/core/common/ObjectSerDeUtils.java:
##########
@@ -119,7 +119,8 @@ public enum ObjectType {
     LongLongPair(28),
     FloatLongPair(29),
     DoubleLongPair(30),
-    StringLongPair(31);
+    StringLongPair(31),
+    Missing(100);

Review Comment:
   Let's name it `Null` for clarity



##########
pinot-common/src/main/java/org/apache/pinot/common/utils/DataTable.java:
##########
@@ -76,6 +77,8 @@ byte[] toBytes()
 
   String[] getStringArray(int rowId, int colId);
 
+  MutableRoaringBitmap getNullRowIds(int colId);

Review Comment:
   Suggest returning `RoaringBitmap` which has better performance



##########
pinot-core/src/main/java/org/apache/pinot/core/common/datatable/DataTableImplV4.java:
##########
@@ -19,36 +19,306 @@
 
 package org.apache.pinot.core.common.datatable;
 
+import java.io.DataOutputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.util.HashMap;
 import java.util.Map;
 import org.apache.pinot.common.utils.DataSchema;
+import org.apache.pinot.core.common.datablock.DataBlockUtils;
 import org.apache.pinot.core.common.datablock.RowDataBlock;
 import org.apache.pinot.spi.annotations.InterfaceStability;
+import org.roaringbitmap.buffer.MutableRoaringBitmap;
 
 
 /**
- * Datatable V4 Implementation is a wrapper around the Row-based data block.
+ * Datatable V4 implementation is a wrapper around the Row-based data block.
+ * The layout of serialized V4 datatable looks like:
+ * +-----------------------------------------------+
+ * | 17 integers of header:                        |
+ * | VERSION                                       |
+ * | NUM_ROWS                                      |
+ * | NUM_COLUMNS                                   |
+ * | EXCEPTIONS SECTION START OFFSET               |
+ * | EXCEPTIONS SECTION LENGTH                     |
+ * | DICTIONARY_MAP SECTION START OFFSET           |
+ * | DICTIONARY_MAP SECTION LENGTH                 |
+ * | DATA_SCHEMA SECTION START OFFSET              |
+ * | DATA_SCHEMA SECTION LENGTH                    |
+ * | FIXED_SIZE_DATA SECTION START OFFSET          |
+ * | FIXED_SIZE_DATA SECTION LENGTH                |
+ * | VARIABLE_SIZE_DATA SECTION START OFFSET       |
+ * | VARIABLE_SIZE_DATA SECTION LENGTH             |
+ * | FIXED_SIZE_NULL_VECTOR SECTION START OFFSET   |

Review Comment:
   We can put the bitmaps into existing `fixedSizeData` and `variableSizeData`. If no bitmap is provided, we don't write anything. If some bitmaps are provided, we store 2 ints per column in the fixed size data (variable size data buffer offset & length). On the reader side, we can derive whether null bitmap is available by checking the size of the fixed size data. The overhead would be minimal if null handling is not enabled, where the data table will be exactly the same as before.



##########
pinot-core/src/main/java/org/apache/pinot/core/common/NullBitmapUtils.java:
##########
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.common;
+
+import java.io.DataOutputStream;
+import java.io.IOException;
+import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
+
+
+public final class NullBitmapUtils {

Review Comment:
   If we switch to use `RoaringBitmap`, we already have a ser-de: `ObjectSerDeUtils.ROARING_BITMAP_SER_DE`



##########
pinot-core/src/main/java/org/apache/pinot/core/common/datablock/DataBlockFactory.java:
##########
@@ -0,0 +1,47 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.common.datablock;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import org.apache.pinot.core.common.datatable.DataTableBuilder;
+import org.apache.pinot.core.common.datatable.DataTableImplV4;
+
+
+public class DataBlockFactory {

Review Comment:
   Do we need this factory? We should use `DataTableFactory`



##########
pinot-core/src/main/java/org/apache/pinot/core/common/datablock/DataBlockBuilder.java:
##########
@@ -76,7 +83,13 @@ private DataBlockBuilder(DataSchema dataSchema, BaseDataBlock.Type blockType) {
     }
   }
 
-  public static RowDataBlock buildFromRows(List<Object[]> rows, DataSchema dataSchema)
+  public void setNullRowIds(ImmutableRoaringBitmap nullBitmap)
+      throws IOException {
+    NullBitmapUtils.setNullRowIds(nullBitmap, _fixedSizeNullVectorOutputStream, _variableSizeNullVectorOutputStream);
+  }
+
+  public static RowDataBlock buildFromRows(List<Object[]> rows, ImmutableRoaringBitmap[] colNullBitmaps,

Review Comment:
   ```suggestion
     public static RowDataBlock buildFromRows(List<Object[]> rows, @Nullable ImmutableRoaringBitmap[] colNullBitmaps,
   ```



##########
pinot-core/src/main/java/org/apache/pinot/core/common/ObjectSerDeUtils.java:
##########
@@ -131,6 +132,10 @@ public int getValue() {
     }
 
     public static ObjectType getObjectType(Object value) {

Review Comment:
   ```suggestion
       public static ObjectType getObjectType(@Nullable Object value) {
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org