You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by ja...@apache.org on 2022/04/13 23:45:05 UTC

[pinot] branch master updated: Allow disabling dict generation for High cardinality columns (#8398)

This is an automated email from the ASF dual-hosted git repository.

jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 476679da86 Allow disabling dict generation for High cardinality columns (#8398)
476679da86 is described below

commit 476679da861f20b7c7b1100f818a03eae60a9776
Author: Kartik Khare <kh...@gmail.com>
AuthorDate: Thu Apr 14 05:15:00 2022 +0530

    Allow disabling dict generation for High cardinality columns (#8398)
    
    For high cardinality metric columns (especially doubles/floats), dictionary may be an overhead not only for storage space but also it adds one additional hop for read.
    During segment generation phase, we can check if a column would require more storage for dictionary vs raw values and can choose to not create dictionary in that phase.
    
    The feature is off by default and can be enabled by setting optimizeDictionaryEnabled to true.
---
 .../creator/impl/SegmentColumnarIndexCreator.java  |  20 +++
 .../segment/creator/DictionaryOptimiserTest.java   | 178 +++++++++++++++++++++
 .../resources/data/mixed_cardinality_data.avro     | Bin 0 -> 553960 bytes
 .../spi/creator/SegmentGeneratorConfig.java        |  21 +++
 .../pinot/spi/config/table/IndexingConfig.java     |  23 +++
 5 files changed, 242 insertions(+)

diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java
index b0b9d3891b..d5e2fe710e 100644
--- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java
+++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java
@@ -264,6 +264,26 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
         .containsKey(column)) {
       return false;
     }
+
+    // Do not create dictionary if index size with dictionary is going to be larger than index size without dictionary
+    // This is done to reduce the cost of dictionary for high cardinality columns
+    // Off by default and needs optimizeDictionaryEnabled to be set to true
+    if (config.isOptimizeDictionaryForMetrics() && spec.getFieldType() == FieldType.METRIC
+        && spec.isSingleValueField() && spec.getDataType().isFixedWidth()) {
+      long dictionarySize = info.getDistinctValueCount() * spec.getDataType().size();
+      long forwardIndexSize =
+          ((long) info.getTotalNumberOfEntries() * PinotDataBitSet.getNumBitsPerValue(info.getDistinctValueCount() - 1)
+              + Byte.SIZE - 1) / Byte.SIZE;
+
+      double indexWithDictSize = dictionarySize + forwardIndexSize;
+      double indexWithoutDictSize = info.getTotalNumberOfEntries() * spec.getDataType().size();
+
+      double indexSizeRatio = indexWithoutDictSize / indexWithDictSize;
+      if (indexSizeRatio <= config.getNoDictionarySizeRatioThreshold()) {
+        return false;
+      }
+    }
+
     return info.isCreateDictionary();
   }
 
diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/DictionaryOptimiserTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/DictionaryOptimiserTest.java
new file mode 100644
index 0000000000..5e2e932080
--- /dev/null
+++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/DictionaryOptimiserTest.java
@@ -0,0 +1,178 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.segment.creator;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.Objects;
+import java.util.concurrent.TimeUnit;
+import org.apache.avro.file.DataFileStream;
+import org.apache.avro.generic.GenericDatumReader;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.commons.io.FileUtils;
+import org.apache.pinot.plugin.inputformat.avro.AvroUtils;
+import org.apache.pinot.segment.local.indexsegment.immutable.ImmutableSegmentLoader;
+import org.apache.pinot.segment.local.segment.creator.impl.SegmentCreationDriverFactory;
+import org.apache.pinot.segment.spi.ImmutableSegment;
+import org.apache.pinot.segment.spi.creator.SegmentGeneratorConfig;
+import org.apache.pinot.segment.spi.creator.SegmentIndexCreationDriver;
+import org.apache.pinot.segment.spi.creator.SegmentVersion;
+import org.apache.pinot.spi.config.table.TableType;
+import org.apache.pinot.spi.data.DimensionFieldSpec;
+import org.apache.pinot.spi.data.FieldSpec;
+import org.apache.pinot.spi.data.MetricFieldSpec;
+import org.apache.pinot.spi.data.Schema;
+import org.apache.pinot.spi.data.readers.FileFormat;
+import org.apache.pinot.spi.utils.ReadMode;
+import org.apache.pinot.spi.utils.builder.TableConfigBuilder;
+import org.apache.pinot.util.TestUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.testng.Assert;
+import org.testng.annotations.AfterClass;
+import org.testng.annotations.BeforeClass;
+import org.testng.annotations.Test;
+
+
+public class DictionaryOptimiserTest {
+  private static final Logger LOGGER = LoggerFactory.getLogger(DictionaryOptimiserTest.class);
+
+  private static final String AVRO_DATA = "data/mixed_cardinality_data.avro";
+  private static final File INDEX_DIR = new File(DictionariesTest.class.toString());
+
+  private static File _segmentDirectory;
+
+  @AfterClass
+  public static void cleanup() {
+    FileUtils.deleteQuietly(INDEX_DIR);
+  }
+
+  @BeforeClass
+  public static void before()
+      throws Exception {
+    final String filePath = TestUtils.getFileFromResourceUrl(
+        Objects.requireNonNull(DictionaryOptimiserTest.class.getClassLoader().getResource(AVRO_DATA)));
+    if (INDEX_DIR.exists()) {
+      FileUtils.deleteQuietly(INDEX_DIR);
+    }
+
+    final SegmentGeneratorConfig config =
+        getSegmentGenSpecWithSchemAndProjectedColumns(new File(filePath), INDEX_DIR, "time_column", TimeUnit.DAYS,
+            "test");
+
+    // The segment generation code in SegmentColumnarIndexCreator will throw
+    // exception if start and end time in time column are not in acceptable
+    // range. For this test, we first need to fix the input avro data
+    // to have the time column values in allowed range. Until then, the check
+    // is explicitly disabled
+    config.setSkipTimeValueCheck(true);
+    final SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null);
+    driver.init(config);
+    driver.build();
+    _segmentDirectory = new File(INDEX_DIR, driver.getSegmentName());
+
+    final DataFileStream<GenericRecord> avroReader = AvroUtils.getAvroReader(new File(filePath));
+    final org.apache.avro.Schema avroSchema = avroReader.getSchema();
+    final String[] columns = new String[avroSchema.getFields().size()];
+    int i = 0;
+    for (final org.apache.avro.Schema.Field f : avroSchema.getFields()) {
+      columns[i] = f.name();
+      i++;
+    }
+  }
+
+  @Test
+  public void testDictionaryForMixedCardinalities()
+      throws Exception {
+    ImmutableSegment heapSegment = ImmutableSegmentLoader.load(_segmentDirectory, ReadMode.heap);
+
+    Schema schema = heapSegment.getSegmentMetadata().getSchema();
+    for (FieldSpec fieldSpec : schema.getAllFieldSpecs()) {
+      // Skip virtual columns
+      if (fieldSpec.isVirtualColumn()) {
+        continue;
+      }
+
+      String columnName = fieldSpec.getName();
+      if (columnName.contains("low_cardinality")) {
+        Assert.assertTrue(heapSegment.getForwardIndex(columnName).isDictionaryEncoded(),
+            "No dictionary found for low cardinality columns");
+      }
+
+      if (columnName.contains("high_cardinality")) {
+        Assert.assertFalse(heapSegment.getForwardIndex(columnName).isDictionaryEncoded(),
+            "No Raw index for high cardinality columns");
+      }
+    }
+  }
+
+  public static SegmentGeneratorConfig getSegmentGenSpecWithSchemAndProjectedColumns(File inputAvro, File outputDir,
+      String timeColumn, TimeUnit timeUnit, String tableName)
+      throws IOException {
+
+    final SegmentGeneratorConfig segmentGenSpec =
+        new SegmentGeneratorConfig(new TableConfigBuilder(TableType.OFFLINE).setTableName(tableName).build(),
+            extractSchemaFromAvroWithoutTime(inputAvro));
+    segmentGenSpec.setInputFilePath(inputAvro.getAbsolutePath());
+    segmentGenSpec.setTimeColumnName(timeColumn);
+    segmentGenSpec.setSegmentTimeUnit(timeUnit);
+    segmentGenSpec.setFormat(FileFormat.AVRO);
+    segmentGenSpec.setSegmentVersion(SegmentVersion.v1);
+    segmentGenSpec.setTableName(tableName);
+    segmentGenSpec.setOutDir(outputDir.getAbsolutePath());
+    segmentGenSpec.setOptimizeDictionaryForMetrics(true);
+    segmentGenSpec.setNoDictionarySizeRatioThreshold(0.9);
+    return segmentGenSpec;
+  }
+
+  public static Schema extractSchemaFromAvroWithoutTime(File avroFile)
+      throws IOException {
+    DataFileStream<GenericRecord> dataStream =
+        new DataFileStream<GenericRecord>(new FileInputStream(avroFile), new GenericDatumReader<GenericRecord>());
+    Schema schema = new Schema();
+
+    for (final org.apache.avro.Schema.Field field : dataStream.getSchema().getFields()) {
+      try {
+        SegmentTestUtils.getColumnType(field);
+      } catch (Exception e) {
+        LOGGER.warn("Caught exception while converting Avro field {} of type {}, field will not be in schema.",
+            field.name(), field.schema().getType());
+        continue;
+      }
+      final String columnName = field.name();
+      final String pinotType = field.getProp("pinotType");
+
+      final FieldSpec fieldSpec;
+      if ((pinotType != null && "METRIC".equals(pinotType)) || columnName.contains("cardinality")) {
+        fieldSpec = new MetricFieldSpec();
+      } else {
+        fieldSpec = new DimensionFieldSpec();
+      }
+
+      fieldSpec.setName(columnName);
+      fieldSpec.setDataType(SegmentTestUtils.getColumnType(dataStream.getSchema().getField(columnName)));
+      fieldSpec.setSingleValueField(AvroUtils.isSingleValueField(dataStream.getSchema().getField(columnName)));
+      schema.addField(fieldSpec);
+    }
+
+    dataStream.close();
+    return schema;
+  }
+}
diff --git a/pinot-segment-local/src/test/resources/data/mixed_cardinality_data.avro b/pinot-segment-local/src/test/resources/data/mixed_cardinality_data.avro
new file mode 100644
index 0000000000..2869829d7d
Binary files /dev/null and b/pinot-segment-local/src/test/resources/data/mixed_cardinality_data.avro differ
diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/SegmentGeneratorConfig.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/SegmentGeneratorConfig.java
index 1f1359f035..d773189b2e 100644
--- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/SegmentGeneratorConfig.java
+++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/SegmentGeneratorConfig.java
@@ -68,6 +68,7 @@ public class SegmentGeneratorConfig implements Serializable {
   }
 
   private static final Logger LOGGER = LoggerFactory.getLogger(SegmentGeneratorConfig.class);
+  public static final double DEFAULT_NO_DICTIONARY_SIZE_RATIO_THRESHOLD = 0.85d;
 
   private TableConfig _tableConfig;
   private final Map<String, String> _customProperties = new HashMap<>();
@@ -111,6 +112,8 @@ public class SegmentGeneratorConfig implements Serializable {
   private boolean _skipTimeValueCheck = false;
   private boolean _nullHandlingEnabled = false;
   private boolean _failOnEmptySegment = false;
+  private boolean _optimizeDictionaryForMetrics = false;
+  private double _noDictionarySizeRatioThreshold = DEFAULT_NO_DICTIONARY_SIZE_RATIO_THRESHOLD;
 
   // constructed from FieldConfig
   private Map<String, Map<String, String>> _columnProperties = new HashMap<>();
@@ -206,6 +209,8 @@ public class SegmentGeneratorConfig implements Serializable {
       _fstTypeForFSTIndex = tableConfig.getIndexingConfig().getFSTIndexType();
 
       _nullHandlingEnabled = indexingConfig.isNullHandlingEnabled();
+      _optimizeDictionaryForMetrics = indexingConfig.isOptimizeDictionaryForMetrics();
+      _noDictionarySizeRatioThreshold = indexingConfig.getNoDictionarySizeRatioThreshold();
     }
   }
 
@@ -757,6 +762,22 @@ public class SegmentGeneratorConfig implements Serializable {
     _nullHandlingEnabled = nullHandlingEnabled;
   }
 
+  public boolean isOptimizeDictionaryForMetrics() {
+    return _optimizeDictionaryForMetrics;
+  }
+
+  public void setOptimizeDictionaryForMetrics(boolean optimizeDictionaryForMetrics) {
+    _optimizeDictionaryForMetrics = optimizeDictionaryForMetrics;
+  }
+
+  public double getNoDictionarySizeRatioThreshold() {
+    return _noDictionarySizeRatioThreshold;
+  }
+
+  public void setNoDictionarySizeRatioThreshold(double noDictionarySizeRatioThreshold) {
+    _noDictionarySizeRatioThreshold = noDictionarySizeRatioThreshold;
+  }
+
   public boolean isFailOnEmptySegment() {
     return _failOnEmptySegment;
   }
diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/IndexingConfig.java b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/IndexingConfig.java
index ad774e8e61..0211c4c48f 100644
--- a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/IndexingConfig.java
+++ b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/IndexingConfig.java
@@ -54,6 +54,13 @@ public class IndexingConfig extends BaseJsonConfig {
   private boolean _aggregateMetrics;
   private boolean _nullHandlingEnabled;
 
+  /**
+   * If `optimizeDictionaryForMetrics` enabled, dictionary is not created for the metric columns
+   * for which rawIndexSize / forwardIndexSize is less than the `noDictionarySizeRatioThreshold`.
+   */
+  private boolean _optimizeDictionaryForMetrics;
+  private double _noDictionarySizeRatioThreshold;
+
   // TODO: Add a new configuration related to the segment generation
   private boolean _autoGeneratedInvertedIndex;
   private boolean _createInvertedIndexDuringSegmentGeneration;
@@ -276,6 +283,22 @@ public class IndexingConfig extends BaseJsonConfig {
     _nullHandlingEnabled = nullHandlingEnabled;
   }
 
+  public boolean isOptimizeDictionaryForMetrics() {
+    return _optimizeDictionaryForMetrics;
+  }
+
+  public void setOptimizeDictionaryForMetrics(boolean optimizeDictionaryForMetrics) {
+    _optimizeDictionaryForMetrics = optimizeDictionaryForMetrics;
+  }
+
+  public double getNoDictionarySizeRatioThreshold() {
+    return _noDictionarySizeRatioThreshold;
+  }
+
+  public void setNoDictionarySizeRatioThreshold(double noDictionarySizeRatioThreshold) {
+    _noDictionarySizeRatioThreshold = noDictionarySizeRatioThreshold;
+  }
+
   public String getSegmentNameGeneratorType() {
     return _segmentNameGeneratorType;
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org