You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by ja...@apache.org on 2020/07/17 01:45:25 UTC
[incubator-pinot] branch master updated: Store column min/max value
into segment metadata (#5709)
This is an automated email from the ASF dual-hosted git repository.
jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 158f16c Store column min/max value into segment metadata (#5709)
158f16c is described below
commit 158f16c8a1c3c0203c553c6e8647ba016f9ef5fd
Author: Xiaotian (Jackie) Jiang <17...@users.noreply.github.com>
AuthorDate: Thu Jul 16 18:45:10 2020 -0700
Store column min/max value into segment metadata (#5709)
Column min/max value is very useful information and can be used for query optimization.
When creating the segment, we already collected the column min/max value, but not save them to the metadata.
Changes:
- Store the already collected min/max value to the segment metadata
- Do not store string value with special characters that cannot be handled by the `PropertiesConfiguration`
- Enhance the `SegmentColumnarIndexCreator.removeColumnMetadataInfo()` to handle the removal of new added properties (`PARTITION_FUNCTION` etc.)
- Support min/max value for BYTES columns
- Change default `ColumnMinMaxValueGeneratorMode` to `ALL` so that existing segments can have min/max value generated for all dictionary-encoded columns during segment load
---
.../apache/pinot/common/utils/CommonConstants.java | 2 +-
.../query/pruner/ColumnValueSegmentPruner.java | 33 +++---
.../creator/impl/SegmentColumnarIndexCreator.java | 105 ++++++++++--------
.../impl/SegmentIndexCreationDriverImpl.java | 10 +-
.../core/segment/creator/impl/V1Constants.java | 2 +-
.../ColumnMinMaxValueGenerator.java | 50 ++++++---
.../defaultcolumn/BaseDefaultColumnHandler.java | 21 ++--
.../segment/index/metadata/ColumnMetadata.java | 7 +-
.../impl/SegmentColumnarIndexCreatorTest.java | 121 +++++++++++++++++++++
.../index/loader/SegmentPreProcessorTest.java | 55 +++++++++-
10 files changed, 307 insertions(+), 99 deletions(-)
diff --git a/pinot-common/src/main/java/org/apache/pinot/common/utils/CommonConstants.java b/pinot-common/src/main/java/org/apache/pinot/common/utils/CommonConstants.java
index 5326363..c6b0341 100644
--- a/pinot-common/src/main/java/org/apache/pinot/common/utils/CommonConstants.java
+++ b/pinot-common/src/main/java/org/apache/pinot/common/utils/CommonConstants.java
@@ -247,7 +247,7 @@ public class CommonConstants {
"pinot.server.shutdown.resourceCheckIntervalMs";
public static final long DEFAULT_SHUTDOWN_RESOURCE_CHECK_INTERVAL_MS = 10_000L;
- public static final String DEFAULT_COLUMN_MIN_MAX_VALUE_GENERATOR_MODE = "TIME";
+ public static final String DEFAULT_COLUMN_MIN_MAX_VALUE_GENERATOR_MODE = "ALL";
public static final String PINOT_SERVER_METRICS_PREFIX = "pinot.server.metrics.prefix";
diff --git a/pinot-core/src/main/java/org/apache/pinot/core/query/pruner/ColumnValueSegmentPruner.java b/pinot-core/src/main/java/org/apache/pinot/core/query/pruner/ColumnValueSegmentPruner.java
index e214b63..2704dc6 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/query/pruner/ColumnValueSegmentPruner.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/query/pruner/ColumnValueSegmentPruner.java
@@ -21,7 +21,6 @@ package org.apache.pinot.core.query.pruner;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
-
import org.apache.pinot.core.common.DataSource;
import org.apache.pinot.core.common.DataSourceMetadata;
import org.apache.pinot.core.data.partition.PartitionFunction;
@@ -132,9 +131,13 @@ public class ColumnValueSegmentPruner implements SegmentPruner {
// Check min/max value
Comparable minValue = dataSourceMetadata.getMinValue();
if (minValue != null) {
- Comparable maxValue = dataSourceMetadata.getMaxValue();
- assert maxValue != null;
- if (value.compareTo(minValue) < 0 || value.compareTo(maxValue) > 0) {
+ if (value.compareTo(minValue) < 0) {
+ return true;
+ }
+ }
+ Comparable maxValue = dataSourceMetadata.getMaxValue();
+ if (maxValue != null) {
+ if (value.compareTo(maxValue) > 0) {
return true;
}
}
@@ -206,27 +209,27 @@ public class ColumnValueSegmentPruner implements SegmentPruner {
// Check min/max value
Comparable minValue = dataSourceMetadata.getMinValue();
if (minValue != null) {
- Comparable maxValue = dataSourceMetadata.getMaxValue();
- assert maxValue != null;
-
- if (lowerBoundValue != null) {
- if (lowerInclusive) {
- if (lowerBoundValue.compareTo(maxValue) > 0) {
+ if (upperBoundValue != null) {
+ if (upperInclusive) {
+ if (upperBoundValue.compareTo(minValue) < 0) {
return true;
}
} else {
- if (lowerBoundValue.compareTo(maxValue) >= 0) {
+ if (upperBoundValue.compareTo(minValue) <= 0) {
return true;
}
}
}
- if (upperBoundValue != null) {
- if (upperInclusive) {
- if (upperBoundValue.compareTo(minValue) < 0) {
+ }
+ Comparable maxValue = dataSourceMetadata.getMaxValue();
+ if (maxValue != null) {
+ if (lowerBoundValue != null) {
+ if (lowerInclusive) {
+ if (lowerBoundValue.compareTo(maxValue) > 0) {
return true;
}
} else {
- if (upperBoundValue.compareTo(minValue) <= 0) {
+ if (lowerBoundValue.compareTo(maxValue) >= 0) {
return true;
}
}
diff --git a/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java b/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java
index 06146f3..b9b1947 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentColumnarIndexCreator.java
@@ -18,6 +18,7 @@
*/
package org.apache.pinot.core.segment.creator.impl;
+import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.Iterables;
import java.io.File;
@@ -58,7 +59,6 @@ import org.apache.pinot.spi.data.FieldSpec.DataType;
import org.apache.pinot.spi.data.FieldSpec.FieldType;
import org.apache.pinot.spi.data.Schema;
import org.apache.pinot.spi.data.readers.GenericRow;
-import org.apache.pinot.spi.utils.BytesUtils;
import org.apache.pinot.spi.utils.TimeUtils;
import org.joda.time.DateTimeZone;
import org.joda.time.Interval;
@@ -500,7 +500,8 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
int cardinality = columnIndexCreationInfo.getDistinctValueCount();
properties.setProperty(getKeyFor(column, CARDINALITY), String.valueOf(cardinality));
properties.setProperty(getKeyFor(column, TOTAL_DOCS), String.valueOf(totalDocs));
- properties.setProperty(getKeyFor(column, DATA_TYPE), String.valueOf(fieldSpec.getDataType()));
+ DataType dataType = fieldSpec.getDataType();
+ properties.setProperty(getKeyFor(column, DATA_TYPE), String.valueOf(dataType));
properties.setProperty(getKeyFor(column, BITS_PER_ELEMENT),
String.valueOf(PinotDataBitSet.getNumBitsPerValue(cardinality - 1)));
properties.setProperty(getKeyFor(column, DICTIONARY_ELEMENT_SIZE), String.valueOf(dictionaryElementSize));
@@ -509,71 +510,83 @@ public class SegmentColumnarIndexCreator implements SegmentCreator {
properties.setProperty(getKeyFor(column, HAS_NULL_VALUE), String.valueOf(columnIndexCreationInfo.hasNulls()));
properties.setProperty(getKeyFor(column, HAS_DICTIONARY), String.valueOf(hasDictionary));
properties.setProperty(getKeyFor(column, TEXT_INDEX_TYPE), textIndexType.name());
- properties.setProperty(V1Constants.MetadataKeys.Column.getKeyFor(column, HAS_INVERTED_INDEX),
- String.valueOf(hasInvertedIndex));
- properties.setProperty(V1Constants.MetadataKeys.Column.getKeyFor(column, IS_SINGLE_VALUED),
- String.valueOf(fieldSpec.isSingleValueField()));
- properties.setProperty(V1Constants.MetadataKeys.Column.getKeyFor(column, MAX_MULTI_VALUE_ELEMTS),
+ properties.setProperty(getKeyFor(column, HAS_INVERTED_INDEX), String.valueOf(hasInvertedIndex));
+ properties.setProperty(getKeyFor(column, IS_SINGLE_VALUED), String.valueOf(fieldSpec.isSingleValueField()));
+ properties.setProperty(getKeyFor(column, MAX_MULTI_VALUE_ELEMTS),
String.valueOf(columnIndexCreationInfo.getMaxNumberOfMultiValueElements()));
- properties.setProperty(V1Constants.MetadataKeys.Column.getKeyFor(column, TOTAL_NUMBER_OF_ENTRIES),
+ properties.setProperty(getKeyFor(column, TOTAL_NUMBER_OF_ENTRIES),
String.valueOf(columnIndexCreationInfo.getTotalNumberOfEntries()));
- properties.setProperty(V1Constants.MetadataKeys.Column.getKeyFor(column, IS_AUTO_GENERATED),
- String.valueOf(columnIndexCreationInfo.isAutoGenerated()));
+ properties
+ .setProperty(getKeyFor(column, IS_AUTO_GENERATED), String.valueOf(columnIndexCreationInfo.isAutoGenerated()));
PartitionFunction partitionFunction = columnIndexCreationInfo.getPartitionFunction();
if (partitionFunction != null) {
- properties.setProperty(V1Constants.MetadataKeys.Column.getKeyFor(column, PARTITION_FUNCTION),
- partitionFunction.toString());
- properties.setProperty(V1Constants.MetadataKeys.Column.getKeyFor(column, NUM_PARTITIONS),
- columnIndexCreationInfo.getNumPartitions());
- properties.setProperty(V1Constants.MetadataKeys.Column.getKeyFor(column, PARTITION_VALUES),
- columnIndexCreationInfo.getPartitions());
+ properties.setProperty(getKeyFor(column, PARTITION_FUNCTION), partitionFunction.toString());
+ properties.setProperty(getKeyFor(column, NUM_PARTITIONS), columnIndexCreationInfo.getNumPartitions());
+ properties.setProperty(getKeyFor(column, PARTITION_VALUES), columnIndexCreationInfo.getPartitions());
}
// datetime field
if (fieldSpec.getFieldType().equals(FieldType.DATE_TIME)) {
DateTimeFieldSpec dateTimeFieldSpec = (DateTimeFieldSpec) fieldSpec;
- properties.setProperty(V1Constants.MetadataKeys.Column.getKeyFor(column, DATETIME_FORMAT),
- dateTimeFieldSpec.getFormat());
- properties.setProperty(V1Constants.MetadataKeys.Column.getKeyFor(column, DATETIME_GRANULARITY),
- dateTimeFieldSpec.getGranularity());
+ properties.setProperty(getKeyFor(column, DATETIME_FORMAT), dateTimeFieldSpec.getFormat());
+ properties.setProperty(getKeyFor(column, DATETIME_GRANULARITY), dateTimeFieldSpec.getGranularity());
}
- Object defaultNullValue = columnIndexCreationInfo.getDefaultNullValue();
- if (defaultNullValue instanceof byte[]) {
- String defaultNullValueString = BytesUtils.toHexString((byte[]) defaultNullValue);
- properties
- .setProperty(V1Constants.MetadataKeys.Column.getKeyFor(column, DEFAULT_NULL_VALUE), defaultNullValueString);
+ String minValue = columnIndexCreationInfo.getMin().toString();
+ String maxValue = columnIndexCreationInfo.getMax().toString();
+ String defaultNullValue = columnIndexCreationInfo.getDefaultNullValue().toString();
+ if (dataType == DataType.STRING) {
+ // Check special characters for STRING column
+ if (isValidPropertyValue(minValue)) {
+ properties.setProperty(getKeyFor(column, MIN_VALUE), minValue);
+ }
+ if (isValidPropertyValue(maxValue)) {
+ properties.setProperty(getKeyFor(column, MAX_VALUE), maxValue);
+ }
+ if (isValidPropertyValue(defaultNullValue)) {
+ properties.setProperty(getKeyFor(column, DEFAULT_NULL_VALUE), defaultNullValue);
+ }
} else {
- properties.setProperty(V1Constants.MetadataKeys.Column.getKeyFor(column, DEFAULT_NULL_VALUE),
- String.valueOf(defaultNullValue));
+ properties.setProperty(getKeyFor(column, MIN_VALUE), minValue);
+ properties.setProperty(getKeyFor(column, MAX_VALUE), maxValue);
+ properties.setProperty(getKeyFor(column, DEFAULT_NULL_VALUE), defaultNullValue);
}
}
public static void addColumnMinMaxValueInfo(PropertiesConfiguration properties, String column, String minValue,
String maxValue) {
- properties.setProperty(getKeyFor(column, MIN_VALUE), minValue);
- properties.setProperty(getKeyFor(column, MAX_VALUE), maxValue);
+ // Check special characters for STRING column
+ if (isValidPropertyValue(minValue)) {
+ properties.setProperty(getKeyFor(column, MIN_VALUE), minValue);
+ }
+ if (isValidPropertyValue(maxValue)) {
+ properties.setProperty(getKeyFor(column, MAX_VALUE), maxValue);
+ }
+ }
+
+ /**
+ * Helper method to check whether the given value is a valid property value.
+ * <p>Value is invalid iff:
+ * <ul>
+ * <li>It contains leading/trailing whitespace</li>
+ * <li>It contains list separator (',')</li>
+ * </ul>
+ */
+ @VisibleForTesting
+ static boolean isValidPropertyValue(String value) {
+ int length = value.length();
+ if (length == 0) {
+ return true;
+ }
+ if (Character.isWhitespace(value.charAt(0)) || Character.isWhitespace(value.charAt(length - 1))) {
+ return false;
+ }
+ return value.indexOf(',') == -1;
}
public static void removeColumnMetadataInfo(PropertiesConfiguration properties, String column) {
- properties.clearProperty(getKeyFor(column, CARDINALITY));
- properties.clearProperty(getKeyFor(column, TOTAL_DOCS));
- properties.clearProperty(getKeyFor(column, DATA_TYPE));
- properties.clearProperty(getKeyFor(column, BITS_PER_ELEMENT));
- properties.clearProperty(getKeyFor(column, DICTIONARY_ELEMENT_SIZE));
- properties.clearProperty(getKeyFor(column, COLUMN_TYPE));
- properties.clearProperty(getKeyFor(column, IS_SORTED));
- properties.clearProperty(getKeyFor(column, HAS_NULL_VALUE));
- properties.clearProperty(getKeyFor(column, HAS_DICTIONARY));
- properties.clearProperty(getKeyFor(column, HAS_INVERTED_INDEX));
- properties.clearProperty(getKeyFor(column, IS_SINGLE_VALUED));
- properties.clearProperty(getKeyFor(column, MAX_MULTI_VALUE_ELEMTS));
- properties.clearProperty(getKeyFor(column, TOTAL_NUMBER_OF_ENTRIES));
- properties.clearProperty(getKeyFor(column, IS_AUTO_GENERATED));
- properties.clearProperty(getKeyFor(column, DEFAULT_NULL_VALUE));
- properties.clearProperty(getKeyFor(column, MIN_VALUE));
- properties.clearProperty(getKeyFor(column, MAX_VALUE));
+ properties.subset(COLUMN_PROPS_KEY_PREFIX + column).clear();
}
/**
diff --git a/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentIndexCreationDriverImpl.java b/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentIndexCreationDriverImpl.java
index e0137b6..3658896 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentIndexCreationDriverImpl.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentIndexCreationDriverImpl.java
@@ -60,6 +60,7 @@ import org.apache.pinot.spi.data.readers.FileFormat;
import org.apache.pinot.spi.data.readers.GenericRow;
import org.apache.pinot.spi.data.readers.RecordReader;
import org.apache.pinot.spi.data.readers.RecordReaderFactory;
+import org.apache.pinot.spi.utils.ByteArray;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -344,6 +345,7 @@ public class SegmentIndexCreationDriverImpl implements SegmentIndexCreationDrive
*/
void buildIndexCreationInfo()
throws Exception {
+ Set<String> varLengthDictionaryColumns = new HashSet<>(config.getVarLengthDictionaryColumns());
for (FieldSpec fieldSpec : dataSchema.getAllFieldSpecs()) {
// Ignore virtual columns
if (fieldSpec.isVirtualColumn()) {
@@ -352,10 +354,12 @@ public class SegmentIndexCreationDriverImpl implements SegmentIndexCreationDrive
String columnName = fieldSpec.getName();
ColumnStatistics columnProfile = segmentStats.getColumnProfileFor(columnName);
- Set<String> varLengthDictionaryColumns = new HashSet<>(config.getVarLengthDictionaryColumns());
+ Object defaultNullValue = fieldSpec.getDefaultNullValue();
+ if (fieldSpec.getDataType() == FieldSpec.DataType.BYTES) {
+ defaultNullValue = new ByteArray((byte[]) defaultNullValue);
+ }
indexCreationInfoMap.put(columnName, new ColumnIndexCreationInfo(columnProfile, true/*createDictionary*/,
- varLengthDictionaryColumns.contains(columnName), false/*isAutoGenerated*/,
- dataSchema.getFieldSpecFor(columnName).getDefaultNullValue()));
+ varLengthDictionaryColumns.contains(columnName), false/*isAutoGenerated*/, defaultNullValue));
}
segmentIndexCreationInfo.setTotalDocs(totalDocs);
}
diff --git a/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/V1Constants.java b/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/V1Constants.java
index 524eff1..0498f8c 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/V1Constants.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/V1Constants.java
@@ -89,7 +89,7 @@ public class V1Constants {
public static final String DATETIME_GRANULARITY = "datetimeGranularity";
public static final String TEXT_INDEX_TYPE = "textIndexType";
- private static final String COLUMN_PROPS_KEY_PREFIX = "column.";
+ public static final String COLUMN_PROPS_KEY_PREFIX = "column.";
public static String getKeyFor(String column, String key) {
return COLUMN_PROPS_KEY_PREFIX + column + "." + key;
diff --git a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/loader/columnminmaxvalue/ColumnMinMaxValueGenerator.java b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/loader/columnminmaxvalue/ColumnMinMaxValueGenerator.java
index e946b52..91c1f7f 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/loader/columnminmaxvalue/ColumnMinMaxValueGenerator.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/loader/columnminmaxvalue/ColumnMinMaxValueGenerator.java
@@ -18,14 +18,15 @@
*/
package org.apache.pinot.core.segment.index.loader.columnminmaxvalue;
+import com.google.common.base.Preconditions;
import java.io.FileOutputStream;
import java.util.HashSet;
import java.util.Set;
-
import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.pinot.core.segment.creator.impl.SegmentColumnarIndexCreator;
import org.apache.pinot.core.segment.index.metadata.ColumnMetadata;
import org.apache.pinot.core.segment.index.metadata.SegmentMetadataImpl;
+import org.apache.pinot.core.segment.index.readers.BytesDictionary;
import org.apache.pinot.core.segment.index.readers.DoubleDictionary;
import org.apache.pinot.core.segment.index.readers.FloatDictionary;
import org.apache.pinot.core.segment.index.readers.IntDictionary;
@@ -37,8 +38,6 @@ import org.apache.pinot.core.segment.store.SegmentDirectory;
import org.apache.pinot.spi.data.FieldSpec;
import org.apache.pinot.spi.data.Schema;
-import com.clearspring.analytics.util.Preconditions;
-
public class ColumnMinMaxValueGenerator {
private final SegmentMetadataImpl _segmentMetadata;
@@ -56,7 +55,8 @@ public class ColumnMinMaxValueGenerator {
_columnMinMaxValueGeneratorMode = columnMinMaxValueGeneratorMode;
}
- public void addColumnMinMaxValue() throws Exception {
+ public void addColumnMinMaxValue()
+ throws Exception {
Preconditions.checkState(_columnMinMaxValueGeneratorMode != ColumnMinMaxValueGeneratorMode.NONE);
Schema schema = _segmentMetadata.getSchema();
@@ -78,10 +78,12 @@ public class ColumnMinMaxValueGenerator {
saveMetadata();
}
- private void addColumnMinMaxValueForColumn(String columnName) throws Exception {
+ private void addColumnMinMaxValueForColumn(String columnName)
+ throws Exception {
// Skip column without dictionary or with min/max value already set
ColumnMetadata columnMetadata = _segmentMetadata.getColumnMetadataFor(columnName);
- if ((!columnMetadata.hasDictionary()) || (columnMetadata.getMinValue() != null)) {
+ if (!columnMetadata.hasDictionary() || columnMetadata.getMinValue() != null
+ || columnMetadata.getMaxValue() != null) {
return;
}
@@ -91,33 +93,46 @@ public class ColumnMinMaxValueGenerator {
switch (dataType) {
case INT:
try (IntDictionary intDictionary = new IntDictionary(dictionaryBuffer, length)) {
- SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
- intDictionary.getStringValue(0), intDictionary.getStringValue(length - 1));
+ SegmentColumnarIndexCreator
+ .addColumnMinMaxValueInfo(_segmentProperties, columnName, intDictionary.getStringValue(0),
+ intDictionary.getStringValue(length - 1));
}
break;
case LONG:
try (LongDictionary longDictionary = new LongDictionary(dictionaryBuffer, length)) {
- SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
- longDictionary.getStringValue(0), longDictionary.getStringValue(length - 1));
+ SegmentColumnarIndexCreator
+ .addColumnMinMaxValueInfo(_segmentProperties, columnName, longDictionary.getStringValue(0),
+ longDictionary.getStringValue(length - 1));
}
break;
case FLOAT:
try (FloatDictionary floatDictionary = new FloatDictionary(dictionaryBuffer, length)) {
- SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
- floatDictionary.getStringValue(0), floatDictionary.getStringValue(length - 1));
+ SegmentColumnarIndexCreator
+ .addColumnMinMaxValueInfo(_segmentProperties, columnName, floatDictionary.getStringValue(0),
+ floatDictionary.getStringValue(length - 1));
}
break;
case DOUBLE:
try (DoubleDictionary doubleDictionary = new DoubleDictionary(dictionaryBuffer, length)) {
- SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
- doubleDictionary.getStringValue(0), doubleDictionary.getStringValue(length - 1));
+ SegmentColumnarIndexCreator
+ .addColumnMinMaxValueInfo(_segmentProperties, columnName, doubleDictionary.getStringValue(0),
+ doubleDictionary.getStringValue(length - 1));
}
break;
case STRING:
try (StringDictionary stringDictionary = new StringDictionary(dictionaryBuffer, length,
columnMetadata.getColumnMaxLength(), (byte) columnMetadata.getPaddingCharacter())) {
- SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, stringDictionary.get(0),
- stringDictionary.get(length - 1));
+ SegmentColumnarIndexCreator
+ .addColumnMinMaxValueInfo(_segmentProperties, columnName, stringDictionary.getStringValue(0),
+ stringDictionary.getStringValue(length - 1));
+ }
+ break;
+ case BYTES:
+ try (BytesDictionary bytesDictionary = new BytesDictionary(dictionaryBuffer, length,
+ columnMetadata.getColumnMaxLength())) {
+ SegmentColumnarIndexCreator
+ .addColumnMinMaxValueInfo(_segmentProperties, columnName, bytesDictionary.getStringValue(0),
+ bytesDictionary.getStringValue(length - 1));
}
break;
default:
@@ -127,7 +142,8 @@ public class ColumnMinMaxValueGenerator {
_minMaxValueAdded = true;
}
- private void saveMetadata() throws Exception {
+ private void saveMetadata()
+ throws Exception {
if (_minMaxValueAdded) {
// Commons Configuration 1.10 does not support file path containing '%'.
// Explicitly providing the output stream for the file bypasses the problem.
diff --git a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/loader/defaultcolumn/BaseDefaultColumnHandler.java b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/loader/defaultcolumn/BaseDefaultColumnHandler.java
index 32c82b0..01f8a6e 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/loader/defaultcolumn/BaseDefaultColumnHandler.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/loader/defaultcolumn/BaseDefaultColumnHandler.java
@@ -385,8 +385,8 @@ public abstract class BaseDefaultColumnHandler implements DefaultColumnHandler {
new DefaultColumnStatistics(defaultValue /* min */, defaultValue /* max */, sortedArray, false, totalDocs, 0);
ColumnIndexCreationInfo columnIndexCreationInfo =
- new ColumnIndexCreationInfo(columnStatistics, false/*createDictionary*/, false,
- true/*isAutoGenerated*/, defaultValue/*defaultNullValue*/);
+ new ColumnIndexCreationInfo(columnStatistics, false/*createDictionary*/, false, true/*isAutoGenerated*/,
+ defaultValue/*defaultNullValue*/);
// Add the column metadata information to the metadata properties.
SegmentColumnarIndexCreator
@@ -401,15 +401,15 @@ public abstract class BaseDefaultColumnHandler implements DefaultColumnHandler {
*/
protected void createColumnV1Indices(String column)
throws Exception {
- final FieldSpec fieldSpec = _schema.getFieldSpecFor(column);
+ FieldSpec fieldSpec = _schema.getFieldSpecFor(column);
Preconditions.checkNotNull(fieldSpec);
// Generate column index creation information.
- final int totalDocs = _segmentMetadata.getTotalDocs();
- final DataType dataType = fieldSpec.getDataType();
- final Object defaultValue = fieldSpec.getDefaultNullValue();
- final boolean isSingleValue = fieldSpec.isSingleValueField();
- final int maxNumberOfMultiValueElements = isSingleValue ? 0 : 1;
+ int totalDocs = _segmentMetadata.getTotalDocs();
+ DataType dataType = fieldSpec.getDataType();
+ Object defaultValue = fieldSpec.getDefaultNullValue();
+ boolean isSingleValue = fieldSpec.isSingleValueField();
+ int maxNumberOfMultiValueElements = isSingleValue ? 0 : 1;
int dictionaryElementSize = 0;
Object sortedArray;
@@ -440,7 +440,10 @@ public abstract class BaseDefaultColumnHandler implements DefaultColumnHandler {
case BYTES:
Preconditions.checkState(defaultValue instanceof byte[]);
dictionaryElementSize = ((byte[]) defaultValue).length;
- sortedArray = new ByteArray[]{new ByteArray((byte[]) defaultValue)};
+ // Convert byte[] to ByteArray for internal usage
+ ByteArray bytesDefaultValue = new ByteArray((byte[]) defaultValue);
+ defaultValue = bytesDefaultValue;
+ sortedArray = new ByteArray[]{bytesDefaultValue};
break;
default:
throw new UnsupportedOperationException("Unsupported data type: " + dataType + " for column: " + column);
diff --git a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/metadata/ColumnMetadata.java b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/metadata/ColumnMetadata.java
index 9169123..43ea2a2 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/metadata/ColumnMetadata.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/metadata/ColumnMetadata.java
@@ -37,6 +37,7 @@ import org.apache.pinot.spi.data.FieldSpec.FieldType;
import org.apache.pinot.spi.data.MetricFieldSpec;
import org.apache.pinot.spi.data.TimeFieldSpec;
import org.apache.pinot.spi.data.TimeGranularitySpec;
+import org.apache.pinot.spi.utils.BytesUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -124,7 +125,7 @@ public class ColumnMetadata {
// Set min/max value if available.
String minString = config.getString(getKeyFor(column, MIN_VALUE), null);
String maxString = config.getString(getKeyFor(column, MAX_VALUE), null);
- if ((minString != null) && (maxString != null)) {
+ if (minString != null && maxString != null) {
switch (dataType) {
case INT:
builder.setMinValue(Integer.valueOf(minString));
@@ -146,6 +147,10 @@ public class ColumnMetadata {
builder.setMinValue(minString);
builder.setMaxValue(maxString);
break;
+ case BYTES:
+ builder.setMinValue(BytesUtils.toByteArray(minString));
+ builder.setMaxValue(BytesUtils.toByteArray(maxString));
+ break;
default:
throw new IllegalStateException("Unsupported data type: " + dataType + " for column: " + column);
}
diff --git a/pinot-core/src/test/java/org/apache/pinot/core/segment/creator/impl/SegmentColumnarIndexCreatorTest.java b/pinot-core/src/test/java/org/apache/pinot/core/segment/creator/impl/SegmentColumnarIndexCreatorTest.java
new file mode 100644
index 0000000..eb6da38
--- /dev/null
+++ b/pinot-core/src/test/java/org/apache/pinot/core/segment/creator/impl/SegmentColumnarIndexCreatorTest.java
@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.segment.creator.impl;
+
+import java.io.File;
+import java.io.IOException;
+import org.apache.commons.configuration.PropertiesConfiguration;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang3.RandomStringUtils;
+import org.testng.annotations.AfterClass;
+import org.testng.annotations.BeforeClass;
+import org.testng.annotations.Test;
+
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertFalse;
+import static org.testng.Assert.assertTrue;
+
+
+public class SegmentColumnarIndexCreatorTest {
+ private static final File TEMP_DIR = new File(FileUtils.getTempDirectory(), "SegmentColumnarIndexCreatorTest");
+ private static final File CONFIG_FILE = new File(TEMP_DIR, "config");
+ private static final String PROPERTY_KEY = "testKey";
+ private static final String COLUMN_NAME = "testColumn";
+ private static final String COLUMN_PROPERTY_KEY_PREFIX =
+ V1Constants.MetadataKeys.Column.COLUMN_PROPS_KEY_PREFIX + COLUMN_NAME + ".";
+ private static final int NUM_ROUNDS = 1000;
+
+ @BeforeClass
+ public void setUp()
+ throws IOException {
+ FileUtils.deleteDirectory(TEMP_DIR);
+ }
+
+ @Test
+ public void testPropertyValueWithSpecialCharacters()
+ throws Exception {
+ // Leading/trailing whitespace
+ assertFalse(SegmentColumnarIndexCreator.isValidPropertyValue(" a"));
+ assertFalse(SegmentColumnarIndexCreator.isValidPropertyValue("a\t"));
+ assertFalse(SegmentColumnarIndexCreator.isValidPropertyValue("\na"));
+
+ // Whitespace in the middle
+ assertTrue(SegmentColumnarIndexCreator.isValidPropertyValue("a\t b"));
+ testPropertyValueWithSpecialCharacters("a\t b");
+ assertTrue(SegmentColumnarIndexCreator.isValidPropertyValue("a \nb"));
+ testPropertyValueWithSpecialCharacters("a \nb");
+
+ // List separator
+ assertFalse(SegmentColumnarIndexCreator.isValidPropertyValue("a,b,c"));
+ assertFalse(SegmentColumnarIndexCreator.isValidPropertyValue(",a b"));
+
+ // Empty string
+ assertTrue(SegmentColumnarIndexCreator.isValidPropertyValue(""));
+ testPropertyValueWithSpecialCharacters("");
+
+ for (int i = 0; i < NUM_ROUNDS; i++) {
+ testPropertyValueWithSpecialCharacters(RandomStringUtils.randomAscii(5));
+ testPropertyValueWithSpecialCharacters(RandomStringUtils.random(5));
+ }
+ }
+
+ private void testPropertyValueWithSpecialCharacters(String value)
+ throws Exception {
+ if (SegmentColumnarIndexCreator.isValidPropertyValue(value)) {
+ PropertiesConfiguration configuration = new PropertiesConfiguration(CONFIG_FILE);
+ configuration.setProperty(PROPERTY_KEY, value);
+ assertEquals(configuration.getString(PROPERTY_KEY), value);
+ configuration.save();
+
+ configuration = new PropertiesConfiguration(CONFIG_FILE);
+ assertEquals(configuration.getString(PROPERTY_KEY), value);
+ }
+ }
+
+ @Test
+ public void testRemoveColumnMetadataInfo()
+ throws Exception {
+ PropertiesConfiguration configuration = new PropertiesConfiguration(CONFIG_FILE);
+ configuration.setProperty(COLUMN_PROPERTY_KEY_PREFIX + "a", "foo");
+ configuration.setProperty(COLUMN_PROPERTY_KEY_PREFIX + "b", "bar");
+ configuration.setProperty(COLUMN_PROPERTY_KEY_PREFIX + "c", "foobar");
+ configuration.save();
+
+ configuration = new PropertiesConfiguration(CONFIG_FILE);
+ assertTrue(configuration.containsKey(COLUMN_PROPERTY_KEY_PREFIX + "a"));
+ assertTrue(configuration.containsKey(COLUMN_PROPERTY_KEY_PREFIX + "b"));
+ assertTrue(configuration.containsKey(COLUMN_PROPERTY_KEY_PREFIX + "c"));
+ SegmentColumnarIndexCreator.removeColumnMetadataInfo(configuration, COLUMN_NAME);
+ assertFalse(configuration.containsKey(COLUMN_PROPERTY_KEY_PREFIX + "a"));
+ assertFalse(configuration.containsKey(COLUMN_PROPERTY_KEY_PREFIX + "b"));
+ assertFalse(configuration.containsKey(COLUMN_PROPERTY_KEY_PREFIX + "c"));
+ configuration.save();
+
+ configuration = new PropertiesConfiguration(CONFIG_FILE);
+ assertFalse(configuration.containsKey(COLUMN_PROPERTY_KEY_PREFIX + "a"));
+ assertFalse(configuration.containsKey(COLUMN_PROPERTY_KEY_PREFIX + "b"));
+ assertFalse(configuration.containsKey(COLUMN_PROPERTY_KEY_PREFIX + "c"));
+ }
+
+ @AfterClass
+ public void tearDown()
+ throws IOException {
+ FileUtils.deleteDirectory(TEMP_DIR);
+ }
+}
diff --git a/pinot-core/src/test/java/org/apache/pinot/core/segment/index/loader/SegmentPreProcessorTest.java b/pinot-core/src/test/java/org/apache/pinot/core/segment/index/loader/SegmentPreProcessorTest.java
index f2e1271..5d57d37 100644
--- a/pinot-core/src/test/java/org/apache/pinot/core/segment/index/loader/SegmentPreProcessorTest.java
+++ b/pinot-core/src/test/java/org/apache/pinot/core/segment/index/loader/SegmentPreProcessorTest.java
@@ -25,7 +25,9 @@ import java.nio.file.attribute.FileTime;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
+import java.util.Iterator;
import java.util.Set;
+import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.commons.io.FileUtils;
import org.apache.pinot.common.segment.ReadMode;
import org.apache.pinot.core.indexsegment.generator.SegmentGeneratorConfig;
@@ -33,6 +35,7 @@ import org.apache.pinot.core.indexsegment.generator.SegmentVersion;
import org.apache.pinot.core.segment.creator.SegmentIndexCreationDriver;
import org.apache.pinot.core.segment.creator.TextIndexType;
import org.apache.pinot.core.segment.creator.impl.SegmentCreationDriverFactory;
+import org.apache.pinot.core.segment.creator.impl.V1Constants;
import org.apache.pinot.core.segment.index.converter.SegmentV1V2ToV3FormatConverter;
import org.apache.pinot.core.segment.index.loader.columnminmaxvalue.ColumnMinMaxValueGeneratorMode;
import org.apache.pinot.core.segment.index.metadata.ColumnMetadata;
@@ -45,6 +48,8 @@ import org.apache.pinot.spi.config.table.TableConfig;
import org.apache.pinot.spi.config.table.TableType;
import org.apache.pinot.spi.data.FieldSpec;
import org.apache.pinot.spi.data.Schema;
+import org.apache.pinot.spi.utils.ByteArray;
+import org.apache.pinot.spi.utils.BytesUtils;
import org.apache.pinot.spi.utils.builder.TableConfigBuilder;
import org.testng.Assert;
import org.testng.annotations.AfterClass;
@@ -444,13 +449,21 @@ public class SegmentPreProcessorTest {
SegmentMetadataImpl segmentMetadata = new SegmentMetadataImpl(_indexDir);
ColumnMetadata hllMetricMetadata = segmentMetadata.getColumnMetadataFor(NEW_HLL_BYTE_METRIC_COLUMN_NAME);
Assert.assertEquals(hllMetricMetadata.getDataType(), FieldSpec.DataType.BYTES);
- Assert.assertEquals(hllMetricMetadata.getDefaultNullValueString(),
- "00000008000000ac00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000");
+ String expectedDefaultValueString =
+ "00000008000000ac00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000";
+ ByteArray expectedDefaultValue = BytesUtils.toByteArray(expectedDefaultValueString);
+ Assert.assertEquals(hllMetricMetadata.getMinValue(), expectedDefaultValue);
+ Assert.assertEquals(hllMetricMetadata.getMaxValue(), expectedDefaultValue);
+ Assert.assertEquals(hllMetricMetadata.getDefaultNullValueString(), expectedDefaultValueString);
ColumnMetadata tDigestMetricMetadata = segmentMetadata.getColumnMetadataFor(NEW_TDIGEST_BYTE_METRIC_COLUMN_NAME);
Assert.assertEquals(tDigestMetricMetadata.getDataType(), FieldSpec.DataType.BYTES);
- Assert.assertEquals(tDigestMetricMetadata.getDefaultNullValueString(),
- "0000000141ba085ee15d2f3241ba085ee15d2f324059000000000000000000013ff000000000000041ba085ee15d2f32");
+ expectedDefaultValueString =
+ "0000000141ba085ee15d2f3241ba085ee15d2f324059000000000000000000013ff000000000000041ba085ee15d2f32";
+ expectedDefaultValue = BytesUtils.toByteArray(expectedDefaultValueString);
+ Assert.assertEquals(tDigestMetricMetadata.getMinValue(), expectedDefaultValue);
+ Assert.assertEquals(tDigestMetricMetadata.getMaxValue(), expectedDefaultValue);
+ Assert.assertEquals(tDigestMetricMetadata.getDefaultNullValueString(), expectedDefaultValueString);
}
private void checkUpdateDefaultColumns(IndexLoadingConfig indexLoadingConfig)
@@ -478,29 +491,41 @@ public class SegmentPreProcessorTest {
Assert.assertEquals(columnMetadata.getMaxNumberOfMultiValues(), 0);
Assert.assertEquals(columnMetadata.getTotalNumberOfEntries(), 100000);
Assert.assertTrue(columnMetadata.isAutoGenerated());
+ Assert.assertEquals(columnMetadata.getMinValue(), 1);
+ Assert.assertEquals(columnMetadata.getMaxValue(), 1);
Assert.assertEquals(columnMetadata.getDefaultNullValueString(), "1");
columnMetadata = segmentMetadata.getColumnMetadataFor(NEW_LONG_METRIC_COLUMN_NAME);
Assert.assertEquals(columnMetadata.getDataType(), FieldSpec.DataType.LONG);
+ Assert.assertEquals(columnMetadata.getMinValue(), 0L);
+ Assert.assertEquals(columnMetadata.getMaxValue(), 0L);
Assert.assertEquals(columnMetadata.getDefaultNullValueString(), "0");
columnMetadata = segmentMetadata.getColumnMetadataFor(NEW_FLOAT_METRIC_COLUMN_NAME);
Assert.assertEquals(columnMetadata.getDataType(), FieldSpec.DataType.FLOAT);
+ Assert.assertEquals(columnMetadata.getMinValue(), 0f);
+ Assert.assertEquals(columnMetadata.getMaxValue(), 0f);
Assert.assertEquals(columnMetadata.getDefaultNullValueString(), "0.0");
columnMetadata = segmentMetadata.getColumnMetadataFor(NEW_DOUBLE_METRIC_COLUMN_NAME);
Assert.assertEquals(columnMetadata.getDataType(), FieldSpec.DataType.DOUBLE);
+ Assert.assertEquals(columnMetadata.getMinValue(), 0.0);
+ Assert.assertEquals(columnMetadata.getMaxValue(), 0.0);
Assert.assertEquals(columnMetadata.getDefaultNullValueString(), "0.0");
columnMetadata = segmentMetadata.getColumnMetadataFor(NEW_BOOLEAN_SV_DIMENSION_COLUMN_NAME);
Assert.assertEquals(columnMetadata.getDataType(), FieldSpec.DataType.STRING);
Assert.assertEquals(columnMetadata.getColumnMaxLength(), 5);
Assert.assertEquals(columnMetadata.getFieldType(), FieldSpec.FieldType.DIMENSION);
+ Assert.assertEquals(columnMetadata.getMinValue(), "false");
+ Assert.assertEquals(columnMetadata.getMaxValue(), "false");
Assert.assertEquals(columnMetadata.getDefaultNullValueString(), "false");
columnMetadata = segmentMetadata.getColumnMetadataFor(NEW_INT_SV_DIMENSION_COLUMN_NAME);
Assert.assertEquals(columnMetadata.getDataType(), FieldSpec.DataType.INT);
- Assert.assertEquals(columnMetadata.getDefaultNullValueString(), String.valueOf(Integer.MIN_VALUE));
+ Assert.assertEquals(columnMetadata.getMinValue(), Integer.MIN_VALUE);
+ Assert.assertEquals(columnMetadata.getMaxValue(), Integer.MIN_VALUE);
+ Assert.assertEquals(columnMetadata.getDefaultNullValueString(), Integer.toString(Integer.MIN_VALUE));
columnMetadata = segmentMetadata.getColumnMetadataFor(NEW_STRING_MV_DIMENSION_COLUMN_NAME);
Assert.assertEquals(columnMetadata.getDataType(), FieldSpec.DataType.STRING);
@@ -509,6 +534,8 @@ public class SegmentPreProcessorTest {
Assert.assertFalse(columnMetadata.isSingleValue());
Assert.assertEquals(columnMetadata.getMaxNumberOfMultiValues(), 1);
Assert.assertEquals(columnMetadata.getTotalNumberOfEntries(), 100000);
+ Assert.assertEquals(columnMetadata.getMinValue(), "null");
+ Assert.assertEquals(columnMetadata.getMaxValue(), "null");
Assert.assertEquals(columnMetadata.getDefaultNullValueString(), "null");
// Check dictionary and forward index exist.
@@ -540,17 +567,33 @@ public class SegmentPreProcessorTest {
// Check column metadata.
columnMetadata = segmentMetadata.getColumnMetadataFor(NEW_INT_METRIC_COLUMN_NAME);
+ Assert.assertEquals(columnMetadata.getMinValue(), 2);
+ Assert.assertEquals(columnMetadata.getMaxValue(), 2);
Assert.assertEquals(columnMetadata.getDefaultNullValueString(), "2");
columnMetadata = segmentMetadata.getColumnMetadataFor(NEW_STRING_MV_DIMENSION_COLUMN_NAME);
+ Assert.assertEquals(columnMetadata.getMinValue(), "abcd");
+ Assert.assertEquals(columnMetadata.getMaxValue(), "abcd");
Assert.assertEquals(columnMetadata.getDefaultNullValueString(), "abcd");
}
@Test
- public void testAddColumnMinMaxValue()
+ public void testColumnMinMaxValue()
throws Exception {
constructV1Segment();
+ // Remove min/max value from the metadata
+ PropertiesConfiguration configuration = SegmentMetadataImpl.getPropertiesConfiguration(_indexDir);
+ Iterator<String> keys = configuration.getKeys();
+ while (keys.hasNext()) {
+ String key = keys.next();
+ if (key.endsWith(V1Constants.MetadataKeys.Column.MIN_VALUE) || key
+ .endsWith(V1Constants.MetadataKeys.Column.MAX_VALUE)) {
+ configuration.clearProperty(key);
+ }
+ }
+ configuration.save();
+
IndexLoadingConfig indexLoadingConfig = new IndexLoadingConfig();
indexLoadingConfig.setColumnMinMaxValueGeneratorMode(ColumnMinMaxValueGeneratorMode.NONE);
try (SegmentPreProcessor processor = new SegmentPreProcessor(_indexDir, indexLoadingConfig, null)) {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org