You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by bl...@apache.org on 2022/07/08 18:24:37 UTC
[iceberg] branch master updated: Core: Update MetricsConfig to use a default for first 32 columns (#5215)
This is an automated email from the ASF dual-hosted git repository.
blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new a25381773 Core: Update MetricsConfig to use a default for first 32 columns (#5215)
a25381773 is described below
commit a253817734dd2ffa576714a33f3a148c2a83927e
Author: Ryan Blue <bl...@apache.org>
AuthorDate: Fri Jul 8 11:24:33 2022 -0700
Core: Update MetricsConfig to use a default for first 32 columns (#5215)
---
.../java/org/apache/iceberg/MetricsConfig.java | 103 +++++++++++++--------
.../java/org/apache/iceberg/TableProperties.java | 4 +
.../java/org/apache/iceberg/TestMetricsModes.java | 25 +++++
.../org/apache/iceberg/io/TestWriterMetrics.java | 24 +++--
4 files changed, 111 insertions(+), 45 deletions(-)
diff --git a/core/src/main/java/org/apache/iceberg/MetricsConfig.java b/core/src/main/java/org/apache/iceberg/MetricsConfig.java
index 2ae01dad2..3e7a55d01 100644
--- a/core/src/main/java/org/apache/iceberg/MetricsConfig.java
+++ b/core/src/main/java/org/apache/iceberg/MetricsConfig.java
@@ -29,6 +29,8 @@ import org.apache.iceberg.exceptions.ValidationException;
import org.apache.iceberg.relocated.com.google.common.base.Joiner;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
+import org.apache.iceberg.types.TypeUtil;
+import org.apache.iceberg.util.PropertyUtil;
import org.apache.iceberg.util.SerializableMap;
import org.apache.iceberg.util.SortOrderUtil;
import org.slf4j.Logger;
@@ -36,6 +38,8 @@ import org.slf4j.LoggerFactory;
import static org.apache.iceberg.TableProperties.DEFAULT_WRITE_METRICS_MODE;
import static org.apache.iceberg.TableProperties.DEFAULT_WRITE_METRICS_MODE_DEFAULT;
+import static org.apache.iceberg.TableProperties.METRICS_MAX_INFERRED_COLUMN_DEFAULTS;
+import static org.apache.iceberg.TableProperties.METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT;
import static org.apache.iceberg.TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX;
@Immutable
@@ -45,9 +49,8 @@ public final class MetricsConfig implements Serializable {
private static final Joiner DOT = Joiner.on('.');
// Disable metrics by default for wide tables to prevent excessive metadata
- private static final int MAX_COLUMNS = 32;
- private static final MetricsConfig DEFAULT = new MetricsConfig(ImmutableMap.of(),
- MetricsModes.fromString(DEFAULT_WRITE_METRICS_MODE_DEFAULT));
+ private static final MetricsMode DEFAULT_MODE = MetricsModes.fromString(DEFAULT_WRITE_METRICS_MODE_DEFAULT);
+ private static final MetricsConfig DEFAULT = new MetricsConfig(ImmutableMap.of(), DEFAULT_MODE);
private final Map<String, MetricsMode> columnModes;
private final MetricsMode defaultMode;
@@ -96,7 +99,7 @@ public final class MetricsConfig implements Serializable {
**/
@Deprecated
public static MetricsConfig fromProperties(Map<String, String> props) {
- return from(props, null, DEFAULT_WRITE_METRICS_MODE_DEFAULT);
+ return from(props, null, null);
}
/**
@@ -104,13 +107,7 @@ public final class MetricsConfig implements Serializable {
* @param table iceberg table
*/
public static MetricsConfig forTable(Table table) {
- String defaultMode;
- if (table.schema().columns().size() <= MAX_COLUMNS) {
- defaultMode = DEFAULT_WRITE_METRICS_MODE_DEFAULT;
- } else {
- defaultMode = MetricsModes.None.get().toString();
- }
- return from(table.properties(), table.sortOrder(), defaultMode);
+ return from(table.properties(), table.schema(), table.sortOrder());
}
/**
@@ -136,50 +133,55 @@ public final class MetricsConfig implements Serializable {
}
/**
- * Generate a MetricsConfig for all columns based on overrides, sortOrder, and defaultMode.
+ * Generate a MetricsConfig for all columns based on overrides, schema, and sort order.
+ *
* @param props will be read for metrics overrides (write.metadata.metrics.column.*) and default
* (write.metadata.metrics.default)
+ * @param schema table schema
* @param order sort order columns, will be promoted to truncate(16)
- * @param defaultMode default, if not set by user property
* @return metrics configuration
*/
- private static MetricsConfig from(Map<String, String> props, SortOrder order, String defaultMode) {
+ private static MetricsConfig from(Map<String, String> props, Schema schema, SortOrder order) {
+ int maxInferredDefaultColumns = maxInferredColumnDefaults(props);
Map<String, MetricsMode> columnModes = Maps.newHashMap();
// Handle user override of default mode
- MetricsMode finalDefaultMode;
- String defaultModeAsString = props.getOrDefault(DEFAULT_WRITE_METRICS_MODE, defaultMode);
- try {
- finalDefaultMode = MetricsModes.fromString(defaultModeAsString);
- } catch (IllegalArgumentException err) {
- // User override was invalid, log the error and use the default
- LOG.warn("Ignoring invalid default metrics mode: {}", defaultModeAsString, err);
- finalDefaultMode = MetricsModes.fromString(defaultMode);
+ MetricsMode defaultMode;
+ String configuredDefault = props.get(DEFAULT_WRITE_METRICS_MODE);
+ if (configuredDefault != null) {
+ // a user-configured default mode is applied for all columns
+ defaultMode = parseMode(configuredDefault, DEFAULT_MODE, "default");
+
+ } else if (schema == null || schema.columns().size() <= maxInferredDefaultColumns) {
+ // there are less than the inferred limit, so the default is used everywhere
+ defaultMode = DEFAULT_MODE;
+
+ } else {
+ // an inferred default mode is applied to the first few columns, up to the limit
+ Schema subSchema = new Schema(schema.columns().subList(0, maxInferredDefaultColumns));
+ for (Integer id : TypeUtil.getProjectedIds(subSchema)) {
+ columnModes.put(subSchema.findColumnName(id), DEFAULT_MODE);
+ }
+
+ // all other columns don't use metrics
+ defaultMode = MetricsModes.None.get();
}
// First set sorted column with sorted column default (can be overridden by user)
- MetricsMode sortedColDefaultMode = sortedColumnDefaultMode(finalDefaultMode);
+ MetricsMode sortedColDefaultMode = sortedColumnDefaultMode(defaultMode);
Set<String> sortedCols = SortOrderUtil.orderPreservingSortedColumns(order);
sortedCols.forEach(sc -> columnModes.put(sc, sortedColDefaultMode));
// Handle user overrides of defaults
- MetricsMode finalDefaultModeVal = finalDefaultMode;
- props.keySet().stream()
- .filter(key -> key.startsWith(METRICS_MODE_COLUMN_CONF_PREFIX))
- .forEach(key -> {
- String columnAlias = key.replaceFirst(METRICS_MODE_COLUMN_CONF_PREFIX, "");
- MetricsMode mode;
- try {
- mode = MetricsModes.fromString(props.get(key));
- } catch (IllegalArgumentException err) {
- // Mode was invalid, log the error and use the default
- LOG.warn("Ignoring invalid metrics mode for column {}: {}", columnAlias, props.get(key), err);
- mode = finalDefaultModeVal;
- }
- columnModes.put(columnAlias, mode);
- });
+ for (String key : props.keySet()) {
+ if (key.startsWith(METRICS_MODE_COLUMN_CONF_PREFIX)) {
+ String columnAlias = key.replaceFirst(METRICS_MODE_COLUMN_CONF_PREFIX, "");
+ MetricsMode mode = parseMode(props.get(key), defaultMode, "column " + columnAlias);
+ columnModes.put(columnAlias, mode);
+ }
+ }
- return new MetricsConfig(columnModes, finalDefaultMode);
+ return new MetricsConfig(columnModes, defaultMode);
}
/**
@@ -195,6 +197,29 @@ public final class MetricsConfig implements Serializable {
}
}
+ private static int maxInferredColumnDefaults(Map<String, String> properties) {
+ int maxInferredDefaultColumns = PropertyUtil.propertyAsInt(properties,
+ METRICS_MAX_INFERRED_COLUMN_DEFAULTS, METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT);
+ if (maxInferredDefaultColumns < 0) {
+ LOG.warn("Invalid value for {} (negative): {}, falling back to {}",
+ METRICS_MAX_INFERRED_COLUMN_DEFAULTS, maxInferredDefaultColumns,
+ METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT);
+ return METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT;
+ } else {
+ return maxInferredDefaultColumns;
+ }
+ }
+
+ private static MetricsMode parseMode(String modeString, MetricsMode fallback, String context) {
+ try {
+ return MetricsModes.fromString(modeString);
+ } catch (IllegalArgumentException err) {
+ // User override was invalid, log the error and use the default
+ LOG.warn("Ignoring invalid metrics mode ({}): {}", context, modeString, err);
+ return fallback;
+ }
+ }
+
public void validateReferencedColumns(Schema schema) {
for (String column : columnModes.keySet()) {
ValidationException.check(
diff --git a/core/src/main/java/org/apache/iceberg/TableProperties.java b/core/src/main/java/org/apache/iceberg/TableProperties.java
index e7b309f81..3cc520faa 100644
--- a/core/src/main/java/org/apache/iceberg/TableProperties.java
+++ b/core/src/main/java/org/apache/iceberg/TableProperties.java
@@ -267,6 +267,10 @@ public class TableProperties {
public static final String METADATA_DELETE_AFTER_COMMIT_ENABLED = "write.metadata.delete-after-commit.enabled";
public static final boolean METADATA_DELETE_AFTER_COMMIT_ENABLED_DEFAULT = false;
+ public static final String METRICS_MAX_INFERRED_COLUMN_DEFAULTS =
+ "write.metadata.metrics.max-inferred-column-defaults";
+ public static final int METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT = 32;
+
public static final String METRICS_MODE_COLUMN_CONF_PREFIX = "write.metadata.metrics.column.";
public static final String DEFAULT_WRITE_METRICS_MODE = "write.metadata.metrics.default";
public static final String DEFAULT_WRITE_METRICS_MODE_DEFAULT = "truncate(16)";
diff --git a/core/src/test/java/org/apache/iceberg/TestMetricsModes.java b/core/src/test/java/org/apache/iceberg/TestMetricsModes.java
index 8660f81aa..939811067 100644
--- a/core/src/test/java/org/apache/iceberg/TestMetricsModes.java
+++ b/core/src/test/java/org/apache/iceberg/TestMetricsModes.java
@@ -20,6 +20,7 @@
package org.apache.iceberg;
import java.io.File;
+import java.io.IOException;
import java.util.Map;
import org.apache.iceberg.MetricsModes.Counts;
import org.apache.iceberg.MetricsModes.Full;
@@ -160,4 +161,28 @@ public class TestMetricsModes {
Assert.assertEquals("Original default applies as user entered invalid mode for sorted column",
Counts.get(), config.columnMode("col2"));
}
+
+ @Test
+ public void testMetricsConfigInferredDefaultModeLimit() throws IOException {
+ Schema schema = new Schema(
+ required(1, "col1", Types.IntegerType.get()),
+ required(2, "col2", Types.IntegerType.get()),
+ required(3, "col3", Types.IntegerType.get())
+ );
+
+ File tableDir = temp.newFolder();
+ Assert.assertTrue(tableDir.delete());
+
+ Table table = TestTables.create(
+ tableDir, "test", schema, PartitionSpec.unpartitioned(), SortOrder.unsorted(), formatVersion);
+
+ // only infer a default for the first two columns
+ table.updateProperties().set(TableProperties.METRICS_MAX_INFERRED_COLUMN_DEFAULTS, "2").commit();
+
+ MetricsConfig config = MetricsConfig.forTable(table);
+
+ Assert.assertEquals("Should use default mode for col1", Truncate.withLength(16), config.columnMode("col1"));
+ Assert.assertEquals("Should use default mode for col2", Truncate.withLength(16), config.columnMode("col2"));
+ Assert.assertEquals("Should use None for col3", None.get(), config.columnMode("col3"));
+ }
}
diff --git a/data/src/test/java/org/apache/iceberg/io/TestWriterMetrics.java b/data/src/test/java/org/apache/iceberg/io/TestWriterMetrics.java
index 86a9eaa0b..c1575b467 100644
--- a/data/src/test/java/org/apache/iceberg/io/TestWriterMetrics.java
+++ b/data/src/test/java/org/apache/iceberg/io/TestWriterMetrics.java
@@ -237,12 +237,24 @@ public abstract class TestWriterMetrics<T> {
dataWriter.close();
DataFile dataFile = dataWriter.toDataFile();
- // No field should have metrics
- Assert.assertTrue("Should not have any lower bound metrics", dataFile.lowerBounds().isEmpty());
- Assert.assertTrue("Should not have any upper bound metrics", dataFile.upperBounds().isEmpty());
- Assert.assertTrue("Should not have any nan value metrics", dataFile.nanValueCounts().isEmpty());
- Assert.assertTrue("Should not have any null value metrics", dataFile.nullValueCounts().isEmpty());
- Assert.assertTrue("Should not have any value metrics", dataFile.valueCounts().isEmpty());
+ // start at 1 because IDs were reassigned in the table
+ int id = 1;
+ for (; id <= 32; id += 1) {
+ Assert.assertNotNull("Should have lower bound metrics", dataFile.lowerBounds().get(id));
+ Assert.assertNotNull("Should have upper bound metrics", dataFile.upperBounds().get(id));
+ Assert.assertNull("Should not have nan value metrics (not floating point)", dataFile.nanValueCounts().get(id));
+ Assert.assertNotNull("Should have null value metrics", dataFile.nullValueCounts().get(id));
+ Assert.assertNotNull("Should have value metrics", dataFile.valueCounts().get(id));
+ }
+
+ // Remaining fields should not have metrics
+ for (; id <= numColumns; id += 1) {
+ Assert.assertNull("Should not have any lower bound metrics", dataFile.lowerBounds().get(id));
+ Assert.assertNull("Should not have any upper bound metrics", dataFile.upperBounds().get(id));
+ Assert.assertNull("Should not have any nan value metrics", dataFile.nanValueCounts().get(id));
+ Assert.assertNull("Should not have any null value metrics", dataFile.nullValueCounts().get(id));
+ Assert.assertNull("Should not have any value metrics", dataFile.valueCounts().get(id));
+ }
}
@Test