You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by bl...@apache.org on 2023/05/21 22:22:38 UTC
[iceberg] branch master updated: Core, Parquet: Remove Parquet dictionary encoding table property (#7665)
This is an automated email from the ASF dual-hosted git repository.
blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new fa9d0dad7d Core, Parquet: Remove Parquet dictionary encoding table property (#7665)
fa9d0dad7d is described below
commit fa9d0dad7d0ae1c32cbf640019fba0182d400490
Author: Amogh Jahagirdar <ja...@amazon.com>
AuthorDate: Sun May 21 15:22:32 2023 -0700
Core, Parquet: Remove Parquet dictionary encoding table property (#7665)
Co-authored-by: Fokko Driesprong <fo...@apache.org>
---
.../java/org/apache/iceberg/TableProperties.java | 3 --
docs/configuration.md | 3 +-
.../java/org/apache/iceberg/parquet/Parquet.java | 39 +++++++++++-----------
.../iceberg/parquet/TestBloomRowGroupFilter.java | 4 +--
4 files changed, 23 insertions(+), 26 deletions(-)
diff --git a/core/src/main/java/org/apache/iceberg/TableProperties.java b/core/src/main/java/org/apache/iceberg/TableProperties.java
index 770865a569..b14354def6 100644
--- a/core/src/main/java/org/apache/iceberg/TableProperties.java
+++ b/core/src/main/java/org/apache/iceberg/TableProperties.java
@@ -135,9 +135,6 @@ public class TableProperties {
public static final String DELETE_PARQUET_PAGE_ROW_LIMIT = "write.delete.parquet.page-row-limit";
public static final int PARQUET_PAGE_ROW_LIMIT_DEFAULT = 20_000;
- public static final String PARQUET_DICT_ENABLED = "write.parquet.enable.dictionary";
- public static final boolean PARQUET_DICT_ENABLED_DEFAULT = true;
-
public static final String PARQUET_DICT_SIZE_BYTES = "write.parquet.dict-size-bytes";
public static final String DELETE_PARQUET_DICT_SIZE_BYTES =
"write.delete.parquet.dict-size-bytes";
diff --git a/docs/configuration.md b/docs/configuration.md
index 99a25fec12..7fa2d94adf 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -54,8 +54,7 @@ Iceberg tables support table properties to configure table behavior, like the de
| write.parquet.row-group-size-bytes | 134217728 (128 MB) | Parquet row group size |
| write.parquet.page-size-bytes | 1048576 (1 MB) | Parquet page size |
| write.parquet.page-row-limit | 20000 | Parquet page row limit |
- | write.parquet.dictionary.enabled | true | Enable dictionary encoding |
- | write.parquet.dict-size-bytes | 2097152 (2 MB) | Parquet dictionary page size |
+| write.parquet.dict-size-bytes | 2097152 (2 MB) | Parquet dictionary page size |
| write.parquet.compression-codec | gzip | Parquet compression codec: zstd, brotli, lz4, gzip, snappy, uncompressed |
| write.parquet.compression-level | null | Parquet compression level |
| write.parquet.bloom-filter-enabled.column.col1 | (not set) | Hint to parquet to write a bloom filter for the column: col1 |
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java
index 7caeb7c7cd..cdfb9d59b0 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java
@@ -33,8 +33,6 @@ import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION;
import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_DEFAULT;
import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL;
import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL_DEFAULT;
-import static org.apache.iceberg.TableProperties.PARQUET_DICT_ENABLED;
-import static org.apache.iceberg.TableProperties.PARQUET_DICT_ENABLED_DEFAULT;
import static org.apache.iceberg.TableProperties.PARQUET_DICT_SIZE_BYTES;
import static org.apache.iceberg.TableProperties.PARQUET_DICT_SIZE_BYTES_DEFAULT;
import static org.apache.iceberg.TableProperties.PARQUET_PAGE_ROW_LIMIT;
@@ -100,6 +98,7 @@ import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.column.ParquetProperties.WriterVersion;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.ParquetFileWriter;
+import org.apache.parquet.hadoop.ParquetOutputFormat;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.api.ReadSupport;
@@ -246,7 +245,6 @@ public class Parquet {
int rowGroupSize = context.rowGroupSize();
int pageSize = context.pageSize();
int pageRowLimit = context.pageRowLimit();
- boolean dictionaryEnabled = context.dictionaryEnabled();
int dictionaryPageSize = context.dictionaryPageSize();
String compressionLevel = context.compressionLevel();
CompressionCodecName codec = context.codec();
@@ -254,6 +252,7 @@ public class Parquet {
int rowGroupCheckMaxRecordCount = context.rowGroupCheckMaxRecordCount();
int bloomFilterMaxBytes = context.bloomFilterMaxBytes();
Map<String, String> columnBloomFilterEnabled = context.columnBloomFilterEnabled();
+ boolean dictionaryEnabled = context.dictionaryEnabled();
if (compressionLevel != null) {
switch (codec) {
@@ -344,7 +343,6 @@ public class Parquet {
private final int rowGroupSize;
private final int pageSize;
private final int pageRowLimit;
- private final boolean dictionaryEnabled;
private final int dictionaryPageSize;
private final CompressionCodecName codec;
private final String compressionLevel;
@@ -352,23 +350,23 @@ public class Parquet {
private final int rowGroupCheckMaxRecordCount;
private final int bloomFilterMaxBytes;
private final Map<String, String> columnBloomFilterEnabled;
+ private final boolean dictionaryEnabled;
private Context(
int rowGroupSize,
int pageSize,
int pageRowLimit,
- boolean dictionaryEnabled,
int dictionaryPageSize,
CompressionCodecName codec,
String compressionLevel,
int rowGroupCheckMinRecordCount,
int rowGroupCheckMaxRecordCount,
int bloomFilterMaxBytes,
- Map<String, String> columnBloomFilterEnabled) {
+ Map<String, String> columnBloomFilterEnabled,
+ boolean dictionaryEnabled) {
this.rowGroupSize = rowGroupSize;
this.pageSize = pageSize;
this.pageRowLimit = pageRowLimit;
- this.dictionaryEnabled = dictionaryEnabled;
this.dictionaryPageSize = dictionaryPageSize;
this.codec = codec;
this.compressionLevel = compressionLevel;
@@ -376,6 +374,7 @@ public class Parquet {
this.rowGroupCheckMaxRecordCount = rowGroupCheckMaxRecordCount;
this.bloomFilterMaxBytes = bloomFilterMaxBytes;
this.columnBloomFilterEnabled = columnBloomFilterEnabled;
+ this.dictionaryEnabled = dictionaryEnabled;
}
static Context dataContext(Map<String, String> config) {
@@ -394,10 +393,6 @@ public class Parquet {
config, PARQUET_PAGE_ROW_LIMIT, PARQUET_PAGE_ROW_LIMIT_DEFAULT);
Preconditions.checkArgument(pageRowLimit > 0, "Page row count limit must be > 0");
- boolean dictionaryEnabled =
- PropertyUtil.propertyAsBoolean(
- config, PARQUET_DICT_ENABLED, PARQUET_DICT_ENABLED_DEFAULT);
-
int dictionaryPageSize =
PropertyUtil.propertyAsInt(
config, PARQUET_DICT_SIZE_BYTES, PARQUET_DICT_SIZE_BYTES_DEFAULT);
@@ -437,18 +432,21 @@ public class Parquet {
Map<String, String> columnBloomFilterEnabled =
PropertyUtil.propertiesWithPrefix(config, PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX);
+ boolean dictionaryEnabled =
+ PropertyUtil.propertyAsBoolean(config, ParquetOutputFormat.ENABLE_DICTIONARY, true);
+
return new Context(
rowGroupSize,
pageSize,
pageRowLimit,
- dictionaryEnabled,
dictionaryPageSize,
codec,
compressionLevel,
rowGroupCheckMinRecordCount,
rowGroupCheckMaxRecordCount,
bloomFilterMaxBytes,
- columnBloomFilterEnabled);
+ columnBloomFilterEnabled,
+ dictionaryEnabled);
}
static Context deleteContext(Map<String, String> config) {
@@ -509,18 +507,21 @@ public class Parquet {
Map<String, String> columnBloomFilterEnabled =
PropertyUtil.propertiesWithPrefix(config, PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX);
+ boolean dictionaryEnabled =
+ PropertyUtil.propertyAsBoolean(config, ParquetOutputFormat.ENABLE_DICTIONARY, true);
+
return new Context(
rowGroupSize,
pageSize,
pageRowLimit,
- dataContext.dictionaryEnabled(),
dictionaryPageSize,
codec,
compressionLevel,
rowGroupCheckMinRecordCount,
rowGroupCheckMaxRecordCount,
bloomFilterMaxBytes,
- columnBloomFilterEnabled);
+ columnBloomFilterEnabled,
+ dictionaryEnabled);
}
private static CompressionCodecName toCodec(String codecAsString) {
@@ -543,10 +544,6 @@ public class Parquet {
return pageRowLimit;
}
- boolean dictionaryEnabled() {
- return dictionaryEnabled;
- }
-
int dictionaryPageSize() {
return dictionaryPageSize;
}
@@ -574,6 +571,10 @@ public class Parquet {
Map<String, String> columnBloomFilterEnabled() {
return columnBloomFilterEnabled;
}
+
+ boolean dictionaryEnabled() {
+ return dictionaryEnabled;
+ }
}
}
diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestBloomRowGroupFilter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestBloomRowGroupFilter.java
index acb318a232..756639ab12 100644
--- a/parquet/src/test/java/org/apache/iceberg/parquet/TestBloomRowGroupFilter.java
+++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestBloomRowGroupFilter.java
@@ -19,7 +19,6 @@
package org.apache.iceberg.parquet;
import static org.apache.iceberg.TableProperties.PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX;
-import static org.apache.iceberg.TableProperties.PARQUET_DICT_ENABLED;
import static org.apache.iceberg.avro.AvroSchemaUtil.convert;
import static org.apache.iceberg.expressions.Expressions.and;
import static org.apache.iceberg.expressions.Expressions.equal;
@@ -71,6 +70,7 @@ import org.apache.iceberg.types.Types.UUIDType;
import org.apache.parquet.column.values.bloomfilter.BloomFilter;
import org.apache.parquet.hadoop.BloomFilterReader;
import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.ParquetOutputFormat;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.schema.MessageType;
@@ -198,7 +198,7 @@ public class TestBloomRowGroupFilter {
try (FileAppender<Record> appender =
Parquet.write(outFile)
.schema(FILE_SCHEMA)
- .set(PARQUET_DICT_ENABLED, "false")
+ .set(ParquetOutputFormat.ENABLE_DICTIONARY, "false")
.set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_id", "true")
.set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_long", "true")
.set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_double", "true")