You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by bl...@apache.org on 2023/05/21 22:22:38 UTC

[iceberg] branch master updated: Core, Parquet: Remove Parquet dictionary encoding table property (#7665)

This is an automated email from the ASF dual-hosted git repository.

blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new fa9d0dad7d Core, Parquet: Remove Parquet dictionary encoding table property (#7665)
fa9d0dad7d is described below

commit fa9d0dad7d0ae1c32cbf640019fba0182d400490
Author: Amogh Jahagirdar <ja...@amazon.com>
AuthorDate: Sun May 21 15:22:32 2023 -0700

    Core, Parquet: Remove Parquet dictionary encoding table property (#7665)
    
    Co-authored-by: Fokko Driesprong <fo...@apache.org>
---
 .../java/org/apache/iceberg/TableProperties.java   |  3 --
 docs/configuration.md                              |  3 +-
 .../java/org/apache/iceberg/parquet/Parquet.java   | 39 +++++++++++-----------
 .../iceberg/parquet/TestBloomRowGroupFilter.java   |  4 +--
 4 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/core/src/main/java/org/apache/iceberg/TableProperties.java b/core/src/main/java/org/apache/iceberg/TableProperties.java
index 770865a569..b14354def6 100644
--- a/core/src/main/java/org/apache/iceberg/TableProperties.java
+++ b/core/src/main/java/org/apache/iceberg/TableProperties.java
@@ -135,9 +135,6 @@ public class TableProperties {
   public static final String DELETE_PARQUET_PAGE_ROW_LIMIT = "write.delete.parquet.page-row-limit";
   public static final int PARQUET_PAGE_ROW_LIMIT_DEFAULT = 20_000;
 
-  public static final String PARQUET_DICT_ENABLED = "write.parquet.enable.dictionary";
-  public static final boolean PARQUET_DICT_ENABLED_DEFAULT = true;
-
   public static final String PARQUET_DICT_SIZE_BYTES = "write.parquet.dict-size-bytes";
   public static final String DELETE_PARQUET_DICT_SIZE_BYTES =
       "write.delete.parquet.dict-size-bytes";
diff --git a/docs/configuration.md b/docs/configuration.md
index 99a25fec12..7fa2d94adf 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -54,8 +54,7 @@ Iceberg tables support table properties to configure table behavior, like the de
 | write.parquet.row-group-size-bytes                   | 134217728 (128 MB)          | Parquet row group size                                                                                                                                                                            |
 | write.parquet.page-size-bytes                        | 1048576 (1 MB)              | Parquet page size                                                                                                                                                                                 |
 | write.parquet.page-row-limit                         | 20000                       | Parquet page row limit                                                                                                                                                                            |
- | write.parquet.dictionary.enabled                     | true                        | Enable dictionary encoding                                                                                                                                                                        |
- | write.parquet.dict-size-bytes                        | 2097152 (2 MB)              | Parquet dictionary page size                                                                                                                                                                      |
+| write.parquet.dict-size-bytes                        | 2097152 (2 MB)              | Parquet dictionary page size                                                                                                                                                                      |
 | write.parquet.compression-codec                      | gzip                        | Parquet compression codec: zstd, brotli, lz4, gzip, snappy, uncompressed                                                                                                                          |
 | write.parquet.compression-level                      | null                        | Parquet compression level                                                                                                                                                                         |
 | write.parquet.bloom-filter-enabled.column.col1       | (not set)                   | Hint to parquet to write a bloom filter for the column: col1                                                                                                                                      |
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java
index 7caeb7c7cd..cdfb9d59b0 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java
@@ -33,8 +33,6 @@ import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION;
 import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_DEFAULT;
 import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL;
 import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL_DEFAULT;
-import static org.apache.iceberg.TableProperties.PARQUET_DICT_ENABLED;
-import static org.apache.iceberg.TableProperties.PARQUET_DICT_ENABLED_DEFAULT;
 import static org.apache.iceberg.TableProperties.PARQUET_DICT_SIZE_BYTES;
 import static org.apache.iceberg.TableProperties.PARQUET_DICT_SIZE_BYTES_DEFAULT;
 import static org.apache.iceberg.TableProperties.PARQUET_PAGE_ROW_LIMIT;
@@ -100,6 +98,7 @@ import org.apache.parquet.column.ParquetProperties;
 import org.apache.parquet.column.ParquetProperties.WriterVersion;
 import org.apache.parquet.hadoop.ParquetFileReader;
 import org.apache.parquet.hadoop.ParquetFileWriter;
+import org.apache.parquet.hadoop.ParquetOutputFormat;
 import org.apache.parquet.hadoop.ParquetReader;
 import org.apache.parquet.hadoop.ParquetWriter;
 import org.apache.parquet.hadoop.api.ReadSupport;
@@ -246,7 +245,6 @@ public class Parquet {
       int rowGroupSize = context.rowGroupSize();
       int pageSize = context.pageSize();
       int pageRowLimit = context.pageRowLimit();
-      boolean dictionaryEnabled = context.dictionaryEnabled();
       int dictionaryPageSize = context.dictionaryPageSize();
       String compressionLevel = context.compressionLevel();
       CompressionCodecName codec = context.codec();
@@ -254,6 +252,7 @@ public class Parquet {
       int rowGroupCheckMaxRecordCount = context.rowGroupCheckMaxRecordCount();
       int bloomFilterMaxBytes = context.bloomFilterMaxBytes();
       Map<String, String> columnBloomFilterEnabled = context.columnBloomFilterEnabled();
+      boolean dictionaryEnabled = context.dictionaryEnabled();
 
       if (compressionLevel != null) {
         switch (codec) {
@@ -344,7 +343,6 @@ public class Parquet {
       private final int rowGroupSize;
       private final int pageSize;
       private final int pageRowLimit;
-      private final boolean dictionaryEnabled;
       private final int dictionaryPageSize;
       private final CompressionCodecName codec;
       private final String compressionLevel;
@@ -352,23 +350,23 @@ public class Parquet {
       private final int rowGroupCheckMaxRecordCount;
       private final int bloomFilterMaxBytes;
       private final Map<String, String> columnBloomFilterEnabled;
+      private final boolean dictionaryEnabled;
 
       private Context(
           int rowGroupSize,
           int pageSize,
           int pageRowLimit,
-          boolean dictionaryEnabled,
           int dictionaryPageSize,
           CompressionCodecName codec,
           String compressionLevel,
           int rowGroupCheckMinRecordCount,
           int rowGroupCheckMaxRecordCount,
           int bloomFilterMaxBytes,
-          Map<String, String> columnBloomFilterEnabled) {
+          Map<String, String> columnBloomFilterEnabled,
+          boolean dictionaryEnabled) {
         this.rowGroupSize = rowGroupSize;
         this.pageSize = pageSize;
         this.pageRowLimit = pageRowLimit;
-        this.dictionaryEnabled = dictionaryEnabled;
         this.dictionaryPageSize = dictionaryPageSize;
         this.codec = codec;
         this.compressionLevel = compressionLevel;
@@ -376,6 +374,7 @@ public class Parquet {
         this.rowGroupCheckMaxRecordCount = rowGroupCheckMaxRecordCount;
         this.bloomFilterMaxBytes = bloomFilterMaxBytes;
         this.columnBloomFilterEnabled = columnBloomFilterEnabled;
+        this.dictionaryEnabled = dictionaryEnabled;
       }
 
       static Context dataContext(Map<String, String> config) {
@@ -394,10 +393,6 @@ public class Parquet {
                 config, PARQUET_PAGE_ROW_LIMIT, PARQUET_PAGE_ROW_LIMIT_DEFAULT);
         Preconditions.checkArgument(pageRowLimit > 0, "Page row count limit must be > 0");
 
-        boolean dictionaryEnabled =
-            PropertyUtil.propertyAsBoolean(
-                config, PARQUET_DICT_ENABLED, PARQUET_DICT_ENABLED_DEFAULT);
-
         int dictionaryPageSize =
             PropertyUtil.propertyAsInt(
                 config, PARQUET_DICT_SIZE_BYTES, PARQUET_DICT_SIZE_BYTES_DEFAULT);
@@ -437,18 +432,21 @@ public class Parquet {
         Map<String, String> columnBloomFilterEnabled =
             PropertyUtil.propertiesWithPrefix(config, PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX);
 
+        boolean dictionaryEnabled =
+            PropertyUtil.propertyAsBoolean(config, ParquetOutputFormat.ENABLE_DICTIONARY, true);
+
         return new Context(
             rowGroupSize,
             pageSize,
             pageRowLimit,
-            dictionaryEnabled,
             dictionaryPageSize,
             codec,
             compressionLevel,
             rowGroupCheckMinRecordCount,
             rowGroupCheckMaxRecordCount,
             bloomFilterMaxBytes,
-            columnBloomFilterEnabled);
+            columnBloomFilterEnabled,
+            dictionaryEnabled);
       }
 
       static Context deleteContext(Map<String, String> config) {
@@ -509,18 +507,21 @@ public class Parquet {
         Map<String, String> columnBloomFilterEnabled =
             PropertyUtil.propertiesWithPrefix(config, PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX);
 
+        boolean dictionaryEnabled =
+            PropertyUtil.propertyAsBoolean(config, ParquetOutputFormat.ENABLE_DICTIONARY, true);
+
         return new Context(
             rowGroupSize,
             pageSize,
             pageRowLimit,
-            dataContext.dictionaryEnabled(),
             dictionaryPageSize,
             codec,
             compressionLevel,
             rowGroupCheckMinRecordCount,
             rowGroupCheckMaxRecordCount,
             bloomFilterMaxBytes,
-            columnBloomFilterEnabled);
+            columnBloomFilterEnabled,
+            dictionaryEnabled);
       }
 
       private static CompressionCodecName toCodec(String codecAsString) {
@@ -543,10 +544,6 @@ public class Parquet {
         return pageRowLimit;
       }
 
-      boolean dictionaryEnabled() {
-        return dictionaryEnabled;
-      }
-
       int dictionaryPageSize() {
         return dictionaryPageSize;
       }
@@ -574,6 +571,10 @@ public class Parquet {
       Map<String, String> columnBloomFilterEnabled() {
         return columnBloomFilterEnabled;
       }
+
+      boolean dictionaryEnabled() {
+        return dictionaryEnabled;
+      }
     }
   }
 
diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestBloomRowGroupFilter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestBloomRowGroupFilter.java
index acb318a232..756639ab12 100644
--- a/parquet/src/test/java/org/apache/iceberg/parquet/TestBloomRowGroupFilter.java
+++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestBloomRowGroupFilter.java
@@ -19,7 +19,6 @@
 package org.apache.iceberg.parquet;
 
 import static org.apache.iceberg.TableProperties.PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX;
-import static org.apache.iceberg.TableProperties.PARQUET_DICT_ENABLED;
 import static org.apache.iceberg.avro.AvroSchemaUtil.convert;
 import static org.apache.iceberg.expressions.Expressions.and;
 import static org.apache.iceberg.expressions.Expressions.equal;
@@ -71,6 +70,7 @@ import org.apache.iceberg.types.Types.UUIDType;
 import org.apache.parquet.column.values.bloomfilter.BloomFilter;
 import org.apache.parquet.hadoop.BloomFilterReader;
 import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.ParquetOutputFormat;
 import org.apache.parquet.hadoop.metadata.BlockMetaData;
 import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
 import org.apache.parquet.schema.MessageType;
@@ -198,7 +198,7 @@ public class TestBloomRowGroupFilter {
     try (FileAppender<Record> appender =
         Parquet.write(outFile)
             .schema(FILE_SCHEMA)
-            .set(PARQUET_DICT_ENABLED, "false")
+            .set(ParquetOutputFormat.ENABLE_DICTIONARY, "false")
             .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_id", "true")
             .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_long", "true")
             .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_double", "true")