You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by tm...@apache.org on 2018/08/20 20:36:07 UTC

[4/4] impala git commit: IMPALA-7453. Intern HdfsStorageDescriptors

IMPALA-7453. Intern HdfsStorageDescriptors

The number of unique HdfsStorageDescriptors in a warehouse is typically
much smaller than the number of partitions. Each object takes 32/40 bytes
(with/without compressed OOPs respectively). So, by interning these
objects, we can save that amount of memory as well as one object per
partition.

The overall savings aren't huge (on the order of tens of MBs) but the
change is pretty simple so seems worthwhile.

This patch also pulls in the errorprone annotations into the pom so that
errorprone can ensure that the class can be annotated as Immutable.
errorprone checks that classes annotated as Immutable only contain
immutable fields.

I tested this change by comparing 'jmap -histo:live' on a catalogd
before/after. For my local dev environment test warehouse, I had 12055
instances (385kb) before the change and 24 instances (768 bytes) after.

Change-Id: I9ef93148d629b060fa9f67c631e9c3d904a0ccf9
Reviewed-on: http://gerrit.cloudera.org:8080/11236
Reviewed-by: Bharath Vissapragada <bh...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/d2930028
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/d2930028
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/d2930028

Branch: refs/heads/master
Commit: d29300281b5d07c1ed98032536c008b076a7baa5
Parents: dccc2de
Author: Todd Lipcon <to...@apache.org>
Authored: Tue Aug 7 19:01:43 2018 -0700
Committer: Todd Lipcon <to...@apache.org>
Committed: Mon Aug 20 20:32:42 2018 +0000

----------------------------------------------------------------------
 fe/pom.xml                                      |  6 ++
 .../apache/impala/catalog/HdfsPartition.java    | 16 ++---
 .../impala/catalog/HdfsStorageDescriptor.java   | 71 +++++++++++++++++---
 .../impala/catalog/local/LocalKuduTable.java    |  3 +-
 .../catalog/local/LocalPartitionSpec.java       |  6 +-
 .../org/apache/impala/catalog/CatalogTest.java  | 10 +++
 6 files changed, 86 insertions(+), 26 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/d2930028/fe/pom.xml
----------------------------------------------------------------------
diff --git a/fe/pom.xml b/fe/pom.xml
index a120536..c695a5b 100644
--- a/fe/pom.xml
+++ b/fe/pom.xml
@@ -454,6 +454,12 @@ under the License.
     </dependency>
 
     <dependency>
+        <groupId>com.google.errorprone</groupId>
+        <artifactId>error_prone_annotations</artifactId>
+        <version>2.3.1</version>
+    </dependency>
+
+    <dependency>
       <groupId>junit</groupId>
       <artifactId>junit</artifactId>
       <version>${junit.version}</version>

http://git-wip-us.apache.org/repos/asf/impala/blob/d2930028/fe/src/main/java/org/apache/impala/catalog/HdfsPartition.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsPartition.java b/fe/src/main/java/org/apache/impala/catalog/HdfsPartition.java
index 05b66f7..14ce4e3 100644
--- a/fe/src/main/java/org/apache/impala/catalog/HdfsPartition.java
+++ b/fe/src/main/java/org/apache/impala/catalog/HdfsPartition.java
@@ -459,7 +459,7 @@ public class HdfsPartition implements FeFsPartition, PrunablePartition {
    * we. We should therefore treat mixing formats inside one partition as user error.
    * It's easy to add per-file metadata to FileDescriptor if this changes.
    */
-  private final HdfsStorageDescriptor fileFormatDescriptor_;
+  private HdfsStorageDescriptor fileFormatDescriptor_;
 
   /**
    * The file descriptors of this partition, encoded as flatbuffers. Storing the raw
@@ -563,7 +563,8 @@ public class HdfsPartition implements FeFsPartition, PrunablePartition {
    * format classes.
    */
   public void setFileFormat(HdfsFileFormat fileFormat) {
-    fileFormatDescriptor_.setFileFormat(fileFormat);
+    fileFormatDescriptor_ = fileFormatDescriptor_.cloneWithChangedFileFormat(
+        fileFormat);
     cachedMsPartitionDescriptor_.sdInputFormat = fileFormat.inputFormat();
     cachedMsPartitionDescriptor_.sdOutputFormat = fileFormat.outputFormat();
     cachedMsPartitionDescriptor_.sdSerdeInfo.setSerializationLib(
@@ -811,15 +812,8 @@ public class HdfsPartition implements FeFsPartition, PrunablePartition {
 
   public static HdfsPartition fromThrift(HdfsTable table,
       long id, THdfsPartition thriftPartition) {
-    HdfsStorageDescriptor storageDesc = new HdfsStorageDescriptor(table.getName(),
-        HdfsFileFormat.fromThrift(thriftPartition.getFileFormat()),
-        thriftPartition.lineDelim,
-        thriftPartition.fieldDelim,
-        thriftPartition.collectionDelim,
-        thriftPartition.mapKeyDelim,
-        thriftPartition.escapeChar,
-        (byte) '"', // TODO: We should probably add quoteChar to THdfsPartition.
-        thriftPartition.blockSize);
+    HdfsStorageDescriptor storageDesc = HdfsStorageDescriptor.fromThriftPartition(
+        thriftPartition, table.getName());
 
     List<LiteralExpr> literalExpr = Lists.newArrayList();
     if (id != CatalogObjectsConstants.PROTOTYPE_PARTITION_ID) {

http://git-wip-us.apache.org/repos/asf/impala/blob/d2930028/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java b/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java
index f51b10e..d1c0f9d 100644
--- a/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java
+++ b/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java
@@ -17,22 +17,32 @@
 
 package org.apache.impala.catalog;
 
-import java.util.List;
 import java.util.Map;
+import java.util.Objects;
 
 import org.apache.hadoop.hive.metastore.api.SerDeInfo;
 import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
 import org.apache.hadoop.hive.serde.serdeConstants;
+import org.apache.impala.thrift.THdfsPartition;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import com.google.common.base.Preconditions;
 import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Interner;
+import com.google.common.collect.Interners;
 import com.google.common.collect.Maps;
+import com.google.errorprone.annotations.Immutable;
 
 /**
  * Represents the file format metadata for files stored in a table or partition.
+ *
+ * This class is immutable, and instances are stored in a global interner, since
+ * the number of distinct combinations of file formats and other parameters
+ * is typically quite low relative to the total number of partitions (e.g. almost
+ * all partitions will use the default quoting).
  */
+@Immutable
 public class HdfsStorageDescriptor {
   public static final char DEFAULT_LINE_DELIM = '\n';
   // hive by default uses ctrl-a as field delim
@@ -46,14 +56,14 @@ public class HdfsStorageDescriptor {
   // Important: don't change the ordering of these keys - if e.g. FIELD_DELIM is not
   // found, the value of LINE_DELIM is used, so LINE_DELIM must be found first.
   // Package visible for testing.
-  final static List<String> DELIMITER_KEYS = ImmutableList.of(
+  final static ImmutableList<String> DELIMITER_KEYS = ImmutableList.of(
       serdeConstants.LINE_DELIM, serdeConstants.FIELD_DELIM,
       serdeConstants.COLLECTION_DELIM, serdeConstants.MAPKEY_DELIM,
       serdeConstants.ESCAPE_CHAR, serdeConstants.QUOTE_CHAR);
 
   // The Parquet serde shows up multiple times as the location of the implementation
   // has changed between Impala versions.
-  final static List<String> COMPATIBLE_SERDES = ImmutableList.of(
+  final static ImmutableList<String> COMPATIBLE_SERDES = ImmutableList.of(
       "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", // (seq / text / parquet)
       "org.apache.hadoop.hive.serde2.avro.AvroSerDe", // (avro)
       "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe", // (rc)
@@ -65,7 +75,10 @@ public class HdfsStorageDescriptor {
 
   private final static Logger LOG = LoggerFactory.getLogger(HdfsStorageDescriptor.class);
 
-  private HdfsFileFormat fileFormat_;
+  private final static Interner<HdfsStorageDescriptor> INTERNER =
+      Interners.newWeakInterner();
+
+  private final HdfsFileFormat fileFormat_;
   private final byte lineDelim_;
   private final byte fieldDelim_;
   private final byte collectionDelim_;
@@ -74,10 +87,6 @@ public class HdfsStorageDescriptor {
   private final byte quoteChar_;
   private final int blockSize_;
 
-  public void setFileFormat(HdfsFileFormat fileFormat) {
-    fileFormat_ = fileFormat;
-  }
-
   /**
    * Returns a map from delimiter key to a single delimiter character,
    * filling in defaults if explicit values are not found in the supplied
@@ -151,7 +160,7 @@ public class HdfsStorageDescriptor {
     return null;
   }
 
-  public HdfsStorageDescriptor(String tblName, HdfsFileFormat fileFormat, byte lineDelim,
+  private HdfsStorageDescriptor(String tblName, HdfsFileFormat fileFormat, byte lineDelim,
       byte fieldDelim, byte collectionDelim, byte mapKeyDelim, byte escapeChar,
       byte quoteChar, int blockSize) {
     this.fileFormat_ = fileFormat;
@@ -216,7 +225,7 @@ public class HdfsStorageDescriptor {
     }
 
     try {
-      return new HdfsStorageDescriptor(tblName,
+      return INTERNER.intern(new HdfsStorageDescriptor(tblName,
           HdfsFileFormat.fromJavaClassName(sd.getInputFormat()),
           delimMap.get(serdeConstants.LINE_DELIM),
           delimMap.get(serdeConstants.FIELD_DELIM),
@@ -224,13 +233,30 @@ public class HdfsStorageDescriptor {
           delimMap.get(serdeConstants.MAPKEY_DELIM),
           delimMap.get(serdeConstants.ESCAPE_CHAR),
           delimMap.get(serdeConstants.QUOTE_CHAR),
-          blockSize);
+          blockSize));
     } catch (IllegalArgumentException ex) {
       // Thrown by fromJavaClassName
       throw new InvalidStorageDescriptorException(ex);
     }
   }
 
+  public static HdfsStorageDescriptor fromThriftPartition(THdfsPartition thriftPartition,
+      String tableName) {
+    return INTERNER.intern(new HdfsStorageDescriptor(tableName,
+        HdfsFileFormat.fromThrift(thriftPartition.getFileFormat()),
+        thriftPartition.lineDelim, thriftPartition.fieldDelim,
+        thriftPartition.collectionDelim, thriftPartition.mapKeyDelim,
+        thriftPartition.escapeChar,
+        (byte) '"', // TODO: We should probably add quoteChar to THdfsPartition.
+        thriftPartition.blockSize));
+  }
+
+  public HdfsStorageDescriptor cloneWithChangedFileFormat(HdfsFileFormat newFormat) {
+    return INTERNER.intern(new HdfsStorageDescriptor(
+        "<unknown>", newFormat, lineDelim_, fieldDelim_, collectionDelim_, mapKeyDelim_,
+        escapeChar_, quoteChar_, blockSize_));
+  }
+
   public byte getLineDelim() { return lineDelim_; }
   public byte getFieldDelim() { return fieldDelim_; }
   public byte getCollectionDelim() { return collectionDelim_; }
@@ -238,4 +264,27 @@ public class HdfsStorageDescriptor {
   public byte getEscapeChar() { return escapeChar_; }
   public HdfsFileFormat getFileFormat() { return fileFormat_; }
   public int getBlockSize() { return blockSize_; }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(blockSize_, collectionDelim_, escapeChar_, fieldDelim_,
+        fileFormat_, lineDelim_, mapKeyDelim_, quoteChar_);
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) return true;
+    if (obj == null) return false;
+    if (getClass() != obj.getClass()) return false;
+    HdfsStorageDescriptor other = (HdfsStorageDescriptor) obj;
+    if (blockSize_ != other.blockSize_) return false;
+    if (collectionDelim_ != other.collectionDelim_) return false;
+    if (escapeChar_ != other.escapeChar_) return false;
+    if (fieldDelim_ != other.fieldDelim_) return false;
+    if (fileFormat_ != other.fileFormat_) return false;
+    if (lineDelim_ != other.lineDelim_) return false;
+    if (mapKeyDelim_ != other.mapKeyDelim_) return false;
+    if (quoteChar_ != other.quoteChar_) return false;
+    return true;
+  }
 }

http://git-wip-us.apache.org/repos/asf/impala/blob/d2930028/fe/src/main/java/org/apache/impala/catalog/local/LocalKuduTable.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/catalog/local/LocalKuduTable.java b/fe/src/main/java/org/apache/impala/catalog/local/LocalKuduTable.java
index 8a6bb67..2095449 100644
--- a/fe/src/main/java/org/apache/impala/catalog/local/LocalKuduTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/local/LocalKuduTable.java
@@ -22,8 +22,6 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
-import javax.annotation.concurrent.Immutable;
-
 import org.apache.hadoop.hive.metastore.api.FieldSchema;
 import org.apache.hadoop.hive.metastore.api.Table;
 import org.apache.impala.analysis.ColumnDef;
@@ -46,6 +44,7 @@ import org.apache.kudu.client.KuduException;
 import com.google.common.base.Preconditions;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Lists;
+import com.google.errorprone.annotations.Immutable;
 
 public class LocalKuduTable extends LocalTable implements FeKuduTable {
   private final TableParams tableParams_;

http://git-wip-us.apache.org/repos/asf/impala/blob/d2930028/fe/src/main/java/org/apache/impala/catalog/local/LocalPartitionSpec.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/catalog/local/LocalPartitionSpec.java b/fe/src/main/java/org/apache/impala/catalog/local/LocalPartitionSpec.java
index f48f54e..690a4bf 100644
--- a/fe/src/main/java/org/apache/impala/catalog/local/LocalPartitionSpec.java
+++ b/fe/src/main/java/org/apache/impala/catalog/local/LocalPartitionSpec.java
@@ -19,8 +19,6 @@ package org.apache.impala.catalog.local;
 
 import java.util.List;
 
-import javax.annotation.concurrent.Immutable;
-
 import org.apache.hadoop.hive.metastore.api.MetaException;
 import org.apache.impala.analysis.LiteralExpr;
 import org.apache.impala.catalog.CatalogException;
@@ -30,6 +28,7 @@ import org.apache.impala.util.MetaStoreUtil;
 
 import com.google.common.base.Preconditions;
 import com.google.common.collect.ImmutableList;
+import com.google.errorprone.annotations.Immutable;
 
 /**
  * Specification of a partition of a {@link LocalFsTable} containing
@@ -39,6 +38,9 @@ import com.google.common.collect.ImmutableList;
 class LocalPartitionSpec implements PrunablePartition {
   private final long id_;
   private final String name_;
+
+  // LiteralExprs are technically mutable prior to analysis.
+  @SuppressWarnings("Immutable")
   private final ImmutableList<LiteralExpr> partitionValues_;
 
   LocalPartitionSpec(LocalFsTable table, String partName, long id) {

http://git-wip-us.apache.org/repos/asf/impala/blob/d2930028/fe/src/test/java/org/apache/impala/catalog/CatalogTest.java
----------------------------------------------------------------------
diff --git a/fe/src/test/java/org/apache/impala/catalog/CatalogTest.java b/fe/src/test/java/org/apache/impala/catalog/CatalogTest.java
index ad1595b..bc90820 100644
--- a/fe/src/test/java/org/apache/impala/catalog/CatalogTest.java
+++ b/fe/src/test/java/org/apache/impala/catalog/CatalogTest.java
@@ -27,8 +27,10 @@ import static org.junit.Assert.assertTrue;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.IdentityHashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Set;
@@ -359,6 +361,8 @@ public class CatalogTest {
     // check that partition keys cover the date range 1/1/2009-12/31/2010
     // and that we have one file per partition.
     assertEquals(24, partitions.size());
+    Set<HdfsStorageDescriptor> uniqueSds = Collections.newSetFromMap(
+        new IdentityHashMap<HdfsStorageDescriptor, Boolean>());
     Set<Long> months = Sets.newHashSet();
     for (FeFsPartition p: partitions) {
       assertEquals(2, p.getPartitionValues().size());
@@ -380,8 +384,14 @@ public class CatalogTest {
         // no need for this boolean anymore.
         assertEquals(p.getFileDescriptors().size(), 1);
       }
+
+      uniqueSds.add(p.getInputFormatDescriptor());
     }
     assertEquals(months.size(), 24);
+
+    // We intern storage descriptors, so we should only have a single instance across
+    // all of the partitions.
+    assertEquals(1, uniqueSds.size());
   }
 
   // TODO: All Hive-stats related tests are temporarily disabled because of an unknown,