You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by bo...@apache.org on 2020/10/07 14:07:08 UTC

[impala] 06/07: IMPALA-10205: Replace MD5 with Murmur3 for generating datafile path hash

This is an automated email from the ASF dual-hosted git repository.

boroknagyz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit b907a1498d798ccf4ac89089b947d08d21b84814
Author: wzhou-code <wz...@cloudera.com>
AuthorDate: Thu Oct 1 16:59:52 2020 -0700

    IMPALA-10205: Replace MD5 with Murmur3 for generating datafile path hash
    
    Current code generates data path hash in MD5 for Iceberg Table. But
    MD5 is one of forbidden algorithms for FIPS. Even for non-security
    purposes, like hash map, we still cannot use MD5 algorithm.
    This patch replaces MD5 with non-cryptographic hash function
    murmur3_128, which generates hash value with same length as MD5.
    
    Testing:
     - Passed core tests.
    
    Change-Id: If7c805f2fdf0cf5a69738579c7e55f4bd047ed59
    Reviewed-on: http://gerrit.cloudera.org:8080/16534
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 common/thrift/CatalogObjects.thrift                        |  4 ++--
 .../java/org/apache/impala/catalog/FeIcebergTable.java     |  8 ++++----
 .../main/java/org/apache/impala/catalog/IcebergTable.java  | 14 +++++++-------
 .../org/apache/impala/catalog/local/LocalIcebergTable.java |  8 ++++----
 .../java/org/apache/impala/planner/IcebergScanNode.java    |  4 ++--
 fe/src/main/java/org/apache/impala/util/IcebergUtil.java   |  6 +++---
 6 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/common/thrift/CatalogObjects.thrift b/common/thrift/CatalogObjects.thrift
index fed739b..457d0dd 100644
--- a/common/thrift/CatalogObjects.thrift
+++ b/common/thrift/CatalogObjects.thrift
@@ -540,8 +540,8 @@ struct TIcebergTable {
   1: required string table_location
   2: required list<TIcebergPartitionSpec> partition_spec
   3: required i32 default_partition_spec_id
-  // Data file path md5 and it's file descriptor
-  4: optional map<string,THdfsFileDesc> path_md5_to_file_descriptor
+  // Map from 128-bit Murmur3 hash of data file path to its file descriptor
+  4: optional map<string,THdfsFileDesc> path_hash_to_file_descriptor
 }
 
 // Represents a table or view.
diff --git a/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java b/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java
index bdc189f..11dddd1 100644
--- a/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java
@@ -66,7 +66,7 @@ public interface FeIcebergTable extends FeFsTable {
   /**
    * FileDescriptor map
    */
-  Map<String, HdfsPartition.FileDescriptor> getPathMD5ToFileDescMap();
+  Map<String, HdfsPartition.FileDescriptor> getPathHashToFileDescMap();
 
   /**
    * Return the hdfs table transformed from iceberg table
@@ -304,8 +304,8 @@ public interface FeIcebergTable extends FeFsTable {
           icebergTable.getDefaultPartitionSpecId());
 
       for (Map.Entry<String, HdfsPartition.FileDescriptor> entry :
-          icebergTable.getPathMD5ToFileDescMap().entrySet()) {
-        tIcebergTable.putToPath_md5_to_file_descriptor(entry.getKey(),
+          icebergTable.getPathHashToFileDescMap().entrySet()) {
+        tIcebergTable.putToPath_hash_to_file_descriptor(entry.getKey(),
           entry.getValue().toThrift());
       }
       return tIcebergTable;
@@ -356,7 +356,7 @@ public interface FeIcebergTable extends FeFsTable {
         HdfsPartition.FileDescriptor fileDesc = getFileDescriptor(
             new Path(file.path().toString()),
             new Path(table.getIcebergTableLocation()), table.getHostIndex());
-        fileDescMap.put(IcebergUtil.getDataFileMD5(file), fileDesc);
+        fileDescMap.put(IcebergUtil.getDataFilePathHash(file), fileDesc);
       }
       return fileDescMap;
     }
diff --git a/fe/src/main/java/org/apache/impala/catalog/IcebergTable.java b/fe/src/main/java/org/apache/impala/catalog/IcebergTable.java
index 44f6d10..0845c4f 100644
--- a/fe/src/main/java/org/apache/impala/catalog/IcebergTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/IcebergTable.java
@@ -99,8 +99,8 @@ public class IcebergTable extends Table implements FeIcebergTable {
   // Schema of the iceberg table.
   private org.apache.iceberg.Schema icebergSchema_;
 
-  // Key is the DataFile md5, value is FileDescriptor transformed from DataFile
-  private Map<String, FileDescriptor> pathMD5ToFileDescMap_;
+  // Key is the DataFile path hash, value is FileDescriptor transformed from DataFile
+  private Map<String, FileDescriptor> pathHashToFileDescMap_;
 
   // Treat iceberg table as a non-partitioned hdfs table in backend
   private HdfsTable hdfsTable_;
@@ -194,8 +194,8 @@ public class IcebergTable extends Table implements FeIcebergTable {
   public int getDefaultPartitionSpecId() { return defaultPartitionSpecId_; }
 
   @Override
-  public Map<String, FileDescriptor> getPathMD5ToFileDescMap() {
-    return pathMD5ToFileDescMap_;
+  public Map<String, FileDescriptor> getPathHashToFileDescMap() {
+    return pathHashToFileDescMap_;
   }
 
   @Override
@@ -231,7 +231,7 @@ public class IcebergTable extends Table implements FeIcebergTable {
         // Loading hdfs table after loaded schema from Iceberg,
         // in case we create external Iceberg table skipping column info in sql.
         hdfsTable_.load(false, msClient, msTable_, true, true, false, null, reason);
-        pathMD5ToFileDescMap_ = Utils.loadAllPartition(this);
+        pathHashToFileDescMap_ = Utils.loadAllPartition(this);
         loadAllColumnStats(msClient);
       } catch (Exception e) {
         throw new TableLoadingException("Error loading metadata for Iceberg table " +
@@ -298,8 +298,8 @@ public class IcebergTable extends Table implements FeIcebergTable {
     icebergTableLocation_ = ticeberg.getTable_location();
     partitionSpecs_ = loadPartitionBySpecsFromThrift(ticeberg.getPartition_spec());
     defaultPartitionSpecId_ = ticeberg.getDefault_partition_spec_id();
-    pathMD5ToFileDescMap_ = loadFileDescFromThrift(
-        ticeberg.getPath_md5_to_file_descriptor());
+    pathHashToFileDescMap_ = loadFileDescFromThrift(
+        ticeberg.getPath_hash_to_file_descriptor());
     hdfsTable_.loadFromThrift(thriftTable);
   }
 
diff --git a/fe/src/main/java/org/apache/impala/catalog/local/LocalIcebergTable.java b/fe/src/main/java/org/apache/impala/catalog/local/LocalIcebergTable.java
index 69b46f8..fa7e1f0 100644
--- a/fe/src/main/java/org/apache/impala/catalog/local/LocalIcebergTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/local/LocalIcebergTable.java
@@ -53,7 +53,7 @@ public class LocalIcebergTable extends LocalTable implements FeIcebergTable {
   private TIcebergFileFormat icebergFileFormat_;
   private List<IcebergPartitionSpec> partitionSpecs_;
   private int defaultPartitionSpecId_;
-  private Map<String, FileDescriptor> pathMD5ToFileDescMap_;
+  private Map<String, FileDescriptor> pathHashToFileDescMap_;
   private LocalFsTable localFsTable_;
 
   static LocalTable loadFromIceberg(LocalDb db, Table msTable,
@@ -86,7 +86,7 @@ public class LocalIcebergTable extends LocalTable implements FeIcebergTable {
     defaultPartitionSpecId_ = metadata.defaultSpecId();
     localFsTable_ = LocalFsTable.load(db, msTable, ref);
     try {
-      pathMD5ToFileDescMap_ = Utils.loadAllPartition(this);
+      pathHashToFileDescMap_ = Utils.loadAllPartition(this);
     } catch (IOException e) {
       throw new TableLoadingException(e.getMessage());
     }
@@ -134,8 +134,8 @@ public class LocalIcebergTable extends LocalTable implements FeIcebergTable {
   }
 
   @Override
-  public Map<String, FileDescriptor> getPathMD5ToFileDescMap() {
-    return pathMD5ToFileDescMap_;
+  public Map<String, FileDescriptor> getPathHashToFileDescMap() {
+    return pathHashToFileDescMap_;
   }
 
   @Override
diff --git a/fe/src/main/java/org/apache/impala/planner/IcebergScanNode.java b/fe/src/main/java/org/apache/impala/planner/IcebergScanNode.java
index 0be0ebb..ec1e9f3 100644
--- a/fe/src/main/java/org/apache/impala/planner/IcebergScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/IcebergScanNode.java
@@ -94,8 +94,8 @@ public class IcebergScanNode extends HdfsScanNode {
 
     List<FileDescriptor> fileDescList = new ArrayList<>();
     for (DataFile dataFile : dataFileList) {
-      FileDescriptor fileDesc = icebergTable_.getPathMD5ToFileDescMap()
-          .get(IcebergUtil.getDataFileMD5(dataFile));
+      FileDescriptor fileDesc = icebergTable_.getPathHashToFileDescMap()
+          .get(IcebergUtil.getDataFilePathHash(dataFile));
       fileDescList.add(fileDesc);
       //Todo: how to deal with iceberg metadata update, we need to invalidate manually now
       if (fileDesc == null) {
diff --git a/fe/src/main/java/org/apache/impala/util/IcebergUtil.java b/fe/src/main/java/org/apache/impala/util/IcebergUtil.java
index 6b206c7..af4ac31 100644
--- a/fe/src/main/java/org/apache/impala/util/IcebergUtil.java
+++ b/fe/src/main/java/org/apache/impala/util/IcebergUtil.java
@@ -342,10 +342,10 @@ public class IcebergUtil {
   }
 
   /**
-   * Use DataFile path to construct md5 as map key, cached in memory
+   * Use DataFile path to generate 128-bit Murmur3 hash as map key, cached in memory
    */
-  public static String getDataFileMD5(DataFile dataFile) {
-    Hasher hasher = Hashing.md5().newHasher();
+  public static String getDataFilePathHash(DataFile dataFile) {
+    Hasher hasher = Hashing.murmur3_128().newHasher();
     hasher.putUnencodedChars(dataFile.path().toString());
     return hasher.hash().toString();
   }