You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by vi...@apache.org on 2019/01/02 19:30:54 UTC

hive git commit: HIVE-21040 : msck does unnecessary file listing at last level of directory tree (Vihang Karajgaonkar, reviewed by Prasanth Jayachandran)

Repository: hive
Updated Branches:
  refs/heads/branch-3 490041dd3 -> 4c73511f3


HIVE-21040 : msck does unnecessary file listing at last level of directory tree (Vihang Karajgaonkar, reviewed by Prasanth Jayachandran)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/4c73511f
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/4c73511f
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/4c73511f

Branch: refs/heads/branch-3
Commit: 4c73511f3f59144fb8cc306117a1bf1f3d6dd071
Parents: 490041d
Author: Vihang Karajgaonkar <vi...@apache.org>
Authored: Mon Dec 17 17:13:56 2018 -0800
Committer: Vihang Karajgaonkar <vi...@apache.org>
Committed: Wed Jan 2 11:16:18 2019 -0800

----------------------------------------------------------------------
 .../hive/ql/metadata/HiveMetaStoreChecker.java  |  18 ++--
 .../ql/metadata/TestHiveMetaStoreChecker.java   | 106 +++++++++++++++++++
 2 files changed, 116 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/4c73511f/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java
index 598bb2e..9339094 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java
@@ -42,6 +42,7 @@ import org.apache.hadoop.hive.ql.log.PerfLogger;
 import org.apache.hadoop.hive.ql.session.SessionState;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import com.google.common.annotations.VisibleForTesting;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -470,10 +471,13 @@ public class HiveMetaStoreChecker {
         throws IOException, HiveException, InterruptedException {
       final Path currentPath = pd.p;
       final int currentDepth = pd.depth;
+      if (currentDepth == partColNames.size()) {
+        return currentPath;
+      }
       FileStatus[] fileStatuses = fs.listStatus(currentPath, FileUtils.HIDDEN_FILES_PATH_FILTER);
       // found no files under a sub-directory under table base path; it is possible that the table
       // is empty and hence there are no partition sub-directories created under base path
-      if (fileStatuses.length == 0 && currentDepth > 0 && currentDepth < partColNames.size()) {
+      if (fileStatuses.length == 0 && currentDepth > 0) {
         // since maxDepth is not yet reached, we are missing partition
         // columns in currentPath
         logOrThrowExceptionWithMsg(
@@ -481,12 +485,12 @@ public class HiveMetaStoreChecker {
       } else {
         // found files under currentPath add them to the queue if it is a directory
         for (FileStatus fileStatus : fileStatuses) {
-          if (!fileStatus.isDirectory() && currentDepth < partColNames.size()) {
+          if (!fileStatus.isDirectory()) {
             // found a file at depth which is less than number of partition keys
             logOrThrowExceptionWithMsg(
                 "MSCK finds a file rather than a directory when it searches for "
                     + fileStatus.getPath().toString());
-          } else if (fileStatus.isDirectory() && currentDepth < partColNames.size()) {
+          } else {
             // found a sub-directory at a depth less than number of partition keys
             // validate if the partition directory name matches with the corresponding
             // partition colName at currentDepth
@@ -503,9 +507,6 @@ public class HiveMetaStoreChecker {
             }
           }
         }
-        if (currentDepth == partColNames.size()) {
-          return currentPath;
-        }
       }
       return null;
     }
@@ -528,7 +529,8 @@ public class HiveMetaStoreChecker {
     }
   }
 
-  private void checkPartitionDirs(final ExecutorService executor,
+  @VisibleForTesting
+  void checkPartitionDirs(final ExecutorService executor,
       final Path basePath, final Set<Path> result,
       final FileSystem fs, final List<String> partColNames) throws HiveException {
     try {
@@ -559,7 +561,7 @@ public class HiveMetaStoreChecker {
         nextLevel = tempQueue;
       }
     } catch (InterruptedException | ExecutionException e) {
-      LOG.error(e.getMessage());
+      LOG.error("Exception received while listing partition directories", e);
       executor.shutdownNow();
       throw new HiveException(e.getCause());
     }

http://git-wip-us.apache.org/repos/asf/hive/blob/4c73511f/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java b/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java
index ff411f6..46f6ad8 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java
@@ -18,16 +18,29 @@
 package org.apache.hadoop.hive.ql.metadata;
 
 import static org.junit.Assert.*;
+import static org.mockito.Matchers.any;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
 
+import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocalFileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.hive.common.FileUtils;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.metastore.api.AlreadyExistsException;
 import org.apache.hadoop.hive.metastore.api.Database;
@@ -39,10 +52,12 @@ import org.apache.hadoop.hive.serde.serdeConstants;
 import org.apache.hadoop.mapred.TextInputFormat;
 import org.apache.thrift.TException;
 import org.junit.After;
+import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
 
 import com.google.common.collect.Lists;
+import org.mockito.Mockito;
 
 /**
  * TestHiveMetaStoreChecker.
@@ -575,6 +590,94 @@ public class TestHiveMetaStoreChecker {
     }
     assertTrue("Expected HiveException", exception!=null && exception instanceof HiveException);
   }
+
+  /**
+   * Test counts the number of listStatus calls in the msck core method of
+   * listing sub-directories. This is important to check since it unnecessary
+   * listStatus calls could cause performance degradation in remote filesystems
+   * like S3. The test creates a mock FileSystem object and a mock directory structure
+   * to simulate a table which has 2 partition keys and 2 partition values at each level.
+   * In the end it counts how many times the listStatus is called on the mock filesystem
+   * and confirm its equal to the current theoretical value.
+   *
+   * @throws IOException
+   * @throws HiveException
+   */
+  @Test
+  public void testNumberOfListStatusCalls() throws IOException, HiveException {
+    LocalFileSystem mockFs = Mockito.mock(LocalFileSystem.class);
+    Path tableLocation = new Path("mock:///tmp/testTable");
+
+    Path countryUS = new Path(tableLocation, "country=US");
+    Path countryIND = new Path(tableLocation, "country=IND");
+
+    Path cityPA = new Path(countryUS, "city=PA");
+    Path citySF = new Path(countryUS, "city=SF");
+    Path cityBOM = new Path(countryIND, "city=BOM");
+    Path cityDEL = new Path(countryIND, "city=DEL");
+
+    Path paData = new Path(cityPA, "datafile");
+    Path sfData = new Path(citySF, "datafile");
+    Path bomData = new Path(cityBOM, "datafile");
+    Path delData = new Path(cityDEL, "datafile");
+
+    //level 1 listing
+    FileStatus[] allCountries = getMockFileStatus(countryUS, countryIND);
+    when(mockFs.listStatus(tableLocation, FileUtils.HIDDEN_FILES_PATH_FILTER))
+        .thenReturn(allCountries);
+
+    //level 2 listing
+    FileStatus[] filesInUS = getMockFileStatus(cityPA, citySF);
+    when(mockFs.listStatus(countryUS, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(filesInUS);
+
+    FileStatus[] filesInInd = getMockFileStatus(cityBOM, cityDEL);
+    when(mockFs.listStatus(countryIND, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(filesInInd);
+
+    //level 3 listing
+    FileStatus[] paFiles = getMockFileStatus(paData);
+    when(mockFs.listStatus(cityPA, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(paFiles);
+
+    FileStatus[] sfFiles = getMockFileStatus(sfData);
+    when(mockFs.listStatus(citySF, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(sfFiles);
+
+    FileStatus[] bomFiles = getMockFileStatus(bomData);
+    when(mockFs.listStatus(cityBOM, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(bomFiles);
+
+    FileStatus[] delFiles = getMockFileStatus(delData);
+    when(mockFs.listStatus(cityDEL, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(delFiles);
+
+    HiveMetaStoreChecker checker = new HiveMetaStoreChecker(hive);
+    ExecutorService executorService = Executors.newFixedThreadPool(2);
+    Set<Path> result = new HashSet<>();
+    checker.checkPartitionDirs(executorService, tableLocation, result, mockFs,
+        Arrays.asList("country", "city"));
+    // if there are n partition columns, then number of times listStatus should be called
+    // must be equal
+    // to (numDirsAtLevel1) + (numDirsAtLevel2) + ... + (numDirAtLeveln-1)
+    // in this case it should 1 (table level) + 2 (US, IND)
+    verify(mockFs, times(3)).listStatus(any(Path.class), any(PathFilter.class));
+    Assert.assertEquals("msck should have found 4 unknown partitions", 4, result.size());
+  }
+
+  private FileStatus[] getMockFileStatus(Path... paths) throws IOException {
+    FileStatus[] result = new FileStatus[paths.length];
+    int i = 0;
+    for (Path p : paths) {
+      result[i++] = createMockFileStatus(p);
+    }
+    return result;
+  }
+
+  private FileStatus createMockFileStatus(Path p) {
+    FileStatus mock = Mockito.mock(FileStatus.class);
+    when(mock.getPath()).thenReturn(p);
+    if (p.toString().contains("datafile")) {
+      when(mock.isDirectory()).thenReturn(false);
+    } else {
+      when(mock.isDirectory()).thenReturn(true);
+    }
+    return mock;
+  }
   /**
    * Creates a test partitioned table with the required level of nested partitions and number of
    * partitions
@@ -702,6 +805,9 @@ public class TestHiveMetaStoreChecker {
   private void createDirectory(String partPath) throws IOException {
     Path part = new Path(partPath);
     fs.mkdirs(part);
+    // create files under partitions to simulate real partitions
+    fs.createNewFile(new Path(partPath + Path.SEPARATOR + "dummydata1"));
+    fs.createNewFile(new Path(partPath + Path.SEPARATOR + "dummydata2"));
     fs.deleteOnExit(part);
   }
 }