You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by vi...@apache.org on 2019/01/02 19:30:54 UTC
hive git commit: HIVE-21040 : msck does unnecessary file listing at
last level of directory tree (Vihang Karajgaonkar,
reviewed by Prasanth Jayachandran)
Repository: hive
Updated Branches:
refs/heads/branch-3 490041dd3 -> 4c73511f3
HIVE-21040 : msck does unnecessary file listing at last level of directory tree (Vihang Karajgaonkar, reviewed by Prasanth Jayachandran)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/4c73511f
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/4c73511f
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/4c73511f
Branch: refs/heads/branch-3
Commit: 4c73511f3f59144fb8cc306117a1bf1f3d6dd071
Parents: 490041d
Author: Vihang Karajgaonkar <vi...@apache.org>
Authored: Mon Dec 17 17:13:56 2018 -0800
Committer: Vihang Karajgaonkar <vi...@apache.org>
Committed: Wed Jan 2 11:16:18 2019 -0800
----------------------------------------------------------------------
.../hive/ql/metadata/HiveMetaStoreChecker.java | 18 ++--
.../ql/metadata/TestHiveMetaStoreChecker.java | 106 +++++++++++++++++++
2 files changed, 116 insertions(+), 8 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/4c73511f/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java
index 598bb2e..9339094 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreChecker.java
@@ -42,6 +42,7 @@ import org.apache.hadoop.hive.ql.log.PerfLogger;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@@ -470,10 +471,13 @@ public class HiveMetaStoreChecker {
throws IOException, HiveException, InterruptedException {
final Path currentPath = pd.p;
final int currentDepth = pd.depth;
+ if (currentDepth == partColNames.size()) {
+ return currentPath;
+ }
FileStatus[] fileStatuses = fs.listStatus(currentPath, FileUtils.HIDDEN_FILES_PATH_FILTER);
// found no files under a sub-directory under table base path; it is possible that the table
// is empty and hence there are no partition sub-directories created under base path
- if (fileStatuses.length == 0 && currentDepth > 0 && currentDepth < partColNames.size()) {
+ if (fileStatuses.length == 0 && currentDepth > 0) {
// since maxDepth is not yet reached, we are missing partition
// columns in currentPath
logOrThrowExceptionWithMsg(
@@ -481,12 +485,12 @@ public class HiveMetaStoreChecker {
} else {
// found files under currentPath add them to the queue if it is a directory
for (FileStatus fileStatus : fileStatuses) {
- if (!fileStatus.isDirectory() && currentDepth < partColNames.size()) {
+ if (!fileStatus.isDirectory()) {
// found a file at depth which is less than number of partition keys
logOrThrowExceptionWithMsg(
"MSCK finds a file rather than a directory when it searches for "
+ fileStatus.getPath().toString());
- } else if (fileStatus.isDirectory() && currentDepth < partColNames.size()) {
+ } else {
// found a sub-directory at a depth less than number of partition keys
// validate if the partition directory name matches with the corresponding
// partition colName at currentDepth
@@ -503,9 +507,6 @@ public class HiveMetaStoreChecker {
}
}
}
- if (currentDepth == partColNames.size()) {
- return currentPath;
- }
}
return null;
}
@@ -528,7 +529,8 @@ public class HiveMetaStoreChecker {
}
}
- private void checkPartitionDirs(final ExecutorService executor,
+ @VisibleForTesting
+ void checkPartitionDirs(final ExecutorService executor,
final Path basePath, final Set<Path> result,
final FileSystem fs, final List<String> partColNames) throws HiveException {
try {
@@ -559,7 +561,7 @@ public class HiveMetaStoreChecker {
nextLevel = tempQueue;
}
} catch (InterruptedException | ExecutionException e) {
- LOG.error(e.getMessage());
+ LOG.error("Exception received while listing partition directories", e);
executor.shutdownNow();
throw new HiveException(e.getCause());
}
http://git-wip-us.apache.org/repos/asf/hive/blob/4c73511f/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java b/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java
index ff411f6..46f6ad8 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java
@@ -18,16 +18,29 @@
package org.apache.hadoop.hive.ql.metadata;
import static org.junit.Assert.*;
+import static org.mockito.Matchers.any;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.api.AlreadyExistsException;
import org.apache.hadoop.hive.metastore.api.Database;
@@ -39,10 +52,12 @@ import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.thrift.TException;
import org.junit.After;
+import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import com.google.common.collect.Lists;
+import org.mockito.Mockito;
/**
* TestHiveMetaStoreChecker.
@@ -575,6 +590,94 @@ public class TestHiveMetaStoreChecker {
}
assertTrue("Expected HiveException", exception!=null && exception instanceof HiveException);
}
+
+ /**
+ * Test counts the number of listStatus calls in the msck core method of
+ * listing sub-directories. This is important to check since it unnecessary
+ * listStatus calls could cause performance degradation in remote filesystems
+ * like S3. The test creates a mock FileSystem object and a mock directory structure
+ * to simulate a table which has 2 partition keys and 2 partition values at each level.
+ * In the end it counts how many times the listStatus is called on the mock filesystem
+ * and confirm its equal to the current theoretical value.
+ *
+ * @throws IOException
+ * @throws HiveException
+ */
+ @Test
+ public void testNumberOfListStatusCalls() throws IOException, HiveException {
+ LocalFileSystem mockFs = Mockito.mock(LocalFileSystem.class);
+ Path tableLocation = new Path("mock:///tmp/testTable");
+
+ Path countryUS = new Path(tableLocation, "country=US");
+ Path countryIND = new Path(tableLocation, "country=IND");
+
+ Path cityPA = new Path(countryUS, "city=PA");
+ Path citySF = new Path(countryUS, "city=SF");
+ Path cityBOM = new Path(countryIND, "city=BOM");
+ Path cityDEL = new Path(countryIND, "city=DEL");
+
+ Path paData = new Path(cityPA, "datafile");
+ Path sfData = new Path(citySF, "datafile");
+ Path bomData = new Path(cityBOM, "datafile");
+ Path delData = new Path(cityDEL, "datafile");
+
+ //level 1 listing
+ FileStatus[] allCountries = getMockFileStatus(countryUS, countryIND);
+ when(mockFs.listStatus(tableLocation, FileUtils.HIDDEN_FILES_PATH_FILTER))
+ .thenReturn(allCountries);
+
+ //level 2 listing
+ FileStatus[] filesInUS = getMockFileStatus(cityPA, citySF);
+ when(mockFs.listStatus(countryUS, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(filesInUS);
+
+ FileStatus[] filesInInd = getMockFileStatus(cityBOM, cityDEL);
+ when(mockFs.listStatus(countryIND, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(filesInInd);
+
+ //level 3 listing
+ FileStatus[] paFiles = getMockFileStatus(paData);
+ when(mockFs.listStatus(cityPA, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(paFiles);
+
+ FileStatus[] sfFiles = getMockFileStatus(sfData);
+ when(mockFs.listStatus(citySF, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(sfFiles);
+
+ FileStatus[] bomFiles = getMockFileStatus(bomData);
+ when(mockFs.listStatus(cityBOM, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(bomFiles);
+
+ FileStatus[] delFiles = getMockFileStatus(delData);
+ when(mockFs.listStatus(cityDEL, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(delFiles);
+
+ HiveMetaStoreChecker checker = new HiveMetaStoreChecker(hive);
+ ExecutorService executorService = Executors.newFixedThreadPool(2);
+ Set<Path> result = new HashSet<>();
+ checker.checkPartitionDirs(executorService, tableLocation, result, mockFs,
+ Arrays.asList("country", "city"));
+ // if there are n partition columns, then number of times listStatus should be called
+ // must be equal
+ // to (numDirsAtLevel1) + (numDirsAtLevel2) + ... + (numDirAtLeveln-1)
+ // in this case it should 1 (table level) + 2 (US, IND)
+ verify(mockFs, times(3)).listStatus(any(Path.class), any(PathFilter.class));
+ Assert.assertEquals("msck should have found 4 unknown partitions", 4, result.size());
+ }
+
+ private FileStatus[] getMockFileStatus(Path... paths) throws IOException {
+ FileStatus[] result = new FileStatus[paths.length];
+ int i = 0;
+ for (Path p : paths) {
+ result[i++] = createMockFileStatus(p);
+ }
+ return result;
+ }
+
+ private FileStatus createMockFileStatus(Path p) {
+ FileStatus mock = Mockito.mock(FileStatus.class);
+ when(mock.getPath()).thenReturn(p);
+ if (p.toString().contains("datafile")) {
+ when(mock.isDirectory()).thenReturn(false);
+ } else {
+ when(mock.isDirectory()).thenReturn(true);
+ }
+ return mock;
+ }
/**
* Creates a test partitioned table with the required level of nested partitions and number of
* partitions
@@ -702,6 +805,9 @@ public class TestHiveMetaStoreChecker {
private void createDirectory(String partPath) throws IOException {
Path part = new Path(partPath);
fs.mkdirs(part);
+ // create files under partitions to simulate real partitions
+ fs.createNewFile(new Path(partPath + Path.SEPARATOR + "dummydata1"));
+ fs.createNewFile(new Path(partPath + Path.SEPARATOR + "dummydata2"));
fs.deleteOnExit(part);
}
}