You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by an...@apache.org on 2021/01/05 11:09:00 UTC
[hive] branch master updated: HIVE-24526: Get grouped locations of
external table data using metatool. (Arko Sharma,
reviewed by Pravin Kumar Sinha )
This is an automated email from the ASF dual-hosted git repository.
anishek pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new d5ea2f3 HIVE-24526: Get grouped locations of external table data using metatool. (Arko Sharma, reviewed by Pravin Kumar Sinha )
d5ea2f3 is described below
commit d5ea2f3bb81cd992ce2cf6ad1da23fc4db67c471
Author: Anishek Agarwal <an...@gmail.com>
AuthorDate: Tue Jan 5 16:38:42 2021 +0530
HIVE-24526: Get grouped locations of external table data using metatool. (Arko Sharma, reviewed by Pravin Kumar Sinha )
---
.../metastore/tools/metatool/TestHiveMetaTool.java | 312 +++++++++-
.../metastore/tools/metatool/HiveMetaTool.java | 4 +
.../tools/metatool/HiveMetaToolCommandLine.java | 59 +-
.../tools/metatool/MetaToolTaskDiffExtTblLocs.java | 161 +++++
.../tools/metatool/MetaToolTaskListExtTblLocs.java | 668 +++++++++++++++++++++
.../metatool/TestHiveMetaToolCommandLine.java | 30 +-
.../metatool/TestMetaToolTaskListExtTblLocs.java | 291 +++++++++
7 files changed, 1518 insertions(+), 7 deletions(-)
diff --git a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/metastore/tools/metatool/TestHiveMetaTool.java b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/metastore/tools/metatool/TestHiveMetaTool.java
index 81b7ff0..22e3fe0 100644
--- a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/metastore/tools/metatool/TestHiveMetaTool.java
+++ b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/metastore/tools/metatool/TestHiveMetaTool.java
@@ -19,15 +19,24 @@
package org.apache.hadoop.hive.metastore.tools.metatool;
import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
+import java.nio.file.Files;
+import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.Set;
-
+import org.json.JSONObject;
+import org.json.JSONArray;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.metastore.conf.MetastoreConf;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.Database;
@@ -35,12 +44,25 @@ import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.SerDeInfo;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.metastore.api.Table;
+import org.apache.hadoop.hive.metastore.utils.TestTxnDbUtil;
+import org.apache.hadoop.hive.metastore.txn.TxnStore;
+import org.apache.hadoop.hive.metastore.txn.TxnUtils;
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.hive.ql.io.HiveInputFormat;
+import org.apache.hadoop.hive.ql.processors.CommandProcessorException;
+import org.apache.hadoop.hive.ql.session.SessionState;
+import org.apache.hadoop.hive.ql.QueryState;
+import org.apache.hadoop.hive.ql.Driver;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils.AvroTableProperties;
import org.apache.hadoop.util.StringUtils;
import org.apache.thrift.TException;
+import org.junit.Assert;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.assertEquals;
+import com.google.gson.JsonParser;
+import org.json.JSONObject;
import org.junit.Before;
import org.junit.After;
import org.junit.Test;
@@ -57,7 +79,12 @@ public class TestHiveMetaTool {
private HiveMetaStoreClient client;
private OutputStream os;
-
+ protected Driver d;
+ protected TxnStore txnHandler;
+ private static HiveConf hiveConf;
+ private static final String TEST_DATA_DIR = new File(System.getProperty("java.io.tmpdir") +
+ File.separator + TestHiveMetaTool.class.getCanonicalName() + "-" + System.currentTimeMillis()
+ ).getPath().replaceAll("\\\\", "/");
@Before
public void setUp() throws Exception {
@@ -66,19 +93,60 @@ public class TestHiveMetaTool {
os = new ByteArrayOutputStream();
System.setOut(new PrintStream(os));
- HiveConf hiveConf = new HiveConf(HiveMetaTool.class);
+ hiveConf = new HiveConf(HiveMetaTool.class);
client = new HiveMetaStoreClient(hiveConf);
createDatabase();
createTable();
client.close();
+ Path workDir = new Path(System.getProperty("test.tmp.dir",
+ "target" + File.separator + "test" + File.separator + "tmp"));
+ hiveConf.set("mapred.local.dir", workDir + File.separator + this.getClass().getSimpleName()
+ + File.separator + "mapred" + File.separator + "local");
+ hiveConf.set("mapred.system.dir", workDir + File.separator + this.getClass().getSimpleName()
+ + File.separator + "mapred" + File.separator + "system");
+ hiveConf.set("mapreduce.jobtracker.staging.root.dir", workDir + File.separator + this.getClass().getSimpleName()
+ + File.separator + "mapred" + File.separator + "staging");
+ hiveConf.set("mapred.temp.dir", workDir + File.separator + this.getClass().getSimpleName()
+ + File.separator + "mapred" + File.separator + "temp");
+ hiveConf.set(HiveConf.ConfVars.PREEXECHOOKS.varname, "");
+ hiveConf.set(HiveConf.ConfVars.POSTEXECHOOKS.varname, "");
+ hiveConf.set(HiveConf.ConfVars.METASTOREWAREHOUSE.varname, getWarehouseDir());
+ hiveConf.setVar(HiveConf.ConfVars.HIVEINPUTFORMAT, HiveInputFormat.class.getName());
+ hiveConf.setVar(HiveConf.ConfVars.HIVE_AUTHORIZATION_MANAGER,
+ "org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactory");
+ hiveConf.setBoolVar(HiveConf.ConfVars.MERGE_CARDINALITY_VIOLATION_CHECK, true);
+ HiveConf.setBoolVar(hiveConf, HiveConf.ConfVars.MERGE_SPLIT_UPDATE, true);
+ hiveConf.setBoolVar(HiveConf.ConfVars.HIVESTATSCOLAUTOGATHER, false);
+ hiveConf.setBoolean("mapred.input.dir.recursive", true);
+ TestTxnDbUtil.setConfValues(hiveConf);
+ txnHandler = TxnUtils.getTxnStore(hiveConf);
+ TestTxnDbUtil.prepDb(hiveConf);
+ File f = new File(getWarehouseDir());
+ if (f.exists()) {
+ FileUtil.fullyDelete(f);
+ }
+ if (!(new File(getWarehouseDir()).mkdirs())) {
+ throw new RuntimeException("Could not create " + getWarehouseDir());
+ }
+ SessionState ss = SessionState.start(hiveConf);
+ ss.applyAuthorizationPolicy();
+ d = new Driver(new QueryState.Builder().withHiveConf(hiveConf).nonIsolated().build());
+ d.setMaxRows(10000);
} catch (Exception e) {
System.err.println("Unable to setup the hive metatool test");
System.err.println(StringUtils.stringifyException(e));
throw new Exception(e);
}
}
+ protected String getWarehouseDir() {
+ return getTestDataDir() + "/warehouse";
+ }
+
+ private String getTestDataDir() {
+ return TEST_DATA_DIR;
+ }
private void createDatabase() throws Exception {
if (client.getAllDatabases().contains(DB_NAME)) {
@@ -142,17 +210,255 @@ public class TestHiveMetaTool {
checkAvroSchemaURLProps(AVRO_URI);
}
+ /*
+ * Tests -listExtTblLocs option on various input combinations.
+ */
+ @Test
+ public void testListExtTblLocs() throws Exception {
+ String extTblLocation = getTestDataDir() + "/ext";
+ String outLocation = getTestDataDir() + "/extTblOutput/";
+ Configuration conf = MetastoreConf.newMetastoreConf();
+ MetastoreConf.setVar(conf, MetastoreConf.ConfVars.WAREHOUSE_EXTERNAL, getWarehouseDir());
+ MetaToolTaskListExtTblLocs.msConf = conf;
+
+ // Case 1 : Check default locations
+ // Inputs : db1, db2 in default locations, db3 in custom location
+ // Expected outputs: default locations for db1, db2 and custom location for db3 after aggregation
+ runStatementOnDriver("create database db1");
+ runStatementOnDriver("create database db2");
+ runStatementOnDriver("create database db3");
+ runStatementOnDriver("create external table db1.ext(a int) partitioned by (p int)");
+ runStatementOnDriver("create external table db2.ext(a int) partitioned by (p int)");
+ runStatementOnDriver("create external table db3.ext(a int) partitioned by (p int) " +
+ "location '" + getTestDataDir() + "/ext/tblLoc'");
+ runStatementOnDriver("alter table db3.ext add partition(p = 0) location '" + getTestDataDir() + "/part'" );
+ runStatementOnDriver("alter table db3.ext add partition(p = 1) location '" + getTestDataDir() + "/part'" );
+ JSONObject outJS = getListExtTblLocs("db*", outLocation);
+ //confirm default locations
+ Set<String> outLocationSet = outJS.keySet();
+ String expectedOutLoc1 = getAbsolutePath(getWarehouseDir() + "/db1.db");
+ Assert.assertTrue(outLocationSet.contains(expectedOutLoc1));
+ Assert.assertEquals(outLocationSet.size(), 4);
+ JSONArray outArr = outJS.getJSONArray(expectedOutLoc1);
+ Assert.assertEquals(outArr.length(), 1);
+ Assert.assertTrue(outArr.getString(0).equals("db1.ext"));
+ String expectedOutLoc2 = getAbsolutePath(getWarehouseDir() + "/db2.db");
+ Assert.assertTrue(outLocationSet.contains(expectedOutLoc2));
+ outArr = outJS.getJSONArray(expectedOutLoc2);
+ Assert.assertEquals(outArr.length(), 1);
+ Assert.assertTrue(outArr.getString(0).equals("db2.ext"));
+ String expectedOutLoc3 = getAbsolutePath(getTestDataDir() + "/part");
+ Assert.assertTrue(outLocationSet.contains(expectedOutLoc3));
+ outArr = outJS.getJSONArray(expectedOutLoc3);
+ Assert.assertEquals(outArr.length(), 2);
+ Assert.assertTrue(outArr.getString(0).equals("db3.ext.p=0"));
+ Assert.assertTrue(outArr.getString(1).equals("db3.ext.p=1"));
+ String expectedOutLoc4 = getAbsolutePath(getTestDataDir() + "/ext/tblLoc");
+ Assert.assertTrue(outLocationSet.contains(expectedOutLoc4));
+ outArr = outJS.getJSONArray(expectedOutLoc4);
+ Assert.assertEquals(outArr.length(), 1);
+ Assert.assertTrue(outArr.getString(0).equals("db3.ext p(0/2)"));
+
+
+ // Case 2 : Check with special chars in partition-names : including quotes, timestamp formats, spaces, backslash etc.
+ // Also checks count of partitions in tbl-location.
+ // inputs (default database)
+ // ../ext/t1 - table1 location containing 3/5 partitions
+ // ../ext/t2 - table2 location containining 2/4 partitions
+ // ../ext/dir1/dir2/dir3 - 2 partitions of table1, 1 partition of table2, table loc of table3 with 0 partitions.
+ // ../ext - partitions of table3
+ // expected output : [../ext/t1, ../ext/t2, ../ext/dir1/dir2/dir3/t1_parts (2 partitions), ../ext/dir1/dir2/dir3/t2_parts(2 partitions),
+ // .../ext/dir1/dir2/dir3/t3 (0 parittions), ../ext/t3_parts (1 partition) ]
+ // Doesn't contain default database location as there are no entities in default location in this case,
+ // all data is under some custom location (../ext)
+ runStatementOnDriver("drop table ext");
+ runStatementOnDriver("create external table ext(a int) partitioned by (p varchar(3)) " +
+ "location '" + getTestDataDir() + "/ext/t1'");
+ runStatementOnDriver("create external table ext2(a int) partitioned by (flt string, dbl string) " +
+ "location '" + getTestDataDir() + "/ext/t2'");
+ runStatementOnDriver("create external table ext3(a int) partitioned by (dt string, timeSt string) "
+ + "location '" + getTestDataDir() + "/ext/dir1/dir2/dir3/t3'");
+ runStatementOnDriver("alter table ext add partition(p = 'A')");
+ runStatementOnDriver("alter table ext add partition(p = 'B')");
+ runStatementOnDriver("alter table ext add partition(p = 'UK')" );
+ runStatementOnDriver("alter table ext2 add partition(flt = '0.0', dbl = '0')");
+ runStatementOnDriver("alter table ext2 add partition(flt = '0.1', dbl = '1.1')");
+ runStatementOnDriver("alter table ext3 add partition(dt = '2020-12-01', timeSt = '23:23:23') location '"
+ + getTestDataDir() + "/ext/t3_parts'" );
+ runStatementOnDriver("alter table ext3 add partition(dt = '2020-12-02', timeSt = '22:22:22') location '"
+ + getTestDataDir() + "/ext/t3_parts'" );
+ runStatementOnDriver("alter table ext3 add partition(dt = '2020-12-03', timeSt = '21:21:21.1234') location '"
+ + getTestDataDir() + "/ext/t3_parts'" );
+ runStatementOnDriver("alter table ext add partition(p = \'A\\\\\') location '"
+ + getTestDataDir() + "/ext/dir1/dir2/dir3/t1_parts'" );
+ runStatementOnDriver("alter table ext add partition(p = \' A\"\') location '"
+ + getTestDataDir() + "/ext/dir1/dir2/dir3/t1_parts'" );
+ runStatementOnDriver("alter table ext2 add partition(flt = '0.1', dbl='3.22') location '"
+ + getTestDataDir() + "/ext/dir1/dir2/dir3/t2_parts'");
+ runStatementOnDriver("alter table ext2 add partition(flt = '0.22', dbl = '2.22') location '"
+ + getTestDataDir() + "/ext/dir1/dir2/dir3/t2_parts'");
+
+
+ outJS = getListExtTblLocs("default", outLocation);
+ expectedOutLoc1 = getAbsolutePath(extTblLocation + "/t1");
+ expectedOutLoc2 = getAbsolutePath(extTblLocation + "/t2");
+ expectedOutLoc3 = getAbsolutePath(extTblLocation + "/dir1/dir2/dir3/t1_parts");
+ expectedOutLoc4 = getAbsolutePath(extTblLocation + "/dir1/dir2/dir3/t2_parts");
+ String expectedOutLoc5 = getAbsolutePath(extTblLocation + "/dir1/dir2/dir3/t3");
+ String expectedOutLoc6 = getAbsolutePath(extTblLocation + "/t3_parts");
+
+ outLocationSet = outJS.keySet();
+ Assert.assertEquals(outLocationSet.size(), 6);
+ Assert.assertTrue(outLocationSet.contains(expectedOutLoc1));
+ outArr = outJS.getJSONArray(expectedOutLoc1); //t1
+ Assert.assertEquals(outArr.length(), 1);
+ Assert.assertTrue(outArr.getString(0).equals("default.ext p(3/5)"));
+ Assert.assertTrue(outLocationSet.contains(expectedOutLoc2));
+ outArr = outJS.getJSONArray(expectedOutLoc2); //t2
+ Assert.assertEquals(outArr.length(), 1);
+ Assert.assertTrue(outArr.getString(0).equals("default.ext2 p(2/4)"));
+ Assert.assertTrue(outLocationSet.contains(expectedOutLoc3)); //t1_parts
+ outArr = outJS.getJSONArray(expectedOutLoc3);
+ Assert.assertEquals(outArr.length(), 2);
+ Assert.assertTrue(outArr.getString(0).equals("default.ext.p= A%22")); //spaces, quotes
+ Assert.assertTrue(outArr.getString(1).equals("default.ext.p=A%5C")); //backslash
+ Assert.assertTrue(outLocationSet.contains(expectedOutLoc4)); //t2_parts
+ outArr = outJS.getJSONArray(expectedOutLoc4);
+ Assert.assertEquals(outArr.length(), 2);
+ Assert.assertTrue(outArr.getString(0).equals("default.ext2.flt=0.1/dbl=3.22")); //periods, slash
+ Assert.assertTrue(outArr.getString(1).equals("default.ext2.flt=0.22/dbl=2.22"));
+ Assert.assertTrue(outLocationSet.contains(expectedOutLoc5)); //t3
+ outArr = outJS.getJSONArray(expectedOutLoc5);
+ Assert.assertEquals(outArr.length(), 1);
+ Assert.assertTrue(outArr.getString(0).equals("default.ext3 p(0/3)"));
+ Assert.assertTrue(outLocationSet.contains(expectedOutLoc6)); //t3_parts
+ outArr = outJS.getJSONArray(expectedOutLoc6);
+ Assert.assertEquals(outArr.length(), 3);
+ Assert.assertTrue(outArr.getString(0).equals("default.ext3.dt=2020-12-01/timest=23%3A23%3A23")); //date, timestamp formats
+ Assert.assertTrue(outArr.getString(1).equals("default.ext3.dt=2020-12-02/timest=22%3A22%3A22"));
+ Assert.assertTrue(outArr.getString(2).equals("default.ext3.dt=2020-12-03/timest=21%3A21%3A21.1234"));
+ }
+
+ /*
+ * Tests -diffExtTblLocs option on various input combinations.
+ */
+ @Test
+ public void testDiffExtTblLocs() throws Exception {
+ String extTblLocation = getTestDataDir() + "/ext";
+ String outLocation = getTestDataDir() + "/extTblOutput";
+ Configuration conf = MetastoreConf.newMetastoreConf();
+ MetastoreConf.setVar(conf, MetastoreConf.ConfVars.WAREHOUSE_EXTERNAL, getWarehouseDir());
+ MetaToolTaskListExtTblLocs.msConf = conf;
+
+ //create first file using -listExtTblLocs
+ runStatementOnDriver("create database diffDb");
+ runStatementOnDriver("create external table diffDb.ext1(a int) partitioned by (p int)");
+ runStatementOnDriver("create external table diffDb.ext2(a int) partitioned by (p int)");
+ runStatementOnDriver("create external table diffDb.ext3(a int) partitioned by (p int) " +
+ "location '" + getTestDataDir() + "/ext/tblLoc'");
+ runStatementOnDriver("alter table diffDb.ext1 add partition(p = 0) location '" + getTestDataDir() + "/part'" );
+ runStatementOnDriver("alter table diffDb.ext1 add partition(p = 1) location '" + getTestDataDir() + "/part'" );
+ String outLocation1 = outLocation + "1";
+ getListExtTblLocs("diffDb", outLocation1);
+
+ //create second file using -listExtTblLocs after dropping a table, dropping a partition and adding a different partition
+ runStatementOnDriver("drop table diffDb.ext2");
+ runStatementOnDriver("alter table diffDb.ext1 drop partition(p = 0)" );
+ runStatementOnDriver("alter table diffDb.ext1 add partition(p = 3) location '" + getTestDataDir() + "/part'" );
+ String outLocation2 = outLocation + "2";
+ getListExtTblLocs("diffDb", outLocation2);
+
+ //run diff on the above two files
+ JSONObject outJS = getDiffExtTblLocs(outLocation1, outLocation2, outLocation);
+ Set<String> outLocationSet = outJS.keySet();
+ String defaultDbLoc = getAbsolutePath(getWarehouseDir() + "/diffdb.db");
+ Assert.assertEquals(outLocationSet.size(), 2);
+ Assert.assertTrue(outLocationSet.contains(defaultDbLoc));
+ JSONArray outArr = outJS.getJSONArray(defaultDbLoc);
+ Assert.assertEquals(outArr.length(), 1);
+ Assert.assertTrue(outArr.getString(0).equals("- diffdb.ext2")); // dropped ext2 from default location
+ String partLoc = getAbsolutePath(getTestDataDir() + "/part");
+ Assert.assertTrue(outLocationSet.contains(partLoc));
+ outArr = outJS.getJSONArray(partLoc);
+ Assert.assertEquals(outArr.length(), 2); //two entries - 1 for added partition and 1 for dropped partition
+ Assert.assertTrue(outArr.getString(0).equals("+ diffdb.ext1.p=3"));
+ Assert.assertTrue(outArr.getString(1).equals("- diffdb.ext1.p=0"));
+ }
+
+ private String getAbsolutePath(String extTblLocation) {
+ return "file:" + extTblLocation;
+ }
+
+ private JSONObject getListExtTblLocs(String dbName, String outLocation) throws IOException {
+ File f = new File(outLocation);
+ if (f.exists()) {
+ FileUtil.fullyDelete(f);
+ }
+ if (!(new File(outLocation).mkdirs())) {
+ throw new RuntimeException("Could not create " + outLocation);
+ }
+ HiveMetaTool.main(new String[] {"-listExtTblLocs", dbName, outLocation});
+ for (File outFile : f.listFiles()) {
+ String contents = new String(Files.readAllBytes(Paths.get(outFile.getAbsolutePath())));
+ return new JSONObject(contents);
+ }
+ return null;
+ }
+
+ private JSONObject getDiffExtTblLocs(String fileLoc1, String fileLoc2, String outLocation) throws IOException {
+ File f = new File(outLocation);
+ if (f.exists()) {
+ FileUtil.fullyDelete(f);
+ }
+ if (!(new File(outLocation).mkdirs())) {
+ throw new RuntimeException("Could not create " + outLocation);
+ }
+ File f1 = new File(fileLoc1);
+ File f2 = new File(fileLoc2);
+ for (File outFile1 : f1.listFiles()) {
+ for (File outFile2 : f2.listFiles()) {
+ HiveMetaTool.main(new String[] {"-diffExtTblLocs", outFile1.getAbsolutePath(), outFile2.getAbsolutePath(), outLocation});
+ for(File outFile : f.listFiles()) {
+ String contents = new String(Files.readAllBytes(Paths.get(outFile.getAbsolutePath())));
+ return new JSONObject(contents);
+ }
+ }
+ }
+ return null;
+ }
+
private void checkAvroSchemaURLProps(String expectedUri) throws TException {
Table table = client.getTable(DB_NAME, TABLE_NAME);
assertEquals(expectedUri, table.getParameters().get(AvroTableProperties.SCHEMA_URL.getPropName()));
assertEquals(expectedUri, table.getSd().getParameters().get(AvroTableProperties.SCHEMA_URL.getPropName()));
}
+ protected List<String> runStatementOnDriver(String stmt) throws Exception {
+ try {
+ d.run(stmt);
+ } catch (CommandProcessorException e) {
+ throw new RuntimeException(stmt + " failed: " + e);
+ }
+ List<String> rs = new ArrayList<>();
+ d.getResults(rs);
+ return rs;
+ }
+
@After
public void tearDown() throws Exception {
try {
client.dropTable(DB_NAME, TABLE_NAME);
client.dropDatabase(DB_NAME);
+ try {
+ if (d != null) {
+ d.close();
+ d.destroy();
+ d = null;
+ }
+ } finally {
+ TestTxnDbUtil.cleanDb(hiveConf);
+ FileUtils.deleteDirectory(new File(getTestDataDir()));
+ }
client.close();
} catch (Throwable e) {
diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaTool.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaTool.java
index 760d78d..913146e 100644
--- a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaTool.java
+++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaTool.java
@@ -50,6 +50,10 @@ public final class HiveMetaTool {
task = new MetaToolTaskExecuteJDOQLQuery();
} else if (cl.isUpdateLocation()) {
task = new MetaToolTaskUpdateLocation();
+ } else if (cl.isListExtTblLocs()) {
+ task = new MetaToolTaskListExtTblLocs();
+ } else if (cl.isDiffExtTblLocs()) {
+ task = new MetaToolTaskDiffExtTblLocs();
} else {
throw new IllegalArgumentException("No task was specified!");
}
diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaToolCommandLine.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaToolCommandLine.java
index 1223f0d..ce43a8c 100644
--- a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaToolCommandLine.java
+++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaToolCommandLine.java
@@ -58,6 +58,25 @@ class HiveMetaToolCommandLine {
.create("updateLocation");
@SuppressWarnings("static-access")
+ private static final Option LIST_EXT_TBL_LOCS = OptionBuilder
+ .withArgName("dbName> " + " <output-loc")
+ .hasArgs(2)
+ .withDescription("Generates a file containing a list of directories which cover external table data locations " +
+ "for the specified database. A database name or pattern must be specified, on which the tool will be run." +
+ "The output is generated at the specified location."
+ )
+ .create("listExtTblLocs");
+
+ @SuppressWarnings("static-access")
+ private static final Option DIFF_EXT_TBL_LOCS = OptionBuilder
+ .withArgName("file1> " + " <file2> " + "<output-loc")
+ .hasArgs(3)
+ .withDescription("Generates the difference between two output-files created using -listExtTblLocs option at the" +
+ " specified location. Output contains locations(keys) unique to each input file. For keys common to both " +
+ "input-files, those entities are listed which are deleted from the first file and introduced in the second."
+ )
+ .create("diffExtTblLocs");
+
private static final Option DRY_RUN = OptionBuilder
.withDescription("Perform a dry run of updateLocation changes.When run with the dryRun option updateLocation " +
"changes are displayed but not persisted. dryRun is valid only with the updateLocation option.")
@@ -93,6 +112,8 @@ class HiveMetaToolCommandLine {
OPTIONS.addOption(LIST_FS_ROOT);
OPTIONS.addOption(EXECUTE_JDOQL);
OPTIONS.addOption(UPDATE_LOCATION);
+ OPTIONS.addOption(LIST_EXT_TBL_LOCS);
+ OPTIONS.addOption(DIFF_EXT_TBL_LOCS);
OPTIONS.addOption(DRY_RUN);
OPTIONS.addOption(SERDE_PROP_KEY);
OPTIONS.addOption(TABLE_PROP_KEY);
@@ -102,6 +123,8 @@ class HiveMetaToolCommandLine {
private boolean listFSRoot;
private String jdoqlQuery;
private String[] updateLocationParams;
+ private String[] listExtTblLocsParams;
+ private String[] diffExtTblLocsParams;
private boolean dryRun;
private String serdePropKey;
private String tablePropKey;
@@ -137,14 +160,18 @@ class HiveMetaToolCommandLine {
listFSRoot = cl.hasOption(LIST_FS_ROOT.getOpt());
jdoqlQuery = cl.getOptionValue(EXECUTE_JDOQL.getOpt());
updateLocationParams = cl.getOptionValues(UPDATE_LOCATION.getOpt());
+ listExtTblLocsParams = cl.getOptionValues(LIST_EXT_TBL_LOCS.getOpt());
+ diffExtTblLocsParams = cl.getOptionValues(DIFF_EXT_TBL_LOCS.getOpt());
dryRun = cl.hasOption(DRY_RUN.getOpt());
serdePropKey = cl.getOptionValue(SERDE_PROP_KEY.getOpt());
tablePropKey = cl.getOptionValue(TABLE_PROP_KEY.getOpt());
help = cl.hasOption(HELP.getOpt());
- int commandCount = (isListFSRoot() ? 1 : 0) + (isExecuteJDOQL() ? 1 : 0) + (isUpdateLocation() ? 1 : 0);
+ int commandCount = (isListFSRoot() ? 1 : 0) + (isExecuteJDOQL() ? 1 : 0) + (isUpdateLocation() ? 1 : 0) +
+ (isListExtTblLocs() ? 1 : 0) + (isDiffExtTblLocs() ? 1 : 0);
if (commandCount != 1) {
- throw new IllegalArgumentException("exectly one of -listFSRoot, -executeJDOQL, -updateLocation must be set");
+ throw new IllegalArgumentException("exactly one of -listFSRoot, -executeJDOQL, -updateLocation, " +
+ "-listExtTblLocs, -diffExtTblLocs must be set");
}
if (updateLocationParams != null && updateLocationParams.length != 2) {
@@ -152,6 +179,16 @@ class HiveMetaToolCommandLine {
updateLocationParams.length + " arguments");
}
+ if (listExtTblLocsParams != null && listExtTblLocsParams.length != 2) {
+ throw new IllegalArgumentException("HiveMetaTool:listExtTblLocs takes in 2 arguments but was passed " +
+ listExtTblLocsParams.length + " arguments");
+ }
+
+ if (diffExtTblLocsParams != null && diffExtTblLocsParams.length != 3) {
+ throw new IllegalArgumentException("HiveMetaTool:diffExtTblLocs takes in 3 arguments but was passed " +
+ diffExtTblLocsParams.length + " arguments");
+ }
+
if ((dryRun || serdePropKey != null || tablePropKey != null) && !isUpdateLocation()) {
throw new IllegalArgumentException("-dryRun, -serdePropKey, -tablePropKey may be used only for the " +
"-updateLocation command");
@@ -176,6 +213,8 @@ class HiveMetaToolCommandLine {
"\tlistFSRoot : " + listFSRoot + "\n" +
"\tjdoqlQuery : " + jdoqlQuery + "\n" +
"\tupdateLocation: " + Arrays.toString(updateLocationParams) + "\n" +
+ "\tlistExtTblLocs: " + Arrays.toString(listExtTblLocsParams) + "\n" +
+ "\tdiffExtTblLocs: " + Arrays.toString(diffExtTblLocsParams) + "\n" +
"\tdryRun : " + dryRun + "\n" +
"\tserdePropKey : " + serdePropKey + "\n" +
"\ttablePropKey : " + tablePropKey);
@@ -197,10 +236,26 @@ class HiveMetaToolCommandLine {
return updateLocationParams != null;
}
+ boolean isListExtTblLocs() {
+ return listExtTblLocsParams != null;
+ }
+
+ boolean isDiffExtTblLocs() {
+ return diffExtTblLocsParams != null;
+ }
+
String[] getUpddateLocationParams() {
return updateLocationParams;
}
+ String[] getListExtTblLocsParams() {
+ return listExtTblLocsParams;
+ }
+
+ String[] getDiffExtTblLocsParams() {
+ return diffExtTblLocsParams;
+ }
+
boolean isDryRun() {
return dryRun;
}
diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/MetaToolTaskDiffExtTblLocs.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/MetaToolTaskDiffExtTblLocs.java
new file mode 100644
index 0000000..90b676d
--- /dev/null
+++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/MetaToolTaskDiffExtTblLocs.java
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.metastore.tools.metatool;
+
+import org.codehaus.jettison.json.JSONArray;
+import org.codehaus.jettison.json.JSONException;
+import org.codehaus.jettison.json.JSONObject;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+public class MetaToolTaskDiffExtTblLocs extends MetaToolTask {
+ @Override
+ void execute() {
+ String[] args = getCl().getDiffExtTblLocsParams();
+ try {
+ File file1 = new File(args[0]);
+ File file2 = new File(args[1]);
+ String ouputDir = args[2];
+ String outFileName = "diff_" + System.currentTimeMillis();
+ System.out.println("Writing diff to " + outFileName);
+ if (!file1.exists()) {
+ System.out.println("Input " + args[0] + " does not exist.");
+ return;
+ }
+ if (!file2.exists()) {
+ System.out.println("Input " + args[1] + " does not exist.");
+ return;
+ }
+ JSONObject jsonObject = getDiffJson(file1, file2);
+ FileWriter fw = new FileWriter(ouputDir + "/" + outFileName);
+ PrintWriter pw = new PrintWriter(fw);
+ pw.println(jsonObject.toString(4).replace("\\", ""));
+ pw.close();
+ } catch (Exception e) {
+ System.out.println("Generating diff failed: \n" + e.getMessage());
+ }
+ }
+
+ private JSONObject getDiffJson(File file1, File file2) throws IOException, JSONException {
+ JSONObject inJson1 = new JSONObject(new String(Files.readAllBytes(Paths.get(file1.getAbsolutePath()))));
+ JSONObject inJson2 = new JSONObject(new String(Files.readAllBytes(Paths.get(file2.getAbsolutePath()))));
+ Map<String, HashSet<String>> modifiedLocations = new HashMap<>();
+ Set<String> keySet1 = getKeySet(inJson1);
+ Set<String> keySet2 = getKeySet(inJson2);
+ Set<String> uniqueLocationsFile1 = getSetDifference(keySet1, keySet2);
+ Set<String> uniqueLocationsFile2 = getSetDifference(keySet2, keySet1);
+ for (String loc : keySet1) {
+ if (!uniqueLocationsFile1.contains(loc)) {
+ //common key, we need to compare the values
+ JSONArray valArr1 = inJson1.getJSONArray(loc);
+ JSONArray valArr2 = inJson2.getJSONArray(loc);
+ for (int i = 0; i < valArr1.length(); i++) {
+ String val1 = valArr1.getString(i);
+ boolean absentFromSecondKey = true;
+ for (int j = 0; j < valArr2.length(); j++) {
+ String val2 = valArr2.getString(j);
+ if (val1.equalsIgnoreCase(val2)) {
+ absentFromSecondKey = false;
+ break;
+ }
+ }
+ if (absentFromSecondKey) {
+ if (modifiedLocations.containsKey(loc)) {
+ modifiedLocations.get(loc).add(asDeleted(val1));
+ } else {
+ modifiedLocations.put(loc, new HashSet<>());
+ modifiedLocations.get(loc).add(asDeleted(val1));
+ }
+ }
+ }
+ for (int i = 0; i < valArr2.length(); i++) {
+ String val2 = valArr2.getString(i);
+ boolean absentFromFirstKey = true;
+ for (int j = 0; j < valArr1.length(); j++) {
+ String val1 = valArr1.getString(j);
+ if (val1.equalsIgnoreCase(val2)) {
+ absentFromFirstKey = false;
+ break;
+ }
+ }
+ if (absentFromFirstKey) {
+ if (modifiedLocations.containsKey(loc)) {
+ modifiedLocations.get(loc).add(asAdded(val2));
+ } else {
+ modifiedLocations.put(loc, new HashSet<>());
+ modifiedLocations.get(loc).add(asAdded(val2));
+ }
+ }
+ }
+ }
+ }
+ JSONObject jsonObject = new JSONObject();
+ if(!uniqueLocationsFile1.isEmpty() || !uniqueLocationsFile2.isEmpty()) {
+ jsonObject.put("Locations only in " + file1.getName(), uniqueLocationsFile1);
+ jsonObject.put("Locations only in " + file2.getName(), uniqueLocationsFile2);
+ }
+ for(String commonLoc : modifiedLocations.keySet()) {
+ List<String> modifiedEntries = new ArrayList<>();
+ for (String entry : modifiedLocations.get(commonLoc)) {
+ modifiedEntries.add(entry);
+ }
+ Collections.sort(modifiedEntries);
+ jsonObject.put(commonLoc, modifiedEntries);
+ }
+ return jsonObject;
+ }
+
+ private Set<String> getKeySet(JSONObject jsonObject) {
+ Iterator<String> keyIter = jsonObject.keys();
+ Set<String> keySet = new HashSet();
+ while (keyIter.hasNext()) {
+ keySet.add(keyIter.next());
+ }
+ return keySet;
+ }
+
+ private Set<String> getSetDifference(Set<String> keySet1, Set<String> keySet2) {
+ Set<String> diffSet = new HashSet();
+ for(String elem : keySet1) {
+ if(!keySet2.contains(elem)) {
+ diffSet.add(elem);
+ }
+ }
+ return diffSet;
+ }
+
+ private String asDeleted(String str) {
+ return "- " + str;
+ }
+
+ private String asAdded(String str) {
+ return "+ " + str;
+ }
+}
\ No newline at end of file
diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/MetaToolTaskListExtTblLocs.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/MetaToolTaskListExtTblLocs.java
new file mode 100644
index 0000000..f9d34ee
--- /dev/null
+++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/MetaToolTaskListExtTblLocs.java
@@ -0,0 +1,668 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.metastore.tools.metatool;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.ContentSummary;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.metastore.ObjectStore;
+import org.apache.hadoop.hive.metastore.TableType;
+import org.apache.hadoop.hive.metastore.Warehouse;
+import org.apache.hadoop.hive.metastore.api.Table;
+import org.apache.hadoop.hive.metastore.conf.MetastoreConf;
+import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils;
+import org.apache.thrift.TException;
+import org.codehaus.jettison.json.JSONException;
+import org.codehaus.jettison.json.JSONArray;
+import org.codehaus.jettison.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.FileNotFoundException;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Queue;
+import java.util.Set;
+import java.util.TreeSet;
+
+public class MetaToolTaskListExtTblLocs extends MetaToolTask {
+ private static final Logger LOG = LoggerFactory.getLogger(MetaToolTaskListExtTblLocs.class);
+ private static Configuration conf;
+ private final Map<String, HashSet<String>> coverageList = new HashMap<>(); //maps each output-location to the set of input-locations covered by it
+ private final Map<String, DataLocation> inputLocations = new HashMap<>(); //maps each input-location to a DataLocation object which specifies it's properties
+
+ @Override
+ void execute() {
+ String[] loc = getCl().getListExtTblLocsParams();
+ try{
+ generateExternalTableInfo(loc[0], loc[1]);
+ } catch (IOException | TException | JSONException e) {
+ System.out.println("Generating external table locations failed: \n" + e.getMessage());
+ }
+ }
+
+ private void generateExternalTableInfo(String dbPattern, String outputDir) throws TException, IOException,
+ JSONException {
+ ObjectStore objectStore = getObjectStore();
+ conf = msConf != null ? msConf : objectStore.getConf();
+ Warehouse wh = new Warehouse(conf);
+ String defaultCatalog = MetaStoreUtils.getDefaultCatalog(conf);
+ List<String> databases = objectStore.getDatabases(defaultCatalog, dbPattern);
+ System.out.println("Number of databases found for given pattern: " + databases.size());
+ //maintain the set of leaves of the tree as a sorted set
+ Set<String> leafLocations = new TreeSet<>();
+ for (String db : databases) {
+ List<String> tables = objectStore.getAllTables(defaultCatalog, db);
+ Path defaultDbExtPath = wh.getDefaultExternalDatabasePath(db);
+ String defaultDbExtLocation = defaultDbExtPath.toString();
+ boolean isDefaultPathEmpty = true;
+ for(String tblName : tables) {
+ Table t = objectStore.getTable(defaultCatalog, db, tblName);
+ if(TableType.EXTERNAL_TABLE.name().equalsIgnoreCase(t.getTableType())) {
+ String tblLocation = t.getSd().getLocation();
+ Path tblPath = new Path(tblLocation);
+ if(isPathWithinSubtree(tblPath, defaultDbExtPath)) {
+ if(isDefaultPathEmpty) {
+ isDefaultPathEmpty = false;
+ //default paths should always be included, so we add them as special leaves to the tree
+ addDefaultPath(defaultDbExtLocation, db);
+ leafLocations.add(defaultDbExtLocation);
+ }
+ HashSet<String> coveredByDefault = coverageList.get(defaultDbExtLocation);
+ coveredByDefault.add(tblLocation);
+ } else if (!isCovered(leafLocations, tblPath)) {
+ leafLocations.add(tblLocation);
+ }
+ DataLocation dataLocation = new DataLocation(db, tblName, 0, 0,
+ null);
+ inputLocations.put(tblLocation, dataLocation);
+ dataLocation.setSizeExtTblData(getDataSize(tblPath, conf));
+ //retrieving partition locations outside table-location
+ Map<String, String> partitionLocations = objectStore.getPartitionLocations(defaultCatalog, db, tblName,
+ tblLocation, -1);
+ dataLocation.setTotalPartitions(partitionLocations.size());
+ for (String partitionName : partitionLocations.keySet()) {
+ String partLocation = partitionLocations.get(partitionName);
+ //null value means partition is in table location, we do not add it to input in this case.
+ if(partLocation == null) {
+ dataLocation.incrementNumPartsInTblLoc();
+ }
+ else {
+ partLocation = partLocation + Path.SEPARATOR +
+ Warehouse.makePartName(Warehouse.makeSpecFromName(partitionName), false);
+ Path partPath = new Path(partLocation);
+ long partDataSize = getDataSize(partPath, conf);
+ if (isPathWithinSubtree(partPath, defaultDbExtPath)) {
+ if (isDefaultPathEmpty) {
+ isDefaultPathEmpty = false;
+ addDefaultPath(defaultDbExtLocation, db);
+ leafLocations.add(defaultDbExtLocation);
+ }
+ if (isPathWithinSubtree(partPath, tblPath)) {
+ //even in non-null case, handle the corner case where location is set to table-location
+ //In this case, partition would be covered by table location itself, so we need not add to input
+ dataLocation.incrementNumPartsInTblLoc();
+ } else {
+ DataLocation partObj = new DataLocation(db, tblName, 0, 0, partitionName);
+ partObj.setSizeExtTblData(partDataSize);
+ inputLocations.put(partLocation, partObj);
+ coverageList.get(defaultDbExtLocation).add(partLocation);
+ }
+ } else {
+ if (isPathWithinSubtree(partPath, tblPath)) {
+ dataLocation.incrementNumPartsInTblLoc();
+ } else {
+ //only in this case, partition location is neither inside table nor in default location.
+ //So we add it to the graph as a separate leaf.
+ DataLocation partObj = new DataLocation(db, tblName, 0, 0, partitionName);
+ partObj.setSizeExtTblData(partDataSize);
+ inputLocations.put(partLocation, partObj);
+ if(!isCovered(leafLocations, partPath)) {
+ leafLocations.add(partLocation);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ if(!leafLocations.isEmpty()) {
+ removeNestedStructure(leafLocations);
+ createOutputList(leafLocations, outputDir, dbPattern);
+ }
+ else {
+ System.out.println("No external tables found to process.");
+ }
+ }
+
+ private void addDefaultPath(String defaultDbExtLocation, String dbName) {
+ coverageList.put(defaultDbExtLocation, new HashSet<>());
+ DataLocation defaultDatalocation = new DataLocation(dbName, null, 0, 0, null);
+ //mark default leaves to always be included in output-list
+ defaultDatalocation.setIncludeByDefault(true);
+ inputLocations.put(defaultDbExtLocation, defaultDatalocation);
+ }
+
+ private long getDataSize(Path location, Configuration conf) throws IOException {
+ if(location == null) {
+ return 0;
+ }
+ if(MetastoreConf.getBoolVar(conf, MetastoreConf.ConfVars.HIVE_IN_TEST)) {
+ return testDatasizes == null ? 0 : testDatasizes.containsKey(location.toString()) ?
+ testDatasizes.get(location.toString()) : 0;
+ }
+ FileSystem fs = location.getFileSystem(conf);
+ if (fs != null && fs.getUri().getScheme().equals("hdfs")) {
+ try {
+ ContentSummary cs = fs.getContentSummary(location);
+ return cs.getLength();
+ } catch (FileNotFoundException e) {
+ //no data yet in data location but we proceed since data may be added later.
+ }
+ }
+ return 0;
+ }
+
+ private boolean isPathWithinSubtree(Path path, Path subtree) {
+ int subtreeDepth = subtree.depth();
+ while(path != null){
+ if (subtreeDepth > path.depth()) {
+ return false;
+ }
+ if(subtree.equals(path)){
+ return true;
+ }
+ path = path.getParent();
+ }
+ return false;
+ }
+
+
+ /*
+ * Method to determine if an existing location covers the given location and record the coverage in output.
+ */
+ private boolean isCovered(Set<String> locations, Path path) {
+ Path originalPath = new Path(path.toString());
+ while(path != null){
+ if(locations.contains(path.toString())){
+ addCoverage(path, originalPath, true);
+ return true;
+ }
+ path = path.getParent();
+ }
+ return false;
+ }
+
+ /*
+ * Method to cover a child node using a parent.
+ * Removes the child and marks all nodes covered by the child as being covered by the parent.
+ */
+ private void addCoverage(Path parentPath, Path childPath, boolean addChild) {
+ String childLoc = childPath.toString();
+ String parentLoc = parentPath.toString();
+ //If the path to be covered should be included by default, then we do not cover it.
+ //This is because default paths should be individually listed, not covered under some parent.
+ if(inputLocations.containsKey(childLoc) && inputLocations.get(childLoc).shouldIncludeByDefault()) {
+ return;
+ }
+ HashSet<String> pathsUnderChild = coverageList.get(childLoc);
+ coverageList.remove(childLoc);
+ if(coverageList.get(parentLoc) == null) {
+ coverageList.put(parentLoc, new HashSet<>());
+ }
+ HashSet pathsUnderParent = coverageList.get(parentLoc);
+ if(addChild) {
+ pathsUnderParent.add(childPath.toString());
+ }
+ if(pathsUnderChild != null) {
+ pathsUnderParent.addAll(pathsUnderChild);
+ }
+ }
+
+ /*
+ * Transforms a collection so that no element is an ancestor of another.
+ */
+ private void removeNestedStructure(Set<String> locations) {
+ List<String> locationList = new ArrayList<>();
+ locationList.addAll(locations);
+ for(int i = 0; i < locationList.size(); i++) {
+ String currLoc = locationList.get(i);
+ Path currPath = new Path(currLoc);
+ for(int j = i + 1; j < locationList.size(); j++) {
+ String nextLoc = locationList.get(j);
+ Path nextPath = new Path (nextLoc);
+ if(isPathWithinSubtree(nextPath, currPath)) {
+ addCoverage(currPath, nextPath, true);
+ locations.remove(nextLoc);
+ i = j;
+ }
+ else {
+ i = j - 1;
+ break;
+ }
+ }
+ }
+ }
+
+ /*
+ * Method to write the output to the given location.
+ * We construct a tree out of external table - locations and use it to determine suitable directories covering all locations.
+ */
+ private void createOutputList(Set<String> locations, String outputDir, String dbPattern) throws IOException, JSONException {
+ ExternalTableGraphNode rootNode = constructTree(locations);
+ //Traverse through the tree in breadth-first manner and decide which nodes to include.
+ //For every node, either cover all leaves in its subtree using itself
+ // or delegate this duty to its child nodes.
+ Queue<ExternalTableGraphNode> queue = new LinkedList<>();
+ queue.add(rootNode);
+ while(!queue.isEmpty()){
+ ExternalTableGraphNode current = queue.remove();
+ if(current.isLeaf()) {
+ // in this case, the leaf needs to be added to the solution, i.e. marked as being covered.
+ // This was done during graph construction, so we continue.
+ continue;
+ }
+ int nonTrivialCoverage = 0;
+ List<ExternalTableGraphNode> childNodes = current.getChildNodes();
+ boolean processChildrenByDefault = false;
+ for(ExternalTableGraphNode child : childNodes) {
+ if (child.getNumLeavesCovered() > 1) {
+ nonTrivialCoverage += child.getNumLeavesCovered();
+ }
+ if (child.shouldIncludeByDefault()) {
+ processChildrenByDefault = true;
+ break;
+ }
+ }
+ boolean addCurrToSolution = false;
+ if(!processChildrenByDefault) {
+ addCurrToSolution = true;
+ if (!current.shouldIncludeByDefault()) {
+ //ensure that we do not have extra data in the current node for it to be included.
+ long currDataSize = getDataSize(new Path(current.getLocation()), conf);
+ int numLeavesCovered = current.getNumLeavesCovered();
+ //only add current node if it doesn't have extra data and non-trivial coverage is less than half.
+ //Also we do not add current node if there is just a single path(numLeavesCovered = 1); in this case we proceed to the leaf.
+ addCurrToSolution &= currDataSize == current.getChildDataSizes() &&
+ ((nonTrivialCoverage < (numLeavesCovered + 1) / 2) && numLeavesCovered != 1);
+ }
+ }
+ if(processChildrenByDefault) {
+ queue.addAll(childNodes);
+ } else if (addCurrToSolution) {
+ addToSolution(current);
+ } else {
+ queue.addAll(childNodes);
+ }
+ }
+ String outFileName = "externalTableLocations_" + dbPattern + "_" + System.currentTimeMillis() + ".txt";
+ System.out.println("Writing output to " + outFileName);
+ FileWriter fw = new FileWriter(outputDir + "/" + outFileName);
+ PrintWriter pw = new PrintWriter(fw);
+ JSONObject jsonObject = new JSONObject();
+ for(String outputLocation : coverageList.keySet()) {
+ HashSet<String> coveredLocations = coverageList.get(outputLocation);
+ JSONArray outputEntities = listOutputEntities(coveredLocations);
+ jsonObject.put(outputLocation, outputEntities);
+ }
+ String result = jsonObject.toString(4).replace("\\","");
+ pw.println(result);
+ pw.close();
+ }
+
+ /*
+ * Returns a comma separated list of entities(tables or partition names) covered by to a location.
+ * Table-name followed by "*" indicates that all partitions are inside table location.
+ * Otherwise, we record the number of partitions covered by table location.
+ */
+ private JSONArray listOutputEntities(HashSet<String> locations) {
+ List<String> listEntities = new ArrayList<>();
+ for(String loc : locations) {
+ DataLocation data = inputLocations.get(loc);
+ String tblName = data.getTblName();
+ if(tblName == null) {
+ continue;
+ }
+ String out = data.getDbName() + "." + tblName;
+ String partName = data .getPartName();
+ if (partName == null) {
+ int numPartInTblLoc = data.getNumPartitionsInTblLoc();
+ int totPartitions = data.getTotalPartitions();
+ if (totPartitions > 0 && numPartInTblLoc == totPartitions) {
+ out = out + ".*";
+ }
+ else if (totPartitions > 0) {
+ out = out + " p(" + numPartInTblLoc + "/" + totPartitions + ")";
+ }
+ }
+ else {
+ out = out + "." + partName;
+ }
+ listEntities.add(out);
+ }
+ Collections.sort(listEntities);
+ return new JSONArray(listEntities);
+ }
+
+ private ExternalTableGraphNode constructTree(Set<String> locations) {
+ ExternalTableGraphNode rootNode = null;
+ Map<String, ExternalTableGraphNode> locationGraph = new HashMap<>();
+ // Every location is represented by a leaf in the tree.
+ // We traverse through the input locations and construct the tree.
+ for (String leaf : locations) {
+ ExternalTableGraphNode currNode = new ExternalTableGraphNode(leaf, new ArrayList<>(), true, 0);
+ if(inputLocations.containsKey(leaf)) {
+ if(inputLocations.get(leaf).shouldIncludeByDefault()) {
+ currNode.setIncludeByDefault(true);
+ }
+ currNode.setDataSize(inputLocations.get(leaf).getSizeExtTblData());
+ }
+ locationGraph.put(leaf, currNode);
+ //initialize coverage-lists of leaves
+ if (coverageList.get(leaf) == null) {
+ coverageList.put(leaf, new HashSet<>());
+ }
+ //mark the leaf as being covered by itself
+ HashSet currCoverage = coverageList.get(leaf);
+ currCoverage.add(leaf);
+ //set the number of leaves covered. Nested locations could have been covered earlier during preprocessing,
+ //so we set it to the size of it's coverage set.
+ currNode.setNumLeavesCovered(currCoverage.size());
+ Path parent = new Path(leaf).getParent();
+ ExternalTableGraphNode parNode;
+ //traverse upward to the root in order to construct the graph
+ while (parent != null) {
+ String parentLoc = parent.toString();
+ if (!locationGraph.containsKey(parentLoc)) {
+ //if parent doesn't exist in graph then create it
+ parNode = new ExternalTableGraphNode(parentLoc, new ArrayList<>(), false, 0);
+ locationGraph.put(parentLoc, parNode);
+ }
+ else {
+ parNode = locationGraph.get(parentLoc);
+ parNode.setIsLeaf(false);
+ }
+ if(currNode.getParent() == null) {
+ parNode.addChild(currNode);
+ currNode.setParent(parNode);
+ }
+ else {
+ break;
+ }
+ currNode = parNode;
+ parent = parent.getParent();
+ }
+ if (parent == null && rootNode == null) {
+ rootNode = currNode;
+ rootNode.setParent(rootNode);
+ }
+ }
+ rootNode.updateNumLeavesCovered();
+ rootNode.updateIncludeByDefault();
+ rootNode.updateDataSize();
+ return rootNode;
+ }
+
+ private void addToSolution(ExternalTableGraphNode node) {
+ //since this node is in the solution, all its children should be covered using this node.
+ if(!node.isLeaf()) {
+ addCoverageRecursive(node);
+ }
+ }
+
+ private void addCoverageRecursive(ExternalTableGraphNode node) {
+ for(ExternalTableGraphNode child : node.getChildNodes()) {
+ if(child.isLeaf()) {
+ addCoverage(new Path(node.getLocation()), new Path(child.getLocation()), true);
+ }
+ else {
+ addCoverageRecursive(child);
+ addCoverage(new Path(node.getLocation()), new Path(child.getLocation()), false);
+ }
+ }
+ }
+
+ @VisibleForTesting
+ static Configuration msConf = null;
+
+ @VisibleForTesting
+ Map<String, Long> testDatasizes = null;
+
+ @VisibleForTesting
+ public Map<String, HashSet<String>> runTest(Set<String> inputList, Map<String, Long> sizes) {
+ try {
+ conf = msConf;
+ testDatasizes = sizes;
+ coverageList.clear();
+ removeNestedStructure(inputList);
+ createOutputList(inputList, "test", "test");
+ } catch (Exception e) {
+ LOG.error("MetaToolTask failed on ListExtTblLocs test: ", e);
+ }
+ return coverageList;
+ }
+
+ /*
+ * Class denoting every external table data location.
+ * Each location can be either a table location(in this case, partition-name is not set) or
+ * a partition location which is outside table location.
+ * If the location is a table location, we store additional data like how many partitions are there in the table
+ * and how many of them are there in the table loc itself.
+ */
+ private class DataLocation {
+ private String dbName;
+ private String tblName;
+ private int numPartitionsInTblLoc;
+ private String partName;
+ private int totalPartitions;
+ // 'sizeExtTblData' stores the size of useful data in a directory.
+ // This can be compared with total directory-size to ascertain amount of extra data in it.
+ long sizeExtTblData;
+ boolean includeByDefault;
+
+ private DataLocation (String dbName, String tblName, int totalPartitions, int numPartitionsInTblLoc,
+ String partName) {
+ this.dbName = dbName;
+ this.tblName = tblName;
+ this.totalPartitions = totalPartitions;
+ this.numPartitionsInTblLoc = numPartitionsInTblLoc;
+ this.partName = partName;
+ this.sizeExtTblData = 0;
+ }
+
+ private void incrementNumPartsInTblLoc() {
+ this.numPartitionsInTblLoc++;
+ }
+
+ private String getPartName() {
+ return this.partName;
+ }
+
+ private String getDbName() {
+ return this.dbName;
+ }
+
+ private String getTblName() {
+ return this.tblName;
+ }
+
+ private int getNumPartitionsInTblLoc() {
+ return this.numPartitionsInTblLoc;
+ }
+
+ private int getTotalPartitions() {
+ return this.totalPartitions;
+ }
+
+ private long getSizeExtTblData() {
+ return this.sizeExtTblData;
+ }
+
+ private boolean shouldIncludeByDefault() {
+ return this.includeByDefault;
+ }
+
+ private void setTotalPartitions(int totalPartitions) {
+ this.totalPartitions = totalPartitions;
+ }
+
+ private void setSizeExtTblData(long sizeExtTblData) {
+ this.sizeExtTblData = sizeExtTblData;
+ }
+
+ private void setIncludeByDefault(boolean includeByDefault) {
+ this.includeByDefault = includeByDefault;
+ }
+ }
+
+ private class ExternalTableGraphNode {
+ private String location;
+ private List<ExternalTableGraphNode> childNodes;
+ private ExternalTableGraphNode parent;
+ private boolean isLeaf;
+ private boolean includeByDefault;
+ private int numLeavesCovered;
+ private long dataSize;
+
+ private ExternalTableGraphNode(String location, List<ExternalTableGraphNode> childNodes, boolean isLeaf, long dataSize) {
+ this.location = location;
+ this.childNodes = childNodes;
+ this.isLeaf = isLeaf;
+ this.parent = null;
+ this.includeByDefault = false;
+ this.dataSize = dataSize;
+ }
+
+ private void addChild(ExternalTableGraphNode child) {
+ this.childNodes.add(child);
+ }
+
+ private List<ExternalTableGraphNode> getChildNodes() {
+ return this.childNodes;
+ }
+
+ private boolean isLeaf() {
+ return this.isLeaf;
+ }
+
+ public void setIsLeaf(boolean isLeaf) {
+ this.isLeaf = isLeaf;
+ }
+
+ private void setNumLeavesCovered(int numLeavesCovered) {
+ this.numLeavesCovered = numLeavesCovered;
+ }
+
+ private int getNumLeavesCovered() {
+ return this.numLeavesCovered;
+ }
+
+ private String getLocation() {
+ return this.location;
+ }
+
+ private void setParent(ExternalTableGraphNode node) {
+ this.parent = node;
+ }
+
+ private ExternalTableGraphNode getParent() {
+ return this.parent;
+ }
+
+ private boolean shouldIncludeByDefault() {
+ return this.includeByDefault;
+ }
+
+ private void setIncludeByDefault(boolean includeByDefault) {
+ this.includeByDefault = includeByDefault;
+ }
+
+ private void setDataSize(long dataSize) {
+ this.dataSize = dataSize;
+ }
+
+ private long getDataSize() {
+ return this.dataSize;
+ }
+
+ private void updateNumLeavesCovered() {
+ if(this.isLeaf) {
+ return;
+ }
+ this.numLeavesCovered = 0;
+ for(ExternalTableGraphNode currChild : childNodes) {
+ currChild.updateNumLeavesCovered();
+ this.numLeavesCovered += currChild.getNumLeavesCovered();
+ }
+ }
+
+ /*
+ * Method to mark all the paths in the subtree rooted at current node which need to be included by default.
+ * If some leaf has this property, then we mark the path from root to that leaf.
+ */
+ private void updateIncludeByDefault() {
+ if(this.isLeaf) {
+ return;
+ }
+ for(ExternalTableGraphNode currChild : childNodes) {
+ currChild.updateIncludeByDefault();
+ }
+ for(ExternalTableGraphNode currChild : childNodes) {
+ if(currChild.shouldIncludeByDefault()) {
+ this.includeByDefault = true;
+ break;
+ }
+ }
+ }
+
+ /*
+ * Method to update the datasize of subtree rooted at a particular node recursively.
+ */
+ private void updateDataSize() {
+ if(this.isLeaf) {
+ return;
+ }
+ for(ExternalTableGraphNode currChild : childNodes) {
+ currChild.updateDataSize();
+ }
+ this.dataSize += this.getChildDataSizes();
+ }
+
+ /*
+ * Method to return sum of data-sizes of child nodes of a particular node
+ */
+ private long getChildDataSizes() {
+ long sumChildDataSizes = 0;
+ for(ExternalTableGraphNode currChild : childNodes) {
+ sumChildDataSizes += currChild.getDataSize();
+ }
+ return sumChildDataSizes;
+ }
+ }
+}
diff --git a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/tools/metatool/TestHiveMetaToolCommandLine.java b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/tools/metatool/TestHiveMetaToolCommandLine.java
index 9563bd6..ab090c9 100644
--- a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/tools/metatool/TestHiveMetaToolCommandLine.java
+++ b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/tools/metatool/TestHiveMetaToolCommandLine.java
@@ -44,6 +44,8 @@ public class TestHiveMetaToolCommandLine {
assertNull(cl.getJDOQLQuery());
assertFalse(cl.isUpdateLocation());
assertNull(cl.getUpddateLocationParams());
+ assertFalse(cl.isListExtTblLocs());
+ assertNull(cl.getListExtTblLocsParams());
assertFalse(cl.isDryRun());
assertNull(cl.getSerdePropKey());
assertNull(cl.getTablePropKey());
@@ -57,6 +59,8 @@ public class TestHiveMetaToolCommandLine {
assertEquals("select a from b", cl.getJDOQLQuery());
assertFalse(cl.isUpdateLocation());
assertNull(cl.getUpddateLocationParams());
+ assertFalse(cl.isListExtTblLocs());
+ assertNull(cl.getListExtTblLocsParams());
assertFalse(cl.isDryRun());
assertNull(cl.getSerdePropKey());
assertNull(cl.getTablePropKey());
@@ -73,6 +77,8 @@ public class TestHiveMetaToolCommandLine {
assertTrue(cl.isUpdateLocation());
assertEquals("hdfs://new.loc", cl.getUpddateLocationParams()[0]);
assertEquals("hdfs://old.loc", cl.getUpddateLocationParams()[1]);
+ assertFalse(cl.isListExtTblLocs());
+ assertNull(cl.getListExtTblLocsParams());
assertTrue(cl.isDryRun());
assertEquals("abc", cl.getSerdePropKey());
assertEquals("def", cl.getTablePropKey());
@@ -81,7 +87,7 @@ public class TestHiveMetaToolCommandLine {
@Test
public void testNoTask() throws ParseException {
exception.expect(IllegalArgumentException.class);
- exception.expectMessage("exectly one of -listFSRoot, -executeJDOQL, -updateLocation must be set");
+ exception.expectMessage("exactly one of -listFSRoot, -executeJDOQL, -updateLocation, -listExtTblLocs, -diffExtTblLocs must be set");
new HiveMetaToolCommandLine(new String[] {});
}
@@ -89,7 +95,7 @@ public class TestHiveMetaToolCommandLine {
@Test
public void testMultipleTask() throws ParseException {
exception.expect(IllegalArgumentException.class);
- exception.expectMessage("exectly one of -listFSRoot, -executeJDOQL, -updateLocation must be set");
+ exception.expectMessage("exactly one of -listFSRoot, -executeJDOQL, -updateLocation, -listExtTblLocs, -diffExtTblLocs must be set");
new HiveMetaToolCommandLine(new String[] {"-listFSRoot", "-executeJDOQL", "select a from b"});
}
@@ -103,6 +109,26 @@ public class TestHiveMetaToolCommandLine {
}
@Test
+ public void testListExtTblLocsOneArgument() throws ParseException {
+ exception.expect(IllegalArgumentException.class);
+ exception.expectMessage("HiveMetaTool:listExtTblLocs takes in 2 arguments but was passed 1 arguments");
+
+ new HiveMetaToolCommandLine(new String[] {"-listExtTblLocs", "db1"});
+ }
+
+ @Test
+ public void testDiffExtTblLocsArgCount() throws ParseException {
+ exception.expect(IllegalArgumentException.class);
+ exception.expectMessage("HiveMetaTool:diffExtTblLocs takes in 3 arguments but was passed 1 arguments");
+ new HiveMetaToolCommandLine(new String[] {"-diffExtTblLocs", "file1"});
+
+ exception.expect(IllegalArgumentException.class);
+ exception.expectMessage("HiveMetaTool:diffExtTblLocs takes in 3 arguments but was passed 2 arguments");
+ new HiveMetaToolCommandLine(new String[] {"-diffExtTblLocs", "file1", "file2"});
+
+ }
+
+ @Test
public void testDryRunNotAllowed() throws ParseException {
exception.expect(IllegalArgumentException.class);
exception.expectMessage("-dryRun, -serdePropKey, -tablePropKey may be used only for the -updateLocation command");
diff --git a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/tools/metatool/TestMetaToolTaskListExtTblLocs.java b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/tools/metatool/TestMetaToolTaskListExtTblLocs.java
new file mode 100644
index 0000000..4eb3111
--- /dev/null
+++ b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/tools/metatool/TestMetaToolTaskListExtTblLocs.java
@@ -0,0 +1,291 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.metastore.tools.metatool;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest;
+import org.apache.hadoop.hive.metastore.conf.MetastoreConf;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import java.util.Set;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.TreeSet;
+
+
+/* Unit tests for MetaToolTaskListExtTblLocs. */
+@Category(MetastoreUnitTest.class)
+public class TestMetaToolTaskListExtTblLocs {
+
+ /*
+ * Test grouping of locations. No extra data assumed.
+ */
+ @Test
+ public void testGroupLocations() {
+ Set<String> inputLocations = new TreeSet<>();
+ Configuration conf = MetastoreConf.newMetastoreConf();
+ MetastoreConf.setBoolVar(conf, MetastoreConf.ConfVars.HIVE_IN_TEST, true);
+ MetaToolTaskListExtTblLocs.msConf = conf;
+ MetaToolTaskListExtTblLocs task = new MetaToolTaskListExtTblLocs();
+
+ //Case 1: Multiple unpartitioned external tables, expected o/p: 1 location
+ inputLocations.add("/warehouse/customLocation/t1");
+ inputLocations.add("/warehouse/customLocation/t2");
+ inputLocations.add("/warehouse/customLocation/t3");
+ Map<String, HashSet<String>> output = task.runTest(inputLocations, null);
+ Assert.assertEquals(1, output.size());
+ String expectedOutput = "/warehouse/customLocation";
+ Assert.assertTrue(output.containsKey(expectedOutput));
+ HashSet<String> coveredLocs = output.get(expectedOutput);
+ Assert.assertEquals(3, coveredLocs.size());
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/t1"));
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/t2"));
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/t3"));
+
+ //Case 2 : inputs at multiple depths
+ // inputs ../ext/b0 - contains 1 location
+ // ../ext/p=0 - contains 1 location
+ // ../ext/b1/b2/b3 - contains 3 locations (p1, p2, p3)
+ // expected output : [../ext/b1/b2/b3 containing 3 elements, t1, p0]
+ inputLocations.clear();
+ inputLocations.add("/warehouse/customLocation/ext/b0");
+ inputLocations.add("/warehouse/customLocation/ext/p=0");
+ inputLocations.add("/warehouse/customLocation/ext/b1/b2/b3/p=1");
+ inputLocations.add("/warehouse/customLocation/ext/b1/b2/b3/p=2");
+ inputLocations.add("/warehouse/customLocation/ext/b1/b2/b3/p=3");
+ output = task.runTest(inputLocations, null);
+ Assert.assertEquals(3, output.size());
+ String expectedOutput1 = "/warehouse/customLocation/ext/b0";
+ Assert.assertTrue(output.containsKey(expectedOutput1));
+ coveredLocs = output.get(expectedOutput1);
+ Assert.assertEquals(1, coveredLocs.size());
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b0"));
+ String expectedOutput2 = "/warehouse/customLocation/ext/p=0";
+ Assert.assertTrue(output.containsKey(expectedOutput2));
+ coveredLocs = output.get(expectedOutput2);
+ Assert.assertEquals(1, coveredLocs.size());
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/p=0"));
+ String expectedOutput3 = "/warehouse/customLocation/ext/b1/b2/b3";
+ Assert.assertTrue(output.containsKey(expectedOutput3));
+ coveredLocs = output.get(expectedOutput3);
+ Assert.assertEquals(3, coveredLocs.size());
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/b2/b3/p=1"));
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/b2/b3/p=2"));
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/b2/b3/p=3"));
+
+ //Case 3 : root with a lot of leaves
+ // inputs ../ext/ - contains 4 locations
+ // ../ext/b1 - contains 3 locations
+ // expected output : [../ext covering all locations] since root (ext) has more than half of locations
+ inputLocations.clear();
+ inputLocations.add("/warehouse/customLocation/ext/p=0");
+ inputLocations.add("/warehouse/customLocation/ext/p=1");
+ inputLocations.add("/warehouse/customLocation/ext/p=2");
+ inputLocations.add("/warehouse/customLocation/ext/p=3");
+ inputLocations.add("/warehouse/customLocation/ext/b1/p=4");
+ inputLocations.add("/warehouse/customLocation/ext/b1/p=5");
+ inputLocations.add("/warehouse/customLocation/ext/b1/p=6");
+ output = task.runTest(inputLocations, null);
+ Assert.assertEquals(1, output.size());
+ expectedOutput = "/warehouse/customLocation/ext";
+ Assert.assertTrue(output.containsKey(expectedOutput));
+ coveredLocs = output.get(expectedOutput);
+ Assert.assertEquals(7, coveredLocs.size());
+ Assert.assertTrue(coveredLocs.containsAll(inputLocations));
+
+ //Case 4 : root with a lot of trivial locations (non leaf)
+ // inputs ../ext/ - contains 4 trivial locations
+ // ../ext/b1 - contains 3 locations
+ // expected output : [../ext covering all locations] since non trivial (grouped) locations under ext is less than half
+ inputLocations.clear();
+ inputLocations.add("/warehouse/customLocation/ext/dir01/dir02/p=0");
+ inputLocations.add("/warehouse/customLocation/ext/dir11/dir12/p=1");
+ inputLocations.add("/warehouse/customLocation/ext/dir21/dir22/p=2");
+ inputLocations.add("/warehouse/customLocation/ext/dir31/dir32/p=3");
+ inputLocations.add("/warehouse/customLocation/ext/b1/p=4");
+ inputLocations.add("/warehouse/customLocation/ext/b1/p=5");
+ inputLocations.add("/warehouse/customLocation/ext/b1/p=6");
+ output = task.runTest(inputLocations, null);
+ Assert.assertEquals(1, output.size());
+ expectedOutput = "/warehouse/customLocation/ext";
+ Assert.assertTrue(output.containsKey(expectedOutput));
+ coveredLocs = output.get(expectedOutput);
+ Assert.assertEquals(7, coveredLocs.size());
+ Assert.assertTrue(coveredLocs.containsAll(inputLocations));
+
+ //Case 5 : several grouped locations and 1 outlier at root
+ // inputs ../ext/b0 - contains 4 locations
+ // ../ext/b1 - contains 3 locations
+ // expected output : [../ext/b0, ../ext/b1, p=7 ]
+ inputLocations.clear();
+ inputLocations.add("/warehouse/customLocation/ext/b0/p=0");
+ inputLocations.add("/warehouse/customLocation/ext/b0/p=1");
+ inputLocations.add("/warehouse/customLocation/ext/b0/p=2");
+ inputLocations.add("/warehouse/customLocation/ext/b0/p=3");
+ inputLocations.add("/warehouse/customLocation/ext/b1/p=4");
+ inputLocations.add("/warehouse/customLocation/ext/b1/p=5");
+ inputLocations.add("/warehouse/customLocation/ext/b1/p=6");
+ inputLocations.add("/warehouse/customLocation/ext/p=7");
+ output = task.runTest(inputLocations, null);
+ Assert.assertEquals(3, output.size());
+ expectedOutput1 = "/warehouse/customLocation/ext/b0";
+ Assert.assertTrue(output.containsKey(expectedOutput1));
+ coveredLocs = output.get(expectedOutput1);
+ Assert.assertEquals(4, coveredLocs.size());
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b0/p=0"));
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b0/p=1"));
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b0/p=2"));
+ expectedOutput2 = "/warehouse/customLocation/ext/b1";
+ Assert.assertTrue(output.containsKey(expectedOutput2));
+ coveredLocs = output.get(expectedOutput2);
+ Assert.assertEquals(3, coveredLocs.size());
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/p=4"));
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/p=5"));
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/p=6"));
+ expectedOutput3 = "/warehouse/customLocation/ext/p=7";
+ Assert.assertTrue(output.containsKey(expectedOutput3));
+ coveredLocs = output.get(expectedOutput3);
+ Assert.assertEquals(1, coveredLocs.size());
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/p=7"));
+
+ //Case 6 : inputs with nested structure
+ // inputs ../ext/b0 - contains 4 locations
+ // ../ext/b1
+ // ../ext/b1/b2 - contains 4 locations
+ // expected output : [../ext/b0, ../ext/b1 ] : (no extra location for b2 since covered by b1 itself)
+ inputLocations.clear();
+ inputLocations.add("/warehouse/customLocation/ext/b0/p=0");
+ inputLocations.add("/warehouse/customLocation/ext/b0/p=1");
+ inputLocations.add("/warehouse/customLocation/ext/b0/p=2");
+ inputLocations.add("/warehouse/customLocation/ext/b0/p=3");
+ inputLocations.add("/warehouse/customLocation/ext/b1");
+ inputLocations.add("/warehouse/customLocation/ext/b1/b2/p=7");
+ inputLocations.add("/warehouse/customLocation/ext/b1/b2/p=8");
+ inputLocations.add("/warehouse/customLocation/ext/b1/b2/p=9");
+ output = task.runTest(inputLocations, null);
+ Assert.assertEquals(2, output.size());
+ expectedOutput1 = "/warehouse/customLocation/ext/b0";
+ Assert.assertTrue(output.containsKey(expectedOutput1));
+ coveredLocs = output.get(expectedOutput1);
+ Assert.assertEquals(4, coveredLocs.size());
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b0/p=0"));
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b0/p=1"));
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b0/p=2"));
+ expectedOutput2 = "/warehouse/customLocation/ext/b1";
+ Assert.assertTrue(output.containsKey(expectedOutput2));
+ coveredLocs = output.get(expectedOutput2);
+ Assert.assertEquals(4, coveredLocs.size());
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1"));
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/b2/p=7"));
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/b2/p=8"));
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/b2/p=9"));
+ }
+
+ @Test
+ public void testGroupLocationsDummyDataSizes() {
+ Set<String> inputLocations = new TreeSet<>();
+ Configuration conf = MetastoreConf.newMetastoreConf();
+ MetastoreConf.setBoolVar(conf, MetastoreConf.ConfVars.HIVE_IN_TEST, true);
+ MetaToolTaskListExtTblLocs.msConf = conf;
+ MetaToolTaskListExtTblLocs task = new MetaToolTaskListExtTblLocs();
+
+ //Case 1: Multiple unpartitioned external tables, expected o/p without extra data: 1 location (tested in testGroupLocations#1)
+ // But say there is some data at ../customLocation, then we list all the 3 paths
+ inputLocations.add("/warehouse/customLocation/t1");
+ inputLocations.add("/warehouse/customLocation/t2");
+ inputLocations.add("/warehouse/customLocation/t3");
+ Map<String, Long> dataSizes = new HashMap<>();
+ dataSizes.put("/warehouse/customLocation", Long.valueOf(100)); //Simulate 100 bytes extra data at customLocation
+ Map<String, HashSet<String>> output = task.runTest(inputLocations, dataSizes);
+ Assert.assertEquals(3, output.size());
+ String expectedOutput1 = "/warehouse/customLocation/t1";
+ Assert.assertTrue(output.containsKey(expectedOutput1));
+ HashSet<String> coveredLocs = output.get(expectedOutput1);
+ Assert.assertEquals(1, coveredLocs.size());
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/t1"));
+
+ //Case 2 : inputs at multiple depths
+ // inputs ../ext/b0 - contains 1 location
+ // ../ext/p=0 - contains 1 location
+ // ../ext/b1/b2/b3 - contains 3 locations (p1, p2, p3)
+ // expected output without extra data : [../ext/b1/b2/b3 containing 3 elements, t1, p0] (tested in testGroupLocations#2)
+ // expected output with extra data at ../ext/b1/b2/b3 : [p1, p2, p3, t1, p0]
+ inputLocations.clear();
+ dataSizes.clear();
+ inputLocations.add("/warehouse/customLocation/ext/b0");
+ inputLocations.add("/warehouse/customLocation/ext/p=0");
+ inputLocations.add("/warehouse/customLocation/ext/b1/b2/b3/p=1");
+ inputLocations.add("/warehouse/customLocation/ext/b1/b2/b3/p=2");
+ inputLocations.add("/warehouse/customLocation/ext/b1/b2/b3/p=3");
+ dataSizes.put("/warehouse/customLocation/ext/b1/b2/b3", Long.valueOf(100)); // simulate 100 bytes of extra data at ../b3
+ output = task.runTest(inputLocations, dataSizes);
+ Assert.assertEquals(5, output.size());
+ expectedOutput1 = "/warehouse/customLocation/ext/b0";
+ Assert.assertTrue(output.containsKey(expectedOutput1));
+ coveredLocs = output.get(expectedOutput1);
+ Assert.assertEquals(1, coveredLocs.size());
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b0"));
+ String expectedOutput2 = "/warehouse/customLocation/ext/p=0";
+ Assert.assertTrue(output.containsKey(expectedOutput2));
+ coveredLocs = output.get(expectedOutput2);
+ Assert.assertEquals(1, coveredLocs.size());
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/p=0"));
+ String expectedOutput3 = "/warehouse/customLocation/ext/b1/b2/b3/p=1";
+ Assert.assertTrue(output.containsKey(expectedOutput3));
+ coveredLocs = output.get(expectedOutput3);
+ Assert.assertEquals(1, coveredLocs.size());
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/b2/b3/p=1"));
+ String expectedOutput4 = "/warehouse/customLocation/ext/b1/b2/b3/p=2";
+ Assert.assertTrue(output.containsKey(expectedOutput4));
+ coveredLocs = output.get(expectedOutput4);
+ Assert.assertEquals(1, coveredLocs.size());
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/b2/b3/p=2"));
+ String expectedOutput5 = "/warehouse/customLocation/ext/b1/b2/b3/p=3";
+ Assert.assertTrue(output.containsKey(expectedOutput5));
+ coveredLocs = output.get(expectedOutput5);
+ Assert.assertEquals(1, coveredLocs.size());
+ Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/b2/b3/p=3"));
+
+ //Case 3 : intermediate directory has extra data
+ // inputs ../ext/ - contains 4 locations
+ // ../ext/b1 - contains 3 locations
+ // expected output without extra data : [../ext covering all locations] (tested in testGroupLocations#3)
+ // We simulate extra data at ../ext/b1. So, expected output is the list of all locations.
+ inputLocations.clear();
+ dataSizes.clear();
+ inputLocations.add("/warehouse/customLocation/ext/p=0");
+ inputLocations.add("/warehouse/customLocation/ext/p=1");
+ inputLocations.add("/warehouse/customLocation/ext/p=2");
+ inputLocations.add("/warehouse/customLocation/ext/p=3");
+ inputLocations.add("/warehouse/customLocation/ext/b1/p=4");
+ inputLocations.add("/warehouse/customLocation/ext/b1/p=5");
+ inputLocations.add("/warehouse/customLocation/ext/b1/p=6");
+ dataSizes.put("/warehouse/customLocation/ext/b1", Long.valueOf(100)); // simulate 100 bytes of extra data at ..ext/b1
+ dataSizes.put("/warehouse/customLocation/ext", Long.valueOf(100));// since ext/b1 contains 100 bytes, ../ext also has 100 bytes
+ output = task.runTest(inputLocations, dataSizes);
+ Assert.assertEquals(7, output.size());
+ Assert.assertTrue(output.keySet().containsAll(inputLocations));
+ for(String outLoc : output.keySet()) {
+ Assert.assertTrue(output.get(outLoc).contains(outLoc));
+ }
+ }
+}
+