You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by yi...@apache.org on 2022/03/25 02:05:25 UTC
[hudi] branch master updated: [HUDI-1180] Upgrade HBase to 2.4.9 (#5004)
This is an automated email from the ASF dual-hosted git repository.
yihua pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new eaa4c4f [HUDI-1180] Upgrade HBase to 2.4.9 (#5004)
eaa4c4f is described below
commit eaa4c4f2e2ccdb42f62d91c001a0350de183e4dc
Author: Y Ethan Guo <et...@gmail.com>
AuthorDate: Thu Mar 24 19:04:53 2022 -0700
[HUDI-1180] Upgrade HBase to 2.4.9 (#5004)
Co-authored-by: Sagar Sumit <sa...@gmail.com>
---
.../org/apache/hudi/io/HoodieAppendHandle.java | 3 +-
.../hudi/io/storage/HoodieFileWriterFactory.java | 26 +-
.../apache/hudi/io/storage/HoodieHFileConfig.java | 10 +-
.../apache/hudi/io/storage/HoodieHFileWriter.java | 6 +-
.../io/storage/TestHoodieHFileReaderWriter.java | 217 +-
.../hudi/io/storage/TestHoodieOrcReaderWriter.java | 233 +--
.../io/storage/TestHoodieReaderWriterBase.java | 250 +++
...10_hbase_1_2_3_bootstrap_index_partitions.hfile | Bin 0 -> 19745 bytes
.../resources/hudi_0_10_hbase_1_2_3_complex.hfile | Bin 0 -> 6612 bytes
.../resources/hudi_0_10_hbase_1_2_3_simple.hfile | Bin 0 -> 6115 bytes
...11_hbase_2_4_9_bootstrap_index_partitions.hfile | Bin 0 -> 19745 bytes
.../resources/hudi_0_11_hbase_2_4_9_complex.hfile | Bin 0 -> 6612 bytes
.../resources/hudi_0_11_hbase_2_4_9_simple.hfile | Bin 0 -> 6115 bytes
..._9_hbase_1_2_3_bootstrap_index_partitions.hfile | Bin 0 -> 19745 bytes
.../resources/hudi_0_9_hbase_1_2_3_complex.hfile | Bin 0 -> 6612 bytes
.../resources/hudi_0_9_hbase_1_2_3_simple.hfile | Bin 0 -> 6115 bytes
hudi-client/hudi-spark-client/pom.xml | 6 +
.../hudi/testutils/HoodieClientTestUtils.java | 10 +-
hudi-common/pom.xml | 5 +-
.../bootstrap/index/HFileBootstrapIndex.java | 18 +-
.../table/log/AbstractHoodieLogRecordReader.java | 3 +
.../hudi/common/table/log/HoodieLogFileReader.java | 2 +-
.../table/log/block/HoodieHFileDataBlock.java | 40 +-
.../hudi/io/storage/HoodieHBaseKVComparator.java | 4 +-
.../apache/hudi/io/storage/HoodieHFileReader.java | 109 +-
.../apache/hudi/io/storage/HoodieHFileUtils.java | 87 +
hudi-common/src/main/resources/hbase-site.xml | 2185 ++++++++++++++++++++
.../inline/TestInLineFileSystemHFileInLining.java | 74 +-
.../common/functional/TestHoodieLogFormat.java | 7 +-
.../hudi/hadoop/testutils/InputFormatTestUtil.java | 9 +-
.../java/org/apache/hudi/integ/ITTestBase.java | 2 +-
packaging/hudi-flink-bundle/pom.xml | 180 +-
packaging/hudi-hadoop-mr-bundle/pom.xml | 147 +-
packaging/hudi-integ-test-bundle/pom.xml | 112 +-
packaging/hudi-kafka-connect-bundle/pom.xml | 127 +-
packaging/hudi-presto-bundle/pom.xml | 160 +-
packaging/hudi-spark-bundle/pom.xml | 154 +-
packaging/hudi-timeline-server-bundle/pom.xml | 103 +-
packaging/hudi-trino-bundle/pom.xml | 168 +-
packaging/hudi-utilities-bundle/pom.xml | 147 +-
pom.xml | 8 +-
41 files changed, 3866 insertions(+), 746 deletions(-)
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java
index a58e4d6..f808101 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java
@@ -548,7 +548,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload, I, K, O> extends
case AVRO_DATA_BLOCK:
return new HoodieAvroDataBlock(recordList, header, keyField);
case HFILE_DATA_BLOCK:
- return new HoodieHFileDataBlock(recordList, header, writeConfig.getHFileCompressionAlgorithm());
+ return new HoodieHFileDataBlock(
+ recordList, header, writeConfig.getHFileCompressionAlgorithm(), new Path(writeConfig.getBasePath()));
case PARQUET_DATA_BLOCK:
return new HoodieParquetDataBlock(recordList, header, keyField, writeConfig.getParquetCompressionCodec());
default:
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java
index 38db1cd..7d0c307 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java
@@ -30,6 +30,7 @@ import org.apache.hudi.table.HoodieTable;
import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroSchemaConverter;
@@ -53,10 +54,12 @@ public class HoodieFileWriterFactory {
return newParquetFileWriter(instantTime, path, config, schema, hoodieTable, taskContextSupplier, config.populateMetaFields());
}
if (HFILE.getFileExtension().equals(extension)) {
- return newHFileFileWriter(instantTime, path, config, schema, hoodieTable, taskContextSupplier);
+ return newHFileFileWriter(
+ instantTime, path, config, schema, hoodieTable.getHadoopConf(), taskContextSupplier);
}
if (ORC.getFileExtension().equals(extension)) {
- return newOrcFileWriter(instantTime, path, config, schema, hoodieTable, taskContextSupplier);
+ return newOrcFileWriter(
+ instantTime, path, config, schema, hoodieTable.getHadoopConf(), taskContextSupplier);
}
throw new UnsupportedOperationException(extension + " format not supported yet.");
}
@@ -64,28 +67,29 @@ public class HoodieFileWriterFactory {
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newParquetFileWriter(
String instantTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable,
TaskContextSupplier taskContextSupplier, boolean populateMetaFields) throws IOException {
- return newParquetFileWriter(instantTime, path, config, schema, hoodieTable, taskContextSupplier, populateMetaFields, populateMetaFields);
+ return newParquetFileWriter(instantTime, path, config, schema, hoodieTable.getHadoopConf(),
+ taskContextSupplier, populateMetaFields, populateMetaFields);
}
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newParquetFileWriter(
- String instantTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable,
+ String instantTime, Path path, HoodieWriteConfig config, Schema schema, Configuration conf,
TaskContextSupplier taskContextSupplier, boolean populateMetaFields, boolean enableBloomFilter) throws IOException {
Option<BloomFilter> filter = enableBloomFilter ? Option.of(createBloomFilter(config)) : Option.empty();
- HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter(hoodieTable.getHadoopConf()).convert(schema), schema, filter);
+ HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter(conf).convert(schema), schema, filter);
HoodieAvroParquetConfig parquetConfig = new HoodieAvroParquetConfig(writeSupport, config.getParquetCompressionCodec(),
config.getParquetBlockSize(), config.getParquetPageSize(), config.getParquetMaxFileSize(),
- hoodieTable.getHadoopConf(), config.getParquetCompressionRatio(), config.parquetDictionaryEnabled());
+ conf, config.getParquetCompressionRatio(), config.parquetDictionaryEnabled());
return new HoodieParquetWriter<>(instantTime, path, parquetConfig, schema, taskContextSupplier, populateMetaFields);
}
- private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newHFileFileWriter(
- String instantTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable,
+ static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newHFileFileWriter(
+ String instantTime, Path path, HoodieWriteConfig config, Schema schema, Configuration conf,
TaskContextSupplier taskContextSupplier) throws IOException {
BloomFilter filter = createBloomFilter(config);
- HoodieHFileConfig hfileConfig = new HoodieHFileConfig(hoodieTable.getHadoopConf(),
+ HoodieHFileConfig hfileConfig = new HoodieHFileConfig(conf,
config.getHFileCompressionAlgorithm(), config.getHFileBlockSize(), config.getHFileMaxFileSize(),
HoodieHFileReader.KEY_FIELD_NAME, PREFETCH_ON_OPEN, CACHE_DATA_IN_L1, DROP_BEHIND_CACHE_COMPACTION,
filter, HFILE_COMPARATOR);
@@ -94,10 +98,10 @@ public class HoodieFileWriterFactory {
}
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newOrcFileWriter(
- String instantTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable,
+ String instantTime, Path path, HoodieWriteConfig config, Schema schema, Configuration conf,
TaskContextSupplier taskContextSupplier) throws IOException {
BloomFilter filter = createBloomFilter(config);
- HoodieOrcConfig orcConfig = new HoodieOrcConfig(hoodieTable.getHadoopConf(), config.getOrcCompressionCodec(),
+ HoodieOrcConfig orcConfig = new HoodieOrcConfig(conf, config.getOrcCompressionCodec(),
config.getOrcStripeSize(), config.getOrcBlockSize(), config.getOrcMaxFileSize(), filter);
return new HoodieOrcWriter<>(instantTime, path, orcConfig, schema, taskContextSupplier);
}
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileConfig.java
index 1079566..5ce3779 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileConfig.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileConfig.java
@@ -21,14 +21,14 @@ package org.apache.hudi.io.storage;
import org.apache.hudi.common.bloom.BloomFilter;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.CellComparator;
import org.apache.hadoop.hbase.HColumnDescriptor;
-import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.io.compress.Compression;
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
public class HoodieHFileConfig {
- public static final KeyValue.KVComparator HFILE_COMPARATOR = new HoodieHBaseKVComparator();
+ public static final CellComparator HFILE_COMPARATOR = new HoodieHBaseKVComparator();
public static final boolean PREFETCH_ON_OPEN = CacheConfig.DEFAULT_PREFETCH_ON_OPEN;
public static final boolean CACHE_DATA_IN_L1 = HColumnDescriptor.DEFAULT_CACHE_DATA_IN_L1;
// This is private in CacheConfig so have been copied here.
@@ -42,12 +42,12 @@ public class HoodieHFileConfig {
private final boolean dropBehindCacheCompaction;
private final Configuration hadoopConf;
private final BloomFilter bloomFilter;
- private final KeyValue.KVComparator hfileComparator;
+ private final CellComparator hfileComparator;
private final String keyFieldName;
public HoodieHFileConfig(Configuration hadoopConf, Compression.Algorithm compressionAlgorithm, int blockSize,
long maxFileSize, String keyFieldName, boolean prefetchBlocksOnOpen, boolean cacheDataInL1,
- boolean dropBehindCacheCompaction, BloomFilter bloomFilter, KeyValue.KVComparator hfileComparator) {
+ boolean dropBehindCacheCompaction, BloomFilter bloomFilter, CellComparator hfileComparator) {
this.hadoopConf = hadoopConf;
this.compressionAlgorithm = compressionAlgorithm;
this.blockSize = blockSize;
@@ -96,7 +96,7 @@ public class HoodieHFileConfig {
return bloomFilter;
}
- public KeyValue.KVComparator getHfileComparator() {
+ public CellComparator getHFileComparator() {
return hfileComparator;
}
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileWriter.java
index 2ad6d7f..5dcd2e0 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileWriter.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileWriter.java
@@ -25,6 +25,8 @@ import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.fs.HoodieWrapperFileSystem;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.StringUtils;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
@@ -38,8 +40,6 @@ import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.io.hfile.HFileContext;
import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder;
import org.apache.hadoop.io.Writable;
-import org.apache.hudi.common.util.Option;
-import org.apache.hudi.common.util.StringUtils;
import java.io.DataInput;
import java.io.DataOutput;
@@ -95,6 +95,7 @@ public class HoodieHFileWriter<T extends HoodieRecordPayload, R extends IndexedR
HFileContext context = new HFileContextBuilder().withBlockSize(hfileConfig.getBlockSize())
.withCompression(hfileConfig.getCompressionAlgorithm())
+ .withCellComparator(hfileConfig.getHFileComparator())
.build();
conf.set(CacheConfig.PREFETCH_BLOCKS_ON_OPEN_KEY, String.valueOf(hfileConfig.shouldPrefetchBlocksOnOpen()));
@@ -104,7 +105,6 @@ public class HoodieHFileWriter<T extends HoodieRecordPayload, R extends IndexedR
this.writer = HFile.getWriterFactory(conf, cacheConfig)
.withPath(this.fs, this.file)
.withFileContext(context)
- .withComparator(hfileConfig.getHfileComparator())
.create();
writer.appendFileInfo(HoodieHFileReader.KEY_SCHEMA.getBytes(), schema.toString().getBytes());
diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java
index fd25d92..e1f9794 100644
--- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java
+++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java
@@ -18,71 +18,113 @@
package org.apache.hudi.io.storage;
-import org.apache.hudi.common.bloom.BloomFilter;
-import org.apache.hudi.common.bloom.BloomFilterFactory;
-import org.apache.hudi.common.bloom.BloomFilterTypeCode;
+import org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex;
import org.apache.hudi.common.engine.TaskContextSupplier;
+import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.EmptyHoodieRecordPayload;
import org.apache.hudi.common.model.HoodieAvroRecord;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.common.util.FileIOUtils;
+import org.apache.hudi.config.HoodieIndexConfig;
+import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hbase.io.compress.Compression;
+import org.apache.hadoop.hbase.CellComparatorImpl;
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
+import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.util.Pair;
-import org.junit.jupiter.api.AfterEach;
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.io.TempDir;
+import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
+import org.junit.jupiter.params.provider.ValueSource;
import org.mockito.Mockito;
-import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
+import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Supplier;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
import java.util.stream.Stream;
import static org.apache.hudi.common.testutils.FileSystemTestUtils.RANDOM;
import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource;
-import static org.apache.hudi.io.storage.HoodieHFileConfig.CACHE_DATA_IN_L1;
-import static org.apache.hudi.io.storage.HoodieHFileConfig.DROP_BEHIND_CACHE_COMPACTION;
import static org.apache.hudi.io.storage.HoodieHFileConfig.HFILE_COMPARATOR;
-import static org.apache.hudi.io.storage.HoodieHFileConfig.PREFETCH_ON_OPEN;
+import static org.apache.hudi.io.storage.HoodieHFileReader.KEY_SCHEMA;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.mockito.Mockito.when;
-public class TestHoodieHFileReaderWriter {
- @TempDir File tempDir;
- private Path filePath;
+public class TestHoodieHFileReaderWriter extends TestHoodieReaderWriterBase {
+ private static final String DUMMY_BASE_PATH = "dummy_base_path";
+ // Number of records in HFile fixtures for compatibility tests
+ private static final int NUM_RECORDS_FIXTURE = 50;
+ private static final String SIMPLE_SCHEMA_HFILE_SUFFIX = "_simple.hfile";
+ private static final String COMPLEX_SCHEMA_HFILE_SUFFIX = "_complex.hfile";
+ private static final String BOOTSTRAP_INDEX_HFILE_SUFFIX = "_bootstrap_index_partitions.hfile";
- @BeforeEach
- public void setup() throws IOException {
- filePath = new Path(tempDir.toString() + "tempFile.txt");
+ @Override
+ protected Path getFilePath() {
+ return new Path(tempDir.toString() + "/f1_1-0-1_000.hfile");
}
- @AfterEach
- public void clearTempFile() {
- File file = new File(filePath.toString());
- if (file.exists()) {
- file.delete();
- }
+ @Override
+ protected HoodieFileWriter<GenericRecord> createWriter(
+ Schema avroSchema, boolean populateMetaFields) throws Exception {
+ String instantTime = "000";
+ HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder()
+ .withPath(DUMMY_BASE_PATH)
+ .withIndexConfig(HoodieIndexConfig.newBuilder()
+ .bloomFilterNumEntries(1000).bloomFilterFPP(0.00001).build())
+ .withPopulateMetaFields(populateMetaFields)
+ .build();
+ Configuration conf = new Configuration();
+ TaskContextSupplier mockTaskContextSupplier = Mockito.mock(TaskContextSupplier.class);
+ Supplier<Integer> partitionSupplier = Mockito.mock(Supplier.class);
+ when(mockTaskContextSupplier.getPartitionIdSupplier()).thenReturn(partitionSupplier);
+ when(partitionSupplier.get()).thenReturn(10);
+
+ return HoodieFileWriterFactory.newHFileFileWriter(
+ instantTime, getFilePath(), writeConfig, avroSchema, conf, mockTaskContextSupplier);
+ }
+
+ @Override
+ protected HoodieFileReader<GenericRecord> createReader(
+ Configuration conf) throws Exception {
+ CacheConfig cacheConfig = new CacheConfig(conf);
+ return new HoodieHFileReader<>(conf, getFilePath(), cacheConfig, getFilePath().getFileSystem(conf));
+ }
+
+ @Override
+ protected void verifyMetadata(Configuration conf) throws IOException {
+ FileSystem fs = getFilePath().getFileSystem(conf);
+ HFile.Reader hfileReader = HoodieHFileUtils.createHFileReader(fs, getFilePath(), new CacheConfig(conf), conf);
+ assertEquals(HFILE_COMPARATOR.getClass(), hfileReader.getComparator().getClass());
+ assertEquals(NUM_RECORDS, hfileReader.getEntries());
+ }
+
+ @Override
+ protected void verifySchema(Configuration conf, String schemaPath) throws IOException {
+ FileSystem fs = getFilePath().getFileSystem(conf);
+ HFile.Reader hfileReader = HoodieHFileUtils.createHFileReader(fs, getFilePath(), new CacheConfig(conf), conf);
+ assertEquals(getSchemaFromResource(TestHoodieHFileReaderWriter.class, schemaPath),
+ new Schema.Parser().parse(new String(hfileReader.getHFileInfo().get(KEY_SCHEMA.getBytes()))));
}
private static Stream<Arguments> populateMetaFieldsAndTestAvroWithMeta() {
@@ -94,25 +136,11 @@ public class TestHoodieHFileReaderWriter {
}).map(Arguments::of);
}
- private HoodieHFileWriter createHFileWriter(Schema avroSchema, boolean populateMetaFields) throws Exception {
- BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.00001, -1, BloomFilterTypeCode.SIMPLE.name());
- Configuration conf = new Configuration();
- TaskContextSupplier mockTaskContextSupplier = Mockito.mock(TaskContextSupplier.class);
- Supplier<Integer> partitionSupplier = Mockito.mock(Supplier.class);
- when(mockTaskContextSupplier.getPartitionIdSupplier()).thenReturn(partitionSupplier);
- when(partitionSupplier.get()).thenReturn(10);
- String instantTime = "000";
-
- HoodieHFileConfig hoodieHFileConfig = new HoodieHFileConfig(conf, Compression.Algorithm.GZ, 1024 * 1024, 120 * 1024 * 1024,
- HoodieHFileReader.KEY_FIELD_NAME, PREFETCH_ON_OPEN, CACHE_DATA_IN_L1, DROP_BEHIND_CACHE_COMPACTION, filter, HFILE_COMPARATOR);
- return new HoodieHFileWriter(instantTime, filePath, hoodieHFileConfig, avroSchema, mockTaskContextSupplier, populateMetaFields);
- }
-
@ParameterizedTest
@MethodSource("populateMetaFieldsAndTestAvroWithMeta")
- public void testWriteReadHFile(boolean populateMetaFields, boolean testAvroWithMeta) throws Exception {
+ public void testWriteReadHFileWithMetaFields(boolean populateMetaFields, boolean testAvroWithMeta) throws Exception {
Schema avroSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleSchemaWithMetaFields.avsc");
- HoodieHFileWriter writer = createHFileWriter(avroSchema, populateMetaFields);
+ HoodieFileWriter<GenericRecord> writer = createWriter(avroSchema, populateMetaFields);
List<String> keys = new ArrayList<>();
Map<String, GenericRecord> recordMap = new HashMap<>();
for (int i = 0; i < 100; i++) {
@@ -134,8 +162,7 @@ public class TestHoodieHFileReaderWriter {
writer.close();
Configuration conf = new Configuration();
- CacheConfig cacheConfig = new CacheConfig(conf);
- HoodieHFileReader hoodieHFileReader = new HoodieHFileReader(conf, filePath, cacheConfig, filePath.getFileSystem(conf));
+ HoodieHFileReader hoodieHFileReader = (HoodieHFileReader) createReader(conf);
List<Pair<String, IndexedRecord>> records = hoodieHFileReader.readAllRecords();
records.forEach(entry -> assertEquals(entry.getSecond(), recordMap.get(entry.getFirst())));
hoodieHFileReader.close();
@@ -145,7 +172,7 @@ public class TestHoodieHFileReaderWriter {
Set<String> rowsToFetch = getRandomKeys(randomRowstoFetch, keys);
List<String> rowsList = new ArrayList<>(rowsToFetch);
Collections.sort(rowsList);
- hoodieHFileReader = new HoodieHFileReader(conf, filePath, cacheConfig, filePath.getFileSystem(conf));
+ hoodieHFileReader = (HoodieHFileReader) createReader(conf);
List<Pair<String, GenericRecord>> result = hoodieHFileReader.readRecords(rowsList);
assertEquals(result.size(), randomRowstoFetch);
result.forEach(entry -> {
@@ -160,6 +187,90 @@ public class TestHoodieHFileReaderWriter {
}
}
+ @Override
+ @Test
+ public void testWriteReadWithEvolvedSchema() throws Exception {
+ // Disable the test with evolved schema for HFile since it's not supported
+ // TODO(HUDI-3683): fix the schema evolution for HFile
+ }
+
+ @Test
+ public void testReadHFileFormatRecords() throws Exception {
+ writeFileWithSimpleSchema();
+ FileSystem fs = FSUtils.getFs(getFilePath().toString(), new Configuration());
+ byte[] content = FileIOUtils.readAsByteArray(
+ fs.open(getFilePath()), (int) fs.getFileStatus(getFilePath()).getLen());
+ // Reading byte array in HFile format, without actual file path
+ HoodieHFileReader<GenericRecord> hfileReader =
+ new HoodieHFileReader<>(fs, new Path(DUMMY_BASE_PATH), content);
+ Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchema.avsc");
+ assertEquals(NUM_RECORDS, hfileReader.getTotalRecords());
+ verifySimpleRecords(hfileReader.getRecordIterator(avroSchema));
+ }
+
+ @Test
+ public void testReaderGetRecordIterator() throws Exception {
+ writeFileWithSimpleSchema();
+ HoodieHFileReader<GenericRecord> hfileReader =
+ (HoodieHFileReader<GenericRecord>) createReader(new Configuration());
+ List<String> keys =
+ IntStream.concat(IntStream.range(40, NUM_RECORDS * 2), IntStream.range(10, 20))
+ .mapToObj(i -> "key" + String.format("%02d", i)).collect(Collectors.toList());
+ Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchema.avsc");
+ Iterator<GenericRecord> iterator = hfileReader.getRecordIterator(keys, avroSchema);
+
+ List<Integer> expectedIds =
+ IntStream.concat(IntStream.range(40, NUM_RECORDS), IntStream.range(10, 20))
+ .boxed().collect(Collectors.toList());
+ int index = 0;
+ while (iterator.hasNext()) {
+ GenericRecord record = iterator.next();
+ String key = "key" + String.format("%02d", expectedIds.get(index));
+ assertEquals(key, record.get("_row_key").toString());
+ assertEquals(Integer.toString(expectedIds.get(index)), record.get("time").toString());
+ assertEquals(expectedIds.get(index), record.get("number"));
+ index++;
+ }
+ }
+
+ @ParameterizedTest
+ @ValueSource(strings = {
+ "/hudi_0_9_hbase_1_2_3", "/hudi_0_10_hbase_1_2_3", "/hudi_0_11_hbase_2_4_9"})
+ public void testHoodieHFileCompatibility(String hfilePrefix) throws IOException {
+ // This fixture is generated from TestHoodieReaderWriterBase#testWriteReadPrimitiveRecord()
+ // using different Hudi releases
+ String simpleHFile = hfilePrefix + SIMPLE_SCHEMA_HFILE_SUFFIX;
+ // This fixture is generated from TestHoodieReaderWriterBase#testWriteReadComplexRecord()
+ // using different Hudi releases
+ String complexHFile = hfilePrefix + COMPLEX_SCHEMA_HFILE_SUFFIX;
+ // This fixture is generated from TestBootstrapIndex#testBootstrapIndex()
+ // using different Hudi releases. The file is copied from .hoodie/.aux/.bootstrap/.partitions/
+ String bootstrapIndexFile = hfilePrefix + BOOTSTRAP_INDEX_HFILE_SUFFIX;
+
+ FileSystem fs = FSUtils.getFs(getFilePath().toString(), new Configuration());
+ byte[] content = readHFileFromResources(simpleHFile);
+ verifyHFileReader(
+ HoodieHFileUtils.createHFileReader(fs, new Path(DUMMY_BASE_PATH), content),
+ hfilePrefix, true, HFILE_COMPARATOR.getClass(), NUM_RECORDS_FIXTURE);
+ HoodieHFileReader<GenericRecord> hfileReader =
+ new HoodieHFileReader<>(fs, new Path(DUMMY_BASE_PATH), content);
+ Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchema.avsc");
+ assertEquals(NUM_RECORDS_FIXTURE, hfileReader.getTotalRecords());
+ verifySimpleRecords(hfileReader.getRecordIterator(avroSchema));
+
+ content = readHFileFromResources(complexHFile);
+ verifyHFileReader(HoodieHFileUtils.createHFileReader(fs, new Path(DUMMY_BASE_PATH), content),
+ hfilePrefix, true, HFILE_COMPARATOR.getClass(), NUM_RECORDS_FIXTURE);
+ hfileReader = new HoodieHFileReader<>(fs, new Path(DUMMY_BASE_PATH), content);
+ avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchemaWithUDT.avsc");
+ assertEquals(NUM_RECORDS_FIXTURE, hfileReader.getTotalRecords());
+ verifySimpleRecords(hfileReader.getRecordIterator(avroSchema));
+
+ content = readHFileFromResources(bootstrapIndexFile);
+ verifyHFileReader(HoodieHFileUtils.createHFileReader(fs, new Path(DUMMY_BASE_PATH), content),
+ hfilePrefix, false, HFileBootstrapIndex.HoodieKVComparator.class, 4);
+ }
+
private Set<String> getRandomKeys(int count, List<String> keys) {
Set<String> rowKeys = new HashSet<>();
int totalKeys = keys.size();
@@ -171,4 +282,26 @@ public class TestHoodieHFileReaderWriter {
}
return rowKeys;
}
+
+ private byte[] readHFileFromResources(String filename) throws IOException {
+ long size = TestHoodieHFileReaderWriter.class
+ .getResource(filename).openConnection().getContentLength();
+ return FileIOUtils.readAsByteArray(
+ TestHoodieHFileReaderWriter.class.getResourceAsStream(filename), (int) size);
+ }
+
+ private void verifyHFileReader(
+ HFile.Reader reader, String hfileName, boolean mayUseDefaultComparator,
+ Class<?> clazz, int count) {
+ // HFile version is 3
+ assertEquals(3, reader.getTrailer().getMajorVersion());
+ if (mayUseDefaultComparator && hfileName.contains("hudi_0_9")) {
+ // Pre Hudi 0.10, the default comparator is used for metadata table HFiles
+ // For bootstrap index HFiles, the custom comparator is always used
+ assertEquals(CellComparatorImpl.class, reader.getComparator().getClass());
+ } else {
+ assertEquals(clazz, reader.getComparator().getClass());
+ }
+ assertEquals(count, reader.getEntries());
+ }
}
diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieOrcReaderWriter.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieOrcReaderWriter.java
index 68143a2..282f102 100644
--- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieOrcReaderWriter.java
+++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieOrcReaderWriter.java
@@ -18,53 +18,40 @@
package org.apache.hudi.io.storage;
-import org.apache.avro.Schema;
-import org.apache.avro.generic.GenericData;
-import org.apache.avro.generic.GenericRecord;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.bloom.BloomFilter;
import org.apache.hudi.common.bloom.BloomFilterFactory;
import org.apache.hudi.common.bloom.BloomFilterTypeCode;
import org.apache.hudi.common.engine.TaskContextSupplier;
import org.apache.hudi.config.HoodieStorageConfig;
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
import org.apache.orc.CompressionKind;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
-import org.junit.jupiter.api.AfterEach;
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
-import java.io.File;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
+import java.io.IOException;
import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY;
import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER;
import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER;
-import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource;
import static org.apache.hudi.io.storage.HoodieOrcConfig.AVRO_SCHEMA_METADATA_KEY;
-import static org.junit.jupiter.api.Assertions.assertFalse;
-import static org.junit.jupiter.api.Assertions.assertNull;
-import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
-public class TestHoodieOrcReaderWriter {
- private final Path filePath = new Path(System.getProperty("java.io.tmpdir") + "/f1_1-0-1_000.orc");
+public class TestHoodieOrcReaderWriter extends TestHoodieReaderWriterBase {
- @BeforeEach
- @AfterEach
- public void clearTempFile() {
- File file = new File(filePath.toString());
- if (file.exists()) {
- file.delete();
- }
+ @Override
+ protected Path getFilePath() {
+ return new Path(tempDir.toString() + "/f1_1-0-1_000.orc");
}
- private HoodieOrcWriter createOrcWriter(Schema avroSchema) throws Exception {
+ @Override
+ protected HoodieFileWriter<GenericRecord> createWriter(
+ Schema avroSchema, boolean populateMetaFields) throws Exception {
BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.00001, -1, BloomFilterTypeCode.SIMPLE.name());
Configuration conf = new Configuration();
int orcStripSize = Integer.parseInt(HoodieStorageConfig.ORC_STRIPE_SIZE.defaultValue());
@@ -73,189 +60,41 @@ public class TestHoodieOrcReaderWriter {
HoodieOrcConfig config = new HoodieOrcConfig(conf, CompressionKind.ZLIB, orcStripSize, orcBlockSize, maxFileSize, filter);
TaskContextSupplier mockTaskContextSupplier = Mockito.mock(TaskContextSupplier.class);
String instantTime = "000";
- return new HoodieOrcWriter(instantTime, filePath, config, avroSchema, mockTaskContextSupplier);
+ return new HoodieOrcWriter<>(instantTime, getFilePath(), config, avroSchema, mockTaskContextSupplier);
}
- @Test
- public void testWriteReadMetadata() throws Exception {
- Schema avroSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleSchema.avsc");
- HoodieOrcWriter writer = createOrcWriter(avroSchema);
- for (int i = 0; i < 3; i++) {
- GenericRecord record = new GenericData.Record(avroSchema);
- record.put("_row_key", "key" + i);
- record.put("time", Integer.toString(i));
- record.put("number", i);
- writer.writeAvro("key" + i, record);
- }
- writer.close();
+ @Override
+ protected HoodieFileReader<GenericRecord> createReader(
+ Configuration conf) throws Exception {
+ return HoodieFileReaderFactory.getFileReader(conf, getFilePath());
+ }
- Configuration conf = new Configuration();
- Reader orcReader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf));
+ @Override
+ protected void verifyMetadata(Configuration conf) throws IOException {
+ Reader orcReader = OrcFile.createReader(getFilePath(), OrcFile.readerOptions(conf));
assertEquals(4, orcReader.getMetadataKeys().size());
assertTrue(orcReader.getMetadataKeys().contains(HOODIE_MIN_RECORD_KEY_FOOTER));
assertTrue(orcReader.getMetadataKeys().contains(HOODIE_MAX_RECORD_KEY_FOOTER));
assertTrue(orcReader.getMetadataKeys().contains(HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY));
assertTrue(orcReader.getMetadataKeys().contains(AVRO_SCHEMA_METADATA_KEY));
assertEquals(CompressionKind.ZLIB.name(), orcReader.getCompressionKind().toString());
-
- HoodieFileReader<GenericRecord> hoodieReader = HoodieFileReaderFactory.getFileReader(conf, filePath);
- BloomFilter filter = hoodieReader.readBloomFilter();
- for (int i = 0; i < 3; i++) {
- assertTrue(filter.mightContain("key" + i));
- }
- assertFalse(filter.mightContain("non-existent-key"));
- assertEquals(3, hoodieReader.getTotalRecords());
- String[] minMaxRecordKeys = hoodieReader.readMinMaxRecordKeys();
- assertEquals(2, minMaxRecordKeys.length);
- assertEquals("key0", minMaxRecordKeys[0]);
- assertEquals("key2", minMaxRecordKeys[1]);
- }
-
- @Test
- public void testWriteReadPrimitiveRecord() throws Exception {
- Schema avroSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleSchema.avsc");
- HoodieOrcWriter writer = createOrcWriter(avroSchema);
- for (int i = 0; i < 3; i++) {
- GenericRecord record = new GenericData.Record(avroSchema);
- record.put("_row_key", "key" + i);
- record.put("time", Integer.toString(i));
- record.put("number", i);
- writer.writeAvro("key" + i, record);
- }
- writer.close();
-
- Configuration conf = new Configuration();
- Reader orcReader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf));
- assertEquals("struct<_row_key:string,time:string,number:int>", orcReader.getSchema().toString());
- assertEquals(3, orcReader.getNumberOfRows());
-
- HoodieFileReader<GenericRecord> hoodieReader = HoodieFileReaderFactory.getFileReader(conf, filePath);
- Iterator<GenericRecord> iter = hoodieReader.getRecordIterator();
- int index = 0;
- while (iter.hasNext()) {
- GenericRecord record = iter.next();
- assertEquals("key" + index, record.get("_row_key").toString());
- assertEquals(Integer.toString(index), record.get("time").toString());
- assertEquals(index, record.get("number"));
- index++;
- }
+ assertEquals(NUM_RECORDS, orcReader.getNumberOfRows());
}
- @Test
- public void testWriteReadComplexRecord() throws Exception {
- Schema avroSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleSchemaWithUDT.avsc");
- Schema udtSchema = avroSchema.getField("driver").schema().getTypes().get(1);
- HoodieOrcWriter writer = createOrcWriter(avroSchema);
- for (int i = 0; i < 3; i++) {
- GenericRecord record = new GenericData.Record(avroSchema);
- record.put("_row_key", "key" + i);
- record.put("time", Integer.toString(i));
- record.put("number", i);
- GenericRecord innerRecord = new GenericData.Record(udtSchema);
- innerRecord.put("driver_name", "driver" + i);
- innerRecord.put("list", Collections.singletonList(i));
- innerRecord.put("map", Collections.singletonMap("key" + i, "value" + i));
- record.put("driver", innerRecord);
- writer.writeAvro("key" + i, record);
- }
- writer.close();
-
- Configuration conf = new Configuration();
- Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf));
- assertEquals("struct<_row_key:string,time:string,number:int,driver:struct<driver_name:string,list:array<int>,map:map<string,string>>>",
- reader.getSchema().toString());
- assertEquals(3, reader.getNumberOfRows());
-
- HoodieFileReader<GenericRecord> hoodieReader = HoodieFileReaderFactory.getFileReader(conf, filePath);
- Iterator<GenericRecord> iter = hoodieReader.getRecordIterator();
- int index = 0;
- while (iter.hasNext()) {
- GenericRecord record = iter.next();
- assertEquals("key" + index, record.get("_row_key").toString());
- assertEquals(Integer.toString(index), record.get("time").toString());
- assertEquals(index, record.get("number"));
- GenericRecord innerRecord = (GenericRecord) record.get("driver");
- assertEquals("driver" + index, innerRecord.get("driver_name").toString());
- assertEquals(1, ((List<?>)innerRecord.get("list")).size());
- assertEquals(index, ((List<?>)innerRecord.get("list")).get(0));
- assertEquals("value" + index, ((Map<?,?>)innerRecord.get("map")).get("key" + index).toString());
- index++;
+ @Override
+ protected void verifySchema(Configuration conf, String schemaPath) throws IOException {
+ Reader orcReader = OrcFile.createReader(getFilePath(), OrcFile.readerOptions(conf));
+ if ("/exampleSchema.avsc".equals(schemaPath)) {
+ assertEquals("struct<_row_key:string,time:string,number:int>",
+ orcReader.getSchema().toString());
+ } else if ("/exampleSchemaWithUDT.avsc".equals(schemaPath)) {
+ assertEquals("struct<_row_key:string,time:string,number:int,driver:struct<driver_name:string,list:array<int>,map:map<string,string>>>",
+ orcReader.getSchema().toString());
}
}
- @Test
- public void testWriteReadWithEvolvedSchema() throws Exception {
- Schema avroSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleSchema.avsc");
- HoodieOrcWriter writer = createOrcWriter(avroSchema);
- for (int i = 0; i < 3; i++) {
- GenericRecord record = new GenericData.Record(avroSchema);
- record.put("_row_key", "key" + i);
- record.put("time", Integer.toString(i));
- record.put("number", i);
- writer.writeAvro("key" + i, record);
- }
- writer.close();
-
- Configuration conf = new Configuration();
- HoodieFileReader<GenericRecord> hoodieReader = HoodieFileReaderFactory.getFileReader(conf, filePath);
- Schema evolvedSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleEvolvedSchema.avsc");
- Iterator<GenericRecord> iter = hoodieReader.getRecordIterator(evolvedSchema);
- int index = 0;
- while (iter.hasNext()) {
- GenericRecord record = iter.next();
- assertEquals("key" + index, record.get("_row_key").toString());
- assertEquals(Integer.toString(index), record.get("time").toString());
- assertEquals(index, record.get("number"));
- assertNull(record.get("added_field"));
- index++;
- }
-
- evolvedSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleEvolvedSchemaChangeOrder.avsc");
- iter = hoodieReader.getRecordIterator(evolvedSchema);
- index = 0;
- while (iter.hasNext()) {
- GenericRecord record = iter.next();
- assertEquals("key" + index, record.get("_row_key").toString());
- assertEquals(Integer.toString(index), record.get("time").toString());
- assertEquals(index, record.get("number"));
- assertNull(record.get("added_field"));
- index++;
- }
-
- evolvedSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleEvolvedSchemaColumnRequire.avsc");
- iter = hoodieReader.getRecordIterator(evolvedSchema);
- index = 0;
- while (iter.hasNext()) {
- GenericRecord record = iter.next();
- assertEquals("key" + index, record.get("_row_key").toString());
- assertEquals(Integer.toString(index), record.get("time").toString());
- assertEquals(index, record.get("number"));
- assertNull(record.get("added_field"));
- index++;
- }
-
- evolvedSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleEvolvedSchemaColumnType.avsc");
- iter = hoodieReader.getRecordIterator(evolvedSchema);
- index = 0;
- while (iter.hasNext()) {
- GenericRecord record = iter.next();
- assertEquals("key" + index, record.get("_row_key").toString());
- assertEquals(Integer.toString(index), record.get("time").toString());
- assertEquals(Integer.toString(index), record.get("number").toString());
- assertNull(record.get("added_field"));
- index++;
- }
-
- evolvedSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleEvolvedSchemaDeleteColumn.avsc");
- iter = hoodieReader.getRecordIterator(evolvedSchema);
- index = 0;
- while (iter.hasNext()) {
- GenericRecord record = iter.next();
- assertEquals("key" + index, record.get("_row_key").toString());
- assertEquals(Integer.toString(index), record.get("time").toString());
- assertNull(record.get("number"));
- assertNull(record.get("added_field"));
- index++;
- }
+ @Override
+ public void testReaderFilterRowKeys() {
+ // TODO(HUDI-3682): fix filterRowKeys test for ORC due to a bug in ORC logic
}
}
diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieReaderWriterBase.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieReaderWriterBase.java
new file mode 100644
index 0000000..19f9b93
--- /dev/null
+++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieReaderWriterBase.java
@@ -0,0 +1,250 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.io.storage;
+
+import org.apache.hudi.common.bloom.BloomFilter;
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Abstract class for unit tests of {@link HoodieFileReader} and {@link HoodieFileWriter}
+ * for different file format
+ */
+public abstract class TestHoodieReaderWriterBase {
+ protected static final int NUM_RECORDS = 50;
+ @TempDir
+ protected File tempDir;
+
+ protected abstract Path getFilePath();
+
+ protected abstract HoodieFileWriter<GenericRecord> createWriter(
+ Schema avroSchema, boolean populateMetaFields) throws Exception;
+
+ protected abstract HoodieFileReader<GenericRecord> createReader(
+ Configuration conf) throws Exception;
+
+ protected abstract void verifyMetadata(Configuration conf) throws IOException;
+
+ protected abstract void verifySchema(Configuration conf, String schemaPath) throws IOException;
+
+ @BeforeEach
+ @AfterEach
+ public void clearTempFile() {
+ File file = new File(getFilePath().toString());
+ if (file.exists()) {
+ file.delete();
+ }
+ }
+
+ @Test
+ public void testWriteReadMetadata() throws Exception {
+ Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchema.avsc");
+ writeFileWithSimpleSchema();
+
+ Configuration conf = new Configuration();
+ verifyMetadata(conf);
+
+ HoodieFileReader<GenericRecord> hoodieReader = createReader(conf);
+ BloomFilter filter = hoodieReader.readBloomFilter();
+ for (int i = 0; i < NUM_RECORDS; i++) {
+ String key = "key" + String.format("%02d", i);
+ assertTrue(filter.mightContain(key));
+ }
+ assertFalse(filter.mightContain("non-existent-key"));
+ assertEquals(avroSchema, hoodieReader.getSchema());
+ assertEquals(NUM_RECORDS, hoodieReader.getTotalRecords());
+ String[] minMaxRecordKeys = hoodieReader.readMinMaxRecordKeys();
+ assertEquals(2, minMaxRecordKeys.length);
+ assertEquals("key00", minMaxRecordKeys[0]);
+ assertEquals("key" + (NUM_RECORDS - 1), minMaxRecordKeys[1]);
+ }
+
+ @Test
+ public void testWriteReadPrimitiveRecord() throws Exception {
+ String schemaPath = "/exampleSchema.avsc";
+ writeFileWithSimpleSchema();
+
+ Configuration conf = new Configuration();
+ verifyMetadata(conf);
+ verifySchema(conf, schemaPath);
+ verifySimpleRecords(createReader(conf).getRecordIterator());
+ }
+
+ @Test
+ public void testWriteReadComplexRecord() throws Exception {
+ String schemaPath = "/exampleSchemaWithUDT.avsc";
+ Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, schemaPath);
+ Schema udtSchema = avroSchema.getField("driver").schema().getTypes().get(1);
+ HoodieFileWriter<GenericRecord> writer = createWriter(avroSchema, true);
+ for (int i = 0; i < NUM_RECORDS; i++) {
+ GenericRecord record = new GenericData.Record(avroSchema);
+ String key = "key" + String.format("%02d", i);
+ record.put("_row_key", key);
+ record.put("time", Integer.toString(i));
+ record.put("number", i);
+ GenericRecord innerRecord = new GenericData.Record(udtSchema);
+ innerRecord.put("driver_name", "driver" + i);
+ innerRecord.put("list", Collections.singletonList(i));
+ innerRecord.put("map", Collections.singletonMap(key, "value" + i));
+ record.put("driver", innerRecord);
+ writer.writeAvro(key, record);
+ }
+ writer.close();
+
+ Configuration conf = new Configuration();
+ verifyMetadata(conf);
+ verifySchema(conf, schemaPath);
+ verifyComplexRecords(createReader(conf).getRecordIterator());
+ }
+
+ @Test
+ public void testWriteReadWithEvolvedSchema() throws Exception {
+ writeFileWithSimpleSchema();
+
+ Configuration conf = new Configuration();
+ HoodieFileReader<GenericRecord> hoodieReader = createReader(conf);
+ String[] schemaList = new String[] {
+ "/exampleEvolvedSchema.avsc", "/exampleEvolvedSchemaChangeOrder.avsc",
+ "/exampleEvolvedSchemaColumnRequire.avsc", "/exampleEvolvedSchemaColumnType.avsc",
+ "/exampleEvolvedSchemaDeleteColumn.avsc"};
+
+ for (String evolvedSchemaPath : schemaList) {
+ verifyReaderWithSchema(evolvedSchemaPath, hoodieReader);
+ }
+ }
+
+ @Test
+ public void testReaderFilterRowKeys() throws Exception {
+ writeFileWithSimpleSchema();
+ Configuration conf = new Configuration();
+ verifyMetadata(conf);
+ verifyFilterRowKeys(createReader(conf));
+ }
+
+ protected void writeFileWithSimpleSchema() throws Exception {
+ Schema avroSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, "/exampleSchema.avsc");
+ HoodieFileWriter<GenericRecord> writer = createWriter(avroSchema, true);
+ for (int i = 0; i < NUM_RECORDS; i++) {
+ GenericRecord record = new GenericData.Record(avroSchema);
+ String key = "key" + String.format("%02d", i);
+ record.put("_row_key", key);
+ record.put("time", Integer.toString(i));
+ record.put("number", i);
+ writer.writeAvro(key, record);
+ }
+ writer.close();
+ }
+
+ protected void verifySimpleRecords(Iterator<GenericRecord> iterator) {
+ int index = 0;
+ while (iterator.hasNext()) {
+ GenericRecord record = iterator.next();
+ String key = "key" + String.format("%02d", index);
+ assertEquals(key, record.get("_row_key").toString());
+ assertEquals(Integer.toString(index), record.get("time").toString());
+ assertEquals(index, record.get("number"));
+ index++;
+ }
+ }
+
+ protected void verifyComplexRecords(Iterator<GenericRecord> iterator) {
+ int index = 0;
+ while (iterator.hasNext()) {
+ GenericRecord record = iterator.next();
+ String key = "key" + String.format("%02d", index);
+ assertEquals(key, record.get("_row_key").toString());
+ assertEquals(Integer.toString(index), record.get("time").toString());
+ assertEquals(index, record.get("number"));
+ GenericRecord innerRecord = (GenericRecord) record.get("driver");
+ assertEquals("driver" + index, innerRecord.get("driver_name").toString());
+ assertEquals(1, ((List<?>) innerRecord.get("list")).size());
+ assertEquals(index, ((List<?>) innerRecord.get("list")).get(0));
+ Map<?, ?> mapping = (Map<?, ?>) innerRecord.get("map");
+ boolean match = false;
+ for (Object innerKey : mapping.keySet()) {
+ // The innerKey may not be in the type of String, so we have to
+ // use the following logic for validation
+ if (innerKey.toString().equals(key)) {
+ assertEquals("value" + index, mapping.get(innerKey).toString());
+ match = true;
+ }
+ }
+ assertTrue(match);
+ index++;
+ }
+ }
+
+ private void verifyFilterRowKeys(HoodieFileReader<GenericRecord> hoodieReader) {
+ Set<String> candidateRowKeys = IntStream.range(40, NUM_RECORDS * 2)
+ .mapToObj(i -> "key" + String.format("%02d", i)).collect(Collectors.toSet());
+ List<String> expectedKeys = IntStream.range(40, NUM_RECORDS)
+ .mapToObj(i -> "key" + String.format("%02d", i)).sorted().collect(Collectors.toList());
+ assertEquals(expectedKeys, hoodieReader.filterRowKeys(candidateRowKeys)
+ .stream().sorted().collect(Collectors.toList()));
+ }
+
+ private void verifyReaderWithSchema(String schemaPath, HoodieFileReader<GenericRecord> hoodieReader) throws IOException {
+ Schema evolvedSchema = getSchemaFromResource(TestHoodieReaderWriterBase.class, schemaPath);
+ Iterator<GenericRecord> iter = hoodieReader.getRecordIterator(evolvedSchema);
+ int index = 0;
+ while (iter.hasNext()) {
+ verifyRecord(schemaPath, iter.next(), index);
+ index++;
+ }
+ }
+
+ private void verifyRecord(String schemaPath, GenericRecord record, int index) {
+ String numStr = String.format("%02d", index);
+ assertEquals("key" + numStr, record.get("_row_key").toString());
+ assertEquals(Integer.toString(index), record.get("time").toString());
+ if ("/exampleEvolvedSchemaColumnType.avsc".equals(schemaPath)) {
+ assertEquals(Integer.toString(index), record.get("number").toString());
+ } else if ("/exampleEvolvedSchemaDeleteColumn.avsc".equals(schemaPath)) {
+ assertNull(record.get("number"));
+ } else {
+ assertEquals(index, record.get("number"));
+ }
+ assertNull(record.get("added_field"));
+ }
+}
diff --git a/hudi-client/hudi-client-common/src/test/resources/hudi_0_10_hbase_1_2_3_bootstrap_index_partitions.hfile b/hudi-client/hudi-client-common/src/test/resources/hudi_0_10_hbase_1_2_3_bootstrap_index_partitions.hfile
new file mode 100644
index 0000000..91e9c76
Binary files /dev/null and b/hudi-client/hudi-client-common/src/test/resources/hudi_0_10_hbase_1_2_3_bootstrap_index_partitions.hfile differ
diff --git a/hudi-client/hudi-client-common/src/test/resources/hudi_0_10_hbase_1_2_3_complex.hfile b/hudi-client/hudi-client-common/src/test/resources/hudi_0_10_hbase_1_2_3_complex.hfile
new file mode 100644
index 0000000..8ce3d0d
Binary files /dev/null and b/hudi-client/hudi-client-common/src/test/resources/hudi_0_10_hbase_1_2_3_complex.hfile differ
diff --git a/hudi-client/hudi-client-common/src/test/resources/hudi_0_10_hbase_1_2_3_simple.hfile b/hudi-client/hudi-client-common/src/test/resources/hudi_0_10_hbase_1_2_3_simple.hfile
new file mode 100644
index 0000000..abe0b33
Binary files /dev/null and b/hudi-client/hudi-client-common/src/test/resources/hudi_0_10_hbase_1_2_3_simple.hfile differ
diff --git a/hudi-client/hudi-client-common/src/test/resources/hudi_0_11_hbase_2_4_9_bootstrap_index_partitions.hfile b/hudi-client/hudi-client-common/src/test/resources/hudi_0_11_hbase_2_4_9_bootstrap_index_partitions.hfile
new file mode 100644
index 0000000..7f6c5bd
Binary files /dev/null and b/hudi-client/hudi-client-common/src/test/resources/hudi_0_11_hbase_2_4_9_bootstrap_index_partitions.hfile differ
diff --git a/hudi-client/hudi-client-common/src/test/resources/hudi_0_11_hbase_2_4_9_complex.hfile b/hudi-client/hudi-client-common/src/test/resources/hudi_0_11_hbase_2_4_9_complex.hfile
new file mode 100644
index 0000000..f5293c5
Binary files /dev/null and b/hudi-client/hudi-client-common/src/test/resources/hudi_0_11_hbase_2_4_9_complex.hfile differ
diff --git a/hudi-client/hudi-client-common/src/test/resources/hudi_0_11_hbase_2_4_9_simple.hfile b/hudi-client/hudi-client-common/src/test/resources/hudi_0_11_hbase_2_4_9_simple.hfile
new file mode 100644
index 0000000..2b57092
Binary files /dev/null and b/hudi-client/hudi-client-common/src/test/resources/hudi_0_11_hbase_2_4_9_simple.hfile differ
diff --git a/hudi-client/hudi-client-common/src/test/resources/hudi_0_9_hbase_1_2_3_bootstrap_index_partitions.hfile b/hudi-client/hudi-client-common/src/test/resources/hudi_0_9_hbase_1_2_3_bootstrap_index_partitions.hfile
new file mode 100644
index 0000000..290af99
Binary files /dev/null and b/hudi-client/hudi-client-common/src/test/resources/hudi_0_9_hbase_1_2_3_bootstrap_index_partitions.hfile differ
diff --git a/hudi-client/hudi-client-common/src/test/resources/hudi_0_9_hbase_1_2_3_complex.hfile b/hudi-client/hudi-client-common/src/test/resources/hudi_0_9_hbase_1_2_3_complex.hfile
new file mode 100644
index 0000000..5a16f0e
Binary files /dev/null and b/hudi-client/hudi-client-common/src/test/resources/hudi_0_9_hbase_1_2_3_complex.hfile differ
diff --git a/hudi-client/hudi-client-common/src/test/resources/hudi_0_9_hbase_1_2_3_simple.hfile b/hudi-client/hudi-client-common/src/test/resources/hudi_0_9_hbase_1_2_3_simple.hfile
new file mode 100644
index 0000000..e52d3c5
Binary files /dev/null and b/hudi-client/hudi-client-common/src/test/resources/hudi_0_9_hbase_1_2_3_simple.hfile differ
diff --git a/hudi-client/hudi-spark-client/pom.xml b/hudi-client/hudi-spark-client/pom.xml
index d6c60cb..0688fed 100644
--- a/hudi-client/hudi-spark-client/pom.xml
+++ b/hudi-client/hudi-spark-client/pom.xml
@@ -110,6 +110,12 @@
</exclusion>
</exclusions>
</dependency>
+ <dependency>
+ <groupId>org.apache.zookeeper</groupId>
+ <artifactId>zookeeper</artifactId>
+ <version>${zookeeper.version}</version>
+ <scope>test</scope>
+ </dependency>
<!-- Hive - Tests -->
<dependency>
diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java
index 05d7f99..c1f05f9 100644
--- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java
+++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java
@@ -38,6 +38,7 @@ import org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.io.storage.HoodieHFileUtils;
import org.apache.hudi.timeline.service.TimelineService;
import org.apache.avro.Schema;
@@ -66,6 +67,8 @@ import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
+import static org.apache.hudi.io.storage.HoodieHFileReader.KEY_SCHEMA;
+
/**
* Utility methods to aid testing inside the HoodieClient module.
*/
@@ -241,9 +244,10 @@ public class HoodieClientTestUtils {
Schema schema = null;
for (String path : paths) {
try {
- HFile.Reader reader = HFile.createReader(fs, new Path(path), cacheConfig, fs.getConf());
+ HFile.Reader reader =
+ HoodieHFileUtils.createHFileReader(fs, new Path(path), cacheConfig, fs.getConf());
if (schema == null) {
- schema = new Schema.Parser().parse(new String(reader.loadFileInfo().get("schema".getBytes())));
+ schema = new Schema.Parser().parse(new String(reader.getHFileInfo().get(KEY_SCHEMA.getBytes())));
}
HFileScanner scanner = reader.getScanner(false, false);
if (!scanner.seekTo()) {
@@ -252,7 +256,7 @@ public class HoodieClientTestUtils {
}
do {
- Cell c = scanner.getKeyValue();
+ Cell c = scanner.getCell();
byte[] value = Arrays.copyOfRange(c.getValueArray(), c.getValueOffset(), c.getValueOffset() + c.getValueLength());
valuesAsList.add(HoodieAvroUtils.bytesToAvro(value, schema));
} while (scanner.next());
diff --git a/hudi-common/pom.xml b/hudi-common/pom.xml
index 1a558ae..a7de212 100644
--- a/hudi-common/pom.xml
+++ b/hudi-common/pom.xml
@@ -221,14 +221,13 @@
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>${hbase.version}</version>
- <scope>test</scope>
</dependency>
-
+
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>${hbase.version}</version>
- <!-- Unfortunately, HFile is packaged ONLY under hbase-server -->
+ <!-- Unfortunately, HFile is packaged ONLY under hbase-server -->
<scope>compile</scope>
<exclusions>
<exclusion>
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/HFileBootstrapIndex.java b/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/HFileBootstrapIndex.java
index 3700d01..b8a2c20 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/HFileBootstrapIndex.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/HFileBootstrapIndex.java
@@ -33,10 +33,12 @@ import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
+import org.apache.hudi.io.storage.HoodieHFileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.CellComparatorImpl;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.KeyValue;
@@ -178,9 +180,7 @@ public class HFileBootstrapIndex extends BootstrapIndex {
private static HFile.Reader createReader(String hFilePath, Configuration conf, FileSystem fileSystem) {
try {
LOG.info("Opening HFile for reading :" + hFilePath);
- HFile.Reader reader = HFile.createReader(fileSystem, new HFilePathForReader(hFilePath),
- new CacheConfig(conf), conf);
- return reader;
+ return HoodieHFileUtils.createHFileReader(fileSystem, new HFilePathForReader(hFilePath), new CacheConfig(conf), conf);
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
@@ -259,7 +259,7 @@ public class HFileBootstrapIndex extends BootstrapIndex {
private HoodieBootstrapIndexInfo fetchBootstrapIndexInfo() throws IOException {
return TimelineMetadataUtils.deserializeAvroMetadata(
- partitionIndexReader().loadFileInfo().get(INDEX_INFO_KEY),
+ partitionIndexReader().getHFileInfo().get(INDEX_INFO_KEY),
HoodieBootstrapIndexInfo.class);
}
@@ -306,7 +306,7 @@ public class HFileBootstrapIndex extends BootstrapIndex {
try {
boolean available = scanner.seekTo();
while (available) {
- keys.add(converter.apply(getUserKeyFromCellKey(CellUtil.getCellKeyAsString(scanner.getKeyValue()))));
+ keys.add(converter.apply(getUserKeyFromCellKey(CellUtil.getCellKeyAsString(scanner.getCell()))));
available = scanner.next();
}
} catch (IOException ioe) {
@@ -528,13 +528,13 @@ public class HFileBootstrapIndex extends BootstrapIndex {
@Override
public void begin() {
try {
- HFileContext meta = new HFileContextBuilder().build();
+ HFileContext meta = new HFileContextBuilder().withCellComparator(new HoodieKVComparator()).build();
this.indexByPartitionWriter = HFile.getWriterFactory(metaClient.getHadoopConf(),
new CacheConfig(metaClient.getHadoopConf())).withPath(metaClient.getFs(), indexByPartitionPath)
- .withFileContext(meta).withComparator(new HoodieKVComparator()).create();
+ .withFileContext(meta).create();
this.indexByFileIdWriter = HFile.getWriterFactory(metaClient.getHadoopConf(),
new CacheConfig(metaClient.getHadoopConf())).withPath(metaClient.getFs(), indexByFileIdPath)
- .withFileContext(meta).withComparator(new HoodieKVComparator()).create();
+ .withFileContext(meta).create();
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
@@ -581,6 +581,6 @@ public class HFileBootstrapIndex extends BootstrapIndex {
* This class is explicitly used as Key Comparator to workaround hard coded
* legacy format class names inside HBase. Otherwise we will face issues with shading.
*/
- public static class HoodieKVComparator extends KeyValue.KVComparator {
+ public static class HoodieKVComparator extends CellComparatorImpl {
}
}
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java
index fa5117e..6a0b10f 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java
@@ -424,6 +424,9 @@ public abstract class AbstractHoodieLogRecordReader {
processDataBlock((HoodieAvroDataBlock) lastBlock, keys);
break;
case HFILE_DATA_BLOCK:
+ if (!keys.isPresent()) {
+ keys = Option.of(Collections.emptyList());
+ }
processDataBlock((HoodieHFileDataBlock) lastBlock, keys);
break;
case PARQUET_DATA_BLOCK:
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java
index 07cb36b..347187f 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java
@@ -208,7 +208,7 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader {
String.format("HFile block could not be of version (%d)", HoodieLogFormatVersion.DEFAULT_VERSION));
return new HoodieHFileDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc,
- Option.ofNullable(readerSchema), header, footer, enableRecordLookups);
+ Option.ofNullable(readerSchema), header, footer, enableRecordLookups, logFile.getPath());
case PARQUET_DATA_BLOCK:
checkState(nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION,
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java
index 557a0db..cdff7ae 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java
@@ -18,6 +18,18 @@
package org.apache.hudi.common.table.log.block;
+import org.apache.hudi.avro.HoodieAvroUtils;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.fs.inline.InLineFSUtils;
+import org.apache.hudi.common.fs.inline.InLineFileSystem;
+import org.apache.hudi.common.util.ClosableIterator;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.StringUtils;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.hudi.exception.HoodieIOException;
+import org.apache.hudi.io.storage.HoodieHBaseKVComparator;
+import org.apache.hudi.io.storage.HoodieHFileReader;
+
import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.conf.Configuration;
@@ -30,17 +42,6 @@ import org.apache.hadoop.hbase.io.hfile.CacheConfig;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.io.hfile.HFileContext;
import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder;
-import org.apache.hudi.avro.HoodieAvroUtils;
-import org.apache.hudi.common.fs.inline.InLineFSUtils;
-import org.apache.hudi.common.fs.inline.InLineFileSystem;
-import org.apache.hudi.common.util.ClosableIterator;
-import org.apache.hudi.common.util.Option;
-import org.apache.hudi.common.util.StringUtils;
-import org.apache.hudi.common.util.ValidationUtils;
-import org.apache.hudi.exception.HoodieIOException;
-import org.apache.hudi.io.storage.HoodieHBaseKVComparator;
-import org.apache.hudi.io.storage.HoodieHFileReader;
-
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -65,6 +66,9 @@ public class HoodieHFileDataBlock extends HoodieDataBlock {
private static final int DEFAULT_BLOCK_SIZE = 1024 * 1024;
private final Option<Compression.Algorithm> compressionAlgorithm;
+ // This path is used for constructing HFile reader context, which should not be
+ // interpreted as the actual file path for the HFile data blocks
+ private final Path pathForReader;
public HoodieHFileDataBlock(FSDataInputStream inputStream,
Option<byte[]> content,
@@ -73,16 +77,20 @@ public class HoodieHFileDataBlock extends HoodieDataBlock {
Option<Schema> readerSchema,
Map<HeaderMetadataType, String> header,
Map<HeaderMetadataType, String> footer,
- boolean enablePointLookups) {
+ boolean enablePointLookups,
+ Path pathForReader) {
super(content, inputStream, readBlockLazily, Option.of(logBlockContentLocation), readerSchema, header, footer, HoodieHFileReader.KEY_FIELD_NAME, enablePointLookups);
this.compressionAlgorithm = Option.empty();
+ this.pathForReader = pathForReader;
}
public HoodieHFileDataBlock(List<IndexedRecord> records,
Map<HeaderMetadataType, String> header,
- Compression.Algorithm compressionAlgorithm) {
+ Compression.Algorithm compressionAlgorithm,
+ Path pathForReader) {
super(records, header, new HashMap<>(), HoodieHFileReader.KEY_FIELD_NAME);
this.compressionAlgorithm = Option.of(compressionAlgorithm);
+ this.pathForReader = pathForReader;
}
@Override
@@ -95,6 +103,7 @@ public class HoodieHFileDataBlock extends HoodieDataBlock {
HFileContext context = new HFileContextBuilder()
.withBlockSize(DEFAULT_BLOCK_SIZE)
.withCompression(compressionAlgorithm.get())
+ .withCellComparator(new HoodieHBaseKVComparator())
.build();
Configuration conf = new Configuration();
@@ -128,7 +137,7 @@ public class HoodieHFileDataBlock extends HoodieDataBlock {
}
HFile.Writer writer = HFile.getWriterFactory(conf, cacheConfig)
- .withOutputStream(ostream).withFileContext(context).withComparator(new HoodieHBaseKVComparator()).create();
+ .withOutputStream(ostream).withFileContext(context).create();
// Write the records
sortedRecordsMap.forEach((recordKey, recordBytes) -> {
@@ -155,7 +164,8 @@ public class HoodieHFileDataBlock extends HoodieDataBlock {
Schema writerSchema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA));
// Read the content
- HoodieHFileReader<IndexedRecord> reader = new HoodieHFileReader<>(content);
+ HoodieHFileReader<IndexedRecord> reader = new HoodieHFileReader<>(
+ FSUtils.getFs(pathForReader.toString(), new Configuration()), pathForReader, content);
// Sets up the writer schema
reader.withSchema(writerSchema);
Iterator<IndexedRecord> recordIterator = reader.getRecordIterator(readerSchema);
diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHBaseKVComparator.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHBaseKVComparator.java
index 2d4d969..aaf1dcd 100644
--- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHBaseKVComparator.java
+++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHBaseKVComparator.java
@@ -19,11 +19,11 @@
package org.apache.hudi.io.storage;
-import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.CellComparatorImpl;
/**
* This class is explicitly used as Key Comparator to work around the hard coded
* legacy format class names inside HBase. Otherwise, we will face issues with shading.
*/
-public class HoodieHBaseKVComparator extends KeyValue.KVComparator {
+public class HoodieHBaseKVComparator extends CellComparatorImpl {
}
diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java
index 371da76..9044034 100644
--- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java
+++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java
@@ -18,18 +18,16 @@
package org.apache.hudi.io.storage;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeSet;
-import java.util.stream.Collectors;
+import org.apache.hudi.avro.HoodieAvroUtils;
+import org.apache.hudi.common.bloom.BloomFilter;
+import org.apache.hudi.common.bloom.BloomFilterFactory;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.util.ClosableIterator;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.hudi.common.util.io.ByteBufferBackedInputStream;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.exception.HoodieIOException;
import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
@@ -41,26 +39,37 @@ import org.apache.hadoop.fs.PositionedReadable;
import org.apache.hadoop.fs.Seekable;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.KeyValue;
-import org.apache.hadoop.hbase.io.FSDataInputStreamWrapper;
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
import org.apache.hadoop.hbase.io.hfile.HFile;
+import org.apache.hadoop.hbase.io.hfile.HFileInfo;
import org.apache.hadoop.hbase.io.hfile.HFileScanner;
+import org.apache.hadoop.hbase.nio.ByteBuff;
import org.apache.hadoop.hbase.util.Pair;
-import org.apache.hudi.avro.HoodieAvroUtils;
-import org.apache.hudi.common.bloom.BloomFilter;
-import org.apache.hudi.common.bloom.BloomFilterFactory;
-import org.apache.hudi.common.fs.FSUtils;
-import org.apache.hudi.common.util.ClosableIterator;
-import org.apache.hudi.common.util.Option;
-import org.apache.hudi.common.util.ValidationUtils;
-import org.apache.hudi.common.util.io.ByteBufferBackedInputStream;
-import org.apache.hudi.exception.HoodieException;
-import org.apache.hudi.exception.HoodieIOException;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.stream.Collectors;
+
public class HoodieHFileReader<R extends IndexedRecord> implements HoodieFileReader<R> {
+ public static final String KEY_FIELD_NAME = "key";
+ public static final String KEY_SCHEMA = "schema";
+ public static final String KEY_BLOOM_FILTER_META_BLOCK = "bloomFilter";
+ public static final String KEY_BLOOM_FILTER_TYPE_CODE = "bloomFilterTypeCode";
+ public static final String KEY_MIN_RECORD = "minRecordKey";
+ public static final String KEY_MAX_RECORD = "maxRecordKey";
+
private static final Logger LOG = LogManager.getLogger(HoodieHFileReader.class);
+
private Path path;
private Configuration conf;
private HFile.Reader reader;
@@ -70,55 +79,35 @@ public class HoodieHFileReader<R extends IndexedRecord> implements HoodieFileRea
// key retrieval.
private HFileScanner keyScanner;
- public static final String KEY_FIELD_NAME = "key";
- public static final String KEY_SCHEMA = "schema";
- public static final String KEY_BLOOM_FILTER_META_BLOCK = "bloomFilter";
- public static final String KEY_BLOOM_FILTER_TYPE_CODE = "bloomFilterTypeCode";
- public static final String KEY_MIN_RECORD = "minRecordKey";
- public static final String KEY_MAX_RECORD = "maxRecordKey";
-
public HoodieHFileReader(Configuration configuration, Path path, CacheConfig cacheConfig) throws IOException {
this.conf = configuration;
this.path = path;
- this.reader = HFile.createReader(FSUtils.getFs(path.toString(), configuration), path, cacheConfig, conf);
+ this.reader = HoodieHFileUtils.createHFileReader(FSUtils.getFs(path.toString(), configuration), path, cacheConfig, conf);
}
public HoodieHFileReader(Configuration configuration, Path path, CacheConfig cacheConfig, FileSystem fs) throws IOException {
this.conf = configuration;
this.path = path;
this.fsDataInputStream = fs.open(path);
- this.reader = HFile.createReader(fs, path, cacheConfig, configuration);
+ this.reader = HoodieHFileUtils.createHFileReader(fs, path, cacheConfig, configuration);
}
- public HoodieHFileReader(byte[] content) throws IOException {
- Configuration conf = new Configuration();
- Path path = new Path("hoodie");
- SeekableByteArrayInputStream bis = new SeekableByteArrayInputStream(content);
- FSDataInputStream fsdis = new FSDataInputStream(bis);
- this.reader = HFile.createReader(FSUtils.getFs("hoodie", conf), path, new FSDataInputStreamWrapper(fsdis),
- content.length, new CacheConfig(conf), conf);
+ public HoodieHFileReader(FileSystem fs, Path dummyPath, byte[] content) throws IOException {
+ this.reader = HoodieHFileUtils.createHFileReader(fs, dummyPath, content);
}
@Override
public String[] readMinMaxRecordKeys() {
- try {
- Map<byte[], byte[]> fileInfo = reader.loadFileInfo();
- return new String[] { new String(fileInfo.get(KEY_MIN_RECORD.getBytes())),
- new String(fileInfo.get(KEY_MAX_RECORD.getBytes()))};
- } catch (IOException e) {
- throw new HoodieException("Could not read min/max record key out of file information block correctly from path", e);
- }
+ HFileInfo fileInfo = reader.getHFileInfo();
+ return new String[] {new String(fileInfo.get(KEY_MIN_RECORD.getBytes())),
+ new String(fileInfo.get(KEY_MAX_RECORD.getBytes()))};
}
@Override
public Schema getSchema() {
if (schema == null) {
- try {
- Map<byte[], byte[]> fileInfo = reader.loadFileInfo();
- schema = new Schema.Parser().parse(new String(fileInfo.get(KEY_SCHEMA.getBytes())));
- } catch (IOException e) {
- throw new HoodieException("Could not read schema of file from path", e);
- }
+ HFileInfo fileInfo = reader.getHFileInfo();
+ schema = new Schema.Parser().parse(new String(fileInfo.get(KEY_SCHEMA.getBytes())));
}
return schema;
@@ -133,10 +122,10 @@ public class HoodieHFileReader<R extends IndexedRecord> implements HoodieFileRea
@Override
public BloomFilter readBloomFilter() {
- Map<byte[], byte[]> fileInfo;
+ HFileInfo fileInfo;
try {
- fileInfo = reader.loadFileInfo();
- ByteBuffer serializedFilter = reader.getMetaBlock(KEY_BLOOM_FILTER_META_BLOCK, false);
+ fileInfo = reader.getHFileInfo();
+ ByteBuff serializedFilter = reader.getMetaBlock(KEY_BLOOM_FILTER_META_BLOCK, false).getBufferWithoutHeader();
byte[] filterBytes = new byte[serializedFilter.remaining()];
serializedFilter.get(filterBytes); // read the bytes that were written
return BloomFilterFactory.fromString(new String(filterBytes),
@@ -206,7 +195,7 @@ public class HoodieHFileReader<R extends IndexedRecord> implements HoodieFileRea
final HFileScanner scanner = reader.getScanner(false, false);
if (scanner.seekTo()) {
do {
- Cell c = scanner.getKeyValue();
+ Cell c = scanner.getCell();
final Pair<String, R> keyAndRecordPair = getRecordFromCell(c, writerSchema, readerSchema, keyFieldSchema);
recordList.add(keyAndRecordPair);
} while (scanner.next());
@@ -250,7 +239,6 @@ public class HoodieHFileReader<R extends IndexedRecord> implements HoodieFileRea
*/
public List<Pair<String, R>> readRecords(List<String> keys, Schema schema) throws IOException {
this.schema = schema;
- reader.loadFileInfo();
List<Pair<String, R>> records = new ArrayList<>();
for (String key: keys) {
Option<R> value = getRecordByKey(key, schema);
@@ -263,7 +251,6 @@ public class HoodieHFileReader<R extends IndexedRecord> implements HoodieFileRea
public ClosableIterator<R> getRecordIterator(List<String> keys, Schema schema) throws IOException {
this.schema = schema;
- reader.loadFileInfo();
Iterator<String> iterator = keys.iterator();
return new ClosableIterator<R>() {
private R next;
@@ -310,7 +297,7 @@ public class HoodieHFileReader<R extends IndexedRecord> implements HoodieFileRea
// To handle when hasNext() is called multiple times for idempotency and/or the first time
if (this.next == null && !this.eof) {
if (!scanner.isSeeked() && scanner.seekTo()) {
- final Pair<String, R> keyAndRecordPair = getRecordFromCell(scanner.getKeyValue(), getSchema(), readerSchema, keyFieldSchema);
+ final Pair<String, R> keyAndRecordPair = getRecordFromCell(scanner.getCell(), getSchema(), readerSchema, keyFieldSchema);
this.next = keyAndRecordPair.getSecond();
}
}
@@ -331,7 +318,7 @@ public class HoodieHFileReader<R extends IndexedRecord> implements HoodieFileRea
}
R retVal = this.next;
if (scanner.next()) {
- final Pair<String, R> keyAndRecordPair = getRecordFromCell(scanner.getKeyValue(), getSchema(), readerSchema, keyFieldSchema);
+ final Pair<String, R> keyAndRecordPair = getRecordFromCell(scanner.getCell(), getSchema(), readerSchema, keyFieldSchema);
this.next = keyAndRecordPair.getSecond();
} else {
this.next = null;
@@ -371,7 +358,7 @@ public class HoodieHFileReader<R extends IndexedRecord> implements HoodieFileRea
}
if (keyScanner.seekTo(kv) == 0) {
- Cell c = keyScanner.getKeyValue();
+ Cell c = keyScanner.getCell();
// Extract the byte value before releasing the lock since we cannot hold on to the returned cell afterwards
value = Arrays.copyOfRange(c.getValueArray(), c.getValueOffset(), c.getValueOffset() + c.getValueLength());
}
diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileUtils.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileUtils.java
new file mode 100644
index 0000000..3767ea1
--- /dev/null
+++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileUtils.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.io.storage;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.io.FSDataInputStreamWrapper;
+import org.apache.hadoop.hbase.io.hfile.CacheConfig;
+import org.apache.hadoop.hbase.io.hfile.HFile;
+import org.apache.hadoop.hbase.io.hfile.HFileInfo;
+import org.apache.hadoop.hbase.io.hfile.ReaderContext;
+import org.apache.hadoop.hbase.io.hfile.ReaderContextBuilder;
+
+import java.io.IOException;
+
+/**
+ * Util class for HFile reading and writing in Hudi
+ */
+public class HoodieHFileUtils {
+ // Based on HBase 2.4.9, the primaryReplicaReader is mainly used for constructing
+ // block cache key, so if we do not use block cache then it is OK to set it as any
+ // value. We use true here.
+ private static final boolean USE_PRIMARY_REPLICA_READER = true;
+
+ /**
+ * Creates HFile reader for a file with default `primaryReplicaReader` as true.
+ *
+ * @param fs File system.
+ * @param path Path to file to read.
+ * @param cacheConfig Cache configuration.
+ * @param configuration Configuration
+ * @return HFile reader
+ * @throws IOException Upon error.
+ */
+ public static HFile.Reader createHFileReader(
+ FileSystem fs, Path path, CacheConfig cacheConfig, Configuration configuration) throws IOException {
+ return HFile.createReader(fs, path, cacheConfig, USE_PRIMARY_REPLICA_READER, configuration);
+ }
+
+ /**
+ * Creates HFile reader for byte array with default `primaryReplicaReader` as true.
+ *
+ * @param fs File system.
+ * @param dummyPath Dummy path to file to read.
+ * @param content Content in byte array.
+ * @return HFile reader
+ * @throws IOException Upon error.
+ */
+ public static HFile.Reader createHFileReader(
+ FileSystem fs, Path dummyPath, byte[] content) throws IOException {
+ Configuration conf = new Configuration();
+ HoodieHFileReader.SeekableByteArrayInputStream bis = new HoodieHFileReader.SeekableByteArrayInputStream(content);
+ FSDataInputStream fsdis = new FSDataInputStream(bis);
+ FSDataInputStreamWrapper stream = new FSDataInputStreamWrapper(fsdis);
+ ReaderContext context = new ReaderContextBuilder()
+ .withFilePath(dummyPath)
+ .withInputStreamWrapper(stream)
+ .withFileSize(content.length)
+ .withFileSystem(fs)
+ .withPrimaryReplicaReader(USE_PRIMARY_REPLICA_READER)
+ .withReaderType(ReaderContext.ReaderType.STREAM)
+ .build();
+ HFileInfo fileInfo = new HFileInfo(context, conf);
+ HFile.Reader reader = HFile.createReader(context, fileInfo, new CacheConfig(conf), conf);
+ fileInfo.initMetaAndIndex(reader);
+ return reader;
+ }
+}
diff --git a/hudi-common/src/main/resources/hbase-site.xml b/hudi-common/src/main/resources/hbase-site.xml
new file mode 100644
index 0000000..ad680e6
--- /dev/null
+++ b/hudi-common/src/main/resources/hbase-site.xml
@@ -0,0 +1,2185 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one
+ ~ or more contributor license agreements. See the NOTICE file
+ ~ distributed with this work for additional information
+ ~ regarding copyright ownership. The ASF licenses this file
+ ~ to you under the Apache License, Version 2.0 (the
+ ~ "License"); you may not use this file except in compliance
+ ~ with the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing,
+ ~ software distributed under the License is distributed on an
+ ~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ ~ KIND, either express or implied. See the License for the
+ ~ specific language governing permissions and limitations
+ ~ under the License.
+ -->
+
+<!--
+In Hudi bundles, we include this hbase-site.xml containing HBase
+default configs from the hbase-common 2.4.9 we use, to override the
+default configs loaded from hbase-default.xml from an older HBase
+version and to ensure correct default configs for Hudi HBase usage.
+In Hive, the Hive server loads all lib jars including HBase jars
+with its corresponding hbase-default.xml into class path (e.g.,
+HBase 1.1.1), and that can cause conflict with the hbase-default.xml
+in Hudi bundles (HBase 2.4.9). The exception is thrown as follows:
+
+Caused by: java.lang.RuntimeException: hbase-default.xml file
+seems to be for an older version of HBase (1.1.1), this version is 2.4.9
+
+Relevant logic causing such exception can be found in
+HBaseConfiguration::addHbaseResources(). To get around this issue,
+since HBase loads "hbase-site.xml" after "hbase-default.xml", we
+provide hbase-site.xml from the bundle so that HBaseConfiguration
+can pick it up and ensure the right defaults.
+-->
+
+<!--
+OVERVIEW
+
+The important configs. are listed near the top. You should change
+at least the setting for hbase.tmp.dir. Other settings will change
+dependent on whether you are running hbase in standalone mode or
+distributed. See the hbase reference guide for requirements and
+guidance making configuration.
+
+This file does not contain all possible configurations. The file would be
+much larger if it carried everything. The absent configurations will only be
+found through source code reading. The idea is that such configurations are
+exotic and only those who would go to the trouble of reading a particular
+section in the code would be knowledgeable or invested enough in ever wanting
+to alter such configurations, so we do not list them here. Listing all
+possible configurations would overwhelm and obscure the important.
+-->
+
+<configuration>
+ <!--Configs you will likely change are listed here at the top of the file.
+ -->
+ <property>
+ <name>hbase.tmp.dir</name>
+ <value>${java.io.tmpdir}/hbase-${user.name}</value>
+ <description>Temporary directory on the local filesystem.
+ Change this setting to point to a location more permanent
+ than '/tmp', the usual resolve for java.io.tmpdir, as the
+ '/tmp' directory is cleared on machine restart.
+ </description>
+ </property>
+ <property>
+ <name>hbase.rootdir</name>
+ <value>${hbase.tmp.dir}/hbase</value>
+ <description>The directory shared by region servers and into
+ which HBase persists. The URL should be 'fully-qualified'
+ to include the filesystem scheme. For example, to specify the
+ HDFS directory '/hbase' where the HDFS instance's namenode is
+ running at namenode.example.org on port 9000, set this value to:
+ hdfs://namenode.example.org:9000/hbase. By default, we write
+ to whatever ${hbase.tmp.dir} is set too -- usually /tmp --
+ so change this configuration or else all data will be lost on
+ machine restart.
+ </description>
+ </property>
+ <property>
+ <name>hbase.cluster.distributed</name>
+ <value>false</value>
+ <description>The mode the cluster will be in. Possible values are
+ false for standalone mode and true for distributed mode. If
+ false, startup will run all HBase and ZooKeeper daemons together
+ in the one JVM.
+ </description>
+ </property>
+ <property>
+ <name>hbase.zookeeper.quorum</name>
+ <!--Ideally we should be using a hostname here instead of IP address. Please refer to
+ https://issues.apache.org/jira/browse/HBASE-23764 for why we switched to IP address. Should be
+ changed once we fix the underlying ZK issue.-->
+ <value>127.0.0.1</value>
+ <description>Comma separated list of servers in the ZooKeeper ensemble
+ (This config. should have been named hbase.zookeeper.ensemble).
+ For example, "host1.mydomain.com,host2.mydomain.com,host3.mydomain.com".
+ By default this is set to localhost for local and pseudo-distributed modes
+ of operation. For a fully-distributed setup, this should be set to a full
+ list of ZooKeeper ensemble servers. If HBASE_MANAGES_ZK is set in hbase-env.sh
+ this is the list of servers which hbase will start/stop ZooKeeper on as
+ part of cluster start/stop. Client-side, we will take this list of
+ ensemble members and put it together with the hbase.zookeeper.property.clientPort
+ config. and pass it into zookeeper constructor as the connectString
+ parameter.
+ </description>
+ </property>
+ <!--The above are the important configurations for getting hbase up
+ and running -->
+
+ <property>
+ <name>zookeeper.recovery.retry.maxsleeptime</name>
+ <value>60000</value>
+ <description>Max sleep time before retry zookeeper operations in milliseconds,
+ a max time is needed here so that sleep time won't grow unboundedly
+ </description>
+ </property>
+ <property>
+ <name>hbase.local.dir</name>
+ <value>${hbase.tmp.dir}/local/</value>
+ <description>Directory on the local filesystem to be used
+ as a local storage.
+ </description>
+ </property>
+
+ <!--Master configurations-->
+ <property>
+ <name>hbase.master.port</name>
+ <value>16000</value>
+ <description>The port the HBase Master should bind to.</description>
+ </property>
+ <property>
+ <name>hbase.master.info.port</name>
+ <value>16010</value>
+ <description>The port for the HBase Master web UI.
+ Set to -1 if you do not want a UI instance run.
+ </description>
+ </property>
+ <property>
+ <name>hbase.master.info.bindAddress</name>
+ <value>0.0.0.0</value>
+ <description>The bind address for the HBase Master web UI
+ </description>
+ </property>
+ <property>
+ <name>hbase.master.logcleaner.plugins</name>
+ <value>
+ org.apache.hadoop.hbase.master.cleaner.TimeToLiveLogCleaner,org.apache.hadoop.hbase.master.cleaner.TimeToLiveProcedureWALCleaner,org.apache.hadoop.hbase.master.cleaner.TimeToLiveMasterLocalStoreWALCleaner
+ </value>
+ <description>A comma-separated list of BaseLogCleanerDelegate invoked by
+ the LogsCleaner service. These WAL cleaners are called in order,
+ so put the cleaner that prunes the most files in front. To
+ implement your own BaseLogCleanerDelegate, just put it in HBase's classpath
+ and add the fully qualified class name here. Always add the above
+ default log cleaners in the list.
+ </description>
+ </property>
+ <property>
+ <name>hbase.master.logcleaner.ttl</name>
+ <value>600000</value>
+ <description>How long a WAL remain in the archive ({hbase.rootdir}/oldWALs) directory,
+ after which it will be cleaned by a Master thread. The value is in milliseconds.
+ </description>
+ </property>
+ <property>
+ <name>hbase.master.hfilecleaner.plugins</name>
+ <value>
+ org.apache.hadoop.hbase.master.cleaner.TimeToLiveHFileCleaner,org.apache.hadoop.hbase.master.cleaner.TimeToLiveMasterLocalStoreHFileCleaner
+ </value>
+ <description>A comma-separated list of BaseHFileCleanerDelegate invoked by
+ the HFileCleaner service. These HFiles cleaners are called in order,
+ so put the cleaner that prunes the most files in front. To
+ implement your own BaseHFileCleanerDelegate, just put it in HBase's classpath
+ and add the fully qualified class name here. Always add the above
+ default hfile cleaners in the list as they will be overwritten in
+ hbase-site.xml.
+ </description>
+ </property>
+ <property>
+ <name>hbase.master.infoserver.redirect</name>
+ <value>true</value>
+ <description>Whether or not the Master listens to the Master web
+ UI port (hbase.master.info.port) and redirects requests to the web
+ UI server shared by the Master and RegionServer. Config. makes
+ sense when Master is serving Regions (not the default).
+ </description>
+ </property>
+ <property>
+ <name>hbase.master.fileSplitTimeout</name>
+ <value>600000</value>
+ <description>Splitting a region, how long to wait on the file-splitting
+ step before aborting the attempt. Default: 600000. This setting used
+ to be known as hbase.regionserver.fileSplitTimeout in hbase-1.x.
+ Split is now run master-side hence the rename (If a
+ 'hbase.master.fileSplitTimeout' setting found, will use it to
+ prime the current 'hbase.master.fileSplitTimeout'
+ Configuration.
+ </description>
+ </property>
+
+ <!--RegionServer configurations-->
+ <property>
+ <name>hbase.regionserver.port</name>
+ <value>16020</value>
+ <description>The port the HBase RegionServer binds to.</description>
+ </property>
+ <property>
+ <name>hbase.regionserver.info.port</name>
+ <value>16030</value>
+ <description>The port for the HBase RegionServer web UI
+ Set to -1 if you do not want the RegionServer UI to run.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.info.bindAddress</name>
+ <value>0.0.0.0</value>
+ <description>The address for the HBase RegionServer web UI</description>
+ </property>
+ <property>
+ <name>hbase.regionserver.info.port.auto</name>
+ <value>false</value>
+ <description>Whether or not the Master or RegionServer
+ UI should search for a port to bind to. Enables automatic port
+ search if hbase.regionserver.info.port is already in use.
+ Useful for testing, turned off by default.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.handler.count</name>
+ <value>30</value>
+ <description>Count of RPC Listener instances spun up on RegionServers.
+ Same property is used by the Master for count of master handlers.
+ Too many handlers can be counter-productive. Make it a multiple of
+ CPU count. If mostly read-only, handlers count close to cpu count
+ does well. Start with twice the CPU count and tune from there.
+ </description>
+ </property>
+ <property>
+ <name>hbase.ipc.server.callqueue.handler.factor</name>
+ <value>0.1</value>
+ <description>Factor to determine the number of call queues.
+ A value of 0 means a single queue shared between all the handlers.
+ A value of 1 means that each handler has its own queue.
+ </description>
+ </property>
+ <property>
+ <name>hbase.ipc.server.callqueue.read.ratio</name>
+ <value>0</value>
+ <description>Split the call queues into read and write queues.
+ The specified interval (which should be between 0.0 and 1.0)
+ will be multiplied by the number of call queues.
+ A value of 0 indicate to not split the call queues, meaning that both read and write
+ requests will be pushed to the same set of queues.
+ A value lower than 0.5 means that there will be less read queues than write queues.
+ A value of 0.5 means there will be the same number of read and write queues.
+ A value greater than 0.5 means that there will be more read queues than write queues.
+ A value of 1.0 means that all the queues except one are used to dispatch read requests.
+
+ Example: Given the total number of call queues being 10
+ a read.ratio of 0 means that: the 10 queues will contain both read/write requests.
+ a read.ratio of 0.3 means that: 3 queues will contain only read requests
+ and 7 queues will contain only write requests.
+ a read.ratio of 0.5 means that: 5 queues will contain only read requests
+ and 5 queues will contain only write requests.
+ a read.ratio of 0.8 means that: 8 queues will contain only read requests
+ and 2 queues will contain only write requests.
+ a read.ratio of 1 means that: 9 queues will contain only read requests
+ and 1 queues will contain only write requests.
+ </description>
+ </property>
+ <property>
+ <name>hbase.ipc.server.callqueue.scan.ratio</name>
+ <value>0</value>
+ <description>Given the number of read call queues, calculated from the total number
+ of call queues multiplied by the callqueue.read.ratio, the scan.ratio property
+ will split the read call queues into small-read and long-read queues.
+ A value lower than 0.5 means that there will be less long-read queues than short-read queues.
+ A value of 0.5 means that there will be the same number of short-read and long-read queues.
+ A value greater than 0.5 means that there will be more long-read queues than short-read queues
+ A value of 0 or 1 indicate to use the same set of queues for gets and scans.
+
+ Example: Given the total number of read call queues being 8
+ a scan.ratio of 0 or 1 means that: 8 queues will contain both long and short read requests.
+ a scan.ratio of 0.3 means that: 2 queues will contain only long-read requests
+ and 6 queues will contain only short-read requests.
+ a scan.ratio of 0.5 means that: 4 queues will contain only long-read requests
+ and 4 queues will contain only short-read requests.
+ a scan.ratio of 0.8 means that: 6 queues will contain only long-read requests
+ and 2 queues will contain only short-read requests.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.msginterval</name>
+ <value>3000</value>
+ <description>Interval between messages from the RegionServer to Master
+ in milliseconds.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.logroll.period</name>
+ <value>3600000</value>
+ <description>Period at which we will roll the commit log regardless
+ of how many edits it has.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.logroll.errors.tolerated</name>
+ <value>2</value>
+ <description>The number of consecutive WAL close errors we will allow
+ before triggering a server abort. A setting of 0 will cause the
+ region server to abort if closing the current WAL writer fails during
+ log rolling. Even a small value (2 or 3) will allow a region server
+ to ride over transient HDFS errors.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.hlog.reader.impl</name>
+ <value>org.apache.hadoop.hbase.regionserver.wal.ProtobufLogReader</value>
+ <description>The WAL file reader implementation.</description>
+ </property>
+ <property>
+ <name>hbase.regionserver.hlog.writer.impl</name>
+ <value>org.apache.hadoop.hbase.regionserver.wal.ProtobufLogWriter</value>
+ <description>The WAL file writer implementation.</description>
+ </property>
+ <property>
+ <name>hbase.regionserver.global.memstore.size</name>
+ <value></value>
+ <description>Maximum size of all memstores in a region server before new
+ updates are blocked and flushes are forced. Defaults to 40% of heap (0.4).
+ Updates are blocked and flushes are forced until size of all memstores
+ in a region server hits hbase.regionserver.global.memstore.size.lower.limit.
+ The default value in this configuration has been intentionally left empty in order to
+ honor the old hbase.regionserver.global.memstore.upperLimit property if present.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.global.memstore.size.lower.limit</name>
+ <value></value>
+ <description>Maximum size of all memstores in a region server before flushes
+ are forced. Defaults to 95% of hbase.regionserver.global.memstore.size
+ (0.95). A 100% value for this value causes the minimum possible flushing
+ to occur when updates are blocked due to memstore limiting. The default
+ value in this configuration has been intentionally left empty in order to
+ honor the old hbase.regionserver.global.memstore.lowerLimit property if
+ present.
+ </description>
+ </property>
+ <property>
+ <name>hbase.systemtables.compacting.memstore.type</name>
+ <value>NONE</value>
+ <description>Determines the type of memstore to be used for system tables like
+ META, namespace tables etc. By default NONE is the type and hence we use the
+ default memstore for all the system tables. If we need to use compacting
+ memstore for system tables then set this property to BASIC/EAGER
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.optionalcacheflushinterval</name>
+ <value>3600000</value>
+ <description>
+ Maximum amount of time an edit lives in memory before being automatically flushed.
+ Default 1 hour. Set it to 0 to disable automatic flushing.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.dns.interface</name>
+ <value>default</value>
+ <description>The name of the Network Interface from which a region server
+ should report its IP address.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.dns.nameserver</name>
+ <value>default</value>
+ <description>The host name or IP address of the name server (DNS)
+ which a region server should use to determine the host name used by the
+ master for communication and display purposes.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.region.split.policy</name>
+ <value>org.apache.hadoop.hbase.regionserver.SteppingSplitPolicy</value>
+ <description>
+ A split policy determines when a region should be split. The various
+ other split policies that are available currently are BusyRegionSplitPolicy,
+ ConstantSizeRegionSplitPolicy, DisabledRegionSplitPolicy,
+ DelimitedKeyPrefixRegionSplitPolicy, KeyPrefixRegionSplitPolicy, and
+ SteppingSplitPolicy. DisabledRegionSplitPolicy blocks manual region splitting.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.regionSplitLimit</name>
+ <value>1000</value>
+ <description>
+ Limit for the number of regions after which no more region splitting
+ should take place. This is not hard limit for the number of regions
+ but acts as a guideline for the regionserver to stop splitting after
+ a certain limit. Default is set to 1000.
+ </description>
+ </property>
+
+ <!--ZooKeeper configuration-->
+ <property>
+ <name>zookeeper.session.timeout</name>
+ <value>90000</value>
+ <description>ZooKeeper session timeout in milliseconds. It is used in two different ways.
+ First, this value is used in the ZK client that HBase uses to connect to the ensemble.
+ It is also used by HBase when it starts a ZK server and it is passed as the 'maxSessionTimeout'.
+ See https://zookeeper.apache.org/doc/current/zookeeperProgrammers.html#ch_zkSessions.
+ For example, if an HBase region server connects to a ZK ensemble that's also managed
+ by HBase, then the session timeout will be the one specified by this configuration.
+ But, a region server that connects to an ensemble managed with a different configuration
+ will be subjected that ensemble's maxSessionTimeout. So, even though HBase might propose
+ using 90 seconds, the ensemble can have a max timeout lower than this and it will take
+ precedence. The current default maxSessionTimeout that ZK ships with is 40 seconds, which is lower than
+ HBase's.
+ </description>
+ </property>
+ <property>
+ <name>zookeeper.znode.parent</name>
+ <value>/hbase</value>
+ <description>Root ZNode for HBase in ZooKeeper. All of HBase's ZooKeeper
+ files that are configured with a relative path will go under this node.
+ By default, all of HBase's ZooKeeper file paths are configured with a
+ relative path, so they will all go under this directory unless changed.
+ </description>
+ </property>
+ <property>
+ <name>zookeeper.znode.acl.parent</name>
+ <value>acl</value>
+ <description>Root ZNode for access control lists.</description>
+ </property>
+ <property>
+ <name>hbase.zookeeper.dns.interface</name>
+ <value>default</value>
+ <description>The name of the Network Interface from which a ZooKeeper server
+ should report its IP address.
+ </description>
+ </property>
+ <property>
+ <name>hbase.zookeeper.dns.nameserver</name>
+ <value>default</value>
+ <description>The host name or IP address of the name server (DNS)
+ which a ZooKeeper server should use to determine the host name used by the
+ master for communication and display purposes.
+ </description>
+ </property>
+ <!--
+ The following three properties are used together to create the list of
+ host:peer_port:leader_port quorum servers for ZooKeeper.
+ -->
+ <property>
+ <name>hbase.zookeeper.peerport</name>
+ <value>2888</value>
+ <description>Port used by ZooKeeper peers to talk to each other.
+ See https://zookeeper.apache.org/doc/r3.3.3/zookeeperStarted.html#sc_RunningReplicatedZooKeeper
+ for more information.
+ </description>
+ </property>
+ <property>
+ <name>hbase.zookeeper.leaderport</name>
+ <value>3888</value>
+ <description>Port used by ZooKeeper for leader election.
+ See https://zookeeper.apache.org/doc/r3.3.3/zookeeperStarted.html#sc_RunningReplicatedZooKeeper
+ for more information.
+ </description>
+ </property>
+ <!-- End of properties used to generate ZooKeeper host:port quorum list. -->
+
+ <!--
+ Beginning of properties that are directly mapped from ZooKeeper's zoo.cfg.
+ All properties with an "hbase.zookeeper.property." prefix are converted for
+ ZooKeeper's configuration. Hence, if you want to add an option from zoo.cfg,
+ e.g. "initLimit=10" you would append the following to your configuration:
+ <property>
+ <name>hbase.zookeeper.property.initLimit</name>
+ <value>10</value>
+ </property>
+ -->
+ <property>
+ <name>hbase.zookeeper.property.initLimit</name>
+ <value>10</value>
+ <description>Property from ZooKeeper's config zoo.cfg.
+ The number of ticks that the initial synchronization phase can take.
+ </description>
+ </property>
+ <property>
+ <name>hbase.zookeeper.property.syncLimit</name>
+ <value>5</value>
+ <description>Property from ZooKeeper's config zoo.cfg.
+ The number of ticks that can pass between sending a request and getting an
+ acknowledgment.
+ </description>
+ </property>
+ <property>
+ <name>hbase.zookeeper.property.dataDir</name>
+ <value>${hbase.tmp.dir}/zookeeper</value>
+ <description>Property from ZooKeeper's config zoo.cfg.
+ The directory where the snapshot is stored.
+ </description>
+ </property>
+ <property>
+ <name>hbase.zookeeper.property.clientPort</name>
+ <value>2181</value>
+ <description>Property from ZooKeeper's config zoo.cfg.
+ The port at which the clients will connect.
+ </description>
+ </property>
+ <property>
+ <name>hbase.zookeeper.property.maxClientCnxns</name>
+ <value>300</value>
+ <description>Property from ZooKeeper's config zoo.cfg.
+ Limit on number of concurrent connections (at the socket level) that a
+ single client, identified by IP address, may make to a single member of
+ the ZooKeeper ensemble. Set high to avoid zk connection issues running
+ standalone and pseudo-distributed.
+ </description>
+ </property>
+ <!-- End of properties that are directly mapped from ZooKeeper's zoo.cfg -->
+
+ <!--Client configurations-->
+ <property>
+ <name>hbase.client.write.buffer</name>
+ <value>2097152</value>
+ <description>Default size of the BufferedMutator write buffer in bytes.
+ A bigger buffer takes more memory -- on both the client and server
+ side since server instantiates the passed write buffer to process
+ it -- but a larger buffer size reduces the number of RPCs made.
+ For an estimate of server-side memory-used, evaluate
+ hbase.client.write.buffer * hbase.regionserver.handler.count
+ </description>
+ </property>
+ <property>
+ <name>hbase.client.pause</name>
+ <value>100</value>
+ <description>General client pause value. Used mostly as value to wait
+ before running a retry of a failed get, region lookup, etc.
+ See hbase.client.retries.number for description of how we backoff from
+ this initial pause amount and how this pause works w/ retries.
+ </description>
+ </property>
+ <property>
+ <name>hbase.client.pause.cqtbe</name>
+ <value></value>
+ <description>Whether or not to use a special client pause for
+ CallQueueTooBigException (cqtbe). Set this property to a higher value
+ than hbase.client.pause if you observe frequent CQTBE from the same
+ RegionServer and the call queue there keeps full
+ </description>
+ </property>
+ <property>
+ <name>hbase.client.retries.number</name>
+ <value>15</value>
+ <description>Maximum retries. Used as maximum for all retryable
+ operations such as the getting of a cell's value, starting a row update,
+ etc. Retry interval is a rough function based on hbase.client.pause. At
+ first we retry at this interval but then with backoff, we pretty quickly reach
+ retrying every ten seconds. See HConstants#RETRY_BACKOFF for how the backup
+ ramps up. Change this setting and hbase.client.pause to suit your workload.
+ </description>
+ </property>
+ <property>
+ <name>hbase.client.max.total.tasks</name>
+ <value>100</value>
+ <description>The maximum number of concurrent mutation tasks a single HTable instance will
+ send to the cluster.
+ </description>
+ </property>
+ <property>
+ <name>hbase.client.max.perserver.tasks</name>
+ <value>2</value>
+ <description>The maximum number of concurrent mutation tasks a single HTable instance will
+ send to a single region server.
+ </description>
+ </property>
+ <property>
+ <name>hbase.client.max.perregion.tasks</name>
+ <value>1</value>
+ <description>The maximum number of concurrent mutation tasks the client will
+ maintain to a single Region. That is, if there is already
+ hbase.client.max.perregion.tasks writes in progress for this region, new puts
+ won't be sent to this region until some writes finishes.
+ </description>
+ </property>
+ <property>
+ <name>hbase.client.perserver.requests.threshold</name>
+ <value>2147483647</value>
+ <description>The max number of concurrent pending requests for one server in all client threads
+ (process level). Exceeding requests will be thrown ServerTooBusyException immediately to prevent
+ user's threads being occupied and blocked by only one slow region server. If you use a fix
+ number of threads to access HBase in a synchronous way, set this to a suitable value which is
+ related to the number of threads will help you. See
+ https://issues.apache.org/jira/browse/HBASE-16388 for details.
+ </description>
+ </property>
+ <property>
+ <name>hbase.client.scanner.caching</name>
+ <value>2147483647</value>
+ <description>Number of rows that we try to fetch when calling next
+ on a scanner if it is not served from (local, client) memory. This configuration
+ works together with hbase.client.scanner.max.result.size to try and use the
+ network efficiently. The default value is Integer.MAX_VALUE by default so that
+ the network will fill the chunk size defined by hbase.client.scanner.max.result.size
+ rather than be limited by a particular number of rows since the size of rows varies
+ table to table. If you know ahead of time that you will not require more than a certain
+ number of rows from a scan, this configuration should be set to that row limit via
+ Scan#setCaching. Higher caching values will enable faster scanners but will eat up more
+ memory and some calls of next may take longer and longer times when the cache is empty.
+ Do not set this value such that the time between invocations is greater than the scanner
+ timeout; i.e. hbase.client.scanner.timeout.period
+ </description>
+ </property>
+ <property>
+ <name>hbase.client.keyvalue.maxsize</name>
+ <value>10485760</value>
+ <description>Specifies the combined maximum allowed size of a KeyValue
+ instance. This is to set an upper boundary for a single entry saved in a
+ storage file. Since they cannot be split it helps avoiding that a region
+ cannot be split any further because the data is too large. It seems wise
+ to set this to a fraction of the maximum region size. Setting it to zero
+ or less disables the check.
+ </description>
+ </property>
+ <property>
+ <name>hbase.server.keyvalue.maxsize</name>
+ <value>10485760</value>
+ <description>Maximum allowed size of an individual cell, inclusive of value and all key
+ components. A value of 0 or less disables the check.
+ The default value is 10MB.
+ This is a safety setting to protect the server from OOM situations.
+ </description>
+ </property>
+ <property>
+ <name>hbase.client.scanner.timeout.period</name>
+ <value>60000</value>
+ <description>Client scanner lease period in milliseconds.</description>
+ </property>
+ <property>
+ <name>hbase.client.localityCheck.threadPoolSize</name>
+ <value>2</value>
+ </property>
+
+ <!--Miscellaneous configuration-->
+ <property>
+ <name>hbase.bulkload.retries.number</name>
+ <value>10</value>
+ <description>Maximum retries. This is maximum number of iterations
+ to atomic bulk loads are attempted in the face of splitting operations
+ 0 means never give up.
+ </description>
+ </property>
+ <property>
+ <name>hbase.master.balancer.maxRitPercent</name>
+ <value>1.0</value>
+ <description>The max percent of regions in transition when balancing.
+ The default value is 1.0. So there are no balancer throttling. If set this config to 0.01,
+ It means that there are at most 1% regions in transition when balancing.
+ Then the cluster's availability is at least 99% when balancing.
+ </description>
+ </property>
+ <property>
+ <name>hbase.balancer.period
+ </name>
+ <value>300000</value>
+ <description>Period at which the region balancer runs in the Master, in
+ milliseconds.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regions.slop</name>
+ <value>0.001</value>
+ <description>Rebalance if any regionserver has average + (average * slop) regions.
+ The default value of this parameter is 0.001 in StochasticLoadBalancer (the default load
+ balancer), while the default is 0.2 in other load balancers (i.e.,
+ SimpleLoadBalancer).
+ </description>
+ </property>
+ <property>
+ <name>hbase.normalizer.period</name>
+ <value>300000</value>
+ <description>Period at which the region normalizer runs in the Master, in
+ milliseconds.
+ </description>
+ </property>
+ <property>
+ <name>hbase.normalizer.split.enabled</name>
+ <value>true</value>
+ <description>Whether to split a region as part of normalization.</description>
+ </property>
+ <property>
+ <name>hbase.normalizer.merge.enabled</name>
+ <value>true</value>
+ <description>Whether to merge a region as part of normalization.</description>
+ </property>
+ <property>
+ <name>hbase.normalizer.min.region.count</name>
+ <value>3</value>
+ <description>The minimum number of regions in a table to consider it for merge
+ normalization.
+ </description>
+ </property>
+ <property>
+ <name>hbase.normalizer.merge.min_region_age.days</name>
+ <value>3</value>
+ <description>The minimum age for a region to be considered for a merge, in days.</description>
+ </property>
+ <property>
+ <name>hbase.normalizer.merge.min_region_age.days</name>
+ <value>3</value>
+ <description>The minimum age for a region to be considered for a merge, in days.</description>
+ </property>
+ <property>
+ <name>hbase.normalizer.merge.min_region_size.mb</name>
+ <value>1</value>
+ <description>The minimum size for a region to be considered for a merge, in whole
+ MBs.
+ </description>
+ </property>
+ <property>
+ <name>hbase.table.normalization.enabled</name>
+ <value>false</value>
+ <description>This config is used to set default behaviour of normalizer at table level.
+ To override this at table level one can set NORMALIZATION_ENABLED at table descriptor level
+ and that property will be honored
+ </description>
+ </property>
+ <property>
+ <name>hbase.server.thread.wakefrequency</name>
+ <value>10000</value>
+ <description>Time to sleep in between searches for work (in milliseconds).
+ Used as sleep interval by service threads such as log roller.
+ </description>
+ </property>
+ <property>
+ <name>hbase.server.versionfile.writeattempts</name>
+ <value>3</value>
+ <description>
+ How many times to retry attempting to write a version file
+ before just aborting. Each attempt is separated by the
+ hbase.server.thread.wakefrequency milliseconds.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hregion.memstore.flush.size</name>
+ <value>134217728</value>
+ <description>
+ Memstore will be flushed to disk if size of the memstore
+ exceeds this number of bytes. Value is checked by a thread that runs
+ every hbase.server.thread.wakefrequency.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hregion.percolumnfamilyflush.size.lower.bound.min</name>
+ <value>16777216</value>
+ <description>
+ If FlushLargeStoresPolicy is used and there are multiple column families,
+ then every time that we hit the total memstore limit, we find out all the
+ column families whose memstores exceed a "lower bound" and only flush them
+ while retaining the others in memory. The "lower bound" will be
+ "hbase.hregion.memstore.flush.size / column_family_number" by default
+ unless value of this property is larger than that. If none of the families
+ have their memstore size more than lower bound, all the memstores will be
+ flushed (just as usual).
+ </description>
+ </property>
+ <property>
+ <name>hbase.hregion.preclose.flush.size</name>
+ <value>5242880</value>
+ <description>
+ If the memstores in a region are this size or larger when we go
+ to close, run a "pre-flush" to clear out memstores before we put up
+ the region closed flag and take the region offline. On close,
+ a flush is run under the close flag to empty memory. During
+ this time the region is offline and we are not taking on any writes.
+ If the memstore content is large, this flush could take a long time to
+ complete. The preflush is meant to clean out the bulk of the memstore
+ before putting up the close flag and taking the region offline so the
+ flush that runs under the close flag has little to do.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hregion.memstore.block.multiplier</name>
+ <value>4</value>
+ <description>
+ Block updates if memstore has hbase.hregion.memstore.block.multiplier
+ times hbase.hregion.memstore.flush.size bytes. Useful preventing
+ runaway memstore during spikes in update traffic. Without an
+ upper-bound, memstore fills such that when it flushes the
+ resultant flush files take a long time to compact or split, or
+ worse, we OOME.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hregion.memstore.mslab.enabled</name>
+ <value>true</value>
+ <description>
+ Enables the MemStore-Local Allocation Buffer,
+ a feature which works to prevent heap fragmentation under
+ heavy write loads. This can reduce the frequency of stop-the-world
+ GC pauses on large heaps.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hregion.memstore.mslab.chunksize</name>
+ <value>2097152</value>
+ <description>The maximum byte size of a chunk in the MemStoreLAB. Unit: bytes</description>
+ </property>
+ <property>
+ <name>hbase.regionserver.offheap.global.memstore.size</name>
+ <value>0</value>
+ <description>The amount of off-heap memory all MemStores in a RegionServer may use.
+ A value of 0 means that no off-heap memory will be used and all chunks in MSLAB
+ will be HeapByteBuffer, otherwise the non-zero value means how many megabyte of
+ off-heap memory will be used for chunks in MSLAB and all chunks in MSLAB will be
+ DirectByteBuffer. Unit: megabytes.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hregion.memstore.mslab.max.allocation</name>
+ <value>262144</value>
+ <description>The maximal size of one allocation in the MemStoreLAB, if the desired byte
+ size exceed this threshold then it will be just allocated from JVM heap rather than MemStoreLAB.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hregion.max.filesize</name>
+ <value>10737418240</value>
+ <description>
+ Maximum HFile size. If the sum of the sizes of a region's HFiles has grown to exceed this
+ value, the region is split in two.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hregion.split.overallfiles</name>
+ <value>false</value>
+ <description>If we should sum overall region files size when check to split.</description>
+ </property>
+ <property>
+ <name>hbase.hregion.majorcompaction</name>
+ <value>604800000</value>
+ <description>Time between major compactions, expressed in milliseconds. Set to 0 to disable
+ time-based automatic major compactions. User-requested and size-based major compactions will
+ still run. This value is multiplied by hbase.hregion.majorcompaction.jitter to cause
+ compaction to start at a somewhat-random time during a given window of time. The default value
+ is 7 days, expressed in milliseconds. If major compactions are causing disruption in your
+ environment, you can configure them to run at off-peak times for your deployment, or disable
+ time-based major compactions by setting this parameter to 0, and run major compactions in a
+ cron job or by another external mechanism.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hregion.majorcompaction.jitter</name>
+ <value>0.50</value>
+ <description>A multiplier applied to hbase.hregion.majorcompaction to cause compaction to occur
+ a given amount of time either side of hbase.hregion.majorcompaction. The smaller the number,
+ the closer the compactions will happen to the hbase.hregion.majorcompaction
+ interval.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hstore.compactionThreshold</name>
+ <value>3</value>
+ <description>If more than this number of StoreFiles exist in any one Store
+ (one StoreFile is written per flush of MemStore), a compaction is run to rewrite all
+ StoreFiles into a single StoreFile. Larger values delay compaction, but when compaction does
+ occur, it takes longer to complete.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.compaction.enabled</name>
+ <value>true</value>
+ <description>Enable/disable compactions on by setting true/false.
+ We can further switch compactions dynamically with the
+ compaction_switch shell command.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hstore.flusher.count</name>
+ <value>2</value>
+ <description>The number of flush threads. With fewer threads, the MemStore flushes will be
+ queued. With more threads, the flushes will be executed in parallel, increasing the load on
+ HDFS, and potentially causing more compactions.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hstore.blockingStoreFiles</name>
+ <value>16</value>
+ <description>If more than this number of StoreFiles exist in any one Store (one StoreFile
+ is written per flush of MemStore), updates are blocked for this region until a compaction is
+ completed, or until hbase.hstore.blockingWaitTime has been exceeded.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hstore.blockingWaitTime</name>
+ <value>90000</value>
+ <description>The time for which a region will block updates after reaching the StoreFile limit
+ defined by hbase.hstore.blockingStoreFiles. After this time has elapsed, the region will stop
+ blocking updates even if a compaction has not been completed.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hstore.compaction.min</name>
+ <value></value>
+ <description>The minimum number of StoreFiles which must be eligible for compaction before
+ compaction can run. The goal of tuning hbase.hstore.compaction.min is to avoid ending up with
+ too many tiny StoreFiles to compact. Setting this value to 2 would cause a minor compaction
+ each time you have two StoreFiles in a Store, and this is probably not appropriate. If you
+ set this value too high, all the other values will need to be adjusted accordingly. For most
+ cases, the default value is appropriate (empty value here, results in 3 by code logic). In
+ previous versions of HBase, the parameter hbase.hstore.compaction.min was named
+ hbase.hstore.compactionThreshold.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hstore.compaction.max</name>
+ <value>10</value>
+ <description>The maximum number of StoreFiles which will be selected for a single minor
+ compaction, regardless of the number of eligible StoreFiles. Effectively, the value of
+ hbase.hstore.compaction.max controls the length of time it takes a single compaction to
+ complete. Setting it larger means that more StoreFiles are included in a compaction. For most
+ cases, the default value is appropriate.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hstore.compaction.min.size</name>
+ <value>134217728</value>
+ <description>A StoreFile (or a selection of StoreFiles, when using ExploringCompactionPolicy)
+ smaller than this size will always be eligible for minor compaction.
+ HFiles this size or larger are evaluated by hbase.hstore.compaction.ratio to determine if
+ they are eligible. Because this limit represents the "automatic include" limit for all
+ StoreFiles smaller than this value, this value may need to be reduced in write-heavy
+ environments where many StoreFiles in the 1-2 MB range are being flushed, because every
+ StoreFile will be targeted for compaction and the resulting StoreFiles may still be under the
+ minimum size and require further compaction. If this parameter is lowered, the ratio check is
+ triggered more quickly. This addressed some issues seen in earlier versions of HBase but
+ changing this parameter is no longer necessary in most situations. Default: 128 MB expressed
+ in bytes.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hstore.compaction.max.size</name>
+ <value>9223372036854775807</value>
+ <description>A StoreFile (or a selection of StoreFiles, when using ExploringCompactionPolicy)
+ larger than this size will be excluded from compaction. The effect of
+ raising hbase.hstore.compaction.max.size is fewer, larger StoreFiles that do not get
+ compacted often. If you feel that compaction is happening too often without much benefit, you
+ can try raising this value. Default: the value of LONG.MAX_VALUE, expressed in bytes.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hstore.compaction.ratio</name>
+ <value>1.2F</value>
+ <description>For minor compaction, this ratio is used to determine whether a given StoreFile
+ which is larger than hbase.hstore.compaction.min.size is eligible for compaction. Its
+ effect is to limit compaction of large StoreFiles. The value of hbase.hstore.compaction.ratio
+ is expressed as a floating-point decimal. A large ratio, such as 10, will produce a single
+ giant StoreFile. Conversely, a low value, such as .25, will produce behavior similar to the
+ BigTable compaction algorithm, producing four StoreFiles. A moderate value of between 1.0 and
+ 1.4 is recommended. When tuning this value, you are balancing write costs with read costs.
+ Raising the value (to something like 1.4) will have more write costs, because you will
+ compact larger StoreFiles. However, during reads, HBase will need to seek through fewer
+ StoreFiles to accomplish the read. Consider this approach if you cannot take advantage of
+ Bloom filters. Otherwise, you can lower this value to something like 1.0 to reduce the
+ background cost of writes, and use Bloom filters to control the number of StoreFiles touched
+ during reads. For most cases, the default value is appropriate.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hstore.compaction.ratio.offpeak</name>
+ <value>5.0F</value>
+ <description>Allows you to set a different (by default, more aggressive) ratio for determining
+ whether larger StoreFiles are included in compactions during off-peak hours. Works in the
+ same way as hbase.hstore.compaction.ratio. Only applies if hbase.offpeak.start.hour and
+ hbase.offpeak.end.hour are also enabled.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hstore.time.to.purge.deletes</name>
+ <value>0</value>
+ <description>The amount of time to delay purging of delete markers with future timestamps. If
+ unset, or set to 0, all delete markers, including those with future timestamps, are purged
+ during the next major compaction. Otherwise, a delete marker is kept until the major compaction
+ which occurs after the marker's timestamp plus the value of this setting, in milliseconds.
+ </description>
+ </property>
+ <property>
+ <name>hbase.offpeak.start.hour</name>
+ <value>-1</value>
+ <description>The start of off-peak hours, expressed as an integer between 0 and 23, inclusive.
+ Set to -1 to disable off-peak.
+ </description>
+ </property>
+ <property>
+ <name>hbase.offpeak.end.hour</name>
+ <value>-1</value>
+ <description>The end of off-peak hours, expressed as an integer between 0 and 23, inclusive. Set
+ to -1 to disable off-peak.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.thread.compaction.throttle</name>
+ <value>2684354560</value>
+ <description>There are two different thread pools for compactions, one for large compactions and
+ the other for small compactions. This helps to keep compaction of lean tables (such as
+ hbase:meta) fast. If a compaction is larger than this threshold, it
+ goes into the large compaction pool. In most cases, the default value is appropriate. Default:
+ 2 x hbase.hstore.compaction.max x hbase.hregion.memstore.flush.size (which defaults to 128MB).
+ The value field assumes that the value of hbase.hregion.memstore.flush.size is unchanged from
+ the default.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.majorcompaction.pagecache.drop</name>
+ <value>true</value>
+ <description>Specifies whether to drop pages read/written into the system page cache by
+ major compactions. Setting it to true helps prevent major compactions from
+ polluting the page cache, which is almost always required, especially for clusters
+ with low/moderate memory to storage ratio.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.minorcompaction.pagecache.drop</name>
+ <value>true</value>
+ <description>Specifies whether to drop pages read/written into the system page cache by
+ minor compactions. Setting it to true helps prevent minor compactions from
+ polluting the page cache, which is most beneficial on clusters with low
+ memory to storage ratio or very write heavy clusters. You may want to set it to
+ false under moderate to low write workload when bulk of the reads are
+ on the most recently written data.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hstore.compaction.kv.max</name>
+ <value>10</value>
+ <description>The maximum number of KeyValues to read and then write in a batch when flushing or
+ compacting. Set this lower if you have big KeyValues and problems with Out Of Memory
+ Exceptions Set this higher if you have wide, small rows.
+ </description>
+ </property>
+ <property>
+ <name>hbase.storescanner.parallel.seek.enable</name>
+ <value>false</value>
+ <description>
+ Enables StoreFileScanner parallel-seeking in StoreScanner,
+ a feature which can reduce response latency under special conditions.
+ </description>
+ </property>
+ <property>
+ <name>hbase.storescanner.parallel.seek.threads</name>
+ <value>10</value>
+ <description>
+ The default thread pool size if parallel-seeking feature enabled.
+ </description>
+ </property>
+ <property>
+ <name>hfile.block.cache.policy</name>
+ <value>LRU</value>
+ <description>The eviction policy for the L1 block cache (LRU or TinyLFU).</description>
+ </property>
+ <property>
+ <name>hfile.block.cache.size</name>
+ <value>0.4</value>
+ <description>Percentage of maximum heap (-Xmx setting) to allocate to block cache
+ used by a StoreFile. Default of 0.4 means allocate 40%.
+ Set to 0 to disable but it's not recommended; you need at least
+ enough cache to hold the storefile indices.
+ </description>
+ </property>
+ <property>
+ <name>hfile.block.index.cacheonwrite</name>
+ <value>false</value>
+ <description>This allows to put non-root multi-level index blocks into the block
+ cache at the time the index is being written.
+ </description>
+ </property>
+ <property>
+ <name>hfile.index.block.max.size</name>
+ <value>131072</value>
+ <description>When the size of a leaf-level, intermediate-level, or root-level
+ index block in a multi-level block index grows to this size, the
+ block is written out and a new block is started.
+ </description>
+ </property>
+ <property>
+ <name>hbase.bucketcache.ioengine</name>
+ <value></value>
+ <description>Where to store the contents of the bucketcache. One of: offheap,
+ file, files, mmap or pmem. If a file or files, set it to file(s):PATH_TO_FILE.
+ mmap means the content will be in an mmaped file. Use mmap:PATH_TO_FILE. 'pmem'
+ is bucket cache over a file on the persistent memory device.
+ Use pmem:PATH_TO_FILE.
+ See http://hbase.apache.org/book.html#offheap.blockcache for more information.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hstore.compaction.throughput.lower.bound</name>
+ <value>52428800</value>
+ <description>The target lower bound on aggregate compaction throughput, in bytes/sec. Allows
+ you to tune the minimum available compaction throughput when the
+ PressureAwareCompactionThroughputController throughput controller is active. (It is active by
+ default.)
+ </description>
+ </property>
+ <property>
+ <name>hbase.hstore.compaction.throughput.higher.bound</name>
+ <value>104857600</value>
+ <description>The target upper bound on aggregate compaction throughput, in bytes/sec. Allows
+ you to control aggregate compaction throughput demand when the
+ PressureAwareCompactionThroughputController throughput controller is active. (It is active by
+ default.) The maximum throughput will be tuned between the lower and upper bounds when
+ compaction pressure is within the range [0.0, 1.0]. If compaction pressure is 1.0 or greater
+ the higher bound will be ignored until pressure returns to the normal range.
+ </description>
+ </property>
+ <property>
+ <name>hbase.bucketcache.size</name>
+ <value></value>
+ <description>A float that EITHER represents a percentage of total heap memory
+ size to give to the cache (if < 1.0) OR, it is the total capacity in
+ megabytes of BucketCache. Default: 0.0
+ </description>
+ </property>
+ <property>
+ <name>hbase.bucketcache.bucket.sizes</name>
+ <value></value>
+ <description>A comma-separated list of sizes for buckets for the bucketcache.
+ Can be multiple sizes. List block sizes in order from smallest to largest.
+ The sizes you use will depend on your data access patterns.
+ Must be a multiple of 256 else you will run into
+ 'java.io.IOException: Invalid HFile block magic' when you go to read from cache.
+ If you specify no values here, then you pick up the default bucketsizes set
+ in code (See BucketAllocator#DEFAULT_BUCKET_SIZES).
+ </description>
+ </property>
+ <property>
+ <name>hfile.format.version</name>
+ <value>3</value>
+ <description>The HFile format version to use for new files.
+ Version 3 adds support for tags in hfiles (See http://hbase.apache.org/book.html#hbase.tags).
+ Also see the configuration 'hbase.replication.rpc.codec'.
+ </description>
+ </property>
+ <property>
+ <name>hfile.block.bloom.cacheonwrite</name>
+ <value>false</value>
+ <description>Enables cache-on-write for inline blocks of a compound Bloom filter.</description>
+ </property>
+ <property>
+ <name>io.storefile.bloom.block.size</name>
+ <value>131072</value>
+ <description>The size in bytes of a single block ("chunk") of a compound Bloom
+ filter. This size is approximate, because Bloom blocks can only be
+ inserted at data block boundaries, and the number of keys per data
+ block varies.
+ </description>
+ </property>
+ <property>
+ <name>hbase.rs.cacheblocksonwrite</name>
+ <value>false</value>
+ <description>Whether an HFile block should be added to the block cache when the
+ block is finished.
+ </description>
+ </property>
+ <property>
+ <name>hbase.rpc.timeout</name>
+ <value>60000</value>
+ <description>This is for the RPC layer to define how long (millisecond) HBase client applications
+ take for a remote call to time out. It uses pings to check connections
+ but will eventually throw a TimeoutException.
+ </description>
+ </property>
+ <property>
+ <name>hbase.client.operation.timeout</name>
+ <value>1200000</value>
+ <description>Operation timeout is a top-level restriction (millisecond) that makes sure a
+ blocking operation in Table will not be blocked more than this. In each operation, if rpc
+ request fails because of timeout or other reason, it will retry until success or throw
+ RetriesExhaustedException. But if the total time being blocking reach the operation timeout
+ before retries exhausted, it will break early and throw SocketTimeoutException.
+ </description>
+ </property>
+ <property>
+ <name>hbase.cells.scanned.per.heartbeat.check</name>
+ <value>10000</value>
+ <description>The number of cells scanned in between heartbeat checks. Heartbeat
+ checks occur during the processing of scans to determine whether or not the
+ server should stop scanning in order to send back a heartbeat message to the
+ client. Heartbeat messages are used to keep the client-server connection alive
+ during long running scans. Small values mean that the heartbeat checks will
+ occur more often and thus will provide a tighter bound on the execution time of
+ the scan. Larger values mean that the heartbeat checks occur less frequently
+ </description>
+ </property>
+ <property>
+ <name>hbase.rpc.shortoperation.timeout</name>
+ <value>10000</value>
+ <description>This is another version of "hbase.rpc.timeout". For those RPC operation
+ within cluster, we rely on this configuration to set a short timeout limitation
+ for short operation. For example, short rpc timeout for region server's trying
+ to report to active master can benefit quicker master failover process.
+ </description>
+ </property>
+ <property>
+ <name>hbase.ipc.client.tcpnodelay</name>
+ <value>true</value>
+ <description>Set no delay on rpc socket connections. See
+ http://docs.oracle.com/javase/1.5.0/docs/api/java/net/Socket.html#getTcpNoDelay()
+ </description>
+ </property>
+ <property>
+ <name>hbase.unsafe.regionserver.hostname</name>
+ <value></value>
+ <description>This config is for experts: don't set its value unless you really know what you are doing.
+ When set to a non-empty value, this represents the (external facing) hostname for the underlying server.
+ See https://issues.apache.org/jira/browse/HBASE-12954 for details.
+ </description>
+ </property>
+ <property>
+ <name>hbase.unsafe.regionserver.hostname.disable.master.reversedns</name>
+ <value>false</value>
+ <description>This config is for experts: don't set its value unless you really know what you are doing.
+ When set to true, regionserver will use the current node hostname for the servername and HMaster will
+ skip reverse DNS lookup and use the hostname sent by regionserver instead. Note that this config and
+ hbase.unsafe.regionserver.hostname are mutually exclusive. See https://issues.apache.org/jira/browse/HBASE-18226
+ for more details.
+ </description>
+ </property>
+ <!-- The following properties configure authentication information for
+ HBase processes when using Kerberos security. There are no default
+ values, included here for documentation purposes -->
+ <property>
+ <name>hbase.master.keytab.file</name>
+ <value></value>
+ <description>Full path to the kerberos keytab file to use for logging in
+ the configured HMaster server principal.
+ </description>
+ </property>
+ <property>
+ <name>hbase.master.kerberos.principal</name>
+ <value></value>
+ <description>Ex. "hbase/_HOST@EXAMPLE.COM". The kerberos principal name
+ that should be used to run the HMaster process. The principal name should
+ be in the form: user/hostname@DOMAIN. If "_HOST" is used as the hostname
+ portion, it will be replaced with the actual hostname of the running
+ instance.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.keytab.file</name>
+ <value></value>
+ <description>Full path to the kerberos keytab file to use for logging in
+ the configured HRegionServer server principal.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.kerberos.principal</name>
+ <value></value>
+ <description>Ex. "hbase/_HOST@EXAMPLE.COM". The kerberos principal name
+ that should be used to run the HRegionServer process. The principal name
+ should be in the form: user/hostname@DOMAIN. If "_HOST" is used as the
+ hostname portion, it will be replaced with the actual hostname of the
+ running instance. An entry for this principal must exist in the file
+ specified in hbase.regionserver.keytab.file
+ </description>
+ </property>
+ <!-- Additional configuration specific to HBase security -->
+ <property>
+ <name>hadoop.policy.file</name>
+ <value>hbase-policy.xml</value>
+ <description>The policy configuration file used by RPC servers to make
+ authorization decisions on client requests. Only used when HBase
+ security is enabled.
+ </description>
+ </property>
+ <property>
+ <name>hbase.superuser</name>
+ <value></value>
+ <description>List of users or groups (comma-separated), who are allowed
+ full privileges, regardless of stored ACLs, across the cluster.
+ Only used when HBase security is enabled.
+ </description>
+ </property>
+ <property>
+ <name>hbase.auth.key.update.interval</name>
+ <value>86400000</value>
+ <description>The update interval for master key for authentication tokens
+ in servers in milliseconds. Only used when HBase security is enabled.
+ </description>
+ </property>
+ <property>
+ <name>hbase.auth.token.max.lifetime</name>
+ <value>604800000</value>
+ <description>The maximum lifetime in milliseconds after which an
+ authentication token expires. Only used when HBase security is enabled.
+ </description>
+ </property>
+ <property>
+ <name>hbase.ipc.client.fallback-to-simple-auth-allowed</name>
+ <value>false</value>
+ <description>When a client is configured to attempt a secure connection, but attempts to
+ connect to an insecure server, that server may instruct the client to
+ switch to SASL SIMPLE (unsecure) authentication. This setting controls
+ whether or not the client will accept this instruction from the server.
+ When false (the default), the client will not allow the fallback to SIMPLE
+ authentication, and will abort the connection.
+ </description>
+ </property>
+ <property>
+ <name>hbase.ipc.server.fallback-to-simple-auth-allowed</name>
+ <value>false</value>
+ <description>When a server is configured to require secure connections, it will
+ reject connection attempts from clients using SASL SIMPLE (unsecure) authentication.
+ This setting allows secure servers to accept SASL SIMPLE connections from clients
+ when the client requests. When false (the default), the server will not allow the fallback
+ to SIMPLE authentication, and will reject the connection. WARNING: This setting should ONLY
+ be used as a temporary measure while converting clients over to secure authentication. It
+ MUST BE DISABLED for secure operation.
+ </description>
+ </property>
+ <property>
+ <name>hbase.display.keys</name>
+ <value>true</value>
+ <description>When this is set to true the webUI and such will display all start/end keys
+ as part of the table details, region names, etc. When this is set to false,
+ the keys are hidden.
+ </description>
+ </property>
+ <property>
+ <name>hbase.coprocessor.enabled</name>
+ <value>true</value>
+ <description>Enables or disables coprocessor loading. If 'false'
+ (disabled), any other coprocessor related configuration will be ignored.
+ </description>
+ </property>
+ <property>
+ <name>hbase.coprocessor.user.enabled</name>
+ <value>true</value>
+ <description>Enables or disables user (aka. table) coprocessor loading.
+ If 'false' (disabled), any table coprocessor attributes in table
+ descriptors will be ignored. If "hbase.coprocessor.enabled" is 'false'
+ this setting has no effect.
+ </description>
+ </property>
+ <property>
+ <name>hbase.coprocessor.region.classes</name>
+ <value></value>
+ <description>A comma-separated list of Coprocessors that are loaded by
+ default on all tables. For any override coprocessor method, these classes
+ will be called in order. After implementing your own Coprocessor, just put
+ it in HBase's classpath and add the fully qualified class name here.
+ A coprocessor can also be loaded on demand by setting HTableDescriptor.
+ </description>
+ </property>
+ <property>
+ <name>hbase.coprocessor.master.classes</name>
+ <value></value>
+ <description>A comma-separated list of
+ org.apache.hadoop.hbase.coprocessor.MasterObserver coprocessors that are
+ loaded by default on the active HMaster process. For any implemented
+ coprocessor methods, the listed classes will be called in order. After
+ implementing your own MasterObserver, just put it in HBase's classpath
+ and add the fully qualified class name here.
+ </description>
+ </property>
+ <property>
+ <name>hbase.coprocessor.abortonerror</name>
+ <value>true</value>
+ <description>Set to true to cause the hosting server (master or regionserver)
+ to abort if a coprocessor fails to load, fails to initialize, or throws an
+ unexpected Throwable object. Setting this to false will allow the server to
+ continue execution but the system wide state of the coprocessor in question
+ will become inconsistent as it will be properly executing in only a subset
+ of servers, so this is most useful for debugging only.
+ </description>
+ </property>
+ <property>
+ <name>hbase.rest.port</name>
+ <value>8080</value>
+ <description>The port for the HBase REST server.</description>
+ </property>
+ <property>
+ <name>hbase.rest.readonly</name>
+ <value>false</value>
+ <description>Defines the mode the REST server will be started in. Possible values are:
+ false: All HTTP methods are permitted - GET/PUT/POST/DELETE.
+ true: Only the GET method is permitted.
+ </description>
+ </property>
+ <property>
+ <name>hbase.rest.threads.max</name>
+ <value>100</value>
+ <description>The maximum number of threads of the REST server thread pool.
+ Threads in the pool are reused to process REST requests. This
+ controls the maximum number of requests processed concurrently.
+ It may help to control the memory used by the REST server to
+ avoid OOM issues. If the thread pool is full, incoming requests
+ will be queued up and wait for some free threads.
+ </description>
+ </property>
+ <property>
+ <name>hbase.rest.threads.min</name>
+ <value>2</value>
+ <description>The minimum number of threads of the REST server thread pool.
+ The thread pool always has at least these number of threads so
+ the REST server is ready to serve incoming requests.
+ </description>
+ </property>
+ <property>
+ <name>hbase.rest.support.proxyuser</name>
+ <value>false</value>
+ <description>Enables running the REST server to support proxy-user mode.</description>
+ </property>
+ <property skipInDoc="true">
+ <name>hbase.defaults.for.version</name>
+ <value>2.4.9</value>
+ <description>This defaults file was compiled for version ${project.version}. This variable is used
+ to make sure that a user doesn't have an old version of hbase-default.xml on the
+ classpath.
+ </description>
+ </property>
+ <property>
+ <name>hbase.defaults.for.version.skip</name>
+ <value>false</value>
+ <description>Set to true to skip the 'hbase.defaults.for.version' check.
+ Setting this to true can be useful in contexts other than
+ the other side of a maven generation; i.e. running in an
+ IDE. You'll want to set this boolean to true to avoid
+ seeing the RuntimeException complaint: "hbase-default.xml file
+ seems to be for and old version of HBase (\${hbase.version}), this
+ version is X.X.X-SNAPSHOT"
+ </description>
+ </property>
+ <property>
+ <name>hbase.table.lock.enable</name>
+ <value>true</value>
+ <description>Set to true to enable locking the table in zookeeper for schema change operations.
+ Table locking from master prevents concurrent schema modifications to corrupt table
+ state.
+ </description>
+ </property>
+ <property>
+ <name>hbase.table.max.rowsize</name>
+ <value>1073741824</value>
+ <description>
+ Maximum size of single row in bytes (default is 1 Gb) for Get'ting
+ or Scan'ning without in-row scan flag set. If row size exceeds this limit
+ RowTooBigException is thrown to client.
+ </description>
+ </property>
+ <property>
+ <name>hbase.thrift.minWorkerThreads</name>
+ <value>16</value>
+ <description>The "core size" of the thread pool. New threads are created on every
+ connection until this many threads are created.
+ </description>
+ </property>
+ <property>
+ <name>hbase.thrift.maxWorkerThreads</name>
+ <value>1000</value>
+ <description>The maximum size of the thread pool. When the pending request queue
+ overflows, new threads are created until their number reaches this number.
+ After that, the server starts dropping connections.
+ </description>
+ </property>
+ <property>
+ <name>hbase.thrift.maxQueuedRequests</name>
+ <value>1000</value>
+ <description>The maximum number of pending Thrift connections waiting in the queue. If
+ there are no idle threads in the pool, the server queues requests. Only
+ when the queue overflows, new threads are added, up to
+ hbase.thrift.maxQueuedRequests threads.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.thrift.framed</name>
+ <value>false</value>
+ <description>Use Thrift TFramedTransport on the server side.
+ This is the recommended transport for thrift servers and requires a similar setting
+ on the client side. Changing this to false will select the default transport,
+ vulnerable to DoS when malformed requests are issued due to THRIFT-601.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.thrift.framed.max_frame_size_in_mb</name>
+ <value>2</value>
+ <description>Default frame size when using framed transport, in MB</description>
+ </property>
+ <property>
+ <name>hbase.regionserver.thrift.compact</name>
+ <value>false</value>
+ <description>Use Thrift TCompactProtocol binary serialization protocol.</description>
+ </property>
+ <property>
+ <name>hbase.rootdir.perms</name>
+ <value>700</value>
+ <description>FS Permissions for the root data subdirectory in a secure (kerberos) setup.
+ When master starts, it creates the rootdir with this permissions or sets the permissions
+ if it does not match.
+ </description>
+ </property>
+ <property>
+ <name>hbase.wal.dir.perms</name>
+ <value>700</value>
+ <description>FS Permissions for the root WAL directory in a secure(kerberos) setup.
+ When master starts, it creates the WAL dir with this permissions or sets the permissions
+ if it does not match.
+ </description>
+ </property>
+ <property>
+ <name>hbase.data.umask.enable</name>
+ <value>false</value>
+ <description>Enable, if true, that file permissions should be assigned
+ to the files written by the regionserver
+ </description>
+ </property>
+ <property>
+ <name>hbase.data.umask</name>
+ <value>000</value>
+ <description>File permissions that should be used to write data
+ files when hbase.data.umask.enable is true
+ </description>
+ </property>
+ <property>
+ <name>hbase.snapshot.enabled</name>
+ <value>true</value>
+ <description>Set to true to allow snapshots to be taken / restored / cloned.</description>
+ </property>
+ <property>
+ <name>hbase.snapshot.restore.take.failsafe.snapshot</name>
+ <value>true</value>
+ <description>Set to true to take a snapshot before the restore operation.
+ The snapshot taken will be used in case of failure, to restore the previous state.
+ At the end of the restore operation this snapshot will be deleted
+ </description>
+ </property>
+ <property>
+ <name>hbase.snapshot.restore.failsafe.name</name>
+ <value>hbase-failsafe-{snapshot.name}-{restore.timestamp}</value>
+ <description>Name of the failsafe snapshot taken by the restore operation.
+ You can use the {snapshot.name}, {table.name} and {restore.timestamp} variables
+ to create a name based on what you are restoring.
+ </description>
+ </property>
+ <property>
+ <name>hbase.snapshot.working.dir</name>
+ <value></value>
+ <description>Location where the snapshotting process will occur. The location of the
+ completed snapshots will not change, but the temporary directory where the snapshot
+ process occurs will be set to this location. This can be a separate filesystem than
+ the root directory, for performance increase purposes. See HBASE-21098 for more
+ information
+ </description>
+ </property>
+ <property>
+ <name>hbase.server.compactchecker.interval.multiplier</name>
+ <value>1000</value>
+ <description>The number that determines how often we scan to see if compaction is necessary.
+ Normally, compactions are done after some events (such as memstore flush), but if
+ region didn't receive a lot of writes for some time, or due to different compaction
+ policies, it may be necessary to check it periodically. The interval between checks is
+ hbase.server.compactchecker.interval.multiplier multiplied by
+ hbase.server.thread.wakefrequency.
+ </description>
+ </property>
+ <property>
+ <name>hbase.lease.recovery.timeout</name>
+ <value>900000</value>
+ <description>How long we wait on dfs lease recovery in total before giving up.</description>
+ </property>
+ <property>
+ <name>hbase.lease.recovery.dfs.timeout</name>
+ <value>64000</value>
+ <description>How long between dfs recover lease invocations. Should be larger than the sum of
+ the time it takes for the namenode to issue a block recovery command as part of
+ datanode; dfs.heartbeat.interval and the time it takes for the primary
+ datanode, performing block recovery to timeout on a dead datanode; usually
+ dfs.client.socket-timeout. See the end of HBASE-8389 for more.
+ </description>
+ </property>
+ <property>
+ <name>hbase.column.max.version</name>
+ <value>1</value>
+ <description>New column family descriptors will use this value as the default number of versions
+ to keep.
+ </description>
+ </property>
+ <property>
+ <name>dfs.client.read.shortcircuit</name>
+ <value></value>
+ <description>
+ If set to true, this configuration parameter enables short-circuit local
+ reads.
+ </description>
+ </property>
+ <property>
+ <name>dfs.domain.socket.path</name>
+ <value></value>
+ <description>
+ This is a path to a UNIX domain socket that will be used for
+ communication between the DataNode and local HDFS clients, if
+ dfs.client.read.shortcircuit is set to true. If the string "_PORT" is
+ present in this path, it will be replaced by the TCP port of the DataNode.
+ Be careful about permissions for the directory that hosts the shared
+ domain socket; dfsclient will complain if open to other users than the HBase user.
+ </description>
+ </property>
+ <property>
+ <name>hbase.dfs.client.read.shortcircuit.buffer.size</name>
+ <value>131072</value>
+ <description>If the DFSClient configuration
+ dfs.client.read.shortcircuit.buffer.size is unset, we will
+ use what is configured here as the short circuit read default
+ direct byte buffer size. DFSClient native default is 1MB; HBase
+ keeps its HDFS files open so number of file blocks * 1MB soon
+ starts to add up and threaten OOME because of a shortage of
+ direct memory. So, we set it down from the default. Make
+ it > the default hbase block size set in the HColumnDescriptor
+ which is usually 64k.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.checksum.verify</name>
+ <value>true</value>
+ <description>
+ If set to true (the default), HBase verifies the checksums for hfile
+ blocks. HBase writes checksums inline with the data when it writes out
+ hfiles. HDFS (as of this writing) writes checksums to a separate file
+ than the data file necessitating extra seeks. Setting this flag saves
+ some on i/o. Checksum verification by HDFS will be internally disabled
+ on hfile streams when this flag is set. If the hbase-checksum verification
+ fails, we will switch back to using HDFS checksums (so do not disable HDFS
+ checksums! And besides this feature applies to hfiles only, not to WALs).
+ If this parameter is set to false, then hbase will not verify any checksums,
+ instead it will depend on checksum verification being done in the HDFS client.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hstore.bytes.per.checksum</name>
+ <value>16384</value>
+ <description>
+ Number of bytes in a newly created checksum chunk for HBase-level
+ checksums in hfile blocks.
+ </description>
+ </property>
+ <property>
+ <name>hbase.hstore.checksum.algorithm</name>
+ <value>CRC32C</value>
+ <description>
+ Name of an algorithm that is used to compute checksums. Possible values
+ are NULL, CRC32, CRC32C.
+ </description>
+ </property>
+ <property>
+ <name>hbase.client.scanner.max.result.size</name>
+ <value>2097152</value>
+ <description>Maximum number of bytes returned when calling a scanner's next method.
+ Note that when a single row is larger than this limit the row is still returned completely.
+ The default value is 2MB, which is good for 1ge networks.
+ With faster and/or high latency networks this value should be increased.
+ </description>
+ </property>
+ <property>
+ <name>hbase.server.scanner.max.result.size</name>
+ <value>104857600</value>
+ <description>Maximum number of bytes returned when calling a scanner's next method.
+ Note that when a single row is larger than this limit the row is still returned completely.
+ The default value is 100MB.
+ This is a safety setting to protect the server from OOM situations.
+ </description>
+ </property>
+ <property>
+ <name>hbase.status.published</name>
+ <value>false</value>
+ <description>
+ This setting activates the publication by the master of the status of the region server.
+ When a region server dies and its recovery starts, the master will push this information
+ to the client application, to let them cut the connection immediately instead of waiting
+ for a timeout.
+ </description>
+ </property>
+ <property>
+ <name>hbase.status.publisher.class</name>
+ <value>org.apache.hadoop.hbase.master.ClusterStatusPublisher$MulticastPublisher</value>
+ <description>
+ Implementation of the status publication with a multicast message.
+ </description>
+ </property>
+ <property>
+ <name>hbase.status.listener.class</name>
+ <value>org.apache.hadoop.hbase.client.ClusterStatusListener$MulticastListener</value>
+ <description>
+ Implementation of the status listener with a multicast message.
+ </description>
+ </property>
+ <property>
+ <name>hbase.status.multicast.address.ip</name>
+ <value>226.1.1.3</value>
+ <description>
+ Multicast address to use for the status publication by multicast.
+ </description>
+ </property>
+ <property>
+ <name>hbase.status.multicast.address.port</name>
+ <value>16100</value>
+ <description>
+ Multicast port to use for the status publication by multicast.
+ </description>
+ </property>
+ <property>
+ <name>hbase.dynamic.jars.dir</name>
+ <value>${hbase.rootdir}/lib</value>
+ <description>
+ The directory from which the custom filter JARs can be loaded
+ dynamically by the region server without the need to restart. However,
+ an already loaded filter/co-processor class would not be un-loaded. See
+ HBASE-1936 for more details.
+
+ Does not apply to coprocessors.
+ </description>
+ </property>
+ <property>
+ <name>hbase.security.authentication</name>
+ <value>simple</value>
+ <description>
+ Controls whether or not secure authentication is enabled for HBase.
+ Possible values are 'simple' (no authentication), and 'kerberos'.
+ </description>
+ </property>
+ <property>
+ <name>hbase.rest.filter.classes</name>
+ <value>org.apache.hadoop.hbase.rest.filter.GzipFilter</value>
+ <description>
+ Servlet filters for REST service.
+ </description>
+ </property>
+ <property>
+ <name>hbase.master.loadbalancer.class</name>
+ <value>org.apache.hadoop.hbase.master.balancer.StochasticLoadBalancer</value>
+ <description>
+ Class used to execute the regions balancing when the period occurs.
+ See the class comment for more on how it works
+ http://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.html
+ It replaces the DefaultLoadBalancer as the default (since renamed
+ as the SimpleLoadBalancer).
+ </description>
+ </property>
+ <property>
+ <name>hbase.master.loadbalance.bytable</name>
+ <value>false</value>
+ <description>Factor Table name when the balancer runs.
+ Default: false.
+ </description>
+ </property>
+ <property>
+ <name>hbase.master.normalizer.class</name>
+ <value>org.apache.hadoop.hbase.master.normalizer.SimpleRegionNormalizer</value>
+ <description>
+ Class used to execute the region normalization when the period occurs.
+ See the class comment for more on how it works
+ http://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/master/normalizer/SimpleRegionNormalizer.html
+ </description>
+ </property>
+ <property>
+ <name>hbase.rest.csrf.enabled</name>
+ <value>false</value>
+ <description>
+ Set to true to enable protection against cross-site request forgery (CSRF)
+ </description>
+ </property>
+ <property>
+ <name>hbase.rest-csrf.browser-useragents-regex</name>
+ <value>^Mozilla.*,^Opera.*</value>
+ <description>
+ A comma-separated list of regular expressions used to match against an HTTP
+ request's User-Agent header when protection against cross-site request
+ forgery (CSRF) is enabled for REST server by setting
+ hbase.rest.csrf.enabled to true. If the incoming User-Agent matches
+ any of these regular expressions, then the request is considered to be sent
+ by a browser, and therefore CSRF prevention is enforced. If the request's
+ User-Agent does not match any of these regular expressions, then the request
+ is considered to be sent by something other than a browser, such as scripted
+ automation. In this case, CSRF is not a potential attack vector, so
+ the prevention is not enforced. This helps achieve backwards-compatibility
+ with existing automation that has not been updated to send the CSRF
+ prevention header.
+ </description>
+ </property>
+ <property>
+ <name>hbase.security.exec.permission.checks</name>
+ <value>false</value>
+ <description>
+ If this setting is enabled and ACL based access control is active (the
+ AccessController coprocessor is installed either as a system coprocessor
+ or on a table as a table coprocessor) then you must grant all relevant
+ users EXEC privilege if they require the ability to execute coprocessor
+ endpoint calls. EXEC privilege, like any other permission, can be
+ granted globally to a user, or to a user on a per table or per namespace
+ basis. For more information on coprocessor endpoints, see the coprocessor
+ section of the HBase online manual. For more information on granting or
+ revoking permissions using the AccessController, see the security
+ section of the HBase online manual.
+ </description>
+ </property>
+ <property>
+ <name>hbase.procedure.regionserver.classes</name>
+ <value></value>
+ <description>A comma-separated list of
+ org.apache.hadoop.hbase.procedure.RegionServerProcedureManager procedure managers that are
+ loaded by default on the active HRegionServer process. The lifecycle methods (init/start/stop)
+ will be called by the active HRegionServer process to perform the specific globally barriered
+ procedure. After implementing your own RegionServerProcedureManager, just put it in
+ HBase's classpath and add the fully qualified class name here.
+ </description>
+ </property>
+ <property>
+ <name>hbase.procedure.master.classes</name>
+ <value></value>
+ <description>A comma-separated list of
+ org.apache.hadoop.hbase.procedure.MasterProcedureManager procedure managers that are
+ loaded by default on the active HMaster process. A procedure is identified by its signature and
+ users can use the signature and an instant name to trigger an execution of a globally barriered
+ procedure. After implementing your own MasterProcedureManager, just put it in HBase's classpath
+ and add the fully qualified class name here.
+ </description>
+ </property>
+ <property>
+ <name>hbase.coordinated.state.manager.class</name>
+ <value>org.apache.hadoop.hbase.coordination.ZkCoordinatedStateManager</value>
+ <description>Fully qualified name of class implementing coordinated state manager.</description>
+ </property>
+ <property>
+ <name>hbase.regionserver.storefile.refresh.period</name>
+ <value>0</value>
+ <description>
+ The period (in milliseconds) for refreshing the store files for the secondary regions. 0
+ means this feature is disabled. Secondary regions sees new files (from flushes and
+ compactions) from primary once the secondary region refreshes the list of files in the
+ region (there is no notification mechanism). But too frequent refreshes might cause
+ extra Namenode pressure. If the files cannot be refreshed for longer than HFile TTL
+ (hbase.master.hfilecleaner.ttl) the requests are rejected. Configuring HFile TTL to a larger
+ value is also recommended with this setting.
+ </description>
+ </property>
+ <property>
+ <name>hbase.region.replica.replication.enabled</name>
+ <value>false</value>
+ <description>
+ Whether asynchronous WAL replication to the secondary region replicas is enabled or not.
+ If this is enabled, a replication peer named "region_replica_replication" will be created
+ which will tail the logs and replicate the mutations to region replicas for tables that
+ have region replication > 1. If this is enabled once, disabling this replication also
+ requires disabling the replication peer using shell or Admin java class.
+ Replication to secondary region replicas works over standard inter-cluster replication.
+ </description>
+ </property>
+ <property>
+ <name>hbase.http.filter.initializers</name>
+ <value>org.apache.hadoop.hbase.http.lib.StaticUserWebFilter</value>
+ <description>
+ A comma separated list of class names. Each class in the list must extend
+ org.apache.hadoop.hbase.http.FilterInitializer. The corresponding Filter will
+ be initialized. Then, the Filter will be applied to all user facing jsp
+ and servlet web pages.
+ The ordering of the list defines the ordering of the filters.
+ The default StaticUserWebFilter add a user principal as defined by the
+ hbase.http.staticuser.user property.
+ </description>
+ </property>
+ <property>
+ <name>hbase.security.visibility.mutations.checkauths</name>
+ <value>false</value>
+ <description>
+ This property if enabled, will check whether the labels in the visibility
+ expression are associated with the user issuing the mutation
+ </description>
+ </property>
+ <property>
+ <name>hbase.http.max.threads</name>
+ <value>16</value>
+ <description>
+ The maximum number of threads that the HTTP Server will create in its
+ ThreadPool.
+ </description>
+ </property>
+ <property>
+ <name>hbase.replication.rpc.codec</name>
+ <value>org.apache.hadoop.hbase.codec.KeyValueCodecWithTags</value>
+ <description>
+ The codec that is to be used when replication is enabled so that
+ the tags are also replicated. This is used along with HFileV3 which
+ supports tags in them. If tags are not used or if the hfile version used
+ is HFileV2 then KeyValueCodec can be used as the replication codec. Note that
+ using KeyValueCodecWithTags for replication when there are no tags causes no harm.
+ </description>
+ </property>
+ <property>
+ <name>hbase.replication.source.maxthreads</name>
+ <value>10</value>
+ <description>
+ The maximum number of threads any replication source will use for
+ shipping edits to the sinks in parallel. This also limits the number of
+ chunks each replication batch is broken into. Larger values can improve
+ the replication throughput between the master and slave clusters. The
+ default of 10 will rarely need to be changed.
+ </description>
+ </property>
+ <!-- Static Web User Filter properties. -->
+ <property>
+ <name>hbase.http.staticuser.user</name>
+ <value>dr.stack</value>
+ <description>
+ The user name to filter as, on static web filters
+ while rendering content. An example use is the HDFS
+ web UI (user to be used for browsing files).
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.handler.abort.on.error.percent</name>
+ <value>0.5</value>
+ <description>The percent of region server RPC threads failed to abort RS.
+ -1 Disable aborting; 0 Abort if even a single handler has died;
+ 0.x Abort only when this percent of handlers have died;
+ 1 Abort only all of the handers have died.
+ </description>
+ </property>
+ <!-- Mob properties. -->
+ <property>
+ <name>hbase.mob.file.cache.size</name>
+ <value>1000</value>
+ <description>
+ Number of opened file handlers to cache.
+ A larger value will benefit reads by providing more file handlers per mob
+ file cache and would reduce frequent file opening and closing.
+ However, if this is set too high, this could lead to a "too many opened file handlers"
+ The default value is 1000.
+ </description>
+ </property>
+ <property>
+ <name>hbase.mob.cache.evict.period</name>
+ <value>3600</value>
+ <description>
+ The amount of time in seconds before the mob cache evicts cached mob files.
+ The default value is 3600 seconds.
+ </description>
+ </property>
+ <property>
+ <name>hbase.mob.cache.evict.remain.ratio</name>
+ <value>0.5f</value>
+ <description>
+ The ratio (between 0.0 and 1.0) of files that remains cached after an eviction
+ is triggered when the number of cached mob files exceeds the hbase.mob.file.cache.size.
+ The default value is 0.5f.
+ </description>
+ </property>
+ <property>
+ <name>hbase.master.mob.ttl.cleaner.period</name>
+ <value>86400</value>
+ <description>
+ The period that ExpiredMobFileCleanerChore runs. The unit is second.
+ The default value is one day. The MOB file name uses only the date part of
+ the file creation time in it. We use this time for deciding TTL expiry of
+ the files. So the removal of TTL expired files might be delayed. The max
+ delay might be 24 hrs.
+ </description>
+ </property>
+ <property>
+ <name>hbase.mob.compaction.mergeable.threshold</name>
+ <value>1342177280</value>
+ <description>
+ If the size of a mob file is less than this value, it's regarded as a small
+ file and needs to be merged in mob compaction. The default value is 1280MB.
+ </description>
+ </property>
+ <property>
+ <name>hbase.mob.delfile.max.count</name>
+ <value>3</value>
+ <description>
+ The max number of del files that is allowed in the mob compaction.
+ In the mob compaction, when the number of existing del files is larger than
+ this value, they are merged until number of del files is not larger this value.
+ The default value is 3.
+ </description>
+ </property>
+ <property>
+ <name>hbase.mob.compaction.batch.size</name>
+ <value>100</value>
+ <description>
+ The max number of the mob files that is allowed in a batch of the mob compaction.
+ The mob compaction merges the small mob files to bigger ones. If the number of the
+ small files is very large, it could lead to a "too many opened file handlers" in the merge.
+ And the merge has to be split into batches. This value limits the number of mob files
+ that are selected in a batch of the mob compaction. The default value is 100.
+ </description>
+ </property>
+ <property>
+ <name>hbase.mob.compaction.chore.period</name>
+ <value>604800</value>
+ <description>
+ The period that MobCompactionChore runs. The unit is second.
+ The default value is one week.
+ </description>
+ </property>
+ <property>
+ <name>hbase.mob.compactor.class</name>
+ <value>org.apache.hadoop.hbase.mob.compactions.PartitionedMobCompactor</value>
+ <description>
+ Implementation of mob compactor, the default one is PartitionedMobCompactor.
+ </description>
+ </property>
+ <property>
+ <name>hbase.mob.compaction.threads.max</name>
+ <value>1</value>
+ <description>
+ The max number of threads used in MobCompactor.
+ </description>
+ </property>
+ <property>
+ <name>hbase.snapshot.master.timeout.millis</name>
+ <value>300000</value>
+ <description>
+ Timeout for master for the snapshot procedure execution.
+ </description>
+ </property>
+ <property>
+ <name>hbase.snapshot.region.timeout</name>
+ <value>300000</value>
+ <description>
+ Timeout for regionservers to keep threads in snapshot request pool waiting.
+ </description>
+ </property>
+ <property>
+ <name>hbase.rpc.rows.warning.threshold</name>
+ <value>5000</value>
+ <description>
+ Number of rows in a batch operation above which a warning will be logged.
+ </description>
+ </property>
+ <property>
+ <name>hbase.master.wait.on.service.seconds</name>
+ <value>30</value>
+ <description>Default is 5 minutes. Make it 30 seconds for tests. See
+ HBASE-19794 for some context.
+ </description>
+ </property>
+ <property>
+ <name>hbase.master.cleaner.snapshot.interval</name>
+ <value>1800000</value>
+ <description>
+ Snapshot Cleanup chore interval in milliseconds.
+ The cleanup thread keeps running at this interval
+ to find all snapshots that are expired based on TTL
+ and delete them.
+ </description>
+ </property>
+ <property>
+ <name>hbase.master.snapshot.ttl</name>
+ <value>0</value>
+ <description>
+ Default Snapshot TTL to be considered when the user does not specify TTL while
+ creating snapshot. Default value 0 indicates FOREVERE - snapshot should not be
+ automatically deleted until it is manually deleted
+ </description>
+ </property>
+ <property>
+ <name>hbase.master.regions.recovery.check.interval</name>
+ <value>1200000</value>
+ <description>
+ Regions Recovery Chore interval in milliseconds.
+ This chore keeps running at this interval to
+ find all regions with configurable max store file ref count
+ and reopens them.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regions.recovery.store.file.ref.count</name>
+ <value>-1</value>
+ <description>
+ Very large number of ref count on a compacted
+ store file indicates that it is a ref leak
+ on that object(compacted store file).
+ Such files can not be removed after
+ it is invalidated via compaction.
+ Only way to recover in such scenario is to
+ reopen the region which can release
+ all resources, like the refcount,
+ leases, etc. This config represents Store files Ref
+ Count threshold value considered for reopening
+ regions. Any region with compacted store files
+ ref count > this value would be eligible for
+ reopening by master. Here, we get the max
+ refCount among all refCounts on all
+ compacted away store files that belong to a
+ particular region. Default value -1 indicates
+ this feature is turned off. Only positive
+ integer value should be provided to
+ enable this feature.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.slowlog.ringbuffer.size</name>
+ <value>256</value>
+ <description>
+ Default size of ringbuffer to be maintained by each RegionServer in order
+ to store online slowlog responses. This is an in-memory ring buffer of
+ requests that were judged to be too slow in addition to the responseTooSlow
+ logging. The in-memory representation would be complete.
+ For more details, please look into Doc Section:
+ Get Slow Response Log from shell
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.slowlog.buffer.enabled</name>
+ <value>false</value>
+ <description>
+ Indicates whether RegionServers have ring buffer running for storing
+ Online Slow logs in FIFO manner with limited entries. The size of
+ the ring buffer is indicated by config: hbase.regionserver.slowlog.ringbuffer.size
+ The default value is false, turn this on and get latest slowlog
+ responses with complete data.
+ </description>
+ </property>
+ <property>
+ <name>hbase.regionserver.slowlog.systable.enabled</name>
+ <value>false</value>
+ <description>
+ Should be enabled only if hbase.regionserver.slowlog.buffer.enabled is enabled. If enabled
+ (true), all slow/large RPC logs would be persisted to system table hbase:slowlog (in addition
+ to in-memory ring buffer at each RegionServer). The records are stored in increasing
+ order of time. Operators can scan the table with various combination of ColumnValueFilter.
+ More details are provided in the doc section:
+ "Get Slow/Large Response Logs from System table hbase:slowlog"
+ </description>
+ </property>
+ <property>
+ <name>hbase.rpc.rows.size.threshold.reject</name>
+ <value>false</value>
+ <description>
+ If value is true, RegionServer will abort batch requests of Put/Delete with number of rows
+ in a batch operation exceeding threshold defined by value of config:
+ hbase.rpc.rows.warning.threshold. The default value is false and hence, by default, only
+ warning will be logged. This config should be turned on to prevent RegionServer from serving
+ very large batch size of rows and this way we can improve CPU usages by discarding
+ too large batch request.
+ </description>
+ </property>
+ <property>
+ <name>hbase.namedqueue.provider.classes</name>
+ <value>
+ org.apache.hadoop.hbase.namequeues.impl.SlowLogQueueService,org.apache.hadoop.hbase.namequeues.impl.BalancerDecisionQueueService,org.apache.hadoop.hbase.namequeues.impl.BalancerRejectionQueueService
+ </value>
+ <description>
+ Default values for NamedQueueService implementors. This comma separated full class names
+ represent all implementors of NamedQueueService that we would like to be invoked by
+ LogEvent handler service. One example of NamedQueue service is SlowLogQueueService which
+ is used to store slow/large RPC logs in ringbuffer at each RegionServer.
+ All implementors of NamedQueueService should be found under package:
+ "org.apache.hadoop.hbase.namequeues.impl"
+ </description>
+ </property>
+ <property>
+ <name>hbase.master.balancer.decision.buffer.enabled</name>
+ <value>false</value>
+ <description>
+ Indicates whether active HMaster has ring buffer running for storing
+ balancer decisions in FIFO manner with limited entries. The size of
+ the ring buffer is indicated by config: hbase.master.balancer.decision.queue.size
+ </description>
+ </property>
+ <property>
+ <name>hbase.master.balancer.rejection.buffer.enabled</name>
+ <value>false</value>
+ <description>
+ Indicates whether active HMaster has ring buffer running for storing
+ balancer rejection in FIFO manner with limited entries. The size of
+ the ring buffer is indicated by config: hbase.master.balancer.rejection.queue.size
+ </description>
+ </property>
+</configuration>
diff --git a/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestInLineFileSystemHFileInLining.java b/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestInLineFileSystemHFileInLining.java
index cc59b46..190ad39 100644
--- a/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestInLineFileSystemHFileInLining.java
+++ b/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestInLineFileSystemHFileInLining.java
@@ -19,12 +19,13 @@
package org.apache.hudi.common.fs.inline;
import org.apache.hudi.common.testutils.FileSystemTestUtils;
-import org.apache.hudi.io.storage.HoodieHBaseKVComparator;
+import org.apache.hudi.io.storage.HoodieHFileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
@@ -39,10 +40,12 @@ import org.junit.jupiter.api.Test;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
+import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.UUID;
+import static org.apache.hadoop.hbase.CellComparatorImpl.COMPARATOR;
import static org.apache.hudi.common.testutils.FileSystemTestUtils.FILE_SCHEME;
import static org.apache.hudi.common.testutils.FileSystemTestUtils.RANDOM;
import static org.apache.hudi.common.testutils.FileSystemTestUtils.getPhantomFile;
@@ -56,11 +59,12 @@ import static org.junit.jupiter.api.Assertions.assertNotEquals;
*/
public class TestInLineFileSystemHFileInLining {
+ private static final String LOCAL_FORMATTER = "%010d";
+ private static final String VALUE_PREFIX = "value";
+ private static final int MIN_BLOCK_BYTES = 1024;
private final Configuration inMemoryConf;
private final Configuration inlineConf;
- private final int minBlockSize = 1024;
- private static final String LOCAL_FORMATTER = "%010d";
- private int maxRows = 100 + RANDOM.nextInt(1000);
+ private final int maxRows = 100 + RANDOM.nextInt(1000);
private Path generatedPath;
public TestInLineFileSystemHFileInLining() {
@@ -88,12 +92,11 @@ public class TestInLineFileSystemHFileInLining {
CacheConfig cacheConf = new CacheConfig(inMemoryConf);
FSDataOutputStream fout = createFSOutput(outerInMemFSPath, inMemoryConf);
HFileContext meta = new HFileContextBuilder()
- .withBlockSize(minBlockSize)
+ .withBlockSize(MIN_BLOCK_BYTES).withCellComparator(COMPARATOR)
.build();
HFile.Writer writer = HFile.getWriterFactory(inMemoryConf, cacheConf)
.withOutputStream(fout)
.withFileContext(meta)
- .withComparator(new HoodieHBaseKVComparator())
.create();
writeRecords(writer);
@@ -110,9 +113,8 @@ public class TestInLineFileSystemHFileInLining {
InLineFileSystem inlineFileSystem = (InLineFileSystem) inlinePath.getFileSystem(inlineConf);
FSDataInputStream fin = inlineFileSystem.open(inlinePath);
- HFile.Reader reader = HFile.createReader(inlineFileSystem, inlinePath, cacheConf, inlineConf);
- // Load up the index.
- reader.loadFileInfo();
+ HFile.Reader reader =
+ HoodieHFileUtils.createHFileReader(inlineFileSystem, inlinePath, cacheConf, inlineConf);
// Get a scanner that caches and that does not use pread.
HFileScanner scanner = reader.getScanner(true, false);
// Align scanner at start of the file.
@@ -121,21 +123,24 @@ public class TestInLineFileSystemHFileInLining {
Set<Integer> rowIdsToSearch = getRandomValidRowIds(10);
for (int rowId : rowIdsToSearch) {
- assertEquals(0, scanner.seekTo(KeyValue.createKeyValueFromKey(getSomeKey(rowId))),
+ KeyValue keyValue = new KeyValue.KeyOnlyKeyValue(getSomeKey(rowId));
+ assertEquals(0, scanner.seekTo(keyValue),
"location lookup failed");
// read the key and see if it matches
- ByteBuffer readKey = scanner.getKey();
- assertArrayEquals(getSomeKey(rowId), Bytes.toBytes(readKey), "seeked key does not match");
- scanner.seekTo(KeyValue.createKeyValueFromKey(getSomeKey(rowId)));
+ Cell cell = scanner.getCell();
+ byte[] key = Arrays.copyOfRange(cell.getRowArray(), cell.getRowOffset(), cell.getRowOffset() + cell.getRowLength());
+ byte[] expectedKey = Arrays.copyOfRange(keyValue.getRowArray(), keyValue.getRowOffset(), keyValue.getRowOffset() + keyValue.getRowLength());
+ assertArrayEquals(expectedKey, key, "seeked key does not match");
+ scanner.seekTo(keyValue);
ByteBuffer val1 = scanner.getValue();
- scanner.seekTo(KeyValue.createKeyValueFromKey(getSomeKey(rowId)));
+ scanner.seekTo(keyValue);
ByteBuffer val2 = scanner.getValue();
assertArrayEquals(Bytes.toBytes(val1), Bytes.toBytes(val2));
}
int[] invalidRowIds = {-4, maxRows, maxRows + 1, maxRows + 120, maxRows + 160, maxRows + 1000};
for (int rowId : invalidRowIds) {
- assertNotEquals(0, scanner.seekTo(KeyValue.createKeyValueFromKey(getSomeKey(rowId))),
+ assertNotEquals(0, scanner.seekTo(new KeyValue.KeyOnlyKeyValue(getSomeKey(rowId))),
"location lookup should have failed");
}
reader.close();
@@ -155,7 +160,7 @@ public class TestInLineFileSystemHFileInLining {
}
private byte[] getSomeKey(int rowId) {
- KeyValue kv = new KeyValue(String.format(LOCAL_FORMATTER, Integer.valueOf(rowId)).getBytes(),
+ KeyValue kv = new KeyValue(String.format(LOCAL_FORMATTER, rowId).getBytes(),
Bytes.toBytes("family"), Bytes.toBytes("qual"), HConstants.LATEST_TIMESTAMP, KeyValue.Type.Put);
return kv.getKey();
}
@@ -169,17 +174,15 @@ public class TestInLineFileSystemHFileInLining {
writer.close();
}
- private int writeSomeRecords(HFile.Writer writer)
+ private void writeSomeRecords(HFile.Writer writer)
throws IOException {
- String value = "value";
KeyValue kv;
for (int i = 0; i < (maxRows); i++) {
- String key = String.format(LOCAL_FORMATTER, Integer.valueOf(i));
+ String key = String.format(LOCAL_FORMATTER, i);
kv = new KeyValue(Bytes.toBytes(key), Bytes.toBytes("family"), Bytes.toBytes("qual"),
- Bytes.toBytes(value + key));
+ Bytes.toBytes(VALUE_PREFIX + key));
writer.append(kv);
}
- return (maxRows);
}
private void readAllRecords(HFileScanner scanner) throws IOException {
@@ -187,30 +190,31 @@ public class TestInLineFileSystemHFileInLining {
}
// read the records and check
- private int readAndCheckbytes(HFileScanner scanner, int start, int n)
+ private void readAndCheckbytes(HFileScanner scanner, int start, int n)
throws IOException {
- String value = "value";
int i = start;
for (; i < (start + n); i++) {
- ByteBuffer key = scanner.getKey();
- ByteBuffer val = scanner.getValue();
- String keyStr = String.format(LOCAL_FORMATTER, Integer.valueOf(i));
- String valStr = value + keyStr;
+ Cell cell = scanner.getCell();
+ byte[] key = Arrays.copyOfRange(
+ cell.getRowArray(), cell.getRowOffset(), cell.getRowOffset() + cell.getRowLength());
+ byte[] val = Arrays.copyOfRange(
+ cell.getValueArray(), cell.getValueOffset(), cell.getValueOffset() + cell.getValueLength());
+ String keyStr = String.format(LOCAL_FORMATTER, i);
+ String valStr = VALUE_PREFIX + keyStr;
KeyValue kv = new KeyValue(Bytes.toBytes(keyStr), Bytes.toBytes("family"),
Bytes.toBytes("qual"), Bytes.toBytes(valStr));
- byte[] keyBytes = new KeyValue.KeyOnlyKeyValue(Bytes.toBytes(key), 0,
- Bytes.toBytes(key).length).getKey();
- assertArrayEquals(kv.getKey(), keyBytes,
- "bytes for keys do not match " + keyStr + " " + Bytes.toString(Bytes.toBytes(key)));
- byte[] valBytes = Bytes.toBytes(val);
- assertArrayEquals(Bytes.toBytes(valStr), valBytes,
- "bytes for vals do not match " + valStr + " " + Bytes.toString(valBytes));
+ byte[] keyBytes = new KeyValue.KeyOnlyKeyValue(key, 0, key.length).getKey();
+ byte[] expectedKeyBytes = Arrays.copyOfRange(
+ kv.getRowArray(), kv.getRowOffset(), kv.getRowOffset() + kv.getRowLength());
+ assertArrayEquals(expectedKeyBytes, keyBytes,
+ "bytes for keys do not match " + keyStr + " " + Bytes.toString(key));
+ assertArrayEquals(Bytes.toBytes(valStr), val,
+ "bytes for vals do not match " + valStr + " " + Bytes.toString(val));
if (!scanner.next()) {
break;
}
}
assertEquals(i, start + n - 1);
- return (start + n);
}
private long generateOuterFile(Path outerPath, byte[] inlineBytes) throws IOException {
diff --git a/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormat.java b/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormat.java
index e9b06e6..0772dc6 100755
--- a/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormat.java
+++ b/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormat.java
@@ -1886,11 +1886,16 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
private HoodieDataBlock getDataBlock(HoodieLogBlockType dataBlockType, List<IndexedRecord> records,
Map<HeaderMetadataType, String> header) {
+ return getDataBlock(dataBlockType, records, header, new Path("dummy_path"));
+ }
+
+ private HoodieDataBlock getDataBlock(HoodieLogBlockType dataBlockType, List<IndexedRecord> records,
+ Map<HeaderMetadataType, String> header, Path pathForReader) {
switch (dataBlockType) {
case AVRO_DATA_BLOCK:
return new HoodieAvroDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD);
case HFILE_DATA_BLOCK:
- return new HoodieHFileDataBlock(records, header, Compression.Algorithm.GZ);
+ return new HoodieHFileDataBlock(records, header, Compression.Algorithm.GZ, pathForReader);
case PARQUET_DATA_BLOCK:
return new HoodieParquetDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD, CompressionCodecName.GZIP);
default:
diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/testutils/InputFormatTestUtil.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/testutils/InputFormatTestUtil.java
index 1185be6..7557938 100644
--- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/testutils/InputFormatTestUtil.java
+++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/testutils/InputFormatTestUtil.java
@@ -18,9 +18,6 @@
package org.apache.hudi.hadoop.testutils;
-import org.apache.hadoop.fs.LocalFileSystem;
-import org.apache.hadoop.fs.RawLocalFileSystem;
-import org.apache.hadoop.hbase.io.compress.Compression;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieFileFormat;
@@ -44,7 +41,10 @@ import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RawLocalFileSystem;
+import org.apache.hadoop.hbase.io.compress.Compression;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.mapred.JobConf;
@@ -373,7 +373,8 @@ public class InputFormatTestUtil {
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, writeSchema.toString());
HoodieDataBlock dataBlock = null;
if (logBlockType == HoodieLogBlock.HoodieLogBlockType.HFILE_DATA_BLOCK) {
- dataBlock = new HoodieHFileDataBlock(records, header, Compression.Algorithm.GZ);
+ dataBlock = new HoodieHFileDataBlock(
+ records, header, Compression.Algorithm.GZ, writer.getLogFile().getPath());
} else if (logBlockType == HoodieLogBlock.HoodieLogBlockType.PARQUET_DATA_BLOCK) {
dataBlock = new HoodieParquetDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD, CompressionCodecName.GZIP);
} else {
diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java
index 4c0265c..7ec2ba5 100644
--- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java
+++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java
@@ -223,7 +223,7 @@ public abstract class ITTestBase {
boolean completed =
dockerClient.execStartCmd(createCmdResponse.getId()).withDetach(false).withTty(false).exec(callback)
- .awaitCompletion(540, SECONDS);
+ .awaitCompletion(540, SECONDS);
if (!completed) {
callback.getStderr().flush();
callback.getStdout().flush();
diff --git a/packaging/hudi-flink-bundle/pom.xml b/packaging/hudi-flink-bundle/pom.xml
index 30ee37a..4d0b18c 100644
--- a/packaging/hudi-flink-bundle/pom.xml
+++ b/packaging/hudi-flink-bundle/pom.xml
@@ -70,6 +70,7 @@
<resource>META-INF/LICENSE</resource>
<file>target/classes/META-INF/LICENSE</file>
</transformer>
+ <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
</transformers>
<artifactSet>
<includes>
@@ -138,7 +139,7 @@
<include>org.apache.hive:hive-service</include>
<include>org.apache.hive:hive-service-rpc</include>
<include>org.apache.hive:hive-exec</include>
- <include>org.apache.hive:hive-standalone-metastore</include>
+ <include>org.apache.hive:hive-standalone-metastore</include>
<include>org.apache.hive:hive-metastore</include>
<include>org.apache.hive:hive-jdbc</include>
<include>org.datanucleus:datanucleus-core</include>
@@ -148,10 +149,18 @@
<include>org.apache.hbase:hbase-common</include>
<include>org.apache.hbase:hbase-client</include>
+ <include>org.apache.hbase:hbase-hadoop-compat</include>
+ <include>org.apache.hbase:hbase-hadoop2-compat</include>
+ <include>org.apache.hbase:hbase-metrics</include>
+ <include>org.apache.hbase:hbase-metrics-api</include>
<include>org.apache.hbase:hbase-server</include>
- <include>org.apache.hbase:hbase-protocol</include>
- <include>org.apache.htrace:htrace-core</include>
+ <include>org.apache.hbase:hbase-protocol-shaded</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-miscellaneous</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-netty</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-protobuf</include>
+ <include>org.apache.htrace:htrace-core4</include>
<include>commons-codec:commons-codec</include>
+ <include>commons-io:commons-io</include>
</includes>
</artifactSet>
<relocations>
@@ -164,6 +173,25 @@
<shadedPattern>${flink.bundle.shade.prefix}org.apache.avro.</shadedPattern>
</relocation>
<relocation>
+ <pattern>org.apache.commons.io.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.commons.io.</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.hbase.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.hbase.</shadedPattern>
+ <excludes>
+ <exclude>org.apache.hadoop.hbase.KeyValue$KeyComparator</exclude>
+ </excludes>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hbase.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hbase.</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.htrace.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.htrace.</shadedPattern>
+ </relocation>
+ <relocation>
<pattern>com.yammer.metrics.</pattern>
<shadedPattern>${flink.bundle.shade.prefix}com.yammer.metrics.</shadedPattern>
</relocation>
@@ -192,6 +220,74 @@
<pattern>com.fasterxml.jackson.</pattern>
<shadedPattern>${flink.bundle.shade.prefix}com.fasterxml.jackson.</shadedPattern>
</relocation>
+ <!-- The classes below in org.apache.hadoop.metrics2 package come from
+ hbase-hadoop-compat and hbase-hadoop2-compat, which have to be shaded one by one,
+ instead of shading all classes under org.apache.hadoop.metrics2 including ones
+ from hadoop. -->
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.MetricHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.MetricHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.MetricsExecutor</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.MetricsExecutor
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.impl.JmxCacheBuster</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.impl.JmxCacheBuster</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MetricsExecutorImpl</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MetricsExecutorImpl
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableFastCounter</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableFastCounter
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableRangeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableRangeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableSizeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableSizeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableTimeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableTimeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.util.MetricQuantile</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.util.MetricQuantile
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.util.MetricSampleQuantiles</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.util.MetricSampleQuantiles
+ </shadedPattern>
+ </relocation>
</relocations>
<filters>
<filter>
@@ -201,6 +297,8 @@
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
<exclude>META-INF/services/javax.*</exclude>
+ <exclude>**/*.proto</exclude>
+ <exclude>hbase-webapps/**</exclude>
</excludes>
</filter>
</filters>
@@ -273,11 +371,23 @@
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-hadoop-mr</artifactId>
<version>${project.version}</version>
+ <exclusions>
+ <exclusion>
+ <artifactId>guava</artifactId>
+ <groupId>com.google.guava</groupId>
+ </exclusion>
+ </exclusions>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-hive-sync</artifactId>
<version>${project.version}</version>
+ <exclusions>
+ <exclusion>
+ <artifactId>guava</artifactId>
+ <groupId>com.google.guava</groupId>
+ </exclusion>
+ </exclusions>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
@@ -288,6 +398,10 @@
<artifactId>rocksdbjni</artifactId>
<groupId>org.rocksdb</groupId>
</exclusion>
+ <exclusion>
+ <artifactId>guava</artifactId>
+ <groupId>com.google.guava</groupId>
+ </exclusion>
</exclusions>
</dependency>
<dependency>
@@ -551,66 +665,6 @@
<artifactId>jackson-annotations</artifactId>
<scope>compile</scope>
</dependency>
-
- <!-- Hbase -->
- <dependency>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-common</artifactId>
- <version>${hbase.version}</version>
- <exclusions>
- <exclusion>
- <artifactId>guava</artifactId>
- <groupId>com.google.guava</groupId>
- </exclusion>
- </exclusions>
- </dependency>
- <dependency>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-server</artifactId>
- <version>${hbase.version}</version>
- <scope>compile</scope>
- <exclusions>
- <exclusion>
- <artifactId>guava</artifactId>
- <groupId>com.google.guava</groupId>
- </exclusion>
- <exclusion>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-common</artifactId>
- </exclusion>
- <exclusion>
- <groupId>javax.servlet</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.codehaus.jackson</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.mortbay.jetty</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- <exclusion>
- <groupId>tomcat</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <dependency>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-client</artifactId>
- <version>${hbase.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-protocol</artifactId>
- <version>${hbase.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.htrace</groupId>
- <artifactId>htrace-core</artifactId>
- <version>${htrace.version}</version>
- </dependency>
</dependencies>
<profiles>
diff --git a/packaging/hudi-hadoop-mr-bundle/pom.xml b/packaging/hudi-hadoop-mr-bundle/pom.xml
index f6215b1..337afc1 100644
--- a/packaging/hudi-hadoop-mr-bundle/pom.xml
+++ b/packaging/hudi-hadoop-mr-bundle/pom.xml
@@ -55,12 +55,13 @@
<transformer implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer">
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer">
- <addHeader>true</addHeader>
+ <addHeader>true</addHeader>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
<resource>META-INF/LICENSE</resource>
<file>target/classes/META-INF/LICENSE</file>
</transformer>
+ <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
</transformers>
<artifactSet>
<includes>
@@ -74,11 +75,19 @@
<include>com.esotericsoftware:minlog</include>
<include>org.apache.hbase:hbase-common</include>
<include>org.apache.hbase:hbase-client</include>
- <include>org.apache.hbase:hbase-protocol</include>
+ <include>org.apache.hbase:hbase-hadoop-compat</include>
+ <include>org.apache.hbase:hbase-hadoop2-compat</include>
+ <include>org.apache.hbase:hbase-metrics</include>
+ <include>org.apache.hbase:hbase-metrics-api</include>
+ <include>org.apache.hbase:hbase-protocol-shaded</include>
<include>org.apache.hbase:hbase-server</include>
- <include>org.apache.htrace:htrace-core</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-miscellaneous</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-netty</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-protobuf</include>
+ <include>org.apache.htrace:htrace-core4</include>
<include>com.yammer.metrics:metrics-core</include>
<include>com.google.guava:guava</include>
+ <include>commons-io:commons-io</include>
</includes>
</artifactSet>
<relocations>
@@ -103,6 +112,25 @@
<shadedPattern>org.apache.hudi.org.apache.avro.</shadedPattern>
</relocation>
<relocation>
+ <pattern>org.apache.commons.io.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.commons.io.</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.hbase.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.hbase.</shadedPattern>
+ <excludes>
+ <exclude>org.apache.hadoop.hbase.KeyValue$KeyComparator</exclude>
+ </excludes>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hbase.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hbase.</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.htrace.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.htrace.</shadedPattern>
+ </relocation>
+ <relocation>
<pattern>org.apache.parquet.avro.</pattern>
<shadedPattern>org.apache.hudi.org.apache.parquet.avro.</shadedPattern>
</relocation>
@@ -110,6 +138,74 @@
<pattern>com.google.common.</pattern>
<shadedPattern>org.apache.hudi.com.google.common.</shadedPattern>
</relocation>
+ <!-- The classes below in org.apache.hadoop.metrics2 package come from
+ hbase-hadoop-compat and hbase-hadoop2-compat, which have to be shaded one by one,
+ instead of shading all classes under org.apache.hadoop.metrics2 including ones
+ from hadoop. -->
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.MetricHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.MetricHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.MetricsExecutor</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.MetricsExecutor
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.impl.JmxCacheBuster</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.impl.JmxCacheBuster</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MetricsExecutorImpl</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MetricsExecutorImpl
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableFastCounter</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableFastCounter
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableRangeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableRangeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableSizeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableSizeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableTimeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableTimeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.util.MetricQuantile</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.util.MetricQuantile
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.util.MetricSampleQuantiles</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.util.MetricSampleQuantiles
+ </shadedPattern>
+ </relocation>
</relocations>
<createDependencyReducedPom>false</createDependencyReducedPom>
<filters>
@@ -120,6 +216,8 @@
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
<exclude>META-INF/services/javax.*</exclude>
+ <exclude>**/*.proto</exclude>
+ <exclude>hbase-webapps/**</exclude>
</excludes>
</filter>
</filters>
@@ -167,48 +265,5 @@
<version>${avro.version}</version>
<scope>compile</scope>
</dependency>
-
- <dependency>
- <groupId>org.apache.htrace</groupId>
- <artifactId>htrace-core</artifactId>
- <version>${htrace.version}</version>
- <scope>compile</scope>
- </dependency>
-
- <!-- HBase -->
- <dependency>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-common</artifactId>
- <version>${hbase.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-server</artifactId>
- <version>${hbase.version}</version>
- <scope>compile</scope>
- <exclusions>
- <exclusion>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-common</artifactId>
- </exclusion>
- <exclusion>
- <groupId>javax.servlet</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.codehaus.jackson</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.mortbay.jetty</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- <exclusion>
- <groupId>tomcat</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
</dependencies>
</project>
diff --git a/packaging/hudi-integ-test-bundle/pom.xml b/packaging/hudi-integ-test-bundle/pom.xml
index b53e02a..78e76a3 100644
--- a/packaging/hudi-integ-test-bundle/pom.xml
+++ b/packaging/hudi-integ-test-bundle/pom.xml
@@ -62,6 +62,7 @@
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>META-INF/services/org.apache.spark.sql.sources.DataSourceRegister</resource>
</transformer>
+ <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
</transformers>
<artifactSet>
<includes>
@@ -85,6 +86,20 @@
<include>org.apache.hudi:hudi-aws</include>
<include>org.apache.hudi:hudi-integ-test</include>
+ <include>org.apache.hbase:hbase-common</include>
+ <include>org.apache.hbase:hbase-client</include>
+ <include>org.apache.hbase:hbase-hadoop-compat</include>
+ <include>org.apache.hbase:hbase-hadoop2-compat</include>
+ <include>org.apache.hbase:hbase-metrics</include>
+ <include>org.apache.hbase:hbase-metrics-api</include>
+ <include>org.apache.hbase:hbase-protocol-shaded</include>
+ <include>org.apache.hbase:hbase-server</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-miscellaneous</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-netty</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-protobuf</include>
+ <include>org.apache.htrace:htrace-core4</include>
+ <include>commons-io:commons-io</include>
+
<include>org.jetbrains.kotlin:kotlin-stdlib-jdk8</include>
<include>org.jetbrains.kotlin:kotlin-stdlib</include>
<include>org.jetbrains.kotlin:kotlin-stdlib-common</include>
@@ -133,7 +148,6 @@
<include>org.apache.hive:hive-common</include>
<include>org.apache.hive:hive-service</include>
- <include>org.apache.hive:hive-metastore</include>
<include>org.apache.hive:hive-jdbc</include>
<include>org.apache.hive:hive-exec</include>
@@ -156,7 +170,6 @@
<include>com.fasterxml.jackson.core:jackson-databind</include>
<include>com.fasterxml.jackson.dataformat:jackson-dataformat-yaml</include>
- <include>org.apache.htrace:htrace-core</include>
<include>org.apache.curator:curator-framework</include>
<include>org.apache.curator:curator-client</include>
<include>org.apache.curator:curator-recipes</include>
@@ -180,6 +193,25 @@
<shadedPattern>org.apache.hudi.org.apache.commons.pool.</shadedPattern>
</relocation>
<relocation>
+ <pattern>org.apache.commons.io.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.commons.io.</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.hbase.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.hbase.</shadedPattern>
+ <excludes>
+ <exclude>org.apache.hadoop.hbase.KeyValue$KeyComparator</exclude>
+ </excludes>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hbase.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hbase.</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.htrace.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.htrace.</shadedPattern>
+ </relocation>
+ <relocation>
<pattern>org.apache.hive.jdbc.</pattern>
<shadedPattern>org.apache.hudi.org.apache.hive.jdbc.</shadedPattern>
</relocation>
@@ -259,6 +291,74 @@
<pattern>org.apache.parquet.avro.</pattern>
<shadedPattern>org.apache.hudi.org.apache.parquet.avro.</shadedPattern>
</relocation>
+ <!-- The classes below in org.apache.hadoop.metrics2 package come from
+ hbase-hadoop-compat and hbase-hadoop2-compat, which have to be shaded one by one,
+ instead of shading all classes under org.apache.hadoop.metrics2 including ones
+ from hadoop. -->
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.MetricHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.MetricHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.MetricsExecutor</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.MetricsExecutor
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.impl.JmxCacheBuster</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.impl.JmxCacheBuster</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MetricsExecutorImpl</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MetricsExecutorImpl
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableFastCounter</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableFastCounter
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableRangeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableRangeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableSizeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableSizeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableTimeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableTimeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.util.MetricQuantile</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.util.MetricQuantile
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.util.MetricSampleQuantiles</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.util.MetricSampleQuantiles
+ </shadedPattern>
+ </relocation>
</relocations>
<filters>
<filter>
@@ -270,6 +370,8 @@
<!-- Use this jar's NOTICE and license file -->
<exclude>META-INF/NOTICE*</exclude>
<exclude>META-INF/LICENSE*</exclude>
+ <exclude>**/*.proto</exclude>
+ <exclude>hbase-webapps/**</exclude>
</excludes>
</filter>
</filters>
@@ -398,6 +500,12 @@
<artifactId>hive-metastore</artifactId>
<version>${hive.version}</version>
<scope>provided</scope>
+ <exclusions>
+ <exclusion>
+ <groupId>org.apache.hbase</groupId>
+ <artifactId>*</artifactId>
+ </exclusion>
+ </exclusions>
</dependency>
<dependency>
diff --git a/packaging/hudi-kafka-connect-bundle/pom.xml b/packaging/hudi-kafka-connect-bundle/pom.xml
index f66bc7f..d17b99b 100644
--- a/packaging/hudi-kafka-connect-bundle/pom.xml
+++ b/packaging/hudi-kafka-connect-bundle/pom.xml
@@ -58,14 +58,16 @@
implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer">
</transformer>
<transformer
- implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer">
+ implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer">
<addHeader>true</addHeader>
</transformer>
<transformer
- implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+ implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
<resource>META-INF/LICENSE</resource>
<file>target/classes/META-INF/LICENSE</file>
</transformer>
+ <transformer
+ implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
</transformers>
<artifactSet>
<includes>
@@ -115,13 +117,21 @@
<include>org.objenesis:objenesis</include>
<include>com.esotericsoftware:kryo-shaded</include>
<include>com.esotericsoftware:minlog</include>
-
+
<include>org.apache.hbase:hbase-client</include>
<include>org.apache.hbase:hbase-common</include>
- <include>org.apache.hbase:hbase-protocol</include>
+ <include>org.apache.hbase:hbase-hadoop-compat</include>
+ <include>org.apache.hbase:hbase-hadoop2-compat</include>
+ <include>org.apache.hbase:hbase-metrics</include>
+ <include>org.apache.hbase:hbase-metrics-api</include>
+ <include>org.apache.hbase:hbase-protocol-shaded</include>
<include>org.apache.hbase:hbase-server</include>
- <include>org.apache.htrace:htrace-core</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-miscellaneous</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-netty</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-protobuf</include>
+ <include>org.apache.htrace:htrace-core4</include>
<include>org.scala-lang:*</include>
+ <include>commons-io:commons-io</include>
</includes>
</artifactSet>
<relocations>
@@ -131,15 +141,107 @@
</relocation>
<relocation>
<pattern>com.yammer.metrics.</pattern>
- <shadedPattern>${kafka.connect.bundle.shade.prefix}com.yammer.metrics.</shadedPattern>
+ <shadedPattern>${kafka.connect.bundle.shade.prefix}com.yammer.metrics.
+ </shadedPattern>
</relocation>
<relocation>
<pattern>com.beust.jcommander.</pattern>
- <shadedPattern>${kafka.connect.bundle.shade.prefix}com.beust.jcommander.</shadedPattern>
+ <shadedPattern>${kafka.connect.bundle.shade.prefix}com.beust.jcommander.
+ </shadedPattern>
</relocation>
<relocation>
<pattern>org.eclipse.jetty.</pattern>
- <shadedPattern>${kafka.connect.bundle.shade.prefix}org.eclipse.jetty.</shadedPattern>
+ <shadedPattern>${kafka.connect.bundle.shade.prefix}org.eclipse.jetty.
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.commons.io.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.commons.io.</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.hbase.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.hbase.</shadedPattern>
+ <excludes>
+ <exclude>org.apache.hadoop.hbase.KeyValue$KeyComparator</exclude>
+ </excludes>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hbase.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hbase.</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.htrace.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.htrace.</shadedPattern>
+ </relocation>
+ <!-- The classes below in org.apache.hadoop.metrics2 package come from
+ hbase-hadoop-compat and hbase-hadoop2-compat, which have to be shaded one by one,
+ instead of shading all classes under org.apache.hadoop.metrics2 including ones
+ from hadoop. -->
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.MetricHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.MetricHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.MetricsExecutor</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.MetricsExecutor
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.impl.JmxCacheBuster</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.impl.JmxCacheBuster
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper</pattern>
+ <shadedPattern>
+ org.apache.hudi.org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MetricsExecutorImpl</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MetricsExecutorImpl
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableFastCounter</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableFastCounter
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableRangeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableRangeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableSizeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableSizeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableTimeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableTimeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.util.MetricQuantile</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.util.MetricQuantile
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.util.MetricSampleQuantiles</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.util.MetricSampleQuantiles
+ </shadedPattern>
</relocation>
</relocations>
<filters>
@@ -150,6 +252,8 @@
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
<exclude>META-INF/services/javax.*</exclude>
+ <exclude>**/*.proto</exclude>
+ <exclude>hbase-webapps/**</exclude>
</excludes>
</filter>
</filters>
@@ -322,13 +426,6 @@
<scope>${utilities.bundle.hive.scope}</scope>
</dependency>
- <dependency>
- <groupId>org.apache.htrace</groupId>
- <artifactId>htrace-core</artifactId>
- <version>${htrace.version}</version>
- <scope>compile</scope>
- </dependency>
-
</dependencies>
</project>
diff --git a/packaging/hudi-presto-bundle/pom.xml b/packaging/hudi-presto-bundle/pom.xml
index 90c1087..ad6d2ec 100644
--- a/packaging/hudi-presto-bundle/pom.xml
+++ b/packaging/hudi-presto-bundle/pom.xml
@@ -61,6 +61,7 @@
<resource>META-INF/LICENSE</resource>
<file>target/classes/META-INF/LICENSE</file>
</transformer>
+ <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
</transformers>
<artifactSet>
<includes>
@@ -75,21 +76,52 @@
<include>com.esotericsoftware:minlog</include>
<include>org.apache.hbase:hbase-common</include>
<include>org.apache.hbase:hbase-client</include>
+ <include>org.apache.hbase:hbase-hadoop-compat</include>
+ <include>org.apache.hbase:hbase-hadoop2-compat</include>
+ <include>org.apache.hbase:hbase-metrics</include>
+ <include>org.apache.hbase:hbase-metrics-api</include>
<include>org.apache.hbase:hbase-protocol</include>
- <include>org.apache.hbase:hbase-server</include>
- <include>org.apache.htrace:htrace-core</include>
+ <include>org.apache.hbase:hbase-protocol-shaded</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-miscellaneous</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-netty</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-protobuf</include>
+ <include>org.apache.htrace:htrace-core4</include>
<include>com.yammer.metrics:metrics-core</include>
<include>com.google.guava:guava</include>
+ <include>commons-io:commons-io</include>
<include>commons-lang:commons-lang</include>
<include>com.google.protobuf:protobuf-java</include>
</includes>
</artifactSet>
<relocations>
<relocation>
+ <pattern>org.apache.parquet.avro.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.parquet.avro.</shadedPattern>
+ </relocation>
+ <relocation>
<pattern>org.apache.avro.</pattern>
<shadedPattern>org.apache.hudi.org.apache.avro.</shadedPattern>
</relocation>
<relocation>
+ <pattern>org.apache.commons.io.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.commons.io.</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.hbase.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.hbase.</shadedPattern>
+ <excludes>
+ <exclude>org.apache.hadoop.hbase.KeyValue$KeyComparator</exclude>
+ </excludes>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hbase.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hbase.</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.htrace.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.htrace.</shadedPattern>
+ </relocation>
+ <relocation>
<pattern>org.codehaus.jackson.</pattern>
<shadedPattern>org.apache.hudi.org.codehaus.jackson.</shadedPattern>
</relocation>
@@ -122,13 +154,77 @@
<shadedPattern>${presto.bundle.bootstrap.shade.prefix}com.google.protobuf.</shadedPattern>
</relocation>
<relocation>
- <pattern>org.apache.htrace.</pattern>
- <shadedPattern>${presto.bundle.bootstrap.shade.prefix}org.apache.htrace.</shadedPattern>
- </relocation>
- <relocation>
<pattern>org.apache.parquet.avro.</pattern>
<shadedPattern>${presto.bundle.bootstrap.shade.prefix}org.apache.parquet.avro.</shadedPattern>
</relocation>
+ <!-- The classes below in org.apache.hadoop.metrics2 package come from
+ hbase-hadoop-compat and hbase-hadoop2-compat, which have to be shaded one by one,
+ instead of shading all classes under org.apache.hadoop.metrics2 including ones
+ from hadoop. -->
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.MetricHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.MetricHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.MetricsExecutor</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.MetricsExecutor
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.impl.JmxCacheBuster</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.impl.JmxCacheBuster</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MetricsExecutorImpl</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MetricsExecutorImpl
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableFastCounter</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableFastCounter
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableRangeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableRangeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableSizeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableSizeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableTimeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableTimeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.util.MetricQuantile</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.util.MetricQuantile
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.util.MetricSampleQuantiles</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.util.MetricSampleQuantiles
+ </shadedPattern>
+ </relocation>
</relocations>
<createDependencyReducedPom>false</createDependencyReducedPom>
<filters>
@@ -139,7 +235,9 @@
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
<exclude>META-INF/services/javax.*</exclude>
+ <exclude>**/*.proto</exclude>
<exclude>com/esotericsoftware/reflectasm/**</exclude>
+ <exclude>hbase-webapps/**</exclude>
<exclude>stringBehavior.avsc</exclude>
</excludes>
</filter>
@@ -171,20 +269,6 @@
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-hadoop-mr-bundle</artifactId>
<version>${project.version}</version>
- <exclusions>
- <exclusion>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-common</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-server</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-client</artifactId>
- </exclusion>
- </exclusions>
</dependency>
<!-- Parquet -->
@@ -201,42 +285,6 @@
<scope>compile</scope>
</dependency>
- <!-- HBase -->
- <dependency>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-common</artifactId>
- <version>${hbase.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-server</artifactId>
- <version>${hbase.version}</version>
- <scope>compile</scope>
- <exclusions>
- <exclusion>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-common</artifactId>
- </exclusion>
- <exclusion>
- <groupId>javax.servlet</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.codehaus.jackson</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.mortbay.jetty</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- <exclusion>
- <groupId>tomcat</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
<!--Guava needs to be shaded because HBase 1.2.3 depends on an earlier guava version i.e 12.0.1 and hits runtime
issues with the guava version present in Presto runtime-->
<dependency>
diff --git a/packaging/hudi-spark-bundle/pom.xml b/packaging/hudi-spark-bundle/pom.xml
index a877d10..fd79a34 100644
--- a/packaging/hudi-spark-bundle/pom.xml
+++ b/packaging/hudi-spark-bundle/pom.xml
@@ -63,6 +63,7 @@
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>META-INF/services/org.apache.spark.sql.sources.DataSourceRegister</resource>
</transformer>
+ <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
</transformers>
<artifactSet>
<includes>
@@ -116,13 +117,21 @@
<include>org.apache.hbase:hbase-client</include>
<include>org.apache.hbase:hbase-common</include>
- <include>org.apache.hbase:hbase-protocol</include>
+ <include>org.apache.hbase:hbase-hadoop-compat</include>
+ <include>org.apache.hbase:hbase-hadoop2-compat</include>
+ <include>org.apache.hbase:hbase-metrics</include>
+ <include>org.apache.hbase:hbase-metrics-api</include>
+ <include>org.apache.hbase:hbase-protocol-shaded</include>
<include>org.apache.hbase:hbase-server</include>
- <include>org.apache.htrace:htrace-core</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-miscellaneous</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-netty</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-protobuf</include>
+ <include>org.apache.htrace:htrace-core4</include>
<include>org.apache.curator:curator-framework</include>
<include>org.apache.curator:curator-client</include>
<include>org.apache.curator:curator-recipes</include>
<include>commons-codec:commons-codec</include>
+ <include>commons-io:commons-io</include>
</includes>
</artifactSet>
<relocations>
@@ -135,6 +144,25 @@
<shadedPattern>org.apache.hudi.com.beust.jcommander.</shadedPattern>
</relocation>
<relocation>
+ <pattern>org.apache.commons.io.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.commons.io.</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.hbase.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.hbase.</shadedPattern>
+ <excludes>
+ <exclude>org.apache.hadoop.hbase.KeyValue$KeyComparator</exclude>
+ </excludes>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hbase.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hbase.</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.htrace.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.htrace.</shadedPattern>
+ </relocation>
+ <relocation>
<pattern>org.apache.spark.sql.avro.</pattern>
<shadedPattern>${spark.bundle.spark.shade.prefix}org.apache.spark.sql.avro.</shadedPattern>
</relocation>
@@ -183,6 +211,74 @@
<shadedPattern>${spark.bundle.spark.shade.prefix}com.google.common.</shadedPattern>
</relocation>
<!-- TODO: Revisit GH ISSUE #533 & PR#633-->
+ <!-- The classes below in org.apache.hadoop.metrics2 package come from
+ hbase-hadoop-compat and hbase-hadoop2-compat, which have to be shaded one by one,
+ instead of shading all classes under org.apache.hadoop.metrics2 including ones
+ from hadoop. -->
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.MetricHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.MetricHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.MetricsExecutor</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.MetricsExecutor
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.impl.JmxCacheBuster</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.impl.JmxCacheBuster</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MetricsExecutorImpl</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MetricsExecutorImpl
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableFastCounter</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableFastCounter
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableRangeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableRangeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableSizeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableSizeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableTimeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableTimeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.util.MetricQuantile</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.util.MetricQuantile
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.util.MetricSampleQuantiles</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.util.MetricSampleQuantiles
+ </shadedPattern>
+ </relocation>
</relocations>
<filters>
<filter>
@@ -192,6 +288,8 @@
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
<exclude>META-INF/services/javax.*</exclude>
+ <exclude>**/*.proto</exclude>
+ <exclude>hbase-webapps/**</exclude>
</excludes>
</filter>
</filters>
@@ -314,58 +412,6 @@
<scope>${spark.bundle.hive.scope}</scope>
</dependency>
- <dependency>
- <groupId>org.apache.htrace</groupId>
- <artifactId>htrace-core</artifactId>
- <version>${htrace.version}</version>
- <scope>compile</scope>
- </dependency>
-
- <!-- Hbase -->
- <dependency>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-common</artifactId>
- <version>${hbase.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-server</artifactId>
- <version>${hbase.version}</version>
- <scope>compile</scope>
- <exclusions>
- <exclusion>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-common</artifactId>
- </exclusion>
- <exclusion>
- <groupId>javax.servlet</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.codehaus.jackson</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.mortbay.jetty</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- <exclusion>
- <groupId>tomcat</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <dependency>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-client</artifactId>
- <version>${hbase.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-protocol</artifactId>
- <version>${hbase.version}</version>
- </dependency>
-
<!-- zookeeper -->
<dependency>
<groupId>org.apache.curator</groupId>
diff --git a/packaging/hudi-timeline-server-bundle/pom.xml b/packaging/hudi-timeline-server-bundle/pom.xml
index 18f7c96..e1e5d98 100644
--- a/packaging/hudi-timeline-server-bundle/pom.xml
+++ b/packaging/hudi-timeline-server-bundle/pom.xml
@@ -155,6 +155,8 @@
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
<exclude>META-INF/services/javax.*</exclude>
+ <exclude>**/*.proto</exclude>
+ <exclude>hbase-webapps/**</exclude>
</excludes>
</filter>
</filters>
@@ -198,17 +200,114 @@
<include>com.fasterxml.jackson.core:jackson-annotations</include>
<include>com.fasterxml.jackson.core:jackson-core</include>
<include>com.fasterxml.jackson.core:jackson-databind</include>
- <include>org.apache.htrace:htrace-core</include>
<include>org.apache.hbase:hbase-common</include>
<include>org.apache.hbase:hbase-client</include>
- <include>org.apache.hbase:hbase-protocol</include>
+ <include>org.apache.hbase:hbase-hadoop-compat</include>
+ <include>org.apache.hbase:hbase-hadoop2-compat</include>
+ <include>org.apache.hbase:hbase-metrics</include>
+ <include>org.apache.hbase:hbase-metrics-api</include>
+ <include>org.apache.hbase:hbase-protocol-shaded</include>
<include>org.apache.hbase:hbase-server</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-miscellaneous</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-netty</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-protobuf</include>
+ <include>org.apache.htrace:htrace-core4</include>
<include>com.esotericsoftware:kryo-shaded</include>
<include>com.esotericsoftware:minlog</include>
+ <include>commons-io:commons-io</include>
<include>log4j:log4j</include>
<include>org.objenesis:objenesis</include>
</includes>
</artifactSet>
+ <relocations>
+ <relocation>
+ <pattern>org.apache.commons.io.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.commons.io.</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.hbase.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.hbase.</shadedPattern>
+ <excludes>
+ <exclude>org.apache.hadoop.hbase.KeyValue$KeyComparator</exclude>
+ </excludes>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hbase.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hbase.</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.htrace.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.htrace.</shadedPattern>
+ </relocation>
+ <!-- The classes below in org.apache.hadoop.metrics2 package come from
+ hbase-hadoop-compat and hbase-hadoop2-compat, which have to be shaded one by one,
+ instead of shading all classes under org.apache.hadoop.metrics2 including ones
+ from hadoop. -->
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.MetricHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.MetricHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.MetricsExecutor</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.MetricsExecutor
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.impl.JmxCacheBuster</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.impl.JmxCacheBuster</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MetricsExecutorImpl</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MetricsExecutorImpl
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableFastCounter</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableFastCounter
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableRangeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableRangeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableSizeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableSizeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableTimeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableTimeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.util.MetricQuantile</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.util.MetricQuantile
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.util.MetricSampleQuantiles</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.util.MetricSampleQuantiles
+ </shadedPattern>
+ </relocation>
+ </relocations>
</configuration>
<executions>
<execution>
diff --git a/packaging/hudi-trino-bundle/pom.xml b/packaging/hudi-trino-bundle/pom.xml
index adf73f1..7cbd502 100644
--- a/packaging/hudi-trino-bundle/pom.xml
+++ b/packaging/hudi-trino-bundle/pom.xml
@@ -62,6 +62,7 @@
<resource>META-INF/LICENSE</resource>
<file>target/classes/META-INF/LICENSE</file>
</transformer>
+ <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
</transformers>
<artifactSet>
<includes>
@@ -76,23 +77,53 @@
<include>com.esotericsoftware:minlog</include>
<include>org.apache.hbase:hbase-common</include>
<include>org.apache.hbase:hbase-client</include>
- <include>org.apache.hbase:hbase-protocol</include>
+ <include>org.apache.hbase:hbase-hadoop-compat</include>
+ <include>org.apache.hbase:hbase-hadoop2-compat</include>
+ <include>org.apache.hbase:hbase-metrics</include>
+ <include>org.apache.hbase:hbase-metrics-api</include>
+ <include>org.apache.hbase:hbase-protocol-shaded</include>
<include>org.apache.hbase:hbase-server</include>
<include>org.apache.hbase:hbase-annotations</include>
- <include>org.apache.htrace:htrace-core</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-protobuf</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-netty</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-miscellaneous</include>
+ <include>org.apache.htrace:htrace-core4</include>
<include>com.yammer.metrics:metrics-core</include>
<include>com.google.guava:guava</include>
<include>commons-lang:commons-lang</include>
+ <include>commons-io:commons-io</include>
<include>com.google.protobuf:protobuf-java</include>
</includes>
</artifactSet>
<relocations>
-
+ <relocation>
+ <pattern>org.apache.parquet.avro.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.parquet.avro.</shadedPattern>
+ </relocation>
<relocation>
<pattern>org.apache.avro.</pattern>
<shadedPattern>org.apache.hudi.org.apache.avro.</shadedPattern>
</relocation>
<relocation>
+ <pattern>org.apache.commons.io.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.commons.io.</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.hbase.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.hbase.</shadedPattern>
+ <excludes>
+ <exclude>org.apache.hadoop.hbase.KeyValue$KeyComparator</exclude>
+ </excludes>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hbase.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hbase.</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.htrace.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.htrace.</shadedPattern>
+ </relocation>
+ <relocation>
<pattern>org.codehaus.jackson.</pattern>
<shadedPattern>org.apache.hudi.org.codehaus.jackson.</shadedPattern>
</relocation>
@@ -124,6 +155,74 @@
<pattern>com.google.protobuf.</pattern>
<shadedPattern>${trino.bundle.bootstrap.shade.prefix}com.google.protobuf.</shadedPattern>
</relocation>
+ <!-- The classes below in org.apache.hadoop.metrics2 package come from
+ hbase-hadoop-compat and hbase-hadoop2-compat, which have to be shaded one by one,
+ instead of shading all classes under org.apache.hadoop.metrics2 including ones
+ from hadoop. -->
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.MetricHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.MetricHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.MetricsExecutor</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.MetricsExecutor
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.impl.JmxCacheBuster</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.impl.JmxCacheBuster</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MetricsExecutorImpl</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MetricsExecutorImpl
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableFastCounter</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableFastCounter
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableRangeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableRangeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableSizeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableSizeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableTimeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableTimeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.util.MetricQuantile</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.util.MetricQuantile
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.util.MetricSampleQuantiles</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.util.MetricSampleQuantiles
+ </shadedPattern>
+ </relocation>
</relocations>
<createDependencyReducedPom>false</createDependencyReducedPom>
<filters>
@@ -134,6 +233,8 @@
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
<exclude>META-INF/services/javax.*</exclude>
+ <exclude>**/*.proto</exclude>
+ <exclude>hbase-webapps/**</exclude>
</excludes>
</filter>
</filters>
@@ -157,69 +258,8 @@
<!-- Hoodie -->
<dependency>
<groupId>org.apache.hudi</groupId>
- <artifactId>hudi-common</artifactId>
- <version>${project.version}</version>
- <exclusions>
- <exclusion>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-server</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-client</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <dependency>
- <groupId>org.apache.hudi</groupId>
<artifactId>hudi-hadoop-mr-bundle</artifactId>
<version>${project.version}</version>
- <exclusions>
- <exclusion>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-server</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-client</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <!-- HBase -->
- <dependency>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-common</artifactId>
- <version>${hbase.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-server</artifactId>
- <version>${hbase.version}</version>
- <scope>compile</scope>
- <exclusions>
- <exclusion>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-common</artifactId>
- </exclusion>
- <exclusion>
- <groupId>javax.servlet</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.codehaus.jackson</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.mortbay.jetty</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- <exclusion>
- <groupId>tomcat</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- </exclusions>
</dependency>
<!-- Parquet -->
diff --git a/packaging/hudi-utilities-bundle/pom.xml b/packaging/hudi-utilities-bundle/pom.xml
index 0685bae..c46a6d7 100644
--- a/packaging/hudi-utilities-bundle/pom.xml
+++ b/packaging/hudi-utilities-bundle/pom.xml
@@ -86,6 +86,7 @@
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>META-INF/services/org.apache.spark.sql.sources.DataSourceRegister</resource>
</transformer>
+ <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
</transformers>
<artifactSet>
<includes>
@@ -149,13 +150,21 @@
<include>org.apache.hbase:hbase-client</include>
<include>org.apache.hbase:hbase-common</include>
- <include>org.apache.hbase:hbase-protocol</include>
+ <include>org.apache.hbase:hbase-hadoop-compat</include>
+ <include>org.apache.hbase:hbase-hadoop2-compat</include>
+ <include>org.apache.hbase:hbase-metrics</include>
+ <include>org.apache.hbase:hbase-metrics-api</include>
+ <include>org.apache.hbase:hbase-protocol-shaded</include>
<include>org.apache.hbase:hbase-server</include>
- <include>org.apache.htrace:htrace-core</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-miscellaneous</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-netty</include>
+ <include>org.apache.hbase.thirdparty:hbase-shaded-protobuf</include>
+ <include>org.apache.htrace:htrace-core4</include>
<include>org.apache.curator:curator-framework</include>
<include>org.apache.curator:curator-client</include>
<include>org.apache.curator:curator-recipes</include>
<include>commons-codec:commons-codec</include>
+ <include>commons-io:commons-io</include>
</includes>
</artifactSet>
<relocations>
@@ -172,6 +181,25 @@
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hive.jdbc.</shadedPattern>
</relocation>
<relocation>
+ <pattern>org.apache.commons.io.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.commons.io.</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.hbase.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.hbase.</shadedPattern>
+ <excludes>
+ <exclude>org.apache.hadoop.hbase.KeyValue$KeyComparator</exclude>
+ </excludes>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hbase.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hbase.</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.htrace.</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.htrace.</shadedPattern>
+ </relocation>
+ <relocation>
<pattern>org.apache.hadoop.hive.metastore.</pattern>
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.metastore.</shadedPattern>
</relocation>
@@ -207,6 +235,74 @@
<pattern>org.eclipse.jetty.</pattern>
<shadedPattern>org.apache.hudi.org.eclipse.jetty.</shadedPattern>
</relocation>
+ <!-- The classes below in org.apache.hadoop.metrics2 package come from
+ hbase-hadoop-compat and hbase-hadoop2-compat, which have to be shaded one by one,
+ instead of shading all classes under org.apache.hadoop.metrics2 including ones
+ from hadoop. -->
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.MetricHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.MetricHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.MetricsExecutor</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.MetricsExecutor
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.impl.JmxCacheBuster</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.impl.JmxCacheBuster</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MetricsExecutorImpl</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MetricsExecutorImpl
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableFastCounter</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableFastCounter
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableRangeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableRangeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableSizeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableSizeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.lib.MutableTimeHistogram</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableTimeHistogram
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.util.MetricQuantile</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.util.MetricQuantile
+ </shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.metrics2.util.MetricSampleQuantiles</pattern>
+ <shadedPattern>org.apache.hudi.org.apache.hadoop.metrics2.util.MetricSampleQuantiles
+ </shadedPattern>
+ </relocation>
</relocations>
<filters>
<filter>
@@ -216,6 +312,8 @@
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
<exclude>META-INF/services/javax.*</exclude>
+ <exclude>**/*.proto</exclude>
+ <exclude>hbase-webapps/**</exclude>
</excludes>
</filter>
</filters>
@@ -339,51 +437,6 @@
<scope>compile</scope>
</dependency>
- <!-- Hbase -->
- <dependency>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-common</artifactId>
- <version>${hbase.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-server</artifactId>
- <version>${hbase.version}</version>
- <scope>compile</scope>
- <exclusions>
- <exclusion>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-common</artifactId>
- </exclusion>
- <exclusion>
- <groupId>javax.servlet</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.codehaus.jackson</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.mortbay.jetty</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- <exclusion>
- <groupId>tomcat</groupId>
- <artifactId>*</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <dependency>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-client</artifactId>
- <version>${hbase.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-protocol</artifactId>
- <version>${hbase.version}</version>
- </dependency>
-
<!-- zookeeper -->
<dependency>
<groupId>org.apache.curator</groupId>
diff --git a/pom.xml b/pom.xml
index c61d5ef..0c67085 100644
--- a/pom.xml
+++ b/pom.xml
@@ -103,7 +103,7 @@
<log4j.test.version>2.17.0</log4j.test.version>
<slf4j.version>1.7.30</slf4j.version>
<joda.version>2.9.9</joda.version>
- <hadoop.version>2.7.3</hadoop.version>
+ <hadoop.version>2.10.1</hadoop.version>
<hive.groupid>org.apache.hive</hive.groupid>
<hive.version>2.3.1</hive.version>
<hive.exec.classifier>core</hive.exec.classifier>
@@ -138,7 +138,8 @@
<thrift.version>0.12.0</thrift.version>
<jetty.version>9.4.15.v20190215</jetty.version>
<htrace.version>3.1.0-incubating</htrace.version>
- <hbase.version>1.2.3</hbase.version>
+ <hbase.version>2.4.9</hbase.version>
+ <hbase-thirdparty.version>3.5.1</hbase-thirdparty.version>
<codehaus-jackson.version>1.9.13</codehaus-jackson.version>
<h2.version>1.4.199</h2.version>
<awaitility.version>3.1.2</awaitility.version>
@@ -170,6 +171,7 @@
<proto.version>3.17.3</proto.version>
<protoc.version>3.11.4</protoc.version>
<dynamodb.lockclient.version>1.1.0</dynamodb.lockclient.version>
+ <zookeeper.version>3.5.7</zookeeper.version>
<dynamodb-local.port>8000</dynamodb-local.port>
<dynamodb-local.endpoint>http://localhost:${dynamodb-local.port}</dynamodb-local.endpoint>
</properties>
@@ -1529,7 +1531,7 @@
<link>https://docs.spring.io/spring-shell/docs/1.2.0.RELEASE</link>
<link>https://fasterxml.github.io/jackson-databind/javadoc/2.6</link>
<link>https://hadoop.apache.org/docs/r${hadoop.version}/api</link>
- <link>https://hbase.apache.org/1.2/apidocs</link>
+ <link>https://hbase.apache.org/2.4/apidocs</link>
<link>https://hive.apache.org/javadocs/r2.3.6/api</link>
<link>https://javadoc.io/static/io.javalin/javalin/2.3.0</link>
<link>https://javadoc.io/doc/org.apache.parquet/parquet-avro/${parquet.version}</link>