You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kylin.apache.org by li...@apache.org on 2016/11/16 10:47:12 UTC
[1/3] kylin git commit: KYLIN-1851 Change TrieDictionary to
TrieDictionaryForest to reduce the peek memory usage
Repository: kylin
Updated Branches:
refs/heads/master 9410b015b -> 350547e6e
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/FactDistinctColumnsReducer2.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/FactDistinctColumnsReducer2.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/FactDistinctColumnsReducer2.java
new file mode 100644
index 0000000..b5aeef6
--- /dev/null
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/FactDistinctColumnsReducer2.java
@@ -0,0 +1,254 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package org.apache.kylin.engine.mr.steps.fdc2;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.kylin.common.KylinConfig;
+import org.apache.kylin.common.util.ByteArray;
+import org.apache.kylin.common.util.Bytes;
+import org.apache.kylin.cube.CubeInstance;
+import org.apache.kylin.cube.CubeManager;
+import org.apache.kylin.cube.model.CubeDesc;
+import org.apache.kylin.engine.mr.KylinReducer;
+import org.apache.kylin.engine.mr.common.AbstractHadoopJob;
+import org.apache.kylin.engine.mr.common.BatchConstants;
+import org.apache.kylin.engine.mr.common.CubeStatsWriter;
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounter;
+import org.apache.kylin.metadata.model.TblColRef;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+/**
+ */
+public class FactDistinctColumnsReducer2 extends KylinReducer<SelfDefineSortableKey, Text, NullWritable, Text> {
+
+ private List<TblColRef> columnList;
+ private String statisticsOutput = null;
+ private List<Long> baseCuboidRowCountInMappers;
+ protected Map<Long, HyperLogLogPlusCounter> cuboidHLLMap = null;
+ protected long baseCuboidId;
+ protected CubeDesc cubeDesc;
+ private long totalRowsBeforeMerge = 0;
+ private int samplingPercentage;
+ private List<ByteArray> colValues;
+ private TblColRef col = null;
+ private boolean isStatistics = false;
+ private boolean isPartitionCol = false;
+ private KylinConfig cubeConfig;
+ protected static final Logger logger = LoggerFactory.getLogger(FactDistinctColumnsReducer2.class);
+
+ @Override
+ protected void setup(Context context) throws IOException {
+ super.bindCurrentConfiguration(context.getConfiguration());
+
+ Configuration conf = context.getConfiguration();
+ KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();
+ String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME);
+ CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
+ cubeConfig = cube.getConfig();
+ cubeDesc = cube.getDescriptor();
+ columnList = CubeManager.getInstance(config).getAllDictColumnsOnFact(cubeDesc);
+
+ boolean collectStatistics = Boolean.parseBoolean(conf.get(BatchConstants.CFG_STATISTICS_ENABLED));
+ int numberOfTasks = context.getNumReduceTasks();
+ int taskId = context.getTaskAttemptID().getTaskID().getId();
+
+ if (collectStatistics && (taskId == numberOfTasks - 1)) {
+ // hll
+ isStatistics = true;
+ statisticsOutput = conf.get(BatchConstants.CFG_STATISTICS_OUTPUT);
+ baseCuboidRowCountInMappers = Lists.newArrayList();
+ cuboidHLLMap = Maps.newHashMap();
+ samplingPercentage = Integer.parseInt(context.getConfiguration().get(BatchConstants.CFG_STATISTICS_SAMPLING_PERCENT));
+ } else if (collectStatistics && (taskId == numberOfTasks - 2)) {
+ // partition col
+ isStatistics = false;
+ isPartitionCol = true;
+ col = cubeDesc.getModel().getPartitionDesc().getPartitionDateColumnRef();
+ colValues = Lists.newLinkedList();
+ } else {
+ // col
+ isStatistics = false;
+ isPartitionCol = false;
+ col = columnList.get(taskId);
+ colValues = Lists.newLinkedList();
+ }
+ }
+
+ @Override
+ protected void doReduce(SelfDefineSortableKey skey, Iterable<Text> values, Context context) throws IOException, InterruptedException {
+ Text key = skey.getText();
+ if (isStatistics == true) {
+ // for hll
+ long cuboidId = Bytes.toLong(key.getBytes(), 1, Bytes.SIZEOF_LONG);
+ for (Text value : values) {
+ HyperLogLogPlusCounter hll = new HyperLogLogPlusCounter(cubeConfig.getCubeStatsHLLPrecision());
+ ByteBuffer bf = ByteBuffer.wrap(value.getBytes(), 0, value.getLength());
+ hll.readRegisters(bf);
+
+ totalRowsBeforeMerge += hll.getCountEstimate();
+
+ if (cuboidId == baseCuboidId) {
+ baseCuboidRowCountInMappers.add(hll.getCountEstimate());
+ }
+
+ if (cuboidHLLMap.get(cuboidId) != null) {
+ cuboidHLLMap.get(cuboidId).merge(hll);
+ } else {
+ cuboidHLLMap.put(cuboidId, hll);
+ }
+ }
+ } else if (isPartitionCol == true) {
+ // for partition col min/max value
+ ByteArray value = new ByteArray(Bytes.copy(key.getBytes(), 1, key.getLength() - 1));
+ if (colValues.size() > 1) {
+ colValues.set(1, value);
+ } else {
+ colValues.add(value);
+ }
+ } else {
+ colValues.add(new ByteArray(Bytes.copy(key.getBytes(), 1, key.getLength() - 1)));
+ if (colValues.size() == 1000000) { //spill every 1 million
+ logger.info("spill values to disk...");
+ outputDistinctValues(col, colValues, context);
+ colValues.clear();
+ }
+ }
+ }
+
+ private void outputDistinctValues(TblColRef col, Collection<ByteArray> values, Context context) throws IOException {
+ final Configuration conf = context.getConfiguration();
+ final FileSystem fs = FileSystem.get(conf);
+ final String outputPath = conf.get(BatchConstants.CFG_OUTPUT_PATH);
+ final Path outputFile = new Path(outputPath, col.getName());
+
+ FSDataOutputStream out = null;
+ try {
+ if (fs.exists(outputFile)) {
+ out = fs.append(outputFile);
+ logger.info("append file " + outputFile);
+ } else {
+ out = fs.create(outputFile);
+ logger.info("create file " + outputFile);
+ }
+
+ for (ByteArray value : values) {
+ out.write(value.array(), value.offset(), value.length());
+ out.write('\n');
+ }
+ } finally {
+ IOUtils.closeQuietly(out);
+ }
+ }
+
+ @Override
+ protected void doCleanup(Context context) throws IOException, InterruptedException {
+
+ if (isStatistics == false) {
+ if (colValues.size() > 0) {
+ outputDistinctValues(col, colValues, context);
+ colValues.clear();
+ }
+ } else {
+ //output the hll info;
+ long grandTotal = 0;
+ for (HyperLogLogPlusCounter hll : cuboidHLLMap.values()) {
+ grandTotal += hll.getCountEstimate();
+ }
+ double mapperOverlapRatio = grandTotal == 0 ? 0 : (double) totalRowsBeforeMerge / grandTotal;
+
+ int mapperNumber = baseCuboidRowCountInMappers.size();
+
+ writeMapperAndCuboidStatistics(context); // for human check
+ CubeStatsWriter.writeCuboidStatistics(context.getConfiguration(), new Path(statisticsOutput), //
+ cuboidHLLMap, samplingPercentage, mapperNumber, mapperOverlapRatio);
+ }
+ }
+
+ private void writeMapperAndCuboidStatistics(Context context) throws IOException {
+ Configuration conf = context.getConfiguration();
+ FileSystem fs = FileSystem.get(conf);
+ FSDataOutputStream out = fs.create(new Path(statisticsOutput, BatchConstants.CFG_STATISTICS_CUBE_ESTIMATION_FILENAME));
+
+ try {
+ String msg;
+
+ List<Long> allCuboids = Lists.newArrayList();
+ allCuboids.addAll(cuboidHLLMap.keySet());
+ Collections.sort(allCuboids);
+
+ msg = "Total cuboid number: \t" + allCuboids.size();
+ writeLine(out, msg);
+ msg = "Samping percentage: \t" + samplingPercentage;
+ writeLine(out, msg);
+
+ writeLine(out, "The following statistics are collected based on sampling data.");
+ writeLine(out, "Number of Mappers: " + baseCuboidRowCountInMappers.size());
+ for (int i = 0; i < baseCuboidRowCountInMappers.size(); i++) {
+ if (baseCuboidRowCountInMappers.get(i) > 0) {
+ msg = "Base Cuboid in Mapper " + i + " row count: \t " + baseCuboidRowCountInMappers.get(i);
+ writeLine(out, msg);
+ }
+ }
+
+ long grantTotal = 0;
+ for (long i : allCuboids) {
+ grantTotal += cuboidHLLMap.get(i).getCountEstimate();
+ msg = "Cuboid " + i + " row count is: \t " + cuboidHLLMap.get(i).getCountEstimate();
+ writeLine(out, msg);
+ }
+
+ msg = "Sum of all the cube segments (before merge) is: \t " + totalRowsBeforeMerge;
+ writeLine(out, msg);
+
+ msg = "After merge, the cube has row count: \t " + grantTotal;
+ writeLine(out, msg);
+
+ if (grantTotal > 0) {
+ msg = "The mapper overlap ratio is: \t" + totalRowsBeforeMerge / grantTotal;
+ writeLine(out, msg);
+ }
+
+ } finally {
+ IOUtils.closeQuietly(out);
+ }
+ }
+
+ private void writeLine(FSDataOutputStream out, String msg) throws IOException {
+ out.write(msg.getBytes());
+ out.write('\n');
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/SelfDefineSortableKey.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/SelfDefineSortableKey.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/SelfDefineSortableKey.java
index cadbcbf..a3351fa 100644
--- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/SelfDefineSortableKey.java
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/SelfDefineSortableKey.java
@@ -95,7 +95,8 @@ public class SelfDefineSortableKey implements WritableComparable<SelfDefineSorta
@Override
public void readFields(DataInput dataInput) throws IOException {
- dataInput.readByte();
+ this.typeId = dataInput.readByte();
+ this.text = new Text();
text.readFields(dataInput);
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/MergeCuboidMapperTest.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/MergeCuboidMapperTest.java b/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/MergeCuboidMapperTest.java
index 01d47b8..df68f76 100644
--- a/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/MergeCuboidMapperTest.java
+++ b/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/MergeCuboidMapperTest.java
@@ -78,7 +78,7 @@ public class MergeCuboidMapperTest extends LocalFileMetadataTestCase {
values.add(new byte[] { 102, 102, 102 });
Dictionary<?> dict = DictionaryGenerator.buildDictionary(DataType.getType(newDictInfo.getDataType()), new IterableDictionaryValueEnumerator(values));
dictionaryManager.trySaveNewDict(dict, newDictInfo);
- ((TrieDictionary) dict).dump(System.out);
+ dict.dump(System.out);
return newDictInfo;
}
@@ -130,7 +130,7 @@ public class MergeCuboidMapperTest extends LocalFileMetadataTestCase {
values.add(new byte[] { 98, 98, 98 });
Dictionary<?> dict = DictionaryGenerator.buildDictionary(DataType.getType(newDictInfo.getDataType()), new IterableDictionaryValueEnumerator(values));
dictionaryManager.trySaveNewDict(dict, newDictInfo);
- ((TrieDictionary) dict).dump(System.out);
+ dict.dump(System.out);
segment.putDictResPath(lfn, newDictInfo.getResourcePath());
segment.putDictResPath(lsi, sharedDict.getResourcePath());
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/NumberDictionaryForestTest.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/NumberDictionaryForestTest.java b/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/NumberDictionaryForestTest.java
index 70197ac..554ee9c 100644
--- a/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/NumberDictionaryForestTest.java
+++ b/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/NumberDictionaryForestTest.java
@@ -35,37 +35,37 @@ import static org.junit.Assert.assertTrue;
public class NumberDictionaryForestTest {
@Test
- public void testNumberDictionaryForestLong(){
+ public void testNumberDictionaryForestLong() {
List<String> list = randomLongData(10);
- testData(list,TypeFlag.INTEGER_FAMILY_TYPE);
+ testData(list, TypeFlag.INTEGER_FAMILY_TYPE);
}
@Test
- public void testNumberDictionaryForestDouble(){
+ public void testNumberDictionaryForestDouble() {
List<String> list = randomDoubleData(10);
- testData(list,TypeFlag.DOUBLE_FAMILY_TYPE);
+ testData(list, TypeFlag.DOUBLE_FAMILY_TYPE);
}
- private void testData(List<String> list,TypeFlag flag){
+ private void testData(List<String> list, TypeFlag flag) {
//stimulate map-reduce job
- ArrayList<SelfDefineSortableKey> keyList = createKeyList(list,(byte)flag.ordinal());
+ ArrayList<SelfDefineSortableKey> keyList = createKeyList(list, (byte) flag.ordinal());
Collections.sort(keyList);
//build tree
NumberDictionaryForestBuilder<String> b = new NumberDictionaryForestBuilder<String>(
- new StringBytesConverter(),0);
- TrieDictionaryForestBuilder.MaxTrieTreeSize = 0;
- for(SelfDefineSortableKey key : keyList){
+ new StringBytesConverter(), 0, 0);
+
+ for (SelfDefineSortableKey key : keyList) {
String fieldValue = printKey(key);
b.addValue(fieldValue);
}
NumberDictionaryForest<String> dict = b.build();
dict.dump(System.out);
ArrayList<Integer> resultIds = new ArrayList<>();
- for(SelfDefineSortableKey key : keyList){
+ for (SelfDefineSortableKey key : keyList) {
String fieldValue = getFieldValue(key);
resultIds.add(dict.getIdFromValue(fieldValue));
- assertEquals(fieldValue,dict.getValueFromId(dict.getIdFromValue(fieldValue)));
+ assertEquals(fieldValue, dict.getValueFromId(dict.getIdFromValue(fieldValue)));
}
assertTrue(isIncreasedOrder(resultIds, new Comparator<Integer>() {
@Override
@@ -83,7 +83,7 @@ public class NumberDictionaryForestTest {
testData.add("100");
//TrieDictionaryForestBuilder.MaxTrieTreeSize = 0;
NumberDictionaryForestBuilder<String> b = new NumberDictionaryForestBuilder<String>(new StringBytesConverter());
- for(String str : testData)
+ for (String str : testData)
b.addValue(str);
NumberDictionaryForest<String> dict = b.build();
dict = testSerialize(dict);
@@ -94,20 +94,20 @@ public class NumberDictionaryForestTest {
}
@Test
- public void testVerySmallDouble(){
+ public void testVerySmallDouble() {
List<String> testData = new ArrayList<>();
- testData.add(-1.0+"");
- testData.add(Double.MIN_VALUE+"");
+ testData.add(-1.0 + "");
+ testData.add(Double.MIN_VALUE + "");
testData.add("1.01");
testData.add("2.0");
NumberDictionaryForestBuilder<String> b = new NumberDictionaryForestBuilder<String>(new StringBytesConverter());
- for(String str : testData)
+ for (String str : testData)
b.addValue(str);
NumberDictionaryForest<String> dict = b.build();
dict.dump(System.out);
NumberDictionaryBuilder<String> b2 = new NumberDictionaryBuilder<>(new StringBytesConverter());
- for(String str : testData)
+ for (String str : testData)
b2.addValue(str);
NumberDictionary<String> dict2 = b2.build(0);
dict2.dump(System.out);
@@ -133,31 +133,31 @@ public class NumberDictionaryForestTest {
}
}
- private List<String> randomLongData(int count){
+ private List<String> randomLongData(int count) {
Random rand = new Random(System.currentTimeMillis());
ArrayList<String> list = new ArrayList<>();
- for(int i=0;i<count;i++){
- list.add(rand.nextLong()+"");
+ for (int i = 0; i < count; i++) {
+ list.add(rand.nextLong() + "");
}
- list.add(Long.MAX_VALUE+"");
- list.add(Long.MIN_VALUE+"");
+ list.add(Long.MAX_VALUE + "");
+ list.add(Long.MIN_VALUE + "");
return list;
}
- private List<String> randomDoubleData(int count){
+ private List<String> randomDoubleData(int count) {
Random rand = new Random(System.currentTimeMillis());
ArrayList<String> list = new ArrayList<>();
- for(int i=0;i<count;i++){
- list.add(rand.nextDouble()+"");
+ for (int i = 0; i < count; i++) {
+ list.add(rand.nextDouble() + "");
}
list.add("-1");
return list;
}
- private List<String> randomStringData(int count){
+ private List<String> randomStringData(int count) {
Random rand = new Random(System.currentTimeMillis());
ArrayList<String> list = new ArrayList<>();
- for(int i=0;i<count;i++){
+ for (int i = 0; i < count; i++) {
list.add(UUID.randomUUID().toString());
}
list.add("123");
@@ -165,47 +165,47 @@ public class NumberDictionaryForestTest {
return list;
}
- private ArrayList<SelfDefineSortableKey> createKeyList(List<String> strNumList,byte typeFlag){
+ private ArrayList<SelfDefineSortableKey> createKeyList(List<String> strNumList, byte typeFlag) {
int partationId = 0;
ArrayList<SelfDefineSortableKey> keyList = new ArrayList<>();
- for(String str : strNumList){
+ for (String str : strNumList) {
ByteBuffer keyBuffer = ByteBuffer.allocate(4096);
int offset = keyBuffer.position();
keyBuffer.put(Bytes.toBytes(partationId)[3]);
keyBuffer.put(Bytes.toBytes(str));
//System.out.println(Arrays.toString(keyBuffer.array()));
- byte[] valueField = Bytes.copy(keyBuffer.array(),1,keyBuffer.position()-offset-1);
+ byte[] valueField = Bytes.copy(keyBuffer.array(), 1, keyBuffer.position() - offset - 1);
//System.out.println("new string:"+new String(valueField));
//System.out.println("arrays toString:"+Arrays.toString(valueField));
Text outputKey = new Text();
- outputKey.set(keyBuffer.array(),offset,keyBuffer.position()-offset);
- SelfDefineSortableKey sortableKey = new SelfDefineSortableKey(typeFlag,outputKey);
+ outputKey.set(keyBuffer.array(), offset, keyBuffer.position() - offset);
+ SelfDefineSortableKey sortableKey = new SelfDefineSortableKey(typeFlag, outputKey);
keyList.add(sortableKey);
}
return keyList;
}
- private String printKey(SelfDefineSortableKey key){
+ private String printKey(SelfDefineSortableKey key) {
byte[] data = key.getText().getBytes();
- byte[] fieldValue = Bytes.copy(data,1,data.length-1);
- System.out.println("type flag:"+key.getTypeId()+" fieldValue:"+new String(fieldValue));
+ byte[] fieldValue = Bytes.copy(data, 1, data.length - 1);
+ System.out.println("type flag:" + key.getTypeId() + " fieldValue:" + new String(fieldValue));
return new String(fieldValue);
}
- private String getFieldValue(SelfDefineSortableKey key){
+ private String getFieldValue(SelfDefineSortableKey key) {
byte[] data = key.getText().getBytes();
- byte[] fieldValue = Bytes.copy(data,1,data.length-1);
+ byte[] fieldValue = Bytes.copy(data, 1, data.length - 1);
return new String(fieldValue);
}
- private<T> boolean isIncreasedOrder(List<T> list, Comparator<T> comp){
+ private <T> boolean isIncreasedOrder(List<T> list, Comparator<T> comp) {
int flag;
T previous = null;
- for(T t : list){
- if(previous == null) previous = t;
- else{
- flag = comp.compare(previous,t);
- if(flag > 0) return false;
+ for (T t : list) {
+ if (previous == null) previous = t;
+ else {
+ flag = comp.compare(previous, t);
+ if (flag > 0) return false;
previous = t;
}
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/examples/test_case_data/sandbox/kylin.properties
----------------------------------------------------------------------
diff --git a/examples/test_case_data/sandbox/kylin.properties b/examples/test_case_data/sandbox/kylin.properties
index 0988536..93b86c9 100644
--- a/examples/test_case_data/sandbox/kylin.properties
+++ b/examples/test_case_data/sandbox/kylin.properties
@@ -63,7 +63,7 @@ kylin.job.retry=0
# you will have to specify kylin.job.remote.cli.hostname, kylin.job.remote.cli.username and kylin.job.remote.cli.password
# It should not be set to "true" unless you're NOT running Kylin.sh on a hadoop client machine
# (Thus kylin instance has to ssh to another real hadoop client machine to execute hbase,hive,hadoop commands)
-kylin.job.run.as.remote.cmd=false
+kylin.job.run.as.remote.cmd=true
# Only necessary when kylin.job.run.as.remote.cmd=true
kylin.job.remote.cli.hostname=sandbox
@@ -160,4 +160,7 @@ kylin.web.contact_mail=
kylin.query.metrics.percentiles.intervals=60, 360, 3600
# Env DEV|QA|PROD
-deploy.env=DEV
\ No newline at end of file
+deploy.env=DEV
+
+#default 500MB
+kylin.dictionary.forest.trie.size.max_mb=500
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/kylin-it/src/test/java/org/apache/kylin/query/KylinTestBase.java
----------------------------------------------------------------------
diff --git a/kylin-it/src/test/java/org/apache/kylin/query/KylinTestBase.java b/kylin-it/src/test/java/org/apache/kylin/query/KylinTestBase.java
index 52461c4..e1303e4 100644
--- a/kylin-it/src/test/java/org/apache/kylin/query/KylinTestBase.java
+++ b/kylin-it/src/test/java/org/apache/kylin/query/KylinTestBase.java
@@ -491,7 +491,10 @@ public class KylinTestBase {
ITable h2Table = executeQuery(h2Conn, queryName, sql, needSort);
try {
+ //compare before junit
// compare the result
+ System.out.println("h2 Table rows count:"+h2Table.getRowCount());
+ System.out.println("kylin Table rows count:"+kylinTable.getRowCount());
Assertion.assertEquals(h2Table, kylinTable);
} catch (Throwable t) {
printInfo("execAndCompQuery failed on: " + sqlFile.getAbsolutePath());
[2/3] kylin git commit: KYLIN-1851 Change TrieDictionary to
TrieDictionaryForest to reduce the peek memory usage
Posted by li...@apache.org.
KYLIN-1851 Change TrieDictionary to TrieDictionaryForest to reduce the peek memory usage
Signed-off-by: Li Yang <li...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/kylin/repo
Commit: http://git-wip-us.apache.org/repos/asf/kylin/commit/734a4f98
Tree: http://git-wip-us.apache.org/repos/asf/kylin/tree/734a4f98
Diff: http://git-wip-us.apache.org/repos/asf/kylin/diff/734a4f98
Branch: refs/heads/master
Commit: 734a4f98b912a6f45957c3435b4f5be0cf54f4e8
Parents: 9410b01
Author: xiefan46 <95...@qq.com>
Authored: Mon Nov 7 14:37:22 2016 +0800
Committer: Li Yang <li...@apache.org>
Committed: Wed Nov 16 18:03:59 2016 +0800
----------------------------------------------------------------------
build/conf/kylin.properties | 4 +
.../apache/kylin/common/KylinConfigBase.java | 36 +-
.../org/apache/kylin/cube/model/CubeDesc.java | 35 +-
.../org/apache/kylin/cube/CubeDescTest.java | 28 +-
.../apache/kylin/dict/DictionaryGenerator.java | 18 +-
.../apache/kylin/dict/DictionaryManager.java | 3 +-
.../kylin/dict/NumberDictionaryForest.java | 6 +
.../dict/NumberDictionaryForestBuilder.java | 12 +
.../org/apache/kylin/dict/TrieDictionary.java | 7 +-
.../apache/kylin/dict/TrieDictionaryForest.java | 95 ++++-
.../kylin/dict/TrieDictionaryForestBuilder.java | 76 +++-
.../apache/kylin/dict/lookup/SnapshotTable.java | 2 +
.../apache/kylin/dict/NumberDictionaryTest.java | 6 +-
.../kylin/dict/TrieDictionaryForestTest.java | 373 +++++++++++++++++--
.../kylin/dict/lookup/LookupTableTest.java | 9 +
.../engine/mr/DFSFileTableSortedReader.java | 249 +++++++++++++
.../kylin/engine/mr/JobBuilderSupport.java | 4 +-
.../fdc2/FactDistinctColumnsCombiner2.java | 5 +-
.../mr/steps/fdc2/FactDistinctColumnsJob2.java | 3 +-
.../fdc2/FactDistinctColumnsMapperBase2.java | 1 +
.../steps/fdc2/FactDistinctColumnsReducer2.java | 254 +++++++++++++
.../mr/steps/fdc2/SelfDefineSortableKey.java | 3 +-
.../engine/mr/steps/MergeCuboidMapperTest.java | 4 +-
.../mr/steps/NumberDictionaryForestTest.java | 86 ++---
.../test_case_data/sandbox/kylin.properties | 7 +-
.../org/apache/kylin/query/KylinTestBase.java | 3 +
26 files changed, 1174 insertions(+), 155 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/build/conf/kylin.properties
----------------------------------------------------------------------
diff --git a/build/conf/kylin.properties b/build/conf/kylin.properties
index e935ebf..715b7a6 100644
--- a/build/conf/kylin.properties
+++ b/build/conf/kylin.properties
@@ -131,6 +131,10 @@ kylin.dictionary.max.cardinality=5000000
kylin.table.snapshot.max_mb=300
+#max size for one trie in TrieDictionaryForest (default 500MB)
+
+
+
### QUERY ###
kylin.query.scan.threshold=10000000
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java
----------------------------------------------------------------------
diff --git a/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java b/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java
index 6d9eef4..300f727 100644
--- a/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java
+++ b/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java
@@ -18,6 +18,13 @@
package org.apache.kylin.common;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
+import org.apache.commons.lang.StringUtils;
+import org.apache.kylin.common.util.CliCommandExecutor;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
@@ -28,14 +35,6 @@ import java.util.SortedSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.commons.lang.StringUtils;
-import org.apache.kylin.common.util.CliCommandExecutor;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.google.common.collect.Maps;
-import com.google.common.collect.Sets;
-
/**
* An abstract class to encapsulate access to a set of 'properties'.
* Subclass can override methods in this class to extend the content of the 'properties',
@@ -177,19 +176,25 @@ abstract public class KylinConfigBase implements Serializable {
setProperty("kylin.storage.url", storageUrl);
}
- /** was for route to hive, not used any more */
+ /**
+ * was for route to hive, not used any more
+ */
@Deprecated
public String getHiveUrl() {
return getOptional("hive.url", "");
}
- /** was for route to hive, not used any more */
+ /**
+ * was for route to hive, not used any more
+ */
@Deprecated
public String getHiveUser() {
return getOptional("hive.user", "");
}
- /** was for route to hive, not used any more */
+ /**
+ * was for route to hive, not used any more
+ */
@Deprecated
public String getHivePassword() {
return getOptional("hive.password", "");
@@ -205,7 +210,7 @@ abstract public class KylinConfigBase implements Serializable {
public String[] getRealizationProviders() {
return getOptionalStringArray("kylin.realization.providers", //
- new String[] { "org.apache.kylin.cube.CubeManager", "org.apache.kylin.storage.hybrid.HybridManager" });
+ new String[]{"org.apache.kylin.cube.CubeManager", "org.apache.kylin.storage.hybrid.HybridManager"});
}
public CliCommandExecutor getCliCommandExecutor() throws IOException {
@@ -464,6 +469,10 @@ abstract public class KylinConfigBase implements Serializable {
return Integer.parseInt(getOptional("kylin.table.snapshot.max_mb", "300"));
}
+ public int getTrieDictionaryForestMaxTrieSizeMB() {
+ return Integer.parseInt(getOptional("kylin.dictionary.forest.trie.size.max_mb", "500"));
+ }
+
public int getHBaseRegionCountMin() {
return Integer.parseInt(getOptional("kylin.hbase.region.count.min", "1"));
}
@@ -582,7 +591,7 @@ abstract public class KylinConfigBase implements Serializable {
}
public int[] getQueryMetricsPercentilesIntervals() {
- String[] dft = { "60", "300", "3600" };
+ String[] dft = {"60", "300", "3600"};
return getOptionalIntArray("kylin.query.metrics.percentiles.intervals", dft);
}
@@ -600,6 +609,7 @@ abstract public class KylinConfigBase implements Serializable {
/**
* HBase region cut size, in GB
+ *
* @return
*/
public float getKylinHBaseRegionCut() {
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/core-cube/src/main/java/org/apache/kylin/cube/model/CubeDesc.java
----------------------------------------------------------------------
diff --git a/core-cube/src/main/java/org/apache/kylin/cube/model/CubeDesc.java b/core-cube/src/main/java/org/apache/kylin/cube/model/CubeDesc.java
index 3160085..c9ebff8 100644
--- a/core-cube/src/main/java/org/apache/kylin/cube/model/CubeDesc.java
+++ b/core-cube/src/main/java/org/apache/kylin/cube/model/CubeDesc.java
@@ -196,7 +196,7 @@ public class CubeDesc extends RootPersistentEntity implements IEngineAware {
public Set<TblColRef> listAllColumns() {
return allColumns;
}
-
+
public Set<ColumnDesc> listAllColumnDescs() {
return allColumnDescs;
}
@@ -209,7 +209,7 @@ public class CubeDesc extends RootPersistentEntity implements IEngineAware {
}
/**
- * @return dimension columns excluding derived
+ * @return dimension columns excluding derived
*/
public List<TblColRef> listDimensionColumnsExcludingDerived(boolean alsoExcludeExtendedCol) {
List<TblColRef> result = new ArrayList<TblColRef>();
@@ -473,8 +473,9 @@ public class CubeDesc extends RootPersistentEntity implements IEngineAware {
/**
* this method is to prevent malicious metadata change by checking the saved signature
* with the calculated signature.
- *
+ * <p>
* if you're comparing two cube descs, prefer to use consistentWith()
+ *
* @return
*/
public boolean checkSignature() {
@@ -558,7 +559,7 @@ public class CubeDesc extends RootPersistentEntity implements IEngineAware {
checkState(rowkey.getRowKeyColumns().length == dimCols.size(), "RowKey columns count (%d) doesn't match dimensions columns count (%d)", rowkey.getRowKeyColumns().length, dimCols.size());
initDictionaryDesc();
-
+
for (TblColRef col : allColumns) {
allColumnDescs.add(col.getColumnDesc());
}
@@ -609,7 +610,7 @@ public class CubeDesc extends RootPersistentEntity implements IEngineAware {
}
Collections.sort(notIncluded);
logger.error("Aggregation group " + index + " Include dimensions not containing all the used dimensions");
- throw new IllegalStateException("Aggregation group " + index + " 'includes' dimensions not include all the dimensions:" + notIncluded.toString());
+ throw new IllegalStateException("Aggregation group " + index + " 'includes' dimensions not include all the dimensions:" + notIncluded.toString());
}
Set<String> normalDims = new TreeSet<>(String.CASE_INSENSITIVE_ORDER);
@@ -754,7 +755,7 @@ public class CubeDesc extends RootPersistentEntity implements IEngineAware {
int find = ArrayUtils.indexOf(dimColArray, fk[i]);
if (find >= 0) {
TblColRef derivedCol = initDimensionColRef(pk[i]);
- initDerivedMap(new TblColRef[] { dimColArray[find] }, DeriveType.PK_FK, dim, new TblColRef[] { derivedCol }, null);
+ initDerivedMap(new TblColRef[]{dimColArray[find]}, DeriveType.PK_FK, dim, new TblColRef[]{derivedCol}, null);
}
}
}
@@ -775,7 +776,7 @@ public class CubeDesc extends RootPersistentEntity implements IEngineAware {
extra[i] = "";
}
}
- return new String[][] { cols, extra };
+ return new String[][]{cols, extra};
}
private void initDerivedMap(TblColRef[] hostCols, DeriveType type, DimensionDesc dimension, TblColRef[] derivedCols, String[] extra) {
@@ -994,7 +995,9 @@ public class CubeDesc extends RootPersistentEntity implements IEngineAware {
this.partitionOffsetStart = partitionOffsetStart;
}
- /** Get columns that have dictionary */
+ /**
+ * Get columns that have dictionary
+ */
public Set<TblColRef> getAllColumnsHaveDictionary() {
Set<TblColRef> result = Sets.newLinkedHashSet();
@@ -1023,7 +1026,9 @@ public class CubeDesc extends RootPersistentEntity implements IEngineAware {
return result;
}
- /** Get columns that need dictionary built on it. Note a column could reuse dictionary of another column. */
+ /**
+ * Get columns that need dictionary built on it. Note a column could reuse dictionary of another column.
+ */
public Set<TblColRef> getAllColumnsNeedDictionaryBuilt() {
Set<TblColRef> result = getAllColumnsHaveDictionary();
@@ -1040,7 +1045,9 @@ public class CubeDesc extends RootPersistentEntity implements IEngineAware {
return result;
}
- /** A column may reuse dictionary of another column, find the dict column, return same col if there's no reuse column*/
+ /**
+ * A column may reuse dictionary of another column, find the dict column, return same col if there's no reuse column
+ */
public TblColRef getDictionaryReuseColumn(TblColRef col) {
if (dictionaries == null) {
return col;
@@ -1053,7 +1060,9 @@ public class CubeDesc extends RootPersistentEntity implements IEngineAware {
return col;
}
- /** Get a column which can be used in distributing the source table */
+ /**
+ * Get a column which can be used in distributing the source table
+ */
public TblColRef getDistributedByColumn() {
Set<TblColRef> shardBy = getShardByColumns();
if (shardBy != null && shardBy.size() > 0) {
@@ -1107,9 +1116,9 @@ public class CubeDesc extends RootPersistentEntity implements IEngineAware {
}
- private Collection ensureOrder(Collection c){
+ private Collection ensureOrder(Collection c) {
TreeSet set = new TreeSet();
- for(Object o : c)
+ for (Object o : c)
set.add(o.toString());
//System.out.println("set:"+set);
return set;
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/core-cube/src/test/java/org/apache/kylin/cube/CubeDescTest.java
----------------------------------------------------------------------
diff --git a/core-cube/src/test/java/org/apache/kylin/cube/CubeDescTest.java b/core-cube/src/test/java/org/apache/kylin/cube/CubeDescTest.java
index 9ad6427..3326b24 100644
--- a/core-cube/src/test/java/org/apache/kylin/cube/CubeDescTest.java
+++ b/core-cube/src/test/java/org/apache/kylin/cube/CubeDescTest.java
@@ -89,7 +89,7 @@ public class CubeDescTest extends LocalFileMetadataTestCase {
@Test
public void testBadInit3() throws Exception {
thrown.expect(IllegalStateException.class);
- thrown.expectMessage("Aggregation group 0 'includes' dimensions not include all the dimensions:" + sortStrs(new String[] { "SELLER_ID", "META_CATEG_NAME", "LSTG_FORMAT_NAME", "LSTG_SITE_ID", "SLR_SEGMENT_CD" }));
+ thrown.expectMessage("Aggregation group 0 'includes' dimensions not include all the dimensions:" + sortStrs(new String[]{"SELLER_ID", "META_CATEG_NAME", "LSTG_FORMAT_NAME", "LSTG_SITE_ID", "SLR_SEGMENT_CD"}));
CubeDesc cubeDesc = CubeDescManager.getInstance(getTestConfig()).getCubeDesc("test_kylin_cube_with_slr_desc");
String[] temp = Arrays.asList(cubeDesc.getAggregationGroups().get(0).getIncludes()).subList(0, 3).toArray(new String[3]);
cubeDesc.getAggregationGroups().get(0).setIncludes(temp);
@@ -114,7 +114,7 @@ public class CubeDescTest extends LocalFileMetadataTestCase {
@Test
public void testBadInit5() throws Exception {
CubeDesc cubeDesc = CubeDescManager.getInstance(getTestConfig()).getCubeDesc("test_kylin_cube_with_slr_desc");
- cubeDesc.getAggregationGroups().get(0).getSelectRule().mandatory_dims = new String[] { "seller_id", "META_CATEG_NAME" };
+ cubeDesc.getAggregationGroups().get(0).getSelectRule().mandatory_dims = new String[]{"seller_id", "META_CATEG_NAME"};
cubeDesc.init(getTestConfig());
}
@@ -122,7 +122,7 @@ public class CubeDescTest extends LocalFileMetadataTestCase {
@Test
public void testBadInit6() throws Exception {
CubeDesc cubeDesc = CubeDescManager.getInstance(getTestConfig()).getCubeDesc("test_kylin_cube_with_slr_desc");
- cubeDesc.getAggregationGroups().get(0).getSelectRule().mandatory_dims = new String[] { "seller_id", "lstg_format_name" };
+ cubeDesc.getAggregationGroups().get(0).getSelectRule().mandatory_dims = new String[]{"seller_id", "lstg_format_name"};
cubeDesc.init(getTestConfig());
}
@@ -133,43 +133,43 @@ public class CubeDescTest extends LocalFileMetadataTestCase {
thrown.expectMessage("Aggregation group 0 require at least 2 dimensions in a joint");
CubeDesc cubeDesc = CubeDescManager.getInstance(getTestConfig()).getCubeDesc("test_kylin_cube_with_slr_desc");
- cubeDesc.getAggregationGroups().get(0).getSelectRule().joint_dims = new String[][] { new String[] { "lstg_format_name" } };
+ cubeDesc.getAggregationGroups().get(0).getSelectRule().joint_dims = new String[][]{new String[]{"lstg_format_name"}};
cubeDesc.init(getTestConfig());
}
@Test
public void testBadInit8() throws Exception {
- String[] strs = new String[] { "CATEG_LVL2_NAME", "META_CATEG_NAME" };
+ String[] strs = new String[]{"CATEG_LVL2_NAME", "META_CATEG_NAME"};
thrown.expect(IllegalStateException.class);
thrown.expectMessage("Aggregation group 0 hierarchy dimensions overlap with joint dimensions: " + sortStrs(strs));
CubeDesc cubeDesc = CubeDescManager.getInstance(getTestConfig()).getCubeDesc("test_kylin_cube_with_slr_desc");
- cubeDesc.getAggregationGroups().get(0).getSelectRule().joint_dims = new String[][] { new String[] { "META_CATEG_NAME", "CATEG_LVL2_NAME" } };
+ cubeDesc.getAggregationGroups().get(0).getSelectRule().joint_dims = new String[][]{new String[]{"META_CATEG_NAME", "CATEG_LVL2_NAME"}};
cubeDesc.init(getTestConfig());
}
@Test
public void testBadInit9() throws Exception {
- String[] strs = new String[] { "lstg_format_name", "META_CATEG_NAME" };
+ String[] strs = new String[]{"lstg_format_name", "META_CATEG_NAME"};
thrown.expect(IllegalStateException.class);
thrown.expectMessage("Aggregation group 0 hierarchy dimensions overlap with joint dimensions: " + sortStrs(strs));
CubeDesc cubeDesc = CubeDescManager.getInstance(getTestConfig()).getCubeDesc("test_kylin_cube_with_slr_desc");
- cubeDesc.getAggregationGroups().get(0).getSelectRule().hierarchy_dims = new String[][] { new String[] { "META_CATEG_NAME", "CATEG_LVL2_NAME", "CATEG_LVL3_NAME" }, new String[] { "lstg_format_name", "lstg_site_id" } };
- cubeDesc.getAggregationGroups().get(0).getSelectRule().joint_dims = new String[][] { new String[] { "META_CATEG_NAME", "lstg_format_name" } };
+ cubeDesc.getAggregationGroups().get(0).getSelectRule().hierarchy_dims = new String[][]{new String[]{"META_CATEG_NAME", "CATEG_LVL2_NAME", "CATEG_LVL3_NAME"}, new String[]{"lstg_format_name", "lstg_site_id"}};
+ cubeDesc.getAggregationGroups().get(0).getSelectRule().joint_dims = new String[][]{new String[]{"META_CATEG_NAME", "lstg_format_name"}};
cubeDesc.init(getTestConfig());
}
@Test
public void testBadInit10() throws Exception {
- String[] strs = new String[] { "lstg_format_name", "lstg_site_id" };
+ String[] strs = new String[]{"lstg_format_name", "lstg_site_id"};
thrown.expect(IllegalStateException.class);
thrown.expectMessage("Aggregation group 0 a dimension exist in more than one joint: " + sortStrs(strs));
CubeDesc cubeDesc = CubeDescManager.getInstance(getTestConfig()).getCubeDesc("test_kylin_cube_with_slr_desc");
- cubeDesc.getAggregationGroups().get(0).getSelectRule().joint_dims = new String[][] { new String[] { "lstg_format_name", "lstg_site_id", "slr_segment_cd" }, new String[] { "lstg_format_name", "lstg_site_id", "leaf_categ_id" } };
+ cubeDesc.getAggregationGroups().get(0).getSelectRule().joint_dims = new String[][]{new String[]{"lstg_format_name", "lstg_site_id", "slr_segment_cd"}, new String[]{"lstg_format_name", "lstg_site_id", "leaf_categ_id"}};
cubeDesc.init(getTestConfig());
}
@@ -180,19 +180,19 @@ public class CubeDescTest extends LocalFileMetadataTestCase {
thrown.expectMessage("Aggregation group 0 require at least 2 dimensions in a hierarchy.");
CubeDesc cubeDesc = CubeDescManager.getInstance(getTestConfig()).getCubeDesc("test_kylin_cube_with_slr_desc");
- cubeDesc.getAggregationGroups().get(0).getSelectRule().hierarchy_dims = new String[][] { new String[] { "META_CATEG_NAME" } };
+ cubeDesc.getAggregationGroups().get(0).getSelectRule().hierarchy_dims = new String[][]{new String[]{"META_CATEG_NAME"}};
cubeDesc.init(getTestConfig());
}
@Test
public void testBadInit12() throws Exception {
- String[] strs = new String[] { "CATEG_LVL2_NAME", "META_CATEG_NAME" };
+ String[] strs = new String[]{"CATEG_LVL2_NAME", "META_CATEG_NAME"};
thrown.expect(IllegalStateException.class);
thrown.expectMessage("Aggregation group 0 a dimension exist in more than one hierarchy: " + sortStrs(strs));
CubeDesc cubeDesc = CubeDescManager.getInstance(getTestConfig()).getCubeDesc("test_kylin_cube_with_slr_desc");
- cubeDesc.getAggregationGroups().get(0).getSelectRule().hierarchy_dims = new String[][] { new String[] { "META_CATEG_NAME", "CATEG_LVL2_NAME", "CATEG_LVL3_NAME" }, new String[] { "META_CATEG_NAME", "CATEG_LVL2_NAME" } };
+ cubeDesc.getAggregationGroups().get(0).getSelectRule().hierarchy_dims = new String[][]{new String[]{"META_CATEG_NAME", "CATEG_LVL2_NAME", "CATEG_LVL3_NAME"}, new String[]{"META_CATEG_NAME", "CATEG_LVL2_NAME"}};
cubeDesc.init(getTestConfig());
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryGenerator.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryGenerator.java b/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryGenerator.java
index 0adf40e..8695338 100644
--- a/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryGenerator.java
+++ b/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryGenerator.java
@@ -36,12 +36,12 @@ import com.google.common.base.Preconditions;
/**
* @author yangli9
*/
-@SuppressWarnings({ "rawtypes", "unchecked" })
+@SuppressWarnings({"rawtypes", "unchecked"})
public class DictionaryGenerator {
private static final Logger logger = LoggerFactory.getLogger(DictionaryGenerator.class);
- private static final String[] DATE_PATTERNS = new String[] { "yyyy-MM-dd", "yyyyMMdd" };
+ private static final String[] DATE_PATTERNS = new String[]{"yyyy-MM-dd", "yyyyMMdd"};
public static Dictionary<String> buildDictionary(DataType dataType, IDictionaryValueEnumerator valueEnumerator) throws IOException {
Preconditions.checkNotNull(dataType, "dataType cannot be null");
@@ -137,7 +137,9 @@ public class DictionaryGenerator {
private static class StringDictBuilder implements IDictionaryBuilder {
@Override
public Dictionary<String> build(DictionaryInfo dictInfo, IDictionaryValueEnumerator valueEnumerator, int baseId, int nSamples, ArrayList<String> returnSamples) throws IOException {
- TrieDictionaryBuilder builder = new TrieDictionaryBuilder(new StringBytesConverter());
+ int maxTrieSizeInMB = TrieDictionaryForestBuilder.getMaxTrieSizeInMB();
+ //TrieDictionaryBuilder builder = new TrieDictionaryBuilder(new StringBytesConverter());
+ TrieDictionaryForestBuilder builder = new TrieDictionaryForestBuilder(new StringBytesConverter(), baseId, maxTrieSizeInMB);
byte[] value;
while (valueEnumerator.moveNext()) {
value = valueEnumerator.current();
@@ -148,14 +150,16 @@ public class DictionaryGenerator {
if (returnSamples.size() < nSamples && returnSamples.contains(v) == false)
returnSamples.add(v);
}
- return builder.build(baseId);
+ return builder.build();
+ //return builder.build(baseId);
}
}
private static class NumberDictBuilder implements IDictionaryBuilder {
@Override
public Dictionary<String> build(DictionaryInfo dictInfo, IDictionaryValueEnumerator valueEnumerator, int baseId, int nSamples, ArrayList<String> returnSamples) throws IOException {
- NumberDictionaryBuilder builder = new NumberDictionaryBuilder(new StringBytesConverter());
+ int maxTrieSizeInMB = TrieDictionaryForestBuilder.getMaxTrieSizeInMB();
+ NumberDictionaryForestBuilder builder = new NumberDictionaryForestBuilder(new StringBytesConverter(), baseId, maxTrieSizeInMB);
byte[] value;
while (valueEnumerator.moveNext()) {
value = valueEnumerator.current();
@@ -169,7 +173,9 @@ public class DictionaryGenerator {
if (returnSamples.size() < nSamples && returnSamples.contains(v) == false)
returnSamples.add(v);
}
- return builder.build(baseId);
+ return builder.build();
}
}
+
+
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryManager.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryManager.java b/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryManager.java
index c8a7a54..b8d039e 100644
--- a/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryManager.java
+++ b/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryManager.java
@@ -327,7 +327,6 @@ public class DictionaryManager {
if (columnValueEnumerator != null)
columnValueEnumerator.close();
}
-
return trySaveNewDict(dictionary, dictInfo);
}
@@ -419,7 +418,7 @@ public class DictionaryManager {
logger.info("DictionaryManager(" + System.identityHashCode(this) + ") loading DictionaryInfo(loadDictObj:" + loadDictObj + ") at " + resourcePath);
DictionaryInfo info = store.getResource(resourcePath, DictionaryInfo.class, loadDictObj ? DictionaryInfoSerializer.FULL_SERIALIZER : DictionaryInfoSerializer.INFO_SERIALIZER);
-
+ //info.dictionaryObject.dump(System.out);
// if (loadDictObj)
// logger.debug("Loaded dictionary at " + resourcePath);
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForest.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForest.java b/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForest.java
index 8caa4b6..fdf1e68 100644
--- a/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForest.java
+++ b/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForest.java
@@ -275,4 +275,10 @@ public class NumberDictionaryForest<T> extends Dictionary<T> {
public BytesConverter<T> getConverter() {
return converter;
}
+
+ public int getTreeSize(){
+ return this.dict.getTrees().size();
+ }
+
+
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForestBuilder.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForestBuilder.java b/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForestBuilder.java
index 5444bb7..c997ce1 100644
--- a/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForestBuilder.java
+++ b/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForestBuilder.java
@@ -41,18 +41,30 @@ public class NumberDictionaryForestBuilder<T> {
this.bytesConverter = bytesConverter;
}
+ public NumberDictionaryForestBuilder(BytesConverter<T> bytesConverter, int baseId, int maxTrieSizeMB) {
+ this.trieBuilder = new TrieDictionaryForestBuilder<T>(bytesConverter, baseId, maxTrieSizeMB);
+ this.bytesConverter = bytesConverter;
+ }
+
public void addValue(T value) {
addValue(bytesConverter.convertToBytes(value));
}
+
+
public void addValue(byte[] value) {
codec.encodeNumber(value, 0, value.length);
byte[] copy = Bytes.copy(codec.buf, codec.bufOffset, codec.bufLen);
this.trieBuilder.addValue(copy);
}
+ //TODO:ensure ordered
public NumberDictionaryForest<T> build() {
TrieDictionaryForest<T> forest = trieBuilder.build();
return new NumberDictionaryForest<T>(forest, bytesConverter);
}
+
+ public void setMaxTrieSize(int size){
+ this.trieBuilder.setMaxTrieTreeSize(size);
+ }
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java
index aea9551..a5e3d36 100644
--- a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java
+++ b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java
@@ -126,7 +126,7 @@ public class TrieDictionary<T> extends Dictionary<T> {
else
throw new RuntimeException(e);
}
-
+ //this.enableValueCache = false;
if (enableValueCache) {
valueToIdCache = new SoftReference<Map>(new ConcurrentHashMap());
idToValueCache = new SoftReference<Object[]>(new Object[nValues]);
@@ -156,6 +156,7 @@ public class TrieDictionary<T> extends Dictionary<T> {
@Override
final protected int getIdFromValueImpl(T value, int roundingFlag) {
if (enableValueCache && roundingFlag == 0) {
+ //System.out.println("use id cache");
Map cache = valueToIdCache.get(); // SoftReference to skip cache gracefully when short of memory
if (cache != null) {
Integer id = null;
@@ -170,6 +171,7 @@ public class TrieDictionary<T> extends Dictionary<T> {
return id;
}
}
+ //System.out.println("not use id cache");
byte[] valueBytes = bytesConvert.convertToBytes(value);
return getIdFromValueBytes(valueBytes, 0, valueBytes.length, roundingFlag);
}
@@ -271,6 +273,7 @@ public class TrieDictionary<T> extends Dictionary<T> {
@Override
final protected T getValueFromIdImpl(int id) {
if (enableValueCache) {
+ //System.out.println("use value cache");
Object[] cache = idToValueCache.get(); // SoftReference to skip cache gracefully when short of memory
if (cache != null) {
int seq = calcSeqNoFromId(id);
@@ -285,8 +288,10 @@ public class TrieDictionary<T> extends Dictionary<T> {
return result;
}
}
+ //System.out.println("not use value cache");
byte[] value = new byte[getSizeOfValue()];
int length = getValueBytesFromId(id, value, 0);
+ //System.out.println("get value by id:"+id+" value:"+bytesConvert.convertFromBytes(value, 0, length).toString());
return bytesConvert.convertFromBytes(value, 0, length);
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java
index e9ccc56..b0440db 100755
--- a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java
+++ b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java
@@ -19,6 +19,7 @@ package org.apache.kylin.dict;
import org.apache.kylin.common.util.ByteArray;
+import org.apache.kylin.common.util.Bytes;
import org.apache.kylin.common.util.BytesUtil;
import org.apache.kylin.common.util.ClassUtil;
import org.apache.kylin.common.util.Dictionary;
@@ -32,7 +33,6 @@ import java.io.DataOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
@@ -89,13 +89,13 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
@Override
public int getMinId() {
- if (trees.isEmpty()) return -1;
+ if (trees.isEmpty()) return baseId;
return trees.get(0).getMinId() + baseId;
}
@Override
public int getMaxId() {
- if (trees.isEmpty()) return -1;
+ if (trees.isEmpty()) return baseId - 1;
int index = trees.size() - 1;
int id = accuOffset.get(index) + trees.get(index).getMaxId() + baseId;
return id;
@@ -127,43 +127,71 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
}
- //id = tree_inner_offset + accumulate_offset + baseId
@Override
- protected int getIdFromValueBytesImpl(byte[] value, int offset, int len, int roundingFlag)
+ protected int getIdFromValueBytesImpl(byte[] value, int offset, int len, int roundingFlag) throws IllegalArgumentException {
+
+ int result = _getIdFromValueBytesImpl(value, offset, len, roundingFlag);
+ //logger.info("{} => {}, rounding {}", bytesConvert.convertFromBytes(value, offset, len), result, roundingFlag);
+ return result;
+ }
+
+ //id = tree_inner_offset + accumulate_offset + baseId
+ protected int _getIdFromValueBytesImpl(byte[] value, int offset, int len, int roundingFlag)
throws IllegalArgumentException {
//long startTime = System.currentTimeMillis();
ByteArray search = new ByteArray(value, offset, len);
//copyTime.addAndGet(System.currentTimeMillis() - startTime);
int index = findIndexByValue(search);
- //int index = findIndexByValue(value);
- //binarySearchTime.addAndGet(System.currentTimeMillis() - startTime);
if (index < 0) {
- //System.out.println("value divide:"+valueDivide.size()+" "+valueDivide);
- throw new IllegalArgumentException("Tree Not Found. index < 0.Value:" + new String(Arrays.copyOfRange(value, offset, len)));
+ if (roundingFlag > 0) {
+ return getMinId(); //searching value smaller than the smallest value in dict
+ } else {
+ throw new IllegalArgumentException("Value '" + Bytes.toString(value, offset, len) + "' (" + Bytes.toStringBinary(value, offset, len) + ") not exists!");
+ }
+ }
+ int id;
+ if (roundingFlag > 0) {
+ T curTreeMax = trees.get(index).getValueFromId(trees.get(index).getMaxId());
+ byte[] b1 = bytesConvert.convertToBytes(curTreeMax);
+ ByteArray ba1 = new ByteArray(b1, 0, b1.length);
+ //ByteArray ba2 = new ByteArray(value, 0, value.length);
+ if (search.compareTo(ba1) > 0)
+ index++;
+ if (index >= trees.size())
+ throw new IllegalArgumentException("Value '" + Bytes.toString(value, offset, len) + "' (" + Bytes.toStringBinary(value, offset, len) + ") not exists!");
}
TrieDictionary<T> tree = trees.get(index);
- //getValueIndexTime.addAndGet(System.currentTimeMillis() - startTime);
- //startTime = System.currentTimeMillis();
- int id = tree.getIdFromValueBytes(value, offset, len, roundingFlag);
+ id = tree.getIdFromValueBytes(value, offset, len, roundingFlag);
id = id + accuOffset.get(index);
id += baseId;
- //getValueTime.addAndGet(System.currentTimeMillis() - startTime);
+ if (id < 0) {
+ throw new IllegalArgumentException("Value '" + Bytes.toString(value, offset, len) + "' (" + Bytes.toStringBinary(value, offset, len) + ") not exists!");
+ }
+ //System.out.println("getIdFromValue value:"+bytesConvert.convertFromBytes(value,offset,len)+" id:"+id);
return id;
}
//id --> value
+ private boolean printstr = false;
+
@Override
protected T getValueFromIdImpl(int id) throws IllegalArgumentException {
//System.out.println("here");
byte[] data = getValueBytesFromIdImpl(id);
if (data != null) {
+ if (!printstr) {
+ System.out.println("getValueFromIdImpl id:" + id + " value:" + bytesConvert.convertFromBytes(data, 0, data.length));
+ printstr = true;
+ }
return bytesConvert.convertFromBytes(data, 0, data.length);
} else {
return null;
}
}
+ private boolean isPrintstr2 = false;
+
@Override
protected int getValueBytesFromIdImpl(int id, byte[] returnValue, int offset)
throws IllegalArgumentException {
@@ -174,6 +202,10 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
//getValueIndexTime2.addAndGet(System.currentTimeMillis() - startTime);
//startTime = System.currentTimeMillis();
int size = tree.getValueBytesFromIdImpl(treeInnerOffset, returnValue, offset);
+ if (!isPrintstr2) {
+ isPrintstr2 = true;
+ System.out.println("getValueBytesFromIdImpl id:" + id + " value:" + bytesConvert.convertFromBytes(returnValue, offset, size));
+ }
//getValueTime2.addAndGet(System.currentTimeMillis() - startTime);
return size;
}
@@ -200,18 +232,37 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
@Override
public void dump(PrintStream out) {
+ out.println("TrieDictionaryForest");
+ out.println("baseId:" + baseId);
+ StringBuilder sb = new StringBuilder();
+ sb.append("value divide:");
+ for (ByteArray ba : valueDivide)
+ sb.append(bytesConvert.convertFromBytes(ba.array(), 0, ba.length()) + " ");
+ sb.append("\noffset divide:");
+ for (Integer offset : accuOffset)
+ sb.append(offset + " ");
+ out.println(sb.toString());
for (int i = 0; i < trees.size(); i++) {
- System.out.println("----tree " + i + "--------");
+ out.println("----tree " + i + "--------");
trees.get(i).dump(out);
}
}
@Override
public void write(DataOutput out) throws IOException {
+ System.out.println("write dict");
writeHead(out);
writeBody(out);
}
+ /*private int compare(T value1,T value2){
+ byte[] b1 = bytesConvert.convertToBytes(value1);
+ byte[] b2 = bytesConvert.convertToBytes(value2);
+ ByteArray ba1 = new ByteArray(b1,0,b1.length);
+ ByteArray ba2 = new ByteArray(b2,0,b2.length);
+ return ba1.compareTo(ba2);
+ }*/
+
private void writeHead(DataOutput out) throws IOException {
ByteArrayOutputStream byteBuf = new ByteArrayOutputStream();
DataOutputStream headOut = new DataOutputStream(byteBuf);
@@ -248,6 +299,7 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
@Override
public void readFields(DataInput in) throws IOException {
+ System.out.println("read dict");
try {
int headSize = in.readInt();
this.baseId = in.readInt();
@@ -285,6 +337,21 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
}
+ /*@Override
+ public boolean equals(Object o) {
+ if ((o instanceof TrieDictionaryForest) == false) {
+ logger.info("Equals return false because it's not TrieDictionaryForest");
+ return false;
+ }
+ TrieDictionaryForest that = (TrieDictionaryForest) o;
+ if(this.trees.size() != that.getTrees().size())
+ return false;
+ for(int i=0;i<trees.size();i++){
+ if(!trees.get(i).equals(that.getTrees().get(i))) return false;
+ }
+ return true;
+ }*/
+
@Override
public boolean contains(Dictionary other) {
if (other.getSize() > this.getSize()) {
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java
index 3c03c08..5e2c346 100755
--- a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java
+++ b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java
@@ -17,6 +17,7 @@
*/
package org.apache.kylin.dict;
+import org.apache.kylin.common.KylinConfig;
import org.apache.kylin.common.util.ByteArray;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -26,7 +27,9 @@ import java.util.ArrayList;
public class TrieDictionaryForestBuilder<T> {
- public static int MaxTrieTreeSize = 1024 * 1024;//1M
+ public static int DEFAULT_MAX_TRIE_TREE_SIZE_MB = 500;
+
+ //public static int MaxTrieTreeSize = 1024;//1k
private BytesConverter<T> bytesConverter;
@@ -48,6 +51,10 @@ public class TrieDictionaryForestBuilder<T> {
private int curOffset;
+ private int maxTrieTreeSize;
+
+ private boolean isOrdered = true;
+
public TrieDictionaryForestBuilder(BytesConverter<T> bytesConverter) {
this(bytesConverter, 0);
@@ -58,9 +65,23 @@ public class TrieDictionaryForestBuilder<T> {
this.trieBuilder = new TrieDictionaryBuilder<T>(bytesConverter);
this.baseId = baseId;
curOffset = 0;
+ int maxTrieTreeSizeMB = getMaxTrieSizeInMB();
+ this.maxTrieTreeSize = maxTrieTreeSizeMB * 1024 * 1024;
+ logger.info("maxTrieSize is set to:" + maxTrieTreeSize + "B");
+ //System.out.println("max trie size:"+maxTrieTreeSize);
//stringComparator = new ByteComparator<>(new StringBytesConverter());
}
+ public TrieDictionaryForestBuilder(BytesConverter<T> bytesConverter, int baseId, int maxTrieTreeSizeMB) {
+ this.bytesConverter = bytesConverter;
+ this.trieBuilder = new TrieDictionaryBuilder<T>(bytesConverter);
+ this.baseId = baseId;
+ curOffset = 0;
+ this.maxTrieTreeSize = maxTrieTreeSizeMB * 1024 * 1024;
+ logger.info("maxTrieSize is set to:" + maxTrieTreeSize + "B");
+ }
+
+
public void addValue(T value) {
if (value == null) return;
byte[] valueBytes = bytesConverter.convertToBytes(value);
@@ -76,20 +97,25 @@ public class TrieDictionaryForestBuilder<T> {
public void addValue(ByteArray value) {
//System.out.println("value length:"+value.length);
if (value == null) return;
+ //logger.info("going to add value:" + new String(value.array()));
if (previousValue == null) {
previousValue = value;
} else {
int comp = previousValue.compareTo(value);
- if (comp == 0) return; //duplicate value
- if (comp > 0) {
- //logger.info("values not in ascending order");
+ if (comp == 0) {
+ //logger.info("find duplicate value:" + new String(value.array()));
+ return; //duplicate value
+ }
+ if (comp > 0 && isOrdered) {
+ logger.info("values not in ascending order:" + new String(value.array()));
+ isOrdered = false;
//System.out.println(".");
}
}
this.trieBuilder.addValue(value.array());
previousValue = value;
this.curTreeSize += value.length();
- if (curTreeSize >= MaxTrieTreeSize) {
+ if (curTreeSize >= this.maxTrieTreeSize) {
TrieDictionary<T> tree = trieBuilder.build(0);
addTree(tree);
reset();
@@ -104,9 +130,33 @@ public class TrieDictionaryForestBuilder<T> {
}
TrieDictionaryForest<T> forest = new TrieDictionaryForest<T>(this.trees,
this.valueDivide, this.accuOffset, this.bytesConverter, baseId);
+
+ //log
+ logger.info("tree num:" + forest.getTrees().size());
+ StringBuilder sb = new StringBuilder();
+ for (ByteArray ba : valueDivide) {
+ sb.append(new String(ba.array()) + " ");
+ }
+ logger.info("value divide:" + sb.toString());
+ /*
+ If input values are not in ascending order and tree num>1,TrieDictionaryForest can not work correctly.
+ */
+ if (forest.getTrees().size() > 1 && !isOrdered) {
+ throw new IllegalStateException("Invalid input data.Unordered data can not be split into multi trees");
+ }
+
return forest;
}
+ public int getMaxTrieTreeSize() {
+ return maxTrieTreeSize;
+ }
+
+ public void setMaxTrieTreeSize(int maxTrieTreeSize) {
+ this.maxTrieTreeSize = maxTrieTreeSize;
+ logger.info("maxTrieSize is set to:" + maxTrieTreeSize + "B");
+ }
+
private void addTree(TrieDictionary<T> tree) {
trees.add(tree);
int minId = tree.getMinId();
@@ -122,4 +172,20 @@ public class TrieDictionaryForestBuilder<T> {
trieBuilder = new TrieDictionaryBuilder<T>(bytesConverter);
}
+ public static int getMaxTrieSizeInMB() {
+ KylinConfig config = null;
+ try {
+ config = KylinConfig.getInstanceFromEnv();
+ } catch (RuntimeException e) {
+ logger.info("can not get KylinConfig from env.Use default setting:" + DEFAULT_MAX_TRIE_TREE_SIZE_MB + "MB");
+ }
+ int maxTrieTreeSizeMB;
+ if (config != null) {
+ maxTrieTreeSizeMB = config.getTrieDictionaryForestMaxTrieSizeMB();
+ } else {
+ maxTrieTreeSizeMB = DEFAULT_MAX_TRIE_TREE_SIZE_MB;
+ }
+ return maxTrieTreeSizeMB;
+ }
+
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/core-dictionary/src/main/java/org/apache/kylin/dict/lookup/SnapshotTable.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/lookup/SnapshotTable.java b/core-dictionary/src/main/java/org/apache/kylin/dict/lookup/SnapshotTable.java
index db1a170..34b326a 100644
--- a/core-dictionary/src/main/java/org/apache/kylin/dict/lookup/SnapshotTable.java
+++ b/core-dictionary/src/main/java/org/apache/kylin/dict/lookup/SnapshotTable.java
@@ -51,6 +51,7 @@ import com.fasterxml.jackson.annotation.JsonProperty;
@JsonAutoDetect(fieldVisibility = Visibility.NONE, getterVisibility = Visibility.NONE, isGetterVisibility = Visibility.NONE, setterVisibility = Visibility.NONE)
public class SnapshotTable extends RootPersistentEntity implements ReadableTable {
+
@JsonProperty("tableName")
private String tableName;
@JsonProperty("signature")
@@ -58,6 +59,7 @@ public class SnapshotTable extends RootPersistentEntity implements ReadableTable
@JsonProperty("useDictionary")
private boolean useDictionary;
+
private ArrayList<int[]> rowIndices;
private Dictionary<String> dict;
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/core-dictionary/src/test/java/org/apache/kylin/dict/NumberDictionaryTest.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/test/java/org/apache/kylin/dict/NumberDictionaryTest.java b/core-dictionary/src/test/java/org/apache/kylin/dict/NumberDictionaryTest.java
index d98b938..a9c4980 100644
--- a/core-dictionary/src/test/java/org/apache/kylin/dict/NumberDictionaryTest.java
+++ b/core-dictionary/src/test/java/org/apache/kylin/dict/NumberDictionaryTest.java
@@ -16,7 +16,7 @@
* limitations under the License.
*/
-package org.apache.kylin.dict;
+/*package org.apache.kylin.dict;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
@@ -42,7 +42,7 @@ import com.google.common.collect.Sets;
/**
*/
-public class NumberDictionaryTest extends LocalFileMetadataTestCase {
+/*public class NumberDictionaryTest extends LocalFileMetadataTestCase {
NumberDictionary.NumberBytesCodec codec = new NumberDictionary.NumberBytesCodec(NumberDictionary.MAX_DIGITS_BEFORE_DECIMAL_POINT);
Random rand = new Random();
@@ -207,4 +207,4 @@ public class NumberDictionaryTest extends LocalFileMetadataTestCase {
return buf.toString();
}
-}
+}*/
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java b/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java
index 81cba64..3def7e0 100755
--- a/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java
+++ b/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java
@@ -20,6 +20,7 @@
package org.apache.kylin.dict;
+import org.apache.kylin.common.util.Array;
import org.apache.kylin.common.util.MemoryBudgetController;
import org.junit.Ignore;
import org.junit.Test;
@@ -88,10 +89,27 @@ public class TrieDictionaryForestTest {
System.out.println("test ok");
}
- public void duplicateDataTest() {
- //todo
+ @Test
+ public void testNullValue(){
+ //encounter null value when building dictionary
+ ArrayList<String> strs = new ArrayList<String>();
+ strs.add(null);
+ strs.add("abc");
+ System.out.println(strs);
+ int maxTreeSize = 0;
+ TrieDictionaryForestBuilder<String> builder = newDictBuilder(strs, 0, maxTreeSize);
+ TrieDictionaryForest<String> dict = builder.build();
+ dict.dump(System.out);
+ //null value query
+ int id = dict.getIdFromValue(null,0);
+ System.out.println(id);
+ id = dict.getIdFromValue(null,1);
+ System.out.println(id);
+ id = dict.getIdFromValue(null,-1);
+ System.out.println(id);
}
+
@Test
public void testBigDataSet() {
//h=generate data
@@ -245,6 +263,274 @@ public class TrieDictionaryForestTest {
}
+ @Test
+ public void roundingFlagTest(){
+ ArrayList<String> testData = new ArrayList<>();
+ testData.add("b");
+ testData.add("bdd");
+ testData.add("ccc");
+ int baseId = 10;
+ TrieDictionaryForestBuilder<String> b = TrieDictionaryForestTest.newDictBuilder(testData,baseId, 0);
+ TrieDictionaryForest<String> dict = b.build();
+
+ //left
+ String smallerStr = "a";
+ int id;
+ try{
+ id = dict.getIdFromValue(smallerStr,0);
+ fail("should throw IllegalArgumentException,but id is:"+id);
+ }catch (IllegalArgumentException e){
+ //correct
+ }
+ try{
+ id = dict.getIdFromValue(smallerStr,-1);
+ fail("should throw IllegalArgumentException,but id is:"+id);
+ }catch (IllegalArgumentException e){
+ //correct
+ }
+ id = dict.getIdFromValue(smallerStr,1);
+ assertEquals(baseId,id);
+
+ //middle
+ String middleStr = "bd";
+ try{
+ id = dict.getIdFromValue(middleStr,0);
+ fail("should throw IllegalArgumentException,but id is:"+id);
+ }catch (IllegalArgumentException e){
+ //correct
+ }
+ id = dict.getIdFromValue(middleStr,-1);
+ assertEquals(baseId,id);
+ id = dict.getIdFromValue(middleStr,1);
+ assertEquals(baseId+1,id);
+
+ //right
+ String rightStr = "e";
+ try{
+ id = dict.getIdFromValue(rightStr,0);
+ fail("should throw IllegalArgumentException,but id is:"+id);
+ }catch (IllegalArgumentException e){
+ //correct
+ }
+ id = dict.getIdFromValue(rightStr,-1);
+ assertEquals(baseId+2,id);
+ try{
+ id = dict.getIdFromValue(rightStr,1);
+ fail("should throw IllegalArgumentException,but id is:"+id);
+ }catch (IllegalArgumentException e){
+ //correct
+ }
+ }
+
+ @Test
+ public void stringDictRoundFlagTest(){
+ TreeSet<String> set = new TreeSet<>(new ByteComparator<>(new StringBytesConverter()));
+ Iterator<String> it = new RandomStrings(10*10000).iterator();
+ int size = 0;
+ while(it.hasNext()){
+ BytesConverter converter = new StringBytesConverter();
+ String str = it.next();
+ set.add(str);
+ size += converter.convertToBytes(str).length;
+ }
+ int treeNum = 5;
+ TrieDictionaryForestBuilder<String> builder = newDictBuilder(set.iterator(),0,size / treeNum);
+ TrieDictionaryForest<String> dict = builder.build();
+ //dict.dump(System.out);
+
+ //test roundingFlag > 0
+ Iterator<String> it2 = new RandomStrings(100*10000).iterator();
+ while(it2.hasNext()){
+ String query = it2.next();
+ //System.out.println("query:"+query);
+ try {
+ int id = dict.getIdFromValue(query, 1);
+ assertEquals(set.ceiling(query),dict.getValueFromId(id));
+ }catch(IllegalArgumentException e){
+ assertNull(set.ceiling(query));
+ }
+ }
+
+ //test roundingFlag < 0
+ Iterator<String> it3 = new RandomStrings(100*10000).iterator();
+ while(it3.hasNext()){
+ String query = it3.next();
+ try {
+ int id = dict.getIdFromValue(query, -1);
+ assertEquals(set.floor(query),dict.getValueFromId(id));
+ }catch(IllegalArgumentException e){
+ assertNull(set.floor(query));
+ }
+ }
+
+ }
+
+ @Test
+ public void longDictRoundingFlagTest(){
+ TreeSet<String> set = new TreeSet<>(new Comparator<String>() {
+ @Override
+ public int compare(String o1, String o2) {
+ try{
+ Long l1 = Long.parseLong(o1);
+ Long l2 = Long.parseLong(o2);
+ return l1.compareTo(l2);
+ }catch(NumberFormatException e){
+ e.printStackTrace();
+ return 0;
+ }
+ }
+ });
+ int num = 10 * 10000;
+ int k = -48481;
+ int size = 0;
+ StringBytesConverter converter = new StringBytesConverter();
+ for(int i=0;i<num;i++)
+ {
+ String value = k+"";
+ set.add(value);
+ k += 1;
+ String basic = "-9999999999999952517";
+ size += converter.convertToBytes(basic).length;
+ }
+ System.out.println("tree num:"+size);
+ int treeNum = 5;
+ //TrieDictionaryForestBuilder<String> builder = newDictBuilder(set.iterator(),0,size / treeNum);
+ //TrieDictionaryForest<String> dict = builder.build();
+ NumberDictionaryForestBuilder<String> builder = new NumberDictionaryForestBuilder<String>(new StringBytesConverter(),0);
+ builder.setMaxTrieSize(size / treeNum);
+ Iterator<String> it = set.iterator();
+ while(it.hasNext())
+ builder.addValue(it.next());
+ NumberDictionaryForest<String> dict = builder.build();
+ System.out.println(dict.getTreeSize());
+
+ int testTimes = 100 * 10000;
+ Random rand = new Random(System.currentTimeMillis());
+ //test roundingFlag > 0
+ for(int i=0;i<testTimes;i++)
+ {
+ String query = rand.nextInt(2*num)+"";
+ try {
+ int id = dict.getIdFromValue(query, 1);
+ assertEquals(set.ceiling(query),dict.getValueFromId(id));
+ }catch(IllegalArgumentException e){
+ assertNull(set.ceiling(query));
+ }
+ }
+
+
+ //test roundingFlag < 0
+ for(int i=0;i<testTimes;i++)
+ {
+ String query = rand.nextInt(2*num)+"";
+ try {
+ int id = dict.getIdFromValue(query, -1);
+ assertEquals(set.floor(query),dict.getValueFromId(id));
+ }catch(IllegalArgumentException e){
+ assertNull(set.floor(query));
+ }
+ }
+ }
+
+ /*
+ can not pass cases like 1.7695564055819624E-4
+ */
+ @Ignore
+ @Test
+ public void doubleDictRoundingFlagTest(){
+ TreeSet<String> set = new TreeSet<>(new Comparator<String>() {
+ @Override
+ public int compare(String o1, String o2) {
+ try{
+ Double d1 = Double.parseDouble(o1);
+ Double d2 = Double.parseDouble(o2);
+ return d1.compareTo(d2);
+ }catch(NumberFormatException e){
+ e.printStackTrace();
+ return 0;
+ }
+ }
+ });
+ int num = 1000000;
+ double k = -0.0;
+ int size = 0;
+ StringBytesConverter converter = new StringBytesConverter();
+ for(int i=0;i<num;i++)
+ {
+ String value = k+"";
+ set.add(value);
+ k += 1.55;
+ String basic = "-9999999999999952517";
+ size += converter.convertToBytes(basic).length;
+ }
+ int treeNum = 5;
+ //TrieDictionaryForestBuilder<String> builder = newDictBuilder(set.iterator(),0,size / treeNum);
+ //TrieDictionaryForest<String> dict = builder.build();
+ NumberDictionaryForestBuilder<String> builder = new NumberDictionaryForestBuilder<String>(new StringBytesConverter(),0);
+ builder.setMaxTrieSize(size / treeNum);
+ Iterator<String> it = set.iterator();
+ while(it.hasNext()){
+ String str = it.next();
+ if(str.contains("E")){
+ set.remove(str);
+ }
+ else{
+ builder.addValue(str);
+ }
+ }
+
+ NumberDictionaryForest<String> dict = builder.build();
+ System.out.println("tree size:"+dict.getTreeSize());
+ System.out.println("--------------dict-----------------");
+ dict.dump(System.out);
+ System.out.println("--------------set-------------------");
+ System.out.println(set);
+
+ //test special value
+ String query1 = "183.82499999999996";
+ int id1 = dict.getIdFromValue(query1,1);
+ String actualValue = dict.getValueFromId(id1);
+ //System.out.println("id:"+id1+" value:"+actualValue);
+ //System.out.println(set.ceiling(query1));
+
+ //dict.dump(System.out);
+ int testTimes = 1000000;
+ double queryBasic = -145.355;
+ //test roundingFlag > 0
+ for(int i=0;i<testTimes;i++)
+ {
+ String query = queryBasic+"";
+ //System.out.println("query:"+query);
+ queryBasic += 1.51;
+ if(query.contains("E"))
+ continue;
+ try {
+ int id = dict.getIdFromValue(query, 1);
+ assertEquals(set.ceiling(query),dict.getValueFromId(id));
+ }catch(IllegalArgumentException e){
+ assertNull(set.ceiling(query));
+ }
+ }
+
+
+ //test roundingFlag < 0
+ queryBasic = -551.3588;
+ for(int i=0;i<testTimes;i++)
+ {
+ String query = queryBasic+"";
+ queryBasic += 1.0;
+ if(query.contains("E"))
+ continue;
+ try {
+ int id = dict.getIdFromValue(query, -1);
+ assertEquals(set.floor(query),dict.getValueFromId(id));
+ }catch(IllegalArgumentException e){
+ assertNull(set.floor(query));
+ }
+ }
+ }
+
+
private static TrieDictionaryForest<String> testSerialize(TrieDictionaryForest<String> dict) {
try {
ByteArrayOutputStream bout = new ByteArrayOutputStream();
@@ -277,35 +563,54 @@ public class TrieDictionaryForestTest {
}*/
- //benchmark
- @Deprecated
- public void memoryUsageBenchmarkTest() throws Exception {
- //create data
- ArrayList<String> testData = getTestData((int) (Integer.MAX_VALUE * 0.8 / 640));
- int testTimes = 1;
- System.out.println("start memory:" + Runtime.getRuntime().maxMemory());
- System.out.println("start memory:" + Runtime.getRuntime().totalMemory());
- for (int i = 0; i < testTimes; i++) {
- long start = MemoryBudgetController.gcAndGetSystemAvailMB();
- TrieDictionaryBuilder<String> b = new TrieDictionaryBuilder<>(new StringBytesConverter());
- for (String str : testData)
- b.addValue(str);
- long end = MemoryBudgetController.gcAndGetSystemAvailMB();
- System.out.println("object trie memory usage:" + (end - start) + "MB");
- System.out.println("start memory:" + Runtime.getRuntime().maxMemory());
- System.out.println("start memory:" + Runtime.getRuntime().totalMemory());
- /*System.out.println(b == null);
- startMemUse = getSystemCurUsedMemory();
- TrieDictionary<String> dict = b.build(0);
- memUse = getSystemCurUsedMemory();
- System.out.println("array trie memory usage:"+(memUse-startMemUse)/(1024*1024)+"MB");
- System.out.println(b == null );
- System.out.println(dict == null);*/
+ /*
+ add value to the Dictionary until encouter OOM error
+ */
+ @Ignore
+ @Test
+ public void memoryUsageBenchmarkOldDictTest() throws Exception {
+ System.out.println("max memory:"+Runtime.getRuntime().maxMemory());
+ System.gc();
+ Thread.currentThread().sleep(1000);
+ NumberDictionaryBuilder<String> b = new NumberDictionaryBuilder<>(new StringBytesConverter());
+ int k = 0;
+ while(true){
+ b.addValue(k+"");
+ if(k%100000 == 0)
+ System.out.println(k);
+ k++;
}
+ //memroy:1908932608 entry:17500000
+ }
-
+ @Ignore
+ @Test
+ public void memoryUsageBenchmarkNewDictForestTest() throws Exception {
+ System.out.println("max memory:"+Runtime.getRuntime().maxMemory());
+ System.gc();
+ Thread.currentThread().sleep(3000);
+ NumberDictionaryForestBuilder<String> b = new NumberDictionaryForestBuilder<>(new StringBytesConverter(),0,0);
+ int k = 0;
+ while(true){
+ b.addValue(k+"");
+ if(k%100000 == 0)
+ System.out.println(k);
+ k++;
+ }
+ /*
+ memory:1908932608(1800MB)
+ maxTrieSize:500M entry:17500000
+ maxTrieSize:180M entry:47100000
+ maxTrieSize:100M entry:83800000
+ maxTrieSize:50M entry:128400000
+ maxTrieSize:25M entry:148100000
+ maxTrieSize:0M entry: 5000000
+
+ 5-8
+ */
}
+
@Deprecated
private long getSystemCurUsedMemory() throws Exception {
System.gc();
@@ -559,21 +864,29 @@ public class TrieDictionaryForestTest {
return result;
}
- private static TrieDictionaryForestBuilder<String> newDictBuilder(Iterable<String> strs, int baseId) {
+ public static TrieDictionaryForestBuilder<String> newDictBuilder(Iterable<String> strs, int baseId) {
TrieDictionaryForestBuilder<String> b = new TrieDictionaryForestBuilder<String>(new StringBytesConverter(), baseId);
for (String s : strs)
b.addValue(s);
return b;
}
- private static TrieDictionaryForestBuilder<String> newDictBuilder(Iterable<String> strs, int baseId, int treeSize) {
+ public static TrieDictionaryForestBuilder<String> newDictBuilder(Iterable<String> strs, int baseId, int treeSize) {
TrieDictionaryForestBuilder<String> b = new TrieDictionaryForestBuilder<String>(new StringBytesConverter(), baseId);
- TrieDictionaryForestBuilder.MaxTrieTreeSize = treeSize;
+ b.setMaxTrieTreeSize(treeSize);
for (String s : strs)
b.addValue(s);
return b;
}
+ public static TrieDictionaryForestBuilder<String> newDictBuilder(Iterator<String> strs, int baseId, int treeSize) {
+ TrieDictionaryForestBuilder<String> b = new TrieDictionaryForestBuilder<String>(new StringBytesConverter(), baseId);
+ b.setMaxTrieTreeSize(treeSize);
+ while(strs.hasNext())
+ b.addValue(strs.next());
+ return b;
+ }
+
private static class RandomStrings implements Iterable<String> {
final private int size;
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/core-dictionary/src/test/java/org/apache/kylin/dict/lookup/LookupTableTest.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/test/java/org/apache/kylin/dict/lookup/LookupTableTest.java b/core-dictionary/src/test/java/org/apache/kylin/dict/lookup/LookupTableTest.java
index e4b32db..25d6ae2 100644
--- a/core-dictionary/src/test/java/org/apache/kylin/dict/lookup/LookupTableTest.java
+++ b/core-dictionary/src/test/java/org/apache/kylin/dict/lookup/LookupTableTest.java
@@ -25,8 +25,10 @@ import java.util.Set;
import org.apache.kylin.common.KylinConfig;
import org.apache.kylin.common.util.DateFormat;
+import org.apache.kylin.common.util.Dictionary;
import org.apache.kylin.common.util.LocalFileMetadataTestCase;
import org.apache.kylin.common.util.Pair;
+import org.apache.kylin.dict.TrieDictionaryForest;
import org.apache.kylin.metadata.MetadataManager;
import org.apache.kylin.metadata.model.TableDesc;
import org.junit.After;
@@ -107,6 +109,13 @@ public class LookupTableTest extends LocalFileMetadataTestCase {
}
}
+ @Test
+ public void testGetClassName(){
+ String name = TrieDictionaryForest.class.getName();
+ System.out.println(name);
+
+ }
+
private String millis(String dateStr) {
return String.valueOf(DateFormat.stringToMillis(dateStr));
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/engine-mr/src/main/java/org/apache/kylin/engine/mr/DFSFileTableSortedReader.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/DFSFileTableSortedReader.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/DFSFileTableSortedReader.java
new file mode 100644
index 0000000..6af35d2
--- /dev/null
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/DFSFileTableSortedReader.java
@@ -0,0 +1,249 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package org.apache.kylin.engine.mr;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringEscapeUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile.Reader;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.apache.kylin.common.util.Bytes;
+import org.apache.kylin.common.util.StringSplitter;
+import org.apache.kylin.source.ReadableTable.TableReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedReader;
+import java.io.Closeable;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * only use for reading output file of FactDistinctColumnsJob2
+ */
+public class DFSFileTableSortedReader implements TableReader {
+
+ private static final Logger logger = LoggerFactory.getLogger(DFSFileTableSortedReader.class);
+ private static final char CSV_QUOTE = '"';
+ private static final String[] DETECT_DELIMS = new String[] { "\177", "|", "\t", "," };
+
+ private String filePath;
+ private String delim;
+ private List<RowReader> readerList;
+
+ private String curLine;
+ private String[] curColumns;
+ private int expectedColumnNumber = -1; // helps delimiter detection
+
+ public DFSFileTableSortedReader(String filePath, int expectedColumnNumber) throws IOException {
+ this(filePath, DFSFileTable.DELIM_AUTO, expectedColumnNumber);
+ }
+
+ public DFSFileTableSortedReader(String filePath, String delim, int expectedColumnNumber) throws IOException {
+ filePath = HadoopUtil.fixWindowsPath(filePath);
+ this.filePath = filePath;
+ this.delim = delim;
+ this.expectedColumnNumber = expectedColumnNumber;
+ this.readerList = new ArrayList<RowReader>();
+
+ FileSystem fs = HadoopUtil.getFileSystem(filePath);
+
+ ArrayList<FileStatus> allFiles = new ArrayList<>();
+ FileStatus status = fs.getFileStatus(new Path(filePath));
+ if (status.isFile()) {
+ allFiles.add(status);
+ } else {
+ FileStatus[] listStatus = fs.listStatus(new Path(filePath));
+ allFiles.addAll(Arrays.asList(listStatus));
+ }
+
+ try {
+ for (FileStatus f : allFiles) {
+ RowReader rowReader = new SeqRowReader(HadoopUtil.getCurrentConfiguration(), fs, f.getPath().toString());
+ this.readerList.add(rowReader);
+ }
+ } catch (IOException e) {
+ if (isExceptionSayingNotSeqFile(e) == false)
+ throw e;
+
+ this.readerList = new ArrayList<RowReader>();
+ for (FileStatus f : allFiles) {
+ RowReader rowReader = new CsvRowReader(fs, f.getPath().toString());
+ this.readerList.add(rowReader);
+ }
+ }
+ }
+
+ private boolean isExceptionSayingNotSeqFile(IOException e) {
+ if (e.getMessage() != null && e.getMessage().contains("not a SequenceFile"))
+ return true;
+
+ if (e instanceof EOFException) // in case the file is very very small
+ return true;
+
+ return false;
+ }
+
+ @Override
+ public boolean next() throws IOException {
+ int curReaderIndex = -1;
+ RowReader curReader;
+
+ while (++curReaderIndex < readerList.size()) {
+ curReader = readerList.get(curReaderIndex);
+ curLine = curReader.nextLine();
+ curColumns = null;
+
+ if (curLine != null) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ public String getLine() {
+ return curLine;
+ }
+
+ @Override
+ public String[] getRow() {
+ if (curColumns == null) {
+ if (DFSFileTable.DELIM_AUTO.equals(delim))
+ delim = autoDetectDelim(curLine);
+
+ if (delim == null)
+ curColumns = new String[] { curLine };
+ else
+ curColumns = split(curLine, delim);
+ }
+ return curColumns;
+ }
+
+ private String[] split(String line, String delim) {
+ // FIXME CVS line should be parsed considering escapes
+ String[] str = StringSplitter.split(line, delim);
+
+ // un-escape CSV
+ if (DFSFileTable.DELIM_COMMA.equals(delim)) {
+ for (int i = 0; i < str.length; i++) {
+ str[i] = unescapeCsv(str[i]);
+ }
+ }
+
+ return str;
+ }
+
+ private String unescapeCsv(String str) {
+ if (str == null || str.length() < 2)
+ return str;
+
+ str = StringEscapeUtils.unescapeCsv(str);
+
+ // unescapeCsv may not remove the outer most quotes
+ if (str.charAt(0) == CSV_QUOTE && str.charAt(str.length() - 1) == CSV_QUOTE)
+ str = str.substring(1, str.length() - 1);
+
+ return str;
+ }
+
+ @Override
+ public void close() {
+ for (RowReader reader : readerList) {
+ IOUtils.closeQuietly(reader);
+ }
+ }
+
+ private String autoDetectDelim(String line) {
+ if (expectedColumnNumber > 0) {
+ for (String delim : DETECT_DELIMS) {
+ if (StringSplitter.split(line, delim).length == expectedColumnNumber) {
+ logger.info("Auto detect delim to be '" + delim + "', split line to " + expectedColumnNumber + " columns -- " + line);
+ return delim;
+ }
+ }
+ }
+
+ logger.info("Auto detect delim to be null, will take THE-WHOLE-LINE as a single value, for " + filePath);
+ return null;
+ }
+
+ // ============================================================================
+
+ private interface RowReader extends Closeable {
+ String nextLine() throws IOException; // return null on EOF
+ }
+
+ private class SeqRowReader implements RowReader {
+ Reader reader;
+ Writable key;
+ Text value;
+
+ SeqRowReader(Configuration hconf, FileSystem fs, String path) throws IOException {
+ reader = new Reader(hconf, Reader.file(new Path(path)));
+ key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), hconf);
+ value = new Text();
+ }
+
+ @Override
+ public String nextLine() throws IOException {
+ boolean hasNext = reader.next(key, value);
+ if (hasNext)
+ return Bytes.toString(value.getBytes(), 0, value.getLength());
+ else
+ return null;
+ }
+
+ @Override
+ public void close() throws IOException {
+ reader.close();
+ }
+ }
+
+ private class CsvRowReader implements RowReader {
+ BufferedReader reader;
+
+ CsvRowReader(FileSystem fs, String path) throws IOException {
+ FSDataInputStream in = fs.open(new Path(path));
+ reader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
+ }
+
+ @Override
+ public String nextLine() throws IOException {
+ return reader.readLine();
+ }
+
+ @Override
+ public void close() throws IOException {
+ reader.close();
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/engine-mr/src/main/java/org/apache/kylin/engine/mr/JobBuilderSupport.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/JobBuilderSupport.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/JobBuilderSupport.java
index 47eb9c3..9bb867b 100644
--- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/JobBuilderSupport.java
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/JobBuilderSupport.java
@@ -27,10 +27,10 @@ import org.apache.kylin.engine.mr.common.HadoopShellExecutable;
import org.apache.kylin.engine.mr.common.MapReduceExecutable;
import org.apache.kylin.engine.mr.steps.CreateDictionaryJob;
import org.apache.kylin.engine.mr.steps.CubingExecutableUtil;
-import org.apache.kylin.engine.mr.steps.FactDistinctColumnsJob;
import org.apache.kylin.engine.mr.steps.MergeDictionaryStep;
import org.apache.kylin.engine.mr.steps.UpdateCubeInfoAfterBuildStep;
import org.apache.kylin.engine.mr.steps.UpdateCubeInfoAfterMergeStep;
+import org.apache.kylin.engine.mr.steps.fdc2.FactDistinctColumnsJob2;
import org.apache.kylin.job.constant.ExecutableConstants;
import org.apache.kylin.job.engine.JobEngineConfig;
@@ -63,7 +63,7 @@ public class JobBuilderSupport {
private MapReduceExecutable createFactDistinctColumnsStep(String jobId, boolean withStats) {
MapReduceExecutable result = new MapReduceExecutable();
result.setName(ExecutableConstants.STEP_NAME_FACT_DISTINCT_COLUMNS);
- result.setMapReduceJobClass(FactDistinctColumnsJob.class);
+ result.setMapReduceJobClass(FactDistinctColumnsJob2.class);
StringBuilder cmd = new StringBuilder();
appendMapReduceParameters(cmd);
appendExecCmdParameters(cmd, BatchConstants.ARG_CUBE_NAME, seg.getRealization().getName());
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/FactDistinctColumnsCombiner2.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/FactDistinctColumnsCombiner2.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/FactDistinctColumnsCombiner2.java
index 6652f4e..289edd0 100644
--- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/FactDistinctColumnsCombiner2.java
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/FactDistinctColumnsCombiner2.java
@@ -26,7 +26,7 @@ import java.io.IOException;
/**
* @author yangli9
*/
-public class FactDistinctColumnsCombiner2 extends KylinReducer<SelfDefineSortableKey, Text, Text, Text> {
+public class FactDistinctColumnsCombiner2 extends KylinReducer<SelfDefineSortableKey, Text, SelfDefineSortableKey, Text> {
@Override
protected void setup(Context context) throws IOException {
@@ -36,9 +36,10 @@ public class FactDistinctColumnsCombiner2 extends KylinReducer<SelfDefineSortabl
@Override
public void doReduce(SelfDefineSortableKey key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
+
// for hll, each key only has one output, no need to do local combine;
// for normal col, values are empty text
- context.write(key.getText(), values.iterator().next());
+ context.write(key, values.iterator().next());
}
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/FactDistinctColumnsJob2.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/FactDistinctColumnsJob2.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/FactDistinctColumnsJob2.java
index 4d26402..2e84f45 100644
--- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/FactDistinctColumnsJob2.java
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/FactDistinctColumnsJob2.java
@@ -34,7 +34,6 @@ import org.apache.kylin.engine.mr.IMRInput.IMRTableInputFormat;
import org.apache.kylin.engine.mr.MRUtil;
import org.apache.kylin.engine.mr.common.AbstractHadoopJob;
import org.apache.kylin.engine.mr.common.BatchConstants;
-import org.apache.kylin.engine.mr.steps.FactDistinctColumnsReducer;
import org.apache.kylin.metadata.model.TblColRef;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -127,7 +126,7 @@ public class FactDistinctColumnsJob2 extends AbstractHadoopJob {
}
private void setupReducer(Path output, int numberOfReducers) throws IOException {
- job.setReducerClass(FactDistinctColumnsReducer.class); //reducer do not need to change
+ job.setReducerClass(FactDistinctColumnsReducer2.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
http://git-wip-us.apache.org/repos/asf/kylin/blob/734a4f98/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/FactDistinctColumnsMapperBase2.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/FactDistinctColumnsMapperBase2.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/FactDistinctColumnsMapperBase2.java
index 6238d22..037afeb 100644
--- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/FactDistinctColumnsMapperBase2.java
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/fdc2/FactDistinctColumnsMapperBase2.java
@@ -41,6 +41,7 @@ import java.util.List;
/**
*/
+
abstract public class FactDistinctColumnsMapperBase2<KEYIN, VALUEIN> extends KylinMapper<KEYIN, VALUEIN, SelfDefineSortableKey, Text> {
protected String cubeName;
[3/3] kylin git commit: KYLIN-1851 code review format
Posted by li...@apache.org.
KYLIN-1851 code review format
Project: http://git-wip-us.apache.org/repos/asf/kylin/repo
Commit: http://git-wip-us.apache.org/repos/asf/kylin/commit/350547e6
Tree: http://git-wip-us.apache.org/repos/asf/kylin/tree/350547e6
Diff: http://git-wip-us.apache.org/repos/asf/kylin/diff/350547e6
Branch: refs/heads/master
Commit: 350547e6ec6634008a4d07d771822f81acc2bcbe
Parents: 734a4f9
Author: Li Yang <li...@apache.org>
Authored: Wed Nov 16 18:46:49 2016 +0800
Committer: Li Yang <li...@apache.org>
Committed: Wed Nov 16 18:46:49 2016 +0800
----------------------------------------------------------------------
build/conf/kylin.properties | 3 +-
.../apache/kylin/common/KylinConfigBase.java | 4 +-
.../org/apache/kylin/cube/model/CubeDesc.java | 4 +-
.../org/apache/kylin/cube/CubeDescTest.java | 28 +--
.../apache/kylin/dict/DictionaryGenerator.java | 10 +-
.../apache/kylin/dict/DictionaryManager.java | 3 -
.../dict/NumberDictionaryForestBuilder.java | 7 +-
.../org/apache/kylin/dict/TrieDictionary.java | 6 -
.../apache/kylin/dict/TrieDictionaryForest.java | 72 +-----
.../kylin/dict/TrieDictionaryForestBuilder.java | 10 +-
.../apache/kylin/dict/NumberDictionaryTest.java | 8 +-
.../engine/mr/DFSFileTableSortedReader.java | 249 -------------------
.../mr/steps/NumberDictionaryForestTest.java | 25 +-
.../test_case_data/sandbox/kylin.properties | 9 +-
.../org/apache/kylin/query/KylinTestBase.java | 3 -
15 files changed, 54 insertions(+), 387 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/kylin/blob/350547e6/build/conf/kylin.properties
----------------------------------------------------------------------
diff --git a/build/conf/kylin.properties b/build/conf/kylin.properties
index 715b7a6..3b50c12 100644
--- a/build/conf/kylin.properties
+++ b/build/conf/kylin.properties
@@ -118,6 +118,7 @@ kylin.job.mapreduce.mapper.input.rows=1000000
kylin.job.step.timeout=7200
+
### CUBE ###
# 'auto', 'inmem', 'layer' or 'random' for testing
@@ -131,8 +132,6 @@ kylin.dictionary.max.cardinality=5000000
kylin.table.snapshot.max_mb=300
-#max size for one trie in TrieDictionaryForest (default 500MB)
-
### QUERY ###
http://git-wip-us.apache.org/repos/asf/kylin/blob/350547e6/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java
----------------------------------------------------------------------
diff --git a/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java b/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java
index 300f727..c7dd8a8 100644
--- a/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java
+++ b/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java
@@ -210,7 +210,7 @@ abstract public class KylinConfigBase implements Serializable {
public String[] getRealizationProviders() {
return getOptionalStringArray("kylin.realization.providers", //
- new String[]{"org.apache.kylin.cube.CubeManager", "org.apache.kylin.storage.hybrid.HybridManager"});
+ new String[] {"org.apache.kylin.cube.CubeManager", "org.apache.kylin.storage.hybrid.HybridManager"});
}
public CliCommandExecutor getCliCommandExecutor() throws IOException {
@@ -591,7 +591,7 @@ abstract public class KylinConfigBase implements Serializable {
}
public int[] getQueryMetricsPercentilesIntervals() {
- String[] dft = {"60", "300", "3600"};
+ String[] dft = { "60", "300", "3600" };
return getOptionalIntArray("kylin.query.metrics.percentiles.intervals", dft);
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/350547e6/core-cube/src/main/java/org/apache/kylin/cube/model/CubeDesc.java
----------------------------------------------------------------------
diff --git a/core-cube/src/main/java/org/apache/kylin/cube/model/CubeDesc.java b/core-cube/src/main/java/org/apache/kylin/cube/model/CubeDesc.java
index c9ebff8..7dad87b 100644
--- a/core-cube/src/main/java/org/apache/kylin/cube/model/CubeDesc.java
+++ b/core-cube/src/main/java/org/apache/kylin/cube/model/CubeDesc.java
@@ -755,7 +755,7 @@ public class CubeDesc extends RootPersistentEntity implements IEngineAware {
int find = ArrayUtils.indexOf(dimColArray, fk[i]);
if (find >= 0) {
TblColRef derivedCol = initDimensionColRef(pk[i]);
- initDerivedMap(new TblColRef[]{dimColArray[find]}, DeriveType.PK_FK, dim, new TblColRef[]{derivedCol}, null);
+ initDerivedMap(new TblColRef[] { dimColArray[find] }, DeriveType.PK_FK, dim, new TblColRef[] { derivedCol }, null);
}
}
}
@@ -776,7 +776,7 @@ public class CubeDesc extends RootPersistentEntity implements IEngineAware {
extra[i] = "";
}
}
- return new String[][]{cols, extra};
+ return new String[][] { cols, extra };
}
private void initDerivedMap(TblColRef[] hostCols, DeriveType type, DimensionDesc dimension, TblColRef[] derivedCols, String[] extra) {
http://git-wip-us.apache.org/repos/asf/kylin/blob/350547e6/core-cube/src/test/java/org/apache/kylin/cube/CubeDescTest.java
----------------------------------------------------------------------
diff --git a/core-cube/src/test/java/org/apache/kylin/cube/CubeDescTest.java b/core-cube/src/test/java/org/apache/kylin/cube/CubeDescTest.java
index 3326b24..9ad6427 100644
--- a/core-cube/src/test/java/org/apache/kylin/cube/CubeDescTest.java
+++ b/core-cube/src/test/java/org/apache/kylin/cube/CubeDescTest.java
@@ -89,7 +89,7 @@ public class CubeDescTest extends LocalFileMetadataTestCase {
@Test
public void testBadInit3() throws Exception {
thrown.expect(IllegalStateException.class);
- thrown.expectMessage("Aggregation group 0 'includes' dimensions not include all the dimensions:" + sortStrs(new String[]{"SELLER_ID", "META_CATEG_NAME", "LSTG_FORMAT_NAME", "LSTG_SITE_ID", "SLR_SEGMENT_CD"}));
+ thrown.expectMessage("Aggregation group 0 'includes' dimensions not include all the dimensions:" + sortStrs(new String[] { "SELLER_ID", "META_CATEG_NAME", "LSTG_FORMAT_NAME", "LSTG_SITE_ID", "SLR_SEGMENT_CD" }));
CubeDesc cubeDesc = CubeDescManager.getInstance(getTestConfig()).getCubeDesc("test_kylin_cube_with_slr_desc");
String[] temp = Arrays.asList(cubeDesc.getAggregationGroups().get(0).getIncludes()).subList(0, 3).toArray(new String[3]);
cubeDesc.getAggregationGroups().get(0).setIncludes(temp);
@@ -114,7 +114,7 @@ public class CubeDescTest extends LocalFileMetadataTestCase {
@Test
public void testBadInit5() throws Exception {
CubeDesc cubeDesc = CubeDescManager.getInstance(getTestConfig()).getCubeDesc("test_kylin_cube_with_slr_desc");
- cubeDesc.getAggregationGroups().get(0).getSelectRule().mandatory_dims = new String[]{"seller_id", "META_CATEG_NAME"};
+ cubeDesc.getAggregationGroups().get(0).getSelectRule().mandatory_dims = new String[] { "seller_id", "META_CATEG_NAME" };
cubeDesc.init(getTestConfig());
}
@@ -122,7 +122,7 @@ public class CubeDescTest extends LocalFileMetadataTestCase {
@Test
public void testBadInit6() throws Exception {
CubeDesc cubeDesc = CubeDescManager.getInstance(getTestConfig()).getCubeDesc("test_kylin_cube_with_slr_desc");
- cubeDesc.getAggregationGroups().get(0).getSelectRule().mandatory_dims = new String[]{"seller_id", "lstg_format_name"};
+ cubeDesc.getAggregationGroups().get(0).getSelectRule().mandatory_dims = new String[] { "seller_id", "lstg_format_name" };
cubeDesc.init(getTestConfig());
}
@@ -133,43 +133,43 @@ public class CubeDescTest extends LocalFileMetadataTestCase {
thrown.expectMessage("Aggregation group 0 require at least 2 dimensions in a joint");
CubeDesc cubeDesc = CubeDescManager.getInstance(getTestConfig()).getCubeDesc("test_kylin_cube_with_slr_desc");
- cubeDesc.getAggregationGroups().get(0).getSelectRule().joint_dims = new String[][]{new String[]{"lstg_format_name"}};
+ cubeDesc.getAggregationGroups().get(0).getSelectRule().joint_dims = new String[][] { new String[] { "lstg_format_name" } };
cubeDesc.init(getTestConfig());
}
@Test
public void testBadInit8() throws Exception {
- String[] strs = new String[]{"CATEG_LVL2_NAME", "META_CATEG_NAME"};
+ String[] strs = new String[] { "CATEG_LVL2_NAME", "META_CATEG_NAME" };
thrown.expect(IllegalStateException.class);
thrown.expectMessage("Aggregation group 0 hierarchy dimensions overlap with joint dimensions: " + sortStrs(strs));
CubeDesc cubeDesc = CubeDescManager.getInstance(getTestConfig()).getCubeDesc("test_kylin_cube_with_slr_desc");
- cubeDesc.getAggregationGroups().get(0).getSelectRule().joint_dims = new String[][]{new String[]{"META_CATEG_NAME", "CATEG_LVL2_NAME"}};
+ cubeDesc.getAggregationGroups().get(0).getSelectRule().joint_dims = new String[][] { new String[] { "META_CATEG_NAME", "CATEG_LVL2_NAME" } };
cubeDesc.init(getTestConfig());
}
@Test
public void testBadInit9() throws Exception {
- String[] strs = new String[]{"lstg_format_name", "META_CATEG_NAME"};
+ String[] strs = new String[] { "lstg_format_name", "META_CATEG_NAME" };
thrown.expect(IllegalStateException.class);
thrown.expectMessage("Aggregation group 0 hierarchy dimensions overlap with joint dimensions: " + sortStrs(strs));
CubeDesc cubeDesc = CubeDescManager.getInstance(getTestConfig()).getCubeDesc("test_kylin_cube_with_slr_desc");
- cubeDesc.getAggregationGroups().get(0).getSelectRule().hierarchy_dims = new String[][]{new String[]{"META_CATEG_NAME", "CATEG_LVL2_NAME", "CATEG_LVL3_NAME"}, new String[]{"lstg_format_name", "lstg_site_id"}};
- cubeDesc.getAggregationGroups().get(0).getSelectRule().joint_dims = new String[][]{new String[]{"META_CATEG_NAME", "lstg_format_name"}};
+ cubeDesc.getAggregationGroups().get(0).getSelectRule().hierarchy_dims = new String[][] { new String[] { "META_CATEG_NAME", "CATEG_LVL2_NAME", "CATEG_LVL3_NAME" }, new String[] { "lstg_format_name", "lstg_site_id" } };
+ cubeDesc.getAggregationGroups().get(0).getSelectRule().joint_dims = new String[][] { new String[] { "META_CATEG_NAME", "lstg_format_name" } };
cubeDesc.init(getTestConfig());
}
@Test
public void testBadInit10() throws Exception {
- String[] strs = new String[]{"lstg_format_name", "lstg_site_id"};
+ String[] strs = new String[] { "lstg_format_name", "lstg_site_id" };
thrown.expect(IllegalStateException.class);
thrown.expectMessage("Aggregation group 0 a dimension exist in more than one joint: " + sortStrs(strs));
CubeDesc cubeDesc = CubeDescManager.getInstance(getTestConfig()).getCubeDesc("test_kylin_cube_with_slr_desc");
- cubeDesc.getAggregationGroups().get(0).getSelectRule().joint_dims = new String[][]{new String[]{"lstg_format_name", "lstg_site_id", "slr_segment_cd"}, new String[]{"lstg_format_name", "lstg_site_id", "leaf_categ_id"}};
+ cubeDesc.getAggregationGroups().get(0).getSelectRule().joint_dims = new String[][] { new String[] { "lstg_format_name", "lstg_site_id", "slr_segment_cd" }, new String[] { "lstg_format_name", "lstg_site_id", "leaf_categ_id" } };
cubeDesc.init(getTestConfig());
}
@@ -180,19 +180,19 @@ public class CubeDescTest extends LocalFileMetadataTestCase {
thrown.expectMessage("Aggregation group 0 require at least 2 dimensions in a hierarchy.");
CubeDesc cubeDesc = CubeDescManager.getInstance(getTestConfig()).getCubeDesc("test_kylin_cube_with_slr_desc");
- cubeDesc.getAggregationGroups().get(0).getSelectRule().hierarchy_dims = new String[][]{new String[]{"META_CATEG_NAME"}};
+ cubeDesc.getAggregationGroups().get(0).getSelectRule().hierarchy_dims = new String[][] { new String[] { "META_CATEG_NAME" } };
cubeDesc.init(getTestConfig());
}
@Test
public void testBadInit12() throws Exception {
- String[] strs = new String[]{"CATEG_LVL2_NAME", "META_CATEG_NAME"};
+ String[] strs = new String[] { "CATEG_LVL2_NAME", "META_CATEG_NAME" };
thrown.expect(IllegalStateException.class);
thrown.expectMessage("Aggregation group 0 a dimension exist in more than one hierarchy: " + sortStrs(strs));
CubeDesc cubeDesc = CubeDescManager.getInstance(getTestConfig()).getCubeDesc("test_kylin_cube_with_slr_desc");
- cubeDesc.getAggregationGroups().get(0).getSelectRule().hierarchy_dims = new String[][]{new String[]{"META_CATEG_NAME", "CATEG_LVL2_NAME", "CATEG_LVL3_NAME"}, new String[]{"META_CATEG_NAME", "CATEG_LVL2_NAME"}};
+ cubeDesc.getAggregationGroups().get(0).getSelectRule().hierarchy_dims = new String[][] { new String[] { "META_CATEG_NAME", "CATEG_LVL2_NAME", "CATEG_LVL3_NAME" }, new String[] { "META_CATEG_NAME", "CATEG_LVL2_NAME" } };
cubeDesc.init(getTestConfig());
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/350547e6/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryGenerator.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryGenerator.java b/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryGenerator.java
index 8695338..8eafe5f 100644
--- a/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryGenerator.java
+++ b/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryGenerator.java
@@ -36,12 +36,12 @@ import com.google.common.base.Preconditions;
/**
* @author yangli9
*/
-@SuppressWarnings({"rawtypes", "unchecked"})
+@SuppressWarnings({ "rawtypes", "unchecked" })
public class DictionaryGenerator {
private static final Logger logger = LoggerFactory.getLogger(DictionaryGenerator.class);
- private static final String[] DATE_PATTERNS = new String[]{"yyyy-MM-dd", "yyyyMMdd"};
+ private static final String[] DATE_PATTERNS = new String[] { "yyyy-MM-dd", "yyyyMMdd" };
public static Dictionary<String> buildDictionary(DataType dataType, IDictionaryValueEnumerator valueEnumerator) throws IOException {
Preconditions.checkNotNull(dataType, "dataType cannot be null");
@@ -138,7 +138,6 @@ public class DictionaryGenerator {
@Override
public Dictionary<String> build(DictionaryInfo dictInfo, IDictionaryValueEnumerator valueEnumerator, int baseId, int nSamples, ArrayList<String> returnSamples) throws IOException {
int maxTrieSizeInMB = TrieDictionaryForestBuilder.getMaxTrieSizeInMB();
- //TrieDictionaryBuilder builder = new TrieDictionaryBuilder(new StringBytesConverter());
TrieDictionaryForestBuilder builder = new TrieDictionaryForestBuilder(new StringBytesConverter(), baseId, maxTrieSizeInMB);
byte[] value;
while (valueEnumerator.moveNext()) {
@@ -151,15 +150,13 @@ public class DictionaryGenerator {
returnSamples.add(v);
}
return builder.build();
- //return builder.build(baseId);
}
}
private static class NumberDictBuilder implements IDictionaryBuilder {
@Override
public Dictionary<String> build(DictionaryInfo dictInfo, IDictionaryValueEnumerator valueEnumerator, int baseId, int nSamples, ArrayList<String> returnSamples) throws IOException {
- int maxTrieSizeInMB = TrieDictionaryForestBuilder.getMaxTrieSizeInMB();
- NumberDictionaryForestBuilder builder = new NumberDictionaryForestBuilder(new StringBytesConverter(), baseId, maxTrieSizeInMB);
+ NumberDictionaryForestBuilder builder = new NumberDictionaryForestBuilder(new StringBytesConverter(), baseId);
byte[] value;
while (valueEnumerator.moveNext()) {
value = valueEnumerator.current();
@@ -177,5 +174,4 @@ public class DictionaryGenerator {
}
}
-
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/350547e6/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryManager.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryManager.java b/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryManager.java
index b8d039e..2dd5085 100644
--- a/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryManager.java
+++ b/core-dictionary/src/main/java/org/apache/kylin/dict/DictionaryManager.java
@@ -418,9 +418,6 @@ public class DictionaryManager {
logger.info("DictionaryManager(" + System.identityHashCode(this) + ") loading DictionaryInfo(loadDictObj:" + loadDictObj + ") at " + resourcePath);
DictionaryInfo info = store.getResource(resourcePath, DictionaryInfo.class, loadDictObj ? DictionaryInfoSerializer.FULL_SERIALIZER : DictionaryInfoSerializer.INFO_SERIALIZER);
- //info.dictionaryObject.dump(System.out);
- // if (loadDictObj)
- // logger.debug("Loaded dictionary at " + resourcePath);
return info;
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/350547e6/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForestBuilder.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForestBuilder.java b/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForestBuilder.java
index c997ce1..519d4c3 100644
--- a/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForestBuilder.java
+++ b/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionaryForestBuilder.java
@@ -29,8 +29,7 @@ public class NumberDictionaryForestBuilder<T> {
private BytesConverter<T> bytesConverter;
- private NumberDictionaryForest.NumberBytesCodec codec = new NumberDictionaryForest.NumberBytesCodec(
- NumberDictionaryForest.MAX_DIGITS_BEFORE_DECIMAL_POINT);
+ private NumberDictionaryForest.NumberBytesCodec codec = new NumberDictionaryForest.NumberBytesCodec(NumberDictionaryForest.MAX_DIGITS_BEFORE_DECIMAL_POINT);
public NumberDictionaryForestBuilder(BytesConverter<T> bytesConverter) {
this(bytesConverter, 0);
@@ -50,8 +49,6 @@ public class NumberDictionaryForestBuilder<T> {
addValue(bytesConverter.convertToBytes(value));
}
-
-
public void addValue(byte[] value) {
codec.encodeNumber(value, 0, value.length);
byte[] copy = Bytes.copy(codec.buf, codec.bufOffset, codec.bufLen);
@@ -64,7 +61,7 @@ public class NumberDictionaryForestBuilder<T> {
return new NumberDictionaryForest<T>(forest, bytesConverter);
}
- public void setMaxTrieSize(int size){
+ public void setMaxTrieSize(int size) {
this.trieBuilder.setMaxTrieTreeSize(size);
}
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/350547e6/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java
index a5e3d36..c099de0 100644
--- a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java
+++ b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java
@@ -126,7 +126,6 @@ public class TrieDictionary<T> extends Dictionary<T> {
else
throw new RuntimeException(e);
}
- //this.enableValueCache = false;
if (enableValueCache) {
valueToIdCache = new SoftReference<Map>(new ConcurrentHashMap());
idToValueCache = new SoftReference<Object[]>(new Object[nValues]);
@@ -156,7 +155,6 @@ public class TrieDictionary<T> extends Dictionary<T> {
@Override
final protected int getIdFromValueImpl(T value, int roundingFlag) {
if (enableValueCache && roundingFlag == 0) {
- //System.out.println("use id cache");
Map cache = valueToIdCache.get(); // SoftReference to skip cache gracefully when short of memory
if (cache != null) {
Integer id = null;
@@ -171,7 +169,6 @@ public class TrieDictionary<T> extends Dictionary<T> {
return id;
}
}
- //System.out.println("not use id cache");
byte[] valueBytes = bytesConvert.convertToBytes(value);
return getIdFromValueBytes(valueBytes, 0, valueBytes.length, roundingFlag);
}
@@ -273,7 +270,6 @@ public class TrieDictionary<T> extends Dictionary<T> {
@Override
final protected T getValueFromIdImpl(int id) {
if (enableValueCache) {
- //System.out.println("use value cache");
Object[] cache = idToValueCache.get(); // SoftReference to skip cache gracefully when short of memory
if (cache != null) {
int seq = calcSeqNoFromId(id);
@@ -288,10 +284,8 @@ public class TrieDictionary<T> extends Dictionary<T> {
return result;
}
}
- //System.out.println("not use value cache");
byte[] value = new byte[getSizeOfValue()];
int length = getValueBytesFromId(id, value, 0);
- //System.out.println("get value by id:"+id+" value:"+bytesConvert.convertFromBytes(value, 0, length).toString());
return bytesConvert.convertFromBytes(value, 0, length);
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/350547e6/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java
index b0440db..38cd0dc 100755
--- a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java
+++ b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java
@@ -18,14 +18,6 @@
package org.apache.kylin.dict;
-import org.apache.kylin.common.util.ByteArray;
-import org.apache.kylin.common.util.Bytes;
-import org.apache.kylin.common.util.BytesUtil;
-import org.apache.kylin.common.util.ClassUtil;
-import org.apache.kylin.common.util.Dictionary;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataOutput;
@@ -37,6 +29,12 @@ import java.util.Collections;
import java.util.Comparator;
import java.util.List;
+import org.apache.kylin.common.util.ByteArray;
+import org.apache.kylin.common.util.Bytes;
+import org.apache.kylin.common.util.BytesUtil;
+import org.apache.kylin.common.util.ClassUtil;
+import org.apache.kylin.common.util.Dictionary;
+
/**
* use trie forest to optimize trie dictionary
@@ -46,8 +44,7 @@ import java.util.List;
* Created by xiefan on 16-10-26.
*/
public class TrieDictionaryForest<T> extends Dictionary<T> {
-
- private static final Logger logger = LoggerFactory.getLogger(TrieDictionaryForest.class);
+ private static final long serialVersionUID = 1L;
private ArrayList<TrieDictionary<T>> trees;
@@ -168,30 +165,19 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
if (id < 0) {
throw new IllegalArgumentException("Value '" + Bytes.toString(value, offset, len) + "' (" + Bytes.toStringBinary(value, offset, len) + ") not exists!");
}
- //System.out.println("getIdFromValue value:"+bytesConvert.convertFromBytes(value,offset,len)+" id:"+id);
return id;
}
- //id --> value
- private boolean printstr = false;
-
@Override
protected T getValueFromIdImpl(int id) throws IllegalArgumentException {
- //System.out.println("here");
byte[] data = getValueBytesFromIdImpl(id);
if (data != null) {
- if (!printstr) {
- System.out.println("getValueFromIdImpl id:" + id + " value:" + bytesConvert.convertFromBytes(data, 0, data.length));
- printstr = true;
- }
return bytesConvert.convertFromBytes(data, 0, data.length);
} else {
return null;
}
}
- private boolean isPrintstr2 = false;
-
@Override
protected int getValueBytesFromIdImpl(int id, byte[] returnValue, int offset)
throws IllegalArgumentException {
@@ -199,14 +185,7 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
int index = findIndexById(id);
int treeInnerOffset = getTreeInnerOffset(id, index);
TrieDictionary<T> tree = trees.get(index);
- //getValueIndexTime2.addAndGet(System.currentTimeMillis() - startTime);
- //startTime = System.currentTimeMillis();
int size = tree.getValueBytesFromIdImpl(treeInnerOffset, returnValue, offset);
- if (!isPrintstr2) {
- isPrintstr2 = true;
- System.out.println("getValueBytesFromIdImpl id:" + id + " value:" + bytesConvert.convertFromBytes(returnValue, offset, size));
- }
- //getValueTime2.addAndGet(System.currentTimeMillis() - startTime);
return size;
}
@@ -250,19 +229,10 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
@Override
public void write(DataOutput out) throws IOException {
- System.out.println("write dict");
writeHead(out);
writeBody(out);
}
- /*private int compare(T value1,T value2){
- byte[] b1 = bytesConvert.convertToBytes(value1);
- byte[] b2 = bytesConvert.convertToBytes(value2);
- ByteArray ba1 = new ByteArray(b1,0,b1.length);
- ByteArray ba2 = new ByteArray(b2,0,b2.length);
- return ba1.compareTo(ba2);
- }*/
-
private void writeHead(DataOutput out) throws IOException {
ByteArrayOutputStream byteBuf = new ByteArrayOutputStream();
DataOutputStream headOut = new DataOutputStream(byteBuf);
@@ -299,8 +269,8 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
@Override
public void readFields(DataInput in) throws IOException {
- System.out.println("read dict");
try {
+ @SuppressWarnings("unused")
int headSize = in.readInt();
this.baseId = in.readInt();
String converterName = in.readUTF();
@@ -371,15 +341,6 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
return Collections.unmodifiableList(this.trees);
}
- private boolean onlyOneTree() {
- return trees.size() == 1;
- }
-
- private int findIndexByValue(T value) {
- byte[] valueBytes = bytesConvert.convertToBytes(value);
- return findIndexByValue(new ByteArray(valueBytes, 0, valueBytes.length));
- }
-
private int findIndexByValue(ByteArray value) {
int index = lowerBound(value, new Comparator<ByteArray>() {
@Override
@@ -421,29 +382,13 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
found = true;
}
if (found) {
- //System.out.println("look for:"+lookfor+" index:"+mid);
return mid;
} else {
- //System.out.println("look for:"+lookfor+" index:"+Math.max(left,right));
return Math.min(left, right); //value may be bigger than the right tree
}
}
public static void main(String[] args) {
- /*ArrayList<Integer> list = new ArrayList<>();
- list.add(3);
- list.add(10);
- list.add(15);
- Comparator<Integer> comp = new Comparator<Integer>() {
- @Override
- public int compare(Integer o1, Integer o2) {
- return o1.compareTo(o2);
- }
- };
- int[] nums = {-1,0,1,2,3,4,13,15,16};
- for(int i : nums){
- System.out.println("found value:"+i+" index:"+lowerBound(i,comp,list));
- }*/
ArrayList<String> list = new ArrayList<>();
list.add("\u4e00");
list.add("\u4e8c");
@@ -464,7 +409,6 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
}
}, list));
}
- //System.out.println(BytesUtil.safeCompareBytes("\u4e8c".getBytes(),"\u4e09".getBytes()));
}
public BytesConverter<T> getBytesConvert() {
http://git-wip-us.apache.org/repos/asf/kylin/blob/350547e6/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java
index 5e2c346..1ceac27 100755
--- a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java
+++ b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java
@@ -61,15 +61,7 @@ public class TrieDictionaryForestBuilder<T> {
}
public TrieDictionaryForestBuilder(BytesConverter<T> bytesConverter, int baseId) {
- this.bytesConverter = bytesConverter;
- this.trieBuilder = new TrieDictionaryBuilder<T>(bytesConverter);
- this.baseId = baseId;
- curOffset = 0;
- int maxTrieTreeSizeMB = getMaxTrieSizeInMB();
- this.maxTrieTreeSize = maxTrieTreeSizeMB * 1024 * 1024;
- logger.info("maxTrieSize is set to:" + maxTrieTreeSize + "B");
- //System.out.println("max trie size:"+maxTrieTreeSize);
- //stringComparator = new ByteComparator<>(new StringBytesConverter());
+ this(bytesConverter, baseId, getMaxTrieSizeInMB());
}
public TrieDictionaryForestBuilder(BytesConverter<T> bytesConverter, int baseId, int maxTrieTreeSizeMB) {
http://git-wip-us.apache.org/repos/asf/kylin/blob/350547e6/core-dictionary/src/test/java/org/apache/kylin/dict/NumberDictionaryTest.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/test/java/org/apache/kylin/dict/NumberDictionaryTest.java b/core-dictionary/src/test/java/org/apache/kylin/dict/NumberDictionaryTest.java
index a9c4980..ea6358d 100644
--- a/core-dictionary/src/test/java/org/apache/kylin/dict/NumberDictionaryTest.java
+++ b/core-dictionary/src/test/java/org/apache/kylin/dict/NumberDictionaryTest.java
@@ -16,7 +16,7 @@
* limitations under the License.
*/
-/*package org.apache.kylin.dict;
+package org.apache.kylin.dict;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
@@ -35,6 +35,7 @@ import org.apache.kylin.common.util.LocalFileMetadataTestCase;
import org.apache.kylin.metadata.datatype.DataType;
import org.junit.After;
import org.junit.Before;
+import org.junit.Ignore;
import org.junit.Test;
import com.google.common.collect.Lists;
@@ -42,7 +43,7 @@ import com.google.common.collect.Sets;
/**
*/
-/*public class NumberDictionaryTest extends LocalFileMetadataTestCase {
+public class NumberDictionaryTest extends LocalFileMetadataTestCase {
NumberDictionary.NumberBytesCodec codec = new NumberDictionary.NumberBytesCodec(NumberDictionary.MAX_DIGITS_BEFORE_DECIMAL_POINT);
Random rand = new Random();
@@ -70,6 +71,7 @@ import com.google.common.collect.Sets;
assertEquals(1, maxId);
}
+ @Ignore
@SuppressWarnings("unchecked")
@Test
public void testEmptyInput() throws IOException {
@@ -207,4 +209,4 @@ import com.google.common.collect.Sets;
return buf.toString();
}
-}*/
+}
http://git-wip-us.apache.org/repos/asf/kylin/blob/350547e6/engine-mr/src/main/java/org/apache/kylin/engine/mr/DFSFileTableSortedReader.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/DFSFileTableSortedReader.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/DFSFileTableSortedReader.java
deleted file mode 100644
index 6af35d2..0000000
--- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/DFSFileTableSortedReader.java
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-package org.apache.kylin.engine.mr;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang.StringEscapeUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.SequenceFile.Reader;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.util.ReflectionUtils;
-import org.apache.kylin.common.util.Bytes;
-import org.apache.kylin.common.util.StringSplitter;
-import org.apache.kylin.source.ReadableTable.TableReader;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.BufferedReader;
-import java.io.Closeable;
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * only use for reading output file of FactDistinctColumnsJob2
- */
-public class DFSFileTableSortedReader implements TableReader {
-
- private static final Logger logger = LoggerFactory.getLogger(DFSFileTableSortedReader.class);
- private static final char CSV_QUOTE = '"';
- private static final String[] DETECT_DELIMS = new String[] { "\177", "|", "\t", "," };
-
- private String filePath;
- private String delim;
- private List<RowReader> readerList;
-
- private String curLine;
- private String[] curColumns;
- private int expectedColumnNumber = -1; // helps delimiter detection
-
- public DFSFileTableSortedReader(String filePath, int expectedColumnNumber) throws IOException {
- this(filePath, DFSFileTable.DELIM_AUTO, expectedColumnNumber);
- }
-
- public DFSFileTableSortedReader(String filePath, String delim, int expectedColumnNumber) throws IOException {
- filePath = HadoopUtil.fixWindowsPath(filePath);
- this.filePath = filePath;
- this.delim = delim;
- this.expectedColumnNumber = expectedColumnNumber;
- this.readerList = new ArrayList<RowReader>();
-
- FileSystem fs = HadoopUtil.getFileSystem(filePath);
-
- ArrayList<FileStatus> allFiles = new ArrayList<>();
- FileStatus status = fs.getFileStatus(new Path(filePath));
- if (status.isFile()) {
- allFiles.add(status);
- } else {
- FileStatus[] listStatus = fs.listStatus(new Path(filePath));
- allFiles.addAll(Arrays.asList(listStatus));
- }
-
- try {
- for (FileStatus f : allFiles) {
- RowReader rowReader = new SeqRowReader(HadoopUtil.getCurrentConfiguration(), fs, f.getPath().toString());
- this.readerList.add(rowReader);
- }
- } catch (IOException e) {
- if (isExceptionSayingNotSeqFile(e) == false)
- throw e;
-
- this.readerList = new ArrayList<RowReader>();
- for (FileStatus f : allFiles) {
- RowReader rowReader = new CsvRowReader(fs, f.getPath().toString());
- this.readerList.add(rowReader);
- }
- }
- }
-
- private boolean isExceptionSayingNotSeqFile(IOException e) {
- if (e.getMessage() != null && e.getMessage().contains("not a SequenceFile"))
- return true;
-
- if (e instanceof EOFException) // in case the file is very very small
- return true;
-
- return false;
- }
-
- @Override
- public boolean next() throws IOException {
- int curReaderIndex = -1;
- RowReader curReader;
-
- while (++curReaderIndex < readerList.size()) {
- curReader = readerList.get(curReaderIndex);
- curLine = curReader.nextLine();
- curColumns = null;
-
- if (curLine != null) {
- return true;
- }
- }
-
- return false;
- }
-
- public String getLine() {
- return curLine;
- }
-
- @Override
- public String[] getRow() {
- if (curColumns == null) {
- if (DFSFileTable.DELIM_AUTO.equals(delim))
- delim = autoDetectDelim(curLine);
-
- if (delim == null)
- curColumns = new String[] { curLine };
- else
- curColumns = split(curLine, delim);
- }
- return curColumns;
- }
-
- private String[] split(String line, String delim) {
- // FIXME CVS line should be parsed considering escapes
- String[] str = StringSplitter.split(line, delim);
-
- // un-escape CSV
- if (DFSFileTable.DELIM_COMMA.equals(delim)) {
- for (int i = 0; i < str.length; i++) {
- str[i] = unescapeCsv(str[i]);
- }
- }
-
- return str;
- }
-
- private String unescapeCsv(String str) {
- if (str == null || str.length() < 2)
- return str;
-
- str = StringEscapeUtils.unescapeCsv(str);
-
- // unescapeCsv may not remove the outer most quotes
- if (str.charAt(0) == CSV_QUOTE && str.charAt(str.length() - 1) == CSV_QUOTE)
- str = str.substring(1, str.length() - 1);
-
- return str;
- }
-
- @Override
- public void close() {
- for (RowReader reader : readerList) {
- IOUtils.closeQuietly(reader);
- }
- }
-
- private String autoDetectDelim(String line) {
- if (expectedColumnNumber > 0) {
- for (String delim : DETECT_DELIMS) {
- if (StringSplitter.split(line, delim).length == expectedColumnNumber) {
- logger.info("Auto detect delim to be '" + delim + "', split line to " + expectedColumnNumber + " columns -- " + line);
- return delim;
- }
- }
- }
-
- logger.info("Auto detect delim to be null, will take THE-WHOLE-LINE as a single value, for " + filePath);
- return null;
- }
-
- // ============================================================================
-
- private interface RowReader extends Closeable {
- String nextLine() throws IOException; // return null on EOF
- }
-
- private class SeqRowReader implements RowReader {
- Reader reader;
- Writable key;
- Text value;
-
- SeqRowReader(Configuration hconf, FileSystem fs, String path) throws IOException {
- reader = new Reader(hconf, Reader.file(new Path(path)));
- key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), hconf);
- value = new Text();
- }
-
- @Override
- public String nextLine() throws IOException {
- boolean hasNext = reader.next(key, value);
- if (hasNext)
- return Bytes.toString(value.getBytes(), 0, value.getLength());
- else
- return null;
- }
-
- @Override
- public void close() throws IOException {
- reader.close();
- }
- }
-
- private class CsvRowReader implements RowReader {
- BufferedReader reader;
-
- CsvRowReader(FileSystem fs, String path) throws IOException {
- FSDataInputStream in = fs.open(new Path(path));
- reader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
- }
-
- @Override
- public String nextLine() throws IOException {
- return reader.readLine();
- }
-
- @Override
- public void close() throws IOException {
- reader.close();
- }
-
- }
-
-}
http://git-wip-us.apache.org/repos/asf/kylin/blob/350547e6/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/NumberDictionaryForestTest.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/NumberDictionaryForestTest.java b/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/NumberDictionaryForestTest.java
index 554ee9c..66946b7 100644
--- a/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/NumberDictionaryForestTest.java
+++ b/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/NumberDictionaryForestTest.java
@@ -1,16 +1,7 @@
package org.apache.kylin.engine.mr.steps;
-import org.apache.hadoop.io.Text;
-import org.apache.kylin.common.util.Bytes;
-import org.apache.kylin.dict.NumberDictionary;
-import org.apache.kylin.dict.NumberDictionaryBuilder;
-import org.apache.kylin.dict.NumberDictionaryForest;
-import org.apache.kylin.dict.NumberDictionaryForestBuilder;
-import org.apache.kylin.dict.StringBytesConverter;
-import org.apache.kylin.dict.TrieDictionaryForestBuilder;
-import org.apache.kylin.engine.mr.steps.fdc2.SelfDefineSortableKey;
-import org.apache.kylin.engine.mr.steps.fdc2.TypeFlag;
-import org.junit.Test;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
@@ -25,8 +16,16 @@ import java.util.List;
import java.util.Random;
import java.util.UUID;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
+import org.apache.hadoop.io.Text;
+import org.apache.kylin.common.util.Bytes;
+import org.apache.kylin.dict.NumberDictionary;
+import org.apache.kylin.dict.NumberDictionaryBuilder;
+import org.apache.kylin.dict.NumberDictionaryForest;
+import org.apache.kylin.dict.NumberDictionaryForestBuilder;
+import org.apache.kylin.dict.StringBytesConverter;
+import org.apache.kylin.engine.mr.steps.fdc2.SelfDefineSortableKey;
+import org.apache.kylin.engine.mr.steps.fdc2.TypeFlag;
+import org.junit.Test;
/**
* Created by xiefan on 16-11-2.
http://git-wip-us.apache.org/repos/asf/kylin/blob/350547e6/examples/test_case_data/sandbox/kylin.properties
----------------------------------------------------------------------
diff --git a/examples/test_case_data/sandbox/kylin.properties b/examples/test_case_data/sandbox/kylin.properties
index 93b86c9..de1250f 100644
--- a/examples/test_case_data/sandbox/kylin.properties
+++ b/examples/test_case_data/sandbox/kylin.properties
@@ -63,7 +63,7 @@ kylin.job.retry=0
# you will have to specify kylin.job.remote.cli.hostname, kylin.job.remote.cli.username and kylin.job.remote.cli.password
# It should not be set to "true" unless you're NOT running Kylin.sh on a hadoop client machine
# (Thus kylin instance has to ssh to another real hadoop client machine to execute hbase,hive,hadoop commands)
-kylin.job.run.as.remote.cmd=true
+kylin.job.run.as.remote.cmd=false
# Only necessary when kylin.job.run.as.remote.cmd=true
kylin.job.remote.cli.hostname=sandbox
@@ -112,10 +112,12 @@ kylin.job.uhc.reducer.count=1
### CUBE ###
+# dictionary forest cut
+kylin.dictionary.forest.trie.size.max_mb=500
+
# 'auto', 'inmem', 'layer' or 'random' for testing
kylin.cube.algorithm=random
-
# Enable/disable ACL check for cube query
kylin.query.security.enabled=true
@@ -161,6 +163,3 @@ kylin.query.metrics.percentiles.intervals=60, 360, 3600
# Env DEV|QA|PROD
deploy.env=DEV
-
-#default 500MB
-kylin.dictionary.forest.trie.size.max_mb=500
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/kylin/blob/350547e6/kylin-it/src/test/java/org/apache/kylin/query/KylinTestBase.java
----------------------------------------------------------------------
diff --git a/kylin-it/src/test/java/org/apache/kylin/query/KylinTestBase.java b/kylin-it/src/test/java/org/apache/kylin/query/KylinTestBase.java
index e1303e4..52461c4 100644
--- a/kylin-it/src/test/java/org/apache/kylin/query/KylinTestBase.java
+++ b/kylin-it/src/test/java/org/apache/kylin/query/KylinTestBase.java
@@ -491,10 +491,7 @@ public class KylinTestBase {
ITable h2Table = executeQuery(h2Conn, queryName, sql, needSort);
try {
- //compare before junit
// compare the result
- System.out.println("h2 Table rows count:"+h2Table.getRowCount());
- System.out.println("kylin Table rows count:"+kylinTable.getRowCount());
Assertion.assertEquals(h2Table, kylinTable);
} catch (Throwable t) {
printInfo("execAndCompQuery failed on: " + sqlFile.getAbsolutePath());