You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kylin.apache.org by li...@apache.org on 2017/01/12 04:10:02 UTC
[30/50] [abbrv] kylin git commit: KYLIN-2331 by layer spark cubing
http://git-wip-us.apache.org/repos/asf/kylin/blob/b7d87bb4/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/BaseCuboidBuilder.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/BaseCuboidBuilder.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/BaseCuboidBuilder.java
new file mode 100644
index 0000000..07b636b
--- /dev/null
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/BaseCuboidBuilder.java
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package org.apache.kylin.engine.mr.common;
+
+import com.google.common.collect.Sets;
+import org.apache.kylin.common.KylinConfig;
+import org.apache.kylin.common.util.Dictionary;
+import org.apache.kylin.cube.CubeSegment;
+import org.apache.kylin.cube.cuboid.Cuboid;
+import org.apache.kylin.cube.kv.AbstractRowKeyEncoder;
+import org.apache.kylin.cube.model.CubeDesc;
+import org.apache.kylin.cube.model.CubeJoinedFlatTableEnrich;
+import org.apache.kylin.measure.BufferedMeasureCodec;
+import org.apache.kylin.measure.MeasureIngester;
+import org.apache.kylin.metadata.model.FunctionDesc;
+import org.apache.kylin.metadata.model.MeasureDesc;
+import org.apache.kylin.metadata.model.ParameterDesc;
+import org.apache.kylin.metadata.model.TblColRef;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ */
+public class BaseCuboidBuilder implements java.io.Serializable {
+
+ protected static final Logger logger = LoggerFactory.getLogger(BaseCuboidBuilder.class);
+ public static final String HIVE_NULL = "\\N";
+ protected String cubeName;
+ protected Cuboid baseCuboid;
+ protected CubeDesc cubeDesc;
+ protected CubeSegment cubeSegment;
+ protected Set<String> nullStrs;
+ protected CubeJoinedFlatTableEnrich intermediateTableDesc;
+ protected MeasureIngester<?>[] aggrIngesters;
+ protected Map<TblColRef, Dictionary<String>> dictionaryMap;
+ protected AbstractRowKeyEncoder rowKeyEncoder;
+ protected BufferedMeasureCodec measureCodec;
+
+ protected KylinConfig kylinConfig;
+
+ public BaseCuboidBuilder(KylinConfig kylinConfig, CubeDesc cubeDesc, CubeSegment cubeSegment, CubeJoinedFlatTableEnrich intermediateTableDesc,
+ AbstractRowKeyEncoder rowKeyEncoder, MeasureIngester<?>[] aggrIngesters, Map<TblColRef, Dictionary<String>> dictionaryMap) {
+ this.kylinConfig = kylinConfig;
+ this.cubeDesc = cubeDesc;
+ this.cubeSegment = cubeSegment;
+ this.intermediateTableDesc = intermediateTableDesc;
+ this.rowKeyEncoder = rowKeyEncoder;
+ this.aggrIngesters = aggrIngesters;
+ this.dictionaryMap = dictionaryMap;
+
+ init();
+ measureCodec = new BufferedMeasureCodec(cubeDesc.getMeasures());
+ }
+
+ public BaseCuboidBuilder(KylinConfig kylinConfig, CubeDesc cubeDesc, CubeSegment cubeSegment, CubeJoinedFlatTableEnrich intermediateTableDesc) {
+ this.kylinConfig = kylinConfig;
+ this.cubeDesc = cubeDesc;
+ this.cubeSegment = cubeSegment;
+ this.intermediateTableDesc = intermediateTableDesc;
+
+ init();
+ rowKeyEncoder = AbstractRowKeyEncoder.createInstance(cubeSegment, baseCuboid);
+ measureCodec = new BufferedMeasureCodec(cubeDesc.getMeasures());
+ aggrIngesters = MeasureIngester.create(cubeDesc.getMeasures());
+ dictionaryMap = cubeSegment.buildDictionaryMap();
+
+ }
+
+ private void init() {
+ long baseCuboidId = Cuboid.getBaseCuboidId(cubeDesc);
+ baseCuboid = Cuboid.findById(cubeDesc, baseCuboidId);
+ initNullBytes();
+ }
+
+ private void initNullBytes() {
+ nullStrs = Sets.newHashSet();
+ nullStrs.add(HIVE_NULL);
+ String[] nullStrings = cubeDesc.getNullStrings();
+ if (nullStrings != null) {
+ for (String s : nullStrings) {
+ nullStrs.add(s);
+ }
+ }
+ }
+
+ protected boolean isNull(String v) {
+ return nullStrs.contains(v);
+ }
+
+ public byte[] buildKey(String[] flatRow) {
+ int[] rowKeyColumnIndexes = intermediateTableDesc.getRowKeyColumnIndexes();
+ List<TblColRef> columns = baseCuboid.getColumns();
+ String[] colValues = new String[columns.size()];
+ for (int i = 0; i < columns.size(); i++) {
+ colValues[i] = getCell(rowKeyColumnIndexes[i], flatRow);
+ }
+ return rowKeyEncoder.encode(colValues);
+ }
+
+ public ByteBuffer buildValue(String[] flatRow) {
+ return measureCodec.encode(buildValueObjects(flatRow));
+ }
+
+ public Object[] buildValueObjects(String[] flatRow) {
+ Object[] measures = new Object[cubeDesc.getMeasures().size()];
+ for (int i = 0; i < measures.length; i++) {
+ measures[i] = buildValueOf(i, flatRow);
+ }
+
+ return measures;
+ }
+
+ public void resetAggrs() {
+ for (int i = 0; i < cubeDesc.getMeasures().size(); i++) {
+ aggrIngesters[i].reset();
+ }
+ }
+
+ private Object buildValueOf(int idxOfMeasure, String[] flatRow) {
+ MeasureDesc measure = cubeDesc.getMeasures().get(idxOfMeasure);
+ FunctionDesc function = measure.getFunction();
+ int[] colIdxOnFlatTable = intermediateTableDesc.getMeasureColumnIndexes()[idxOfMeasure];
+
+ int paramCount = function.getParameterCount();
+ String[] inputToMeasure = new String[paramCount];
+
+ // pick up parameter values
+ ParameterDesc param = function.getParameter();
+ int colParamIdx = 0; // index among parameters of column type
+ for (int i = 0; i < paramCount; i++, param = param.getNextParameter()) {
+ String value;
+ if (function.isCount()) {
+ value = "1";
+ } else if (param.isColumnType()) {
+ value = getCell(colIdxOnFlatTable[colParamIdx++], flatRow);
+ } else {
+ value = param.getValue();
+ }
+ inputToMeasure[i] = value;
+ }
+
+ return aggrIngesters[idxOfMeasure].valueOf(inputToMeasure, measure, dictionaryMap);
+ }
+
+ private String getCell(int i, String[] flatRow) {
+ if (isNull(flatRow[i]))
+ return null;
+ else
+ return flatRow[i];
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/kylin/blob/b7d87bb4/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java
index ffba181..4011915 100644
--- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java
@@ -262,6 +262,11 @@ public class CubeStatsReader {
return ret;
}
+ public List<Long> getCuboidsByLayer(int level) {
+ List<List<Long>> layeredCuboids = cuboidScheduler.getCuboidsByLayer();
+ return layeredCuboids.get(level);
+ }
+
private void printCuboidInfoTreeEntry(Map<Long, Long> cuboidRows, Map<Long, Double> cuboidSizes, PrintWriter out) {
long baseCuboid = Cuboid.getBaseCuboidId(seg.getCubeDesc());
int dimensionCount = Long.bitCount(baseCuboid);
http://git-wip-us.apache.org/repos/asf/kylin/blob/b7d87bb4/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/NDCuboidBuilder.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/NDCuboidBuilder.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/NDCuboidBuilder.java
new file mode 100644
index 0000000..4e98618
--- /dev/null
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/NDCuboidBuilder.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package org.apache.kylin.engine.mr.common;
+
+import org.apache.kylin.common.util.ByteArray;
+import org.apache.kylin.common.util.Pair;
+import org.apache.kylin.common.util.SplittedBytes;
+import org.apache.kylin.cube.CubeSegment;
+import org.apache.kylin.cube.common.RowKeySplitter;
+import org.apache.kylin.cube.cuboid.Cuboid;
+import org.apache.kylin.cube.kv.RowConstants;
+import org.apache.kylin.cube.kv.RowKeyEncoder;
+import org.apache.kylin.cube.kv.RowKeyEncoderProvider;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.Serializable;
+
+/**
+ */
+public class NDCuboidBuilder implements Serializable {
+
+ protected static final Logger logger = LoggerFactory.getLogger(NDCuboidBuilder.class);
+ protected String cubeName;
+ protected String segmentID;
+ protected CubeSegment cubeSegment;
+ private RowKeySplitter rowKeySplitter;
+ private RowKeyEncoderProvider rowKeyEncoderProvider;
+ private byte[] newKeyBodyBuf = new byte[RowConstants.ROWKEY_BUFFER_SIZE];
+ private ByteArray newKeyBuf = ByteArray.allocate(RowConstants.ROWKEY_BUFFER_SIZE);
+
+ public NDCuboidBuilder(CubeSegment cubeSegment) {
+ this.cubeSegment = cubeSegment;
+ this.rowKeySplitter = new RowKeySplitter(cubeSegment, 65, 256);
+ this.rowKeyEncoderProvider = new RowKeyEncoderProvider(cubeSegment);
+ }
+
+ public NDCuboidBuilder(CubeSegment cubeSegment, RowKeyEncoderProvider rowKeyEncoderProvider) {
+ this.cubeSegment = cubeSegment;
+ this.rowKeyEncoderProvider = rowKeyEncoderProvider;
+ this.rowKeySplitter = new RowKeySplitter(cubeSegment, 65, 256);
+ }
+
+
+ public Pair<Integer, ByteArray> buildKey(Cuboid parentCuboid, Cuboid childCuboid, SplittedBytes[] splitBuffers) {
+ RowKeyEncoder rowkeyEncoder = rowKeyEncoderProvider.getRowkeyEncoder(childCuboid);
+
+ int offset = 0;
+
+ // rowkey columns
+ long mask = Long.highestOneBit(parentCuboid.getId());
+ long parentCuboidId = parentCuboid.getId();
+ long childCuboidId = childCuboid.getId();
+ long parentCuboidIdActualLength = Long.SIZE - Long.numberOfLeadingZeros(parentCuboid.getId());
+ int index = rowKeySplitter.getBodySplitOffset(); // skip shard and cuboidId
+ for (int i = 0; i < parentCuboidIdActualLength; i++) {
+ if ((mask & parentCuboidId) > 0) {// if the this bit position equals
+ // 1
+ if ((mask & childCuboidId) > 0) {// if the child cuboid has this
+ // column
+ System.arraycopy(splitBuffers[index].value, 0, newKeyBodyBuf, offset, splitBuffers[index].length);
+ offset += splitBuffers[index].length;
+ }
+ index++;
+ }
+ mask = mask >> 1;
+ }
+
+ int fullKeySize = rowkeyEncoder.getBytesLength();
+ while (newKeyBuf.array().length < fullKeySize) {
+ newKeyBuf.set(new byte[newKeyBuf.length() * 2]);
+ }
+ newKeyBuf.set(0, fullKeySize);
+
+ rowkeyEncoder.encode(new ByteArray(newKeyBodyBuf, 0, offset), newKeyBuf);
+
+ return new Pair<>(Integer.valueOf(fullKeySize), newKeyBuf);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/kylin/blob/b7d87bb4/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/BaseCuboidMapperBase.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/BaseCuboidMapperBase.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/BaseCuboidMapperBase.java
index 7b719e0..d08e29a 100644
--- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/BaseCuboidMapperBase.java
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/BaseCuboidMapperBase.java
@@ -18,38 +18,25 @@
package org.apache.kylin.engine.mr.steps;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
import org.apache.hadoop.io.Text;
import org.apache.kylin.common.KylinConfig;
import org.apache.kylin.common.util.Bytes;
-import org.apache.kylin.common.util.Dictionary;
import org.apache.kylin.cube.CubeInstance;
import org.apache.kylin.cube.CubeManager;
import org.apache.kylin.cube.CubeSegment;
-import org.apache.kylin.cube.cuboid.Cuboid;
-import org.apache.kylin.cube.kv.AbstractRowKeyEncoder;
import org.apache.kylin.cube.model.CubeDesc;
import org.apache.kylin.cube.model.CubeJoinedFlatTableEnrich;
import org.apache.kylin.engine.EngineFactory;
import org.apache.kylin.engine.mr.KylinMapper;
import org.apache.kylin.engine.mr.common.AbstractHadoopJob;
+import org.apache.kylin.engine.mr.common.BaseCuboidBuilder;
import org.apache.kylin.engine.mr.common.BatchConstants;
-import org.apache.kylin.measure.BufferedMeasureCodec;
-import org.apache.kylin.measure.MeasureIngester;
-import org.apache.kylin.metadata.model.FunctionDesc;
-import org.apache.kylin.metadata.model.MeasureDesc;
-import org.apache.kylin.metadata.model.ParameterDesc;
-import org.apache.kylin.metadata.model.TblColRef;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import com.google.common.collect.Sets;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
/**
*/
@@ -59,131 +46,37 @@ abstract public class BaseCuboidMapperBase<KEYIN, VALUEIN> extends KylinMapper<K
public static final byte[] ONE = Bytes.toBytes("1");
protected String cubeName;
protected String segmentID;
- protected Cuboid baseCuboid;
protected CubeInstance cube;
protected CubeDesc cubeDesc;
protected CubeSegment cubeSegment;
- protected Set<String> nullStrs;
- protected CubeJoinedFlatTableEnrich intermediateTableDesc;
- protected String intermediateTableRowDelimiter;
- protected byte byteRowDelimiter;
protected int counter;
- protected MeasureIngester<?>[] aggrIngesters;
- protected Map<TblColRef, Dictionary<String>> dictionaryMap;
protected Object[] measures;
- protected AbstractRowKeyEncoder rowKeyEncoder;
- protected BufferedMeasureCodec measureCodec;
private int errorRecordCounter;
protected Text outputKey = new Text();
protected Text outputValue = new Text();
+ private BaseCuboidBuilder baseCuboidBuilder;
+
@Override
protected void setup(Context context) throws IOException {
super.bindCurrentConfiguration(context.getConfiguration());
-
cubeName = context.getConfiguration().get(BatchConstants.CFG_CUBE_NAME).toUpperCase();
segmentID = context.getConfiguration().get(BatchConstants.CFG_CUBE_SEGMENT_ID);
- intermediateTableRowDelimiter = context.getConfiguration().get(BatchConstants.CFG_CUBE_INTERMEDIATE_TABLE_ROW_DELIMITER, Character.toString(BatchConstants.INTERMEDIATE_TABLE_ROW_DELIMITER));
- if (Bytes.toBytes(intermediateTableRowDelimiter).length > 1) {
- throw new RuntimeException("Expected delimiter byte length is 1, but got " + Bytes.toBytes(intermediateTableRowDelimiter).length);
- }
-
- byteRowDelimiter = Bytes.toBytes(intermediateTableRowDelimiter)[0];
-
- KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();
-
- cube = CubeManager.getInstance(config).getCube(cubeName);
+ final KylinConfig kylinConfig = AbstractHadoopJob.loadKylinPropsAndMetadata();
+ cube = CubeManager.getInstance(kylinConfig).getCube(cubeName);
cubeDesc = cube.getDescriptor();
cubeSegment = cube.getSegmentById(segmentID);
+ CubeJoinedFlatTableEnrich intermediateTableDesc = new CubeJoinedFlatTableEnrich(EngineFactory.getJoinedFlatTableDesc(cubeSegment), cubeDesc);
+ baseCuboidBuilder = new BaseCuboidBuilder(kylinConfig, cubeDesc, cubeSegment, intermediateTableDesc);
- long baseCuboidId = Cuboid.getBaseCuboidId(cubeDesc);
- baseCuboid = Cuboid.findById(cubeDesc, baseCuboidId);
-
- intermediateTableDesc = new CubeJoinedFlatTableEnrich(EngineFactory.getJoinedFlatTableDesc(cubeSegment), cubeDesc);
-
- rowKeyEncoder = AbstractRowKeyEncoder.createInstance(cubeSegment, baseCuboid);
-
- measureCodec = new BufferedMeasureCodec(cubeDesc.getMeasures());
- measures = new Object[cubeDesc.getMeasures().size()];
-
- aggrIngesters = MeasureIngester.create(cubeDesc.getMeasures());
- dictionaryMap = cubeSegment.buildDictionaryMap();
-
- initNullBytes();
- }
-
- private void initNullBytes() {
- nullStrs = Sets.newHashSet();
- nullStrs.add(HIVE_NULL);
- String[] nullStrings = cubeDesc.getNullStrings();
- if (nullStrings != null) {
- for (String s : nullStrings) {
- nullStrs.add(s);
- }
- }
- }
-
- protected boolean isNull(String v) {
- return nullStrs.contains(v);
- }
-
- protected byte[] buildKey(String[] flatRow) {
- int[] rowKeyColumnIndexes = intermediateTableDesc.getRowKeyColumnIndexes();
- List<TblColRef> columns = baseCuboid.getColumns();
- String[] colValues = new String[columns.size()];
- for (int i = 0; i < columns.size(); i++) {
- colValues[i] = getCell(rowKeyColumnIndexes[i], flatRow);
- }
- return rowKeyEncoder.encode(colValues);
}
- private ByteBuffer buildValue(String[] flatRow) {
-
- for (int i = 0; i < measures.length; i++) {
- measures[i] = buildValueOf(i, flatRow);
- }
-
- return measureCodec.encode(measures);
- }
-
- private Object buildValueOf(int idxOfMeasure, String[] flatRow) {
- MeasureDesc measure = cubeDesc.getMeasures().get(idxOfMeasure);
- FunctionDesc function = measure.getFunction();
- int[] colIdxOnFlatTable = intermediateTableDesc.getMeasureColumnIndexes()[idxOfMeasure];
-
- int paramCount = function.getParameterCount();
- String[] inputToMeasure = new String[paramCount];
-
- // pick up parameter values
- ParameterDesc param = function.getParameter();
- int colParamIdx = 0; // index among parameters of column type
- for (int i = 0; i < paramCount; i++, param = param.getNextParameter()) {
- String value;
- if (function.isCount()) {
- value = "1";
- } else if (param.isColumnType()) {
- value = getCell(colIdxOnFlatTable[colParamIdx++], flatRow);
- } else {
- value = param.getValue();
- }
- inputToMeasure[i] = value;
- }
-
- return aggrIngesters[idxOfMeasure].valueOf(inputToMeasure, measure, dictionaryMap);
- }
-
- private String getCell(int i, String[] flatRow) {
- if (isNull(flatRow[i]))
- return null;
- else
- return flatRow[i];
- }
protected void outputKV(String[] flatRow, Context context) throws IOException, InterruptedException {
- byte[] rowKey = buildKey(flatRow);
+ byte[] rowKey = baseCuboidBuilder.buildKey(flatRow);
outputKey.set(rowKey, 0, rowKey.length);
- ByteBuffer valueBuf = buildValue(flatRow);
+ ByteBuffer valueBuf = baseCuboidBuilder.buildValue(flatRow);
outputValue.set(valueBuf.array(), 0, valueBuf.position());
context.write(outputKey, outputValue);
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/b7d87bb4/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/NDCuboidMapper.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/NDCuboidMapper.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/NDCuboidMapper.java
index 01cdd4a..b924edc 100644
--- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/NDCuboidMapper.java
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/NDCuboidMapper.java
@@ -18,29 +18,27 @@
package org.apache.kylin.engine.mr.steps;
-import java.io.IOException;
-import java.util.Collection;
-
import org.apache.hadoop.io.Text;
import org.apache.kylin.common.KylinConfig;
import org.apache.kylin.common.util.ByteArray;
-import org.apache.kylin.common.util.SplittedBytes;
+import org.apache.kylin.common.util.Pair;
import org.apache.kylin.cube.CubeInstance;
import org.apache.kylin.cube.CubeManager;
import org.apache.kylin.cube.CubeSegment;
import org.apache.kylin.cube.common.RowKeySplitter;
import org.apache.kylin.cube.cuboid.Cuboid;
import org.apache.kylin.cube.cuboid.CuboidScheduler;
-import org.apache.kylin.cube.kv.RowConstants;
-import org.apache.kylin.cube.kv.RowKeyEncoder;
-import org.apache.kylin.cube.kv.RowKeyEncoderProvider;
import org.apache.kylin.cube.model.CubeDesc;
import org.apache.kylin.engine.mr.KylinMapper;
import org.apache.kylin.engine.mr.common.AbstractHadoopJob;
import org.apache.kylin.engine.mr.common.BatchConstants;
+import org.apache.kylin.engine.mr.common.NDCuboidBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.io.IOException;
+import java.util.Collection;
+
/**
* @author George Song (ysong1)
*
@@ -59,10 +57,9 @@ public class NDCuboidMapper extends KylinMapper<Text, Text, Text, Text> {
private int handleCounter;
private int skipCounter;
- private byte[] newKeyBodyBuf = new byte[RowConstants.ROWKEY_BUFFER_SIZE];
- private ByteArray newKeyBuf = ByteArray.allocate(RowConstants.ROWKEY_BUFFER_SIZE);
private RowKeySplitter rowKeySplitter;
- private RowKeyEncoderProvider rowKeyEncoderProvider;
+
+ private NDCuboidBuilder ndCuboidBuilder;
@Override
protected void setup(Context context) throws IOException {
@@ -76,48 +73,13 @@ public class NDCuboidMapper extends KylinMapper<Text, Text, Text, Text> {
CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
cubeSegment = cube.getSegmentById(segmentID);
cubeDesc = cube.getDescriptor();
-
+ ndCuboidBuilder = new NDCuboidBuilder(cubeSegment);
// initialize CubiodScheduler
cuboidScheduler = new CuboidScheduler(cubeDesc);
-
rowKeySplitter = new RowKeySplitter(cubeSegment, 65, 256);
- rowKeyEncoderProvider = new RowKeyEncoderProvider(cubeSegment);
}
- private int buildKey(Cuboid parentCuboid, Cuboid childCuboid, SplittedBytes[] splitBuffers) {
- RowKeyEncoder rowkeyEncoder = rowKeyEncoderProvider.getRowkeyEncoder(childCuboid);
-
- int offset = 0;
-
- // rowkey columns
- long mask = Long.highestOneBit(parentCuboid.getId());
- long parentCuboidId = parentCuboid.getId();
- long childCuboidId = childCuboid.getId();
- long parentCuboidIdActualLength = Long.SIZE - Long.numberOfLeadingZeros(parentCuboid.getId());
- int index = rowKeySplitter.getBodySplitOffset(); // skip shard and cuboidId
- for (int i = 0; i < parentCuboidIdActualLength; i++) {
- if ((mask & parentCuboidId) > 0) {// if the this bit position equals
- // 1
- if ((mask & childCuboidId) > 0) {// if the child cuboid has this
- // column
- System.arraycopy(splitBuffers[index].value, 0, newKeyBodyBuf, offset, splitBuffers[index].length);
- offset += splitBuffers[index].length;
- }
- index++;
- }
- mask = mask >> 1;
- }
- int fullKeySize = rowkeyEncoder.getBytesLength();
- while (newKeyBuf.array().length < fullKeySize) {
- newKeyBuf.set(new byte[newKeyBuf.length() * 2]);
- }
- newKeyBuf.set(0, fullKeySize);
-
- rowkeyEncoder.encode(new ByteArray(newKeyBodyBuf, 0, offset), newKeyBuf);
-
- return fullKeySize;
- }
@Override
public void doMap(Text key, Text value, Context context) throws IOException, InterruptedException {
@@ -143,8 +105,8 @@ public class NDCuboidMapper extends KylinMapper<Text, Text, Text, Text> {
for (Long child : myChildren) {
Cuboid childCuboid = Cuboid.findById(cubeDesc, child);
- int fullKeySize = buildKey(parentCuboid, childCuboid, rowKeySplitter.getSplitBuffers());
- outputKey.set(newKeyBuf.array(), 0, fullKeySize);
+ Pair<Integer, ByteArray> result = ndCuboidBuilder.buildKey(parentCuboid, childCuboid, rowKeySplitter.getSplitBuffers());
+ outputKey.set(result.getSecond().array(), 0, result.getFirst());
context.write(outputKey, value);
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/b7d87bb4/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/CubeReducerTest.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/CubeReducerTest.java b/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/CubeReducerTest.java
index 97dd750..29ca9b8 100644
--- a/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/CubeReducerTest.java
+++ b/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/CubeReducerTest.java
@@ -41,7 +41,6 @@ import org.apache.kylin.measure.BufferedMeasureCodec;
import org.apache.kylin.measure.MeasureAggregator;
import org.apache.kylin.measure.MeasureIngester;
import org.apache.kylin.measure.MeasureType;
-import org.apache.kylin.metadata.datatype.LongMutable;
import org.apache.kylin.metadata.model.FunctionDesc;
import org.apache.kylin.metadata.model.MeasureDesc;
import org.junit.After;
@@ -161,7 +160,7 @@ public class CubeReducerTest extends LocalFileMetadataTestCase {
}
private Text newValueText(BufferedMeasureCodec codec, String sum, String min, String max, int count, int item_count) {
- Object[] values = new Object[] { new BigDecimal(sum), new BigDecimal(min), new BigDecimal(max), new LongMutable(count), new LongMutable(item_count) };
+ Object[] values = new Object[] { new BigDecimal(sum), new BigDecimal(min), new BigDecimal(max), new Long(count), new Long(item_count) };
ByteBuffer buf = codec.encode(values);
http://git-wip-us.apache.org/repos/asf/kylin/blob/b7d87bb4/engine-spark/pom.xml
----------------------------------------------------------------------
diff --git a/engine-spark/pom.xml b/engine-spark/pom.xml
index 4921dca..be2cdf6 100644
--- a/engine-spark/pom.xml
+++ b/engine-spark/pom.xml
@@ -43,6 +43,11 @@
<artifactId>kylin-core-job</artifactId>
</dependency>
+ <dependency>
+ <groupId>org.apache.kylin</groupId>
+ <artifactId>kylin-engine-mr</artifactId>
+ </dependency>
+
<!-- Spark dependency -->
<dependency>
<groupId>org.apache.spark</groupId>
@@ -111,6 +116,10 @@
<artifactId>maven-model</artifactId>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ </dependency>
</dependencies>
http://git-wip-us.apache.org/repos/asf/kylin/blob/b7d87bb4/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkBatchCubingEngine2.java
----------------------------------------------------------------------
diff --git a/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkBatchCubingEngine2.java b/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkBatchCubingEngine2.java
new file mode 100644
index 0000000..a7a4151
--- /dev/null
+++ b/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkBatchCubingEngine2.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package org.apache.kylin.engine.spark;
+
+import org.apache.kylin.cube.CubeSegment;
+import org.apache.kylin.engine.mr.MRBatchCubingEngine2;
+import org.apache.kylin.job.execution.DefaultChainedExecutable;
+
+/**
+ */
+public class SparkBatchCubingEngine2 extends MRBatchCubingEngine2 {
+ @Override
+ public DefaultChainedExecutable createBatchCubingJob(CubeSegment newSegment, String submitter) {
+ return new SparkBatchCubingJobBuilder2(newSegment, submitter).build();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/kylin/blob/b7d87bb4/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkBatchCubingJobBuilder2.java
----------------------------------------------------------------------
diff --git a/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkBatchCubingJobBuilder2.java b/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkBatchCubingJobBuilder2.java
new file mode 100644
index 0000000..9532d31
--- /dev/null
+++ b/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkBatchCubingJobBuilder2.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package org.apache.kylin.engine.spark;
+
+import org.apache.hadoop.util.ClassUtil;
+import org.apache.kylin.common.util.StringUtil;
+import org.apache.kylin.cube.CubeSegment;
+import org.apache.kylin.engine.EngineFactory;
+import org.apache.kylin.engine.mr.BatchCubingJobBuilder2;
+import org.apache.kylin.engine.mr.CubingJob;
+import org.apache.kylin.job.constant.ExecutableConstants;
+import org.apache.kylin.job.execution.AbstractExecutable;
+import org.apache.kylin.metadata.model.IJoinedFlatTableDesc;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ */
+public class SparkBatchCubingJobBuilder2 extends BatchCubingJobBuilder2 {
+
+ private static final Logger logger = LoggerFactory.getLogger(SparkBatchCubingJobBuilder2.class);
+
+ public SparkBatchCubingJobBuilder2(CubeSegment newSegment, String submitter) {
+ super(newSegment, submitter);
+ }
+
+ protected void addLayerCubingSteps(final CubingJob result, final String jobId, final String cuboidRootPath) {
+
+ }
+
+ @Override
+ protected AbstractExecutable createInMemCubingStep(String jobId, String cuboidRootPath) {
+ IJoinedFlatTableDesc flatTableDesc = EngineFactory.getJoinedFlatTableDesc(seg);
+ final SparkExecutable sparkExecutable = new SparkExecutable();
+ sparkExecutable.setClassName(SparkCubingByLayer.class.getName());
+ sparkExecutable.setParam(SparkCubingByLayer.OPTION_CUBE_NAME.getOpt(), seg.getRealization().getName());
+ sparkExecutable.setParam(SparkCubingByLayer.OPTION_SEGMENT_ID.getOpt(), seg.getUuid());
+ sparkExecutable.setParam(SparkCubingByLayer.OPTION_INPUT_PATH.getOpt(), flatTableDesc.getTableName());
+ sparkExecutable.setParam(SparkCubingByLayer.OPTION_CONF_PATH.getOpt(), "/Users/shishaofeng/workspace/kylin-15/examples/test_case_data/sandbox/"); //FIXME
+ sparkExecutable.setParam(SparkCubingByLayer.OPTION_OUTPUT_PATH.getOpt(), cuboidRootPath);
+
+ StringBuilder jars = new StringBuilder();
+
+ StringUtil.appendWithSeparator(jars, findJar("org.htrace.HTraceConfiguration")); // htrace-core.jar
+ StringUtil.appendWithSeparator(jars, findJar("org.cloudera.htrace.HTraceConfiguration"));
+ StringUtil.appendWithSeparator(jars, findJar("org.apache.hadoop.hbase.client.HConnection")); // hbase-client.jar
+ StringUtil.appendWithSeparator(jars, findJar("org.apache.hadoop.hbase.HBaseConfiguration")); // hbase-common.jar
+ StringUtil.appendWithSeparator(jars, findJar("org.apache.hadoop.hbase.util.ByteStringer")); // hbase-protocol.jar
+
+ StringUtil.appendWithSeparator(jars, seg.getConfig().getSparkAdditionalJars());
+ sparkExecutable.setJars(jars.toString());
+ // sparkExecutable.setJars("/Users/shishaofeng/.m2/repository/org/cloudera/htrace/htrace-core/2.01/htrace-core-2.01.jar,/Users/shishaofeng/.m2/repository/org/apache/hbase/hbase-protocol/0.98.8-hadoop2/hbase-protocol-0.98.8-hadoop2.jar,/Users/shishaofeng/.m2/repository/org/apache/hbase/hbase-common/0.98.8-hadoop2/hbase-common-0.98.8-hadoop2.jar,/Users/shishaofeng/.m2//repository/org/apache/hbase/hbase-client/0.98.8-hadoop2/hbase-client-0.98.8-hadoop2.jar");
+
+ sparkExecutable.setName(ExecutableConstants.STEP_NAME_BUILD_IN_MEM_CUBE + " with Spark");
+ return sparkExecutable;
+
+ }
+
+ private String findJar(String className) {
+ try {
+ return ClassUtil.findContainingJar(Class.forName(className));
+ } catch (ClassNotFoundException e) {
+ logger.error("failed to locate jar for class " + className, e);
+ }
+
+ return "";
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/kylin/blob/b7d87bb4/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubing.java
----------------------------------------------------------------------
diff --git a/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubing.java b/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubing.java
index 4ec5048..0437a80 100644
--- a/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubing.java
+++ b/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubing.java
@@ -84,6 +84,7 @@ import org.apache.kylin.engine.spark.util.IteratorUtils;
import org.apache.kylin.measure.BufferedMeasureCodec;
import org.apache.kylin.measure.MeasureAggregators;
import org.apache.kylin.measure.hllc.HLLCounter;
+import org.apache.kylin.measure.MeasureIngester;
import org.apache.kylin.metadata.model.FunctionDesc;
import org.apache.kylin.metadata.model.IJoinedFlatTableDesc;
import org.apache.kylin.metadata.model.MeasureDesc;
@@ -153,6 +154,20 @@ public class SparkCubing extends AbstractApplication {
return options;
}
+ public static KylinConfig loadKylinPropsAndMetadata(String folder) throws IOException {
+ File metaDir = new File(folder);
+ if (!metaDir.getAbsolutePath().equals(System.getProperty(KylinConfig.KYLIN_CONF))) {
+ System.setProperty(KylinConfig.KYLIN_CONF, metaDir.getAbsolutePath());
+ logger.info("The absolute path for meta dir is " + metaDir.getAbsolutePath());
+ KylinConfig kylinConfig = KylinConfig.getInstanceFromEnv();
+ System.out.println("setting metadataUrl to " + metaDir.getAbsolutePath());
+ kylinConfig.setMetadataUrl(metaDir.getAbsolutePath());
+ return kylinConfig;
+ } else {
+ return KylinConfig.getInstanceFromEnv();
+ }
+ }
+
private void setupClasspath(JavaSparkContext sc, String confPath) throws Exception {
ClassUtil.addClasspath(confPath);
final File[] files = new File(confPath).listFiles(new FileFilter() {
@@ -466,7 +481,7 @@ public class SparkCubing extends AbstractApplication {
}).saveAsNewAPIHadoopFile(hFileLocation, ImmutableBytesWritable.class, KeyValue.class, HFileOutputFormat.class, conf);
}
- private static void prepare() throws Exception {
+ public static void prepare() throws Exception {
final File file = new File(SparkFiles.get("kylin.properties"));
final String confPath = file.getParentFile().getAbsolutePath();
System.out.println("conf directory:" + confPath);
@@ -530,12 +545,18 @@ public class SparkCubing extends AbstractApplication {
}
}
- private Collection<String> getKyroClasses() {
+ public static Collection<String> getKyroClasses() {
Set<Class> kyroClasses = Sets.newHashSet();
kyroClasses.addAll(new Reflections("org.apache.kylin").getSubTypesOf(Serializable.class));
+ kyroClasses.addAll(new Reflections("org.apache.kylin.dimension").getSubTypesOf(Serializable.class));
+ kyroClasses.addAll(new Reflections("org.apache.kylin.cube").getSubTypesOf(Serializable.class));
kyroClasses.addAll(new Reflections("org.apache.kylin.cube.model").getSubTypesOf(Object.class));
+ kyroClasses.addAll(new Reflections("org.apache.kylin.metadata").getSubTypesOf(Object.class));
kyroClasses.addAll(new Reflections("org.apache.kylin.metadata.model").getSubTypesOf(Object.class));
kyroClasses.addAll(new Reflections("org.apache.kylin.metadata.measure").getSubTypesOf(Object.class));
+ kyroClasses.addAll(new Reflections("org.apache.kylin.metadata.datatype").getSubTypesOf(org.apache.kylin.common.util.BytesSerializer.class));
+ kyroClasses.addAll(new Reflections("org.apache.kylin.measure").getSubTypesOf(MeasureIngester.class));
+
kyroClasses.add(HashMap.class);
kyroClasses.add(org.apache.spark.sql.Row[].class);
kyroClasses.add(org.apache.spark.sql.Row.class);
@@ -545,11 +566,15 @@ public class SparkCubing extends AbstractApplication {
kyroClasses.add(org.apache.spark.sql.types.StructField.class);
kyroClasses.add(org.apache.spark.sql.types.DateType$.class);
kyroClasses.add(org.apache.spark.sql.types.Metadata.class);
- kyroClasses.add(Object[].class);
kyroClasses.add(org.apache.spark.sql.types.StringType$.class);
kyroClasses.add(Hashing.murmur3_128().getClass());
- kyroClasses.add(org.apache.spark.sql.columnar.CachedBatch.class);
+ kyroClasses.add(org.apache.spark.sql.execution.columnar.CachedBatch.class);
+ kyroClasses.add(Object[].class);
+ kyroClasses.add(int[].class);
+ kyroClasses.add(byte[].class);
kyroClasses.add(byte[][].class);
+ kyroClasses.add(String[].class);
+ kyroClasses.add(String[][].class);
kyroClasses.add(org.apache.spark.sql.types.Decimal.class);
kyroClasses.add(scala.math.BigDecimal.class);
kyroClasses.add(java.math.BigDecimal.class);
@@ -557,6 +582,61 @@ public class SparkCubing extends AbstractApplication {
kyroClasses.add(java.math.RoundingMode.class);
kyroClasses.add(java.util.ArrayList.class);
kyroClasses.add(java.util.LinkedList.class);
+ kyroClasses.add(java.util.HashSet.class);
+ kyroClasses.add(java.util.LinkedHashSet.class);
+ kyroClasses.add(java.util.LinkedHashMap.class);
+ kyroClasses.add(java.util.TreeMap.class);
+ kyroClasses.add(java.util.concurrent.ConcurrentHashMap.class);
+
+ kyroClasses.add(java.util.HashMap.class);
+ kyroClasses.add(java.util.Properties.class);
+ kyroClasses.add(org.apache.kylin.metadata.model.ColumnDesc[].class);
+ kyroClasses.add(org.apache.kylin.metadata.model.JoinTableDesc[].class);
+ kyroClasses.add(org.apache.kylin.metadata.model.TblColRef[].class);
+ kyroClasses.add(org.apache.kylin.metadata.model.DataModelDesc.RealizationCapacity.class);
+ kyroClasses.add(org.apache.kylin.metadata.model.DataModelDesc.TableKind.class);
+ kyroClasses.add(org.apache.kylin.metadata.model.PartitionDesc.DefaultPartitionConditionBuilder.class);
+ kyroClasses.add(org.apache.kylin.metadata.model.PartitionDesc.PartitionType.class);
+ kyroClasses.add(org.apache.kylin.cube.model.CubeDesc.DeriveInfo.class);
+ kyroClasses.add(org.apache.kylin.cube.model.CubeDesc.DeriveType.class);
+ kyroClasses.add(org.apache.kylin.cube.model.HBaseColumnFamilyDesc[].class);
+ kyroClasses.add(org.apache.kylin.cube.model.HBaseColumnDesc[].class);
+ kyroClasses.add(org.apache.kylin.metadata.model.MeasureDesc[].class);
+ kyroClasses.add(org.apache.kylin.cube.model.RowKeyColDesc[].class);
+ kyroClasses.add(org.apache.kylin.common.util.Array.class);
+ kyroClasses.add(org.apache.kylin.metadata.model.Segments.class);
+ kyroClasses.add(org.apache.kylin.metadata.realization.RealizationStatusEnum.class);
+ kyroClasses.add(org.apache.kylin.metadata.model.SegmentStatusEnum.class);
+ kyroClasses.add(org.apache.kylin.measure.BufferedMeasureCodec.class);
+ kyroClasses.add(org.apache.kylin.cube.kv.RowKeyColumnIO.class);
+ kyroClasses.add(org.apache.kylin.measure.MeasureCodec.class);
+ kyroClasses.add(org.apache.kylin.measure.MeasureAggregator[].class);
+ kyroClasses.add(org.apache.kylin.metadata.datatype.DataTypeSerializer[].class);
+ kyroClasses.add(org.apache.kylin.cube.kv.CubeDimEncMap.class);
+ kyroClasses.add(org.apache.kylin.measure.basic.BasicMeasureType.class);
+ kyroClasses.add(org.apache.kylin.common.util.SplittedBytes[].class);
+ kyroClasses.add(org.apache.kylin.common.util.SplittedBytes.class);
+ kyroClasses.add(org.apache.kylin.cube.kv.RowKeyEncoderProvider.class);
+ kyroClasses.add(org.apache.kylin.cube.kv.RowKeyEncoder.class);
+ kyroClasses.add(org.apache.kylin.measure.basic.BigDecimalIngester.class);
+ kyroClasses.add(org.apache.kylin.dimension.DictionaryDimEnc.class);
+ kyroClasses.add(org.apache.kylin.dimension.IntDimEnc.class);
+ kyroClasses.add(org.apache.kylin.dimension.BooleanDimEnc.class);
+ kyroClasses.add(org.apache.kylin.dimension.DateDimEnc.class);
+ kyroClasses.add(org.apache.kylin.dimension.FixedLenDimEnc.class);
+ kyroClasses.add(org.apache.kylin.dimension.FixedLenHexDimEnc.class);
+ kyroClasses.add(org.apache.kylin.dimension.IntegerDimEnc.class);
+ kyroClasses.add(org.apache.kylin.dimension.OneMoreByteVLongDimEnc.class);
+ kyroClasses.add(org.apache.kylin.dimension.TimeDimEnc.class);
+ kyroClasses.add(org.apache.kylin.cube.model.AggregationGroup.HierarchyMask.class);
+ kyroClasses.add(org.apache.kylin.measure.topn.DoubleDeltaSerializer.class);
+ kyroClasses.add(org.apache.kylin.measure.topn.Counter.class);
+
+ try {
+ kyroClasses.add(Class.forName("com.google.common.collect.EmptyImmutableList"));
+ } catch (ClassNotFoundException e) {
+ logger.error("failed to load class", e);
+ }
ArrayList<String> result = Lists.newArrayList();
for (Class kyroClass : kyroClasses) {
http://git-wip-us.apache.org/repos/asf/kylin/blob/b7d87bb4/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubingByLayer.java
----------------------------------------------------------------------
diff --git a/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubingByLayer.java b/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubingByLayer.java
new file mode 100644
index 0000000..53c1f96
--- /dev/null
+++ b/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubingByLayer.java
@@ -0,0 +1,416 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+package org.apache.kylin.engine.spark;
+
+import com.google.common.base.Predicate;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.kylin.common.KylinConfig;
+import org.apache.kylin.common.util.AbstractApplication;
+import org.apache.kylin.common.util.ByteArray;
+import org.apache.kylin.common.util.ClassUtil;
+import org.apache.kylin.common.util.OptionsHelper;
+import org.apache.kylin.common.util.Pair;
+import org.apache.kylin.cube.CubeInstance;
+import org.apache.kylin.cube.CubeManager;
+import org.apache.kylin.cube.CubeSegment;
+import org.apache.kylin.cube.common.RowKeySplitter;
+import org.apache.kylin.cube.cuboid.Cuboid;
+import org.apache.kylin.cube.cuboid.CuboidScheduler;
+import org.apache.kylin.cube.kv.AbstractRowKeyEncoder;
+import org.apache.kylin.cube.kv.RowKeyEncoderProvider;
+import org.apache.kylin.cube.model.CubeDesc;
+import org.apache.kylin.cube.model.CubeJoinedFlatTableEnrich;
+import org.apache.kylin.engine.EngineFactory;
+import org.apache.kylin.engine.mr.BatchCubingJobBuilder2;
+import org.apache.kylin.engine.mr.HadoopUtil;
+import org.apache.kylin.engine.mr.common.BaseCuboidBuilder;
+import org.apache.kylin.engine.mr.common.BatchConstants;
+import org.apache.kylin.engine.mr.common.CubeStatsReader;
+import org.apache.kylin.engine.mr.common.NDCuboidBuilder;
+import org.apache.kylin.measure.BufferedMeasureCodec;
+import org.apache.kylin.measure.MeasureAggregators;
+import org.apache.kylin.measure.MeasureIngester;
+import org.apache.kylin.metadata.model.MeasureDesc;
+import org.apache.spark.SparkConf;
+import org.apache.spark.SparkFiles;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.api.java.function.Function2;
+import org.apache.spark.api.java.function.PairFlatMapFunction;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.broadcast.Broadcast;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.hive.HiveContext;
+import org.apache.spark.storage.StorageLevel;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import scala.Tuple2;
+
+import javax.annotation.Nullable;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.Serializable;
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.List;
+
+import static org.apache.kylin.engine.spark.SparkCubing.getKyroClasses;
+
+/**
+ */
+public class SparkCubingByLayer extends AbstractApplication implements Serializable {
+
+ protected static final Logger logger = LoggerFactory.getLogger(SparkCubingByLayer.class);
+
+ public static final Option OPTION_INPUT_PATH = OptionBuilder.withArgName("path").hasArg().isRequired(true).withDescription("Hive Intermediate Table").create("hiveTable");
+ public static final Option OPTION_CUBE_NAME = OptionBuilder.withArgName(BatchConstants.ARG_CUBE_NAME).hasArg().isRequired(true).withDescription("Cube Name").create(BatchConstants.ARG_CUBE_NAME);
+ public static final Option OPTION_SEGMENT_ID = OptionBuilder.withArgName("segment").hasArg().isRequired(true).withDescription("Cube Segment Id").create("segmentId");
+ public static final Option OPTION_CONF_PATH = OptionBuilder.withArgName("confPath").hasArg().isRequired(true).withDescription("Configuration Path").create("confPath");
+ public static final Option OPTION_OUTPUT_PATH = OptionBuilder.withArgName(BatchConstants.ARG_OUTPUT).hasArg().isRequired(true).withDescription("Cube output path").create(BatchConstants.ARG_OUTPUT);
+
+ private Options options;
+
+ public SparkCubingByLayer() {
+ options = new Options();
+ options.addOption(OPTION_INPUT_PATH);
+ options.addOption(OPTION_CUBE_NAME);
+ options.addOption(OPTION_SEGMENT_ID);
+ options.addOption(OPTION_CONF_PATH);
+ options.addOption(OPTION_OUTPUT_PATH);
+ }
+
+ @Override
+ protected Options getOptions() {
+ return options;
+ }
+
+ private void setupClasspath(JavaSparkContext sc, String confPath) throws Exception {
+ ClassUtil.addClasspath(confPath);
+ final File[] files = new File(confPath).listFiles(new FileFilter() {
+ @Override
+ public boolean accept(File pathname) {
+ if (pathname.getAbsolutePath().endsWith(".xml")) {
+ return true;
+ }
+ if (pathname.getAbsolutePath().endsWith(".properties")) {
+ return true;
+ }
+ return false;
+ }
+ });
+ for (File file : files) {
+ sc.addFile(file.getAbsolutePath());
+ }
+ }
+
+ private static final void prepare() {
+ final File file = new File(SparkFiles.get("kylin.properties"));
+ final String confPath = file.getParentFile().getAbsolutePath();
+ logger.info("conf directory:" + confPath);
+ System.setProperty(KylinConfig.KYLIN_CONF, confPath);
+ ClassUtil.addClasspath(confPath);
+ }
+
+ @Override
+ protected void execute(OptionsHelper optionsHelper) throws Exception {
+ final String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
+ final String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
+ final String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
+ final String confPath = optionsHelper.getOptionValue(OPTION_CONF_PATH);
+ final String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);
+
+ SparkConf conf = new SparkConf().setAppName("Cubing Application");
+ //serialization conf
+ conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
+ conf.set("spark.kryo.registrationRequired", "true");
+ final Iterable<String> allClasses = Iterables.filter(Iterables.concat(Lists.newArrayList(conf.get("spark.kryo.classesToRegister", "").split(",")), getKyroClasses()), new Predicate<String>() {
+ @Override
+ public boolean apply(@Nullable String input) {
+ return input != null && input.trim().length() > 0;
+ }
+ });
+ conf.set("spark.kryo.classesToRegister", StringUtils.join(allClasses, ","));
+
+ JavaSparkContext sc = new JavaSparkContext(conf);
+ setupClasspath(sc, confPath);
+ HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));
+
+ HiveContext sqlContext = new HiveContext(sc.sc());
+ final DataFrame intermediateTable = sqlContext.table(hiveTable);
+
+ System.setProperty(KylinConfig.KYLIN_CONF, confPath);
+ final KylinConfig envConfig = KylinConfig.getInstanceFromEnv();
+ final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
+ final CubeDesc cubeDesc = cubeInstance.getDescriptor();
+ final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);
+ final CubeJoinedFlatTableEnrich intermediateTableDesc = new CubeJoinedFlatTableEnrich(EngineFactory.getJoinedFlatTableDesc(cubeSegment), cubeDesc);
+
+ final KylinConfig kylinConfig = cubeDesc.getConfig();
+ final Broadcast<CubeDesc> vCubeDesc = sc.broadcast(cubeDesc);
+ final Broadcast<CubeSegment> vCubeSegment = sc.broadcast(cubeSegment);
+ final NDCuboidBuilder ndCuboidBuilder = new NDCuboidBuilder(vCubeSegment.getValue(), new RowKeyEncoderProvider(vCubeSegment.getValue()));
+
+ final Broadcast<CuboidScheduler> vCuboidScheduler = sc.broadcast(new CuboidScheduler(vCubeDesc.getValue()));
+
+ final long baseCuboidId = Cuboid.getBaseCuboidId(cubeDesc);
+ final Cuboid baseCuboid = Cuboid.findById(cubeDesc, baseCuboidId);
+ final int measureNum = cubeDesc.getMeasures().size();
+ final BaseCuboidBuilder baseCuboidBuilder = new BaseCuboidBuilder(kylinConfig, vCubeDesc.getValue(), vCubeSegment.getValue(), intermediateTableDesc, AbstractRowKeyEncoder.createInstance(cubeSegment, baseCuboid), MeasureIngester.create(cubeDesc.getMeasures()), cubeSegment.buildDictionaryMap());
+
+ int countMeasureIndex = 0;
+ for (MeasureDesc measureDesc : cubeDesc.getMeasures()) {
+ if (measureDesc.getFunction().isCount() == true) {
+ break;
+ } else {
+ countMeasureIndex++;
+ }
+ }
+ final CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, kylinConfig);
+ boolean[] needAggr = new boolean[cubeDesc.getMeasures().size()];
+ boolean allNormalMeasure = true;
+ for (int i = 0; i < cubeDesc.getMeasures().size(); i++) {
+ needAggr[i] = !cubeDesc.getMeasures().get(i).getFunction().getMeasureType().onlyAggrInBaseCuboid();
+ allNormalMeasure = allNormalMeasure && needAggr[i];
+ }
+ logger.info("All measure are normal (agg on all cuboids) ? : " + allNormalMeasure);
+
+ StorageLevel storageLevel = StorageLevel.MEMORY_AND_DISK_SER();
+
+ // encode with dimension encoding, transform to <ByteArray, Object[]> RDD
+ final JavaPairRDD<ByteArray, Object[]> encodedBaseRDD = intermediateTable.javaRDD().mapToPair(new PairFunction<Row, ByteArray, Object[]>() {
+ transient boolean initialized = false;
+
+ @Override
+ public Tuple2<ByteArray, Object[]> call(Row row) throws Exception {
+ if (initialized == false) {
+ prepare();
+ initialized = true;
+ }
+
+ String[] rowArray = rowToArray(row);
+ baseCuboidBuilder.resetAggrs();
+ byte[] rowKey = baseCuboidBuilder.buildKey(rowArray);
+ Object[] result = baseCuboidBuilder.buildValueObjects(rowArray);
+ return new Tuple2<>(new ByteArray(rowKey), result);
+ }
+
+ private String[] rowToArray(Row row) {
+ String[] result = new String[row.size()];
+ for (int i = 0; i < row.size(); i++) {
+ final Object o = row.get(i);
+ if (o != null) {
+ result[i] = o.toString();
+ } else {
+ result[i] = null;
+ }
+ }
+ return result;
+ }
+
+ });
+
+ logger.info("encodedBaseRDD partition number: " + encodedBaseRDD.getNumPartitions());
+ Long totalCount = 0L;
+ if (kylinConfig.isSparkSanityCheckEnabled()) {
+ totalCount = encodedBaseRDD.count();
+ logger.info("encodedBaseRDD row count: " + encodedBaseRDD.count());
+ }
+
+ final MeasureAggregators measureAggregators = new MeasureAggregators(cubeDesc.getMeasures());
+ final BaseCuboidReducerFunction2 baseCuboidReducerFunction = new BaseCuboidReducerFunction2(measureNum, vCubeDesc.getValue(), measureAggregators);
+ BaseCuboidReducerFunction2 reducerFunction2 = baseCuboidReducerFunction;
+ if (allNormalMeasure == false) {
+ reducerFunction2 = new CuboidReducerFunction2(measureNum, vCubeDesc.getValue(), measureAggregators, needAggr);
+ }
+
+ final int totalLevels = cubeDesc.getBuildLevel();
+ JavaPairRDD<ByteArray, Object[]>[] allRDDs = new JavaPairRDD[totalLevels];
+ int level = 0;
+ int partition = estimateRDDPartitionNum(level, cubeStatsReader, kylinConfig);
+
+ // aggregate to calculate base cuboid
+ allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel);
+ saveToHDFS(allRDDs[0], vCubeDesc.getValue(), outputPath, 0, sc.hadoopConfiguration());
+
+ // aggregate to ND cuboids
+ PairFlatMapFunction<Tuple2<ByteArray, Object[]>, ByteArray, Object[]> flatMapFunction = new CuboidFlatMap(vCubeSegment.getValue(), vCubeDesc.getValue(), vCuboidScheduler.getValue(), ndCuboidBuilder);
+
+ for (level = 1; level < totalLevels; level++) {
+ partition = estimateRDDPartitionNum(level, cubeStatsReader, kylinConfig);
+ logger.info("Level " + level + " partition number: " + partition);
+ allRDDs[level] = allRDDs[level - 1].flatMapToPair(flatMapFunction).reduceByKey(reducerFunction2, partition).persist(storageLevel);
+ if (kylinConfig.isSparkSanityCheckEnabled() == true) {
+ sanityCheck(allRDDs[level], totalCount, level, cubeStatsReader, countMeasureIndex);
+ }
+ saveToHDFS(allRDDs[level], vCubeDesc.getValue(), outputPath, level, sc.hadoopConfiguration());
+ allRDDs[level - 1].unpersist();
+ }
+ allRDDs[totalLevels - 1].unpersist();
+ logger.info("Finished on calculating all level cuboids.");
+ }
+
+ private static int estimateRDDPartitionNum(int level, CubeStatsReader statsReader, KylinConfig kylinConfig) {
+ double baseCuboidSize = statsReader.estimateLayerSize(level);
+ float rddCut = kylinConfig.getSparkRDDPartitionCutMB();
+ int partition = (int) (baseCuboidSize / rddCut);
+ partition = Math.max(kylinConfig.getSparkMinPartition(), partition);
+ partition = Math.min(kylinConfig.getSparkMaxPartition(), partition);
+ logger.debug("Estimated level " + level + " partition number: " + partition);
+ return partition;
+ }
+
+ private static void saveToHDFS(final JavaPairRDD<ByteArray, Object[]> rdd, final CubeDesc cubeDesc, final String hdfsBaseLocation, int level, Configuration conf) {
+ final String cuboidOutputPath = BatchCubingJobBuilder2.getCuboidOutputPathsByLevel(hdfsBaseLocation, level);
+ rdd.mapToPair(new PairFunction<Tuple2<ByteArray, Object[]>, org.apache.hadoop.io.Text, org.apache.hadoop.io.Text>() {
+ BufferedMeasureCodec codec = new BufferedMeasureCodec(cubeDesc.getMeasures());
+ @Override
+ public Tuple2<org.apache.hadoop.io.Text, org.apache.hadoop.io.Text> call(Tuple2<ByteArray, Object[]> tuple2) throws Exception {
+ ByteBuffer valueBuf = codec.encode(tuple2._2());
+ byte[] encodedBytes = new byte[valueBuf.position()];
+ System.arraycopy(valueBuf.array(), 0, encodedBytes, 0, valueBuf.position());
+ return new Tuple2<>(new org.apache.hadoop.io.Text(tuple2._1().array()), new org.apache.hadoop.io.Text(encodedBytes));
+ }
+ }).saveAsNewAPIHadoopFile(cuboidOutputPath, org.apache.hadoop.io.Text.class, org.apache.hadoop.io.Text.class, SequenceFileOutputFormat.class, conf);
+ logger.info("Persisting RDD for level " + level + " into " + cuboidOutputPath);
+ }
+
+ class BaseCuboidReducerFunction2 implements Function2<Object[], Object[], Object[]> {
+ CubeDesc cubeDesc;
+ int measureNum;
+ MeasureAggregators aggregators;
+
+ BaseCuboidReducerFunction2(int measureNum, CubeDesc cubeDesc, MeasureAggregators aggregators) {
+ this.cubeDesc = cubeDesc;
+ this.measureNum = measureNum;
+ this.aggregators = aggregators;
+ }
+
+ @Override
+ public Object[] call(Object[] input1, Object[] input2) throws Exception {
+ Object[] result = new Object[measureNum];
+ aggregators.aggregate(input1, input2, result);
+ return result;
+ }
+ }
+
+ class CuboidReducerFunction2 extends BaseCuboidReducerFunction2 {
+ boolean[] needAggr;
+
+ CuboidReducerFunction2(int measureNum, CubeDesc cubeDesc, MeasureAggregators aggregators, boolean[] needAggr) {
+ super(measureNum, cubeDesc, aggregators);
+ this.needAggr = needAggr;
+ }
+
+ @Override
+ public Object[] call(Object[] input1, Object[] input2) throws Exception {
+ Object[] result = new Object[measureNum];
+ aggregators.aggregate(input1, input2, result, needAggr);
+ return result;
+ }
+ }
+
+ private static final java.lang.Iterable<Tuple2<ByteArray, Object[]>> EMTPY_ITERATOR = Lists.newArrayListWithCapacity(0);
+
+ class CuboidFlatMap implements PairFlatMapFunction<Tuple2<ByteArray, Object[]>, ByteArray, Object[]> {
+
+ CubeSegment cubeSegment;
+ CubeDesc cubeDesc;
+ CuboidScheduler cuboidScheduler;
+ NDCuboidBuilder ndCuboidBuilder;
+ RowKeySplitter rowKeySplitter;
+ transient boolean initialized = false;
+
+ CuboidFlatMap(CubeSegment cubeSegment, CubeDesc cubeDesc, CuboidScheduler cuboidScheduler, NDCuboidBuilder ndCuboidBuilder) {
+ this.cubeSegment = cubeSegment;
+ this.cubeDesc = cubeDesc;
+ this.cuboidScheduler = cuboidScheduler;
+ this.ndCuboidBuilder = ndCuboidBuilder;
+ this.rowKeySplitter = new RowKeySplitter(cubeSegment, 65, 256);
+ }
+
+ @Override
+ public Iterable<Tuple2<ByteArray, Object[]>> call(Tuple2<ByteArray, Object[]> tuple2) throws Exception {
+ if (initialized == false) {
+ prepare();
+ initialized = true;
+ }
+
+ byte[] key = tuple2._1().array();
+ long cuboidId = rowKeySplitter.split(key);
+ Cuboid parentCuboid = Cuboid.findById(cubeDesc, cuboidId);
+
+ Collection<Long> myChildren = cuboidScheduler.getSpanningCuboid(cuboidId);
+
+ // if still empty or null
+ if (myChildren == null || myChildren.size() == 0) {
+ return EMTPY_ITERATOR;
+ }
+
+ List<Tuple2<ByteArray, Object[]>> tuples = Lists.newArrayListWithCapacity(myChildren.size());
+ for (Long child : myChildren) {
+ Cuboid childCuboid = Cuboid.findById(cubeDesc, child);
+ Pair<Integer, ByteArray> result = ndCuboidBuilder.buildKey(parentCuboid, childCuboid, rowKeySplitter.getSplitBuffers());
+
+ byte[] newKey = new byte[result.getFirst()];
+ System.arraycopy(result.getSecond().array(), 0, newKey, 0, result.getFirst());
+
+ tuples.add(new Tuple2<>(new ByteArray(newKey), tuple2._2()));
+ }
+
+ return tuples;
+ }
+ }
+
+ //sanity check
+
+ private void sanityCheck(JavaPairRDD<ByteArray, Object[]> rdd, Long totalCount, int thisLevel, CubeStatsReader cubeStatsReader, final int countMeasureIndex) {
+ int thisCuboidNum = cubeStatsReader.getCuboidsByLayer(thisLevel).size();
+ Long count2 = getRDDCountSum(rdd, countMeasureIndex);
+ if (count2 != totalCount * thisCuboidNum) {
+ throw new IllegalStateException(String.format("Sanity check failed, level %s, total count(*) is %s; cuboid number %s", thisLevel, count2, thisCuboidNum));
+ } else {
+ logger.info("sanity check success for level " + thisLevel + ", count(*) is " + (count2 / thisCuboidNum));
+ }
+ }
+
+ private Long getRDDCountSum(JavaPairRDD<ByteArray, Object[]> rdd, final int countMeasureIndex) {
+ final ByteArray ONE = new ByteArray();
+ Long count = rdd.mapValues(new Function<Object[], Long>() {
+ @Override
+ public Long call(Object[] objects) throws Exception {
+ return (Long) objects[countMeasureIndex]; // assume the first measure is COUNT(*)
+ }
+ }).reduce(new Function2<Tuple2<ByteArray, Long>, Tuple2<ByteArray, Long>, Tuple2<ByteArray, Long>>() {
+ @Override
+ public Tuple2<ByteArray, Long> call(Tuple2<ByteArray, Long> longTuple2, Tuple2<ByteArray, Long> longTuple22) throws Exception {
+ return new Tuple2<>(ONE, longTuple2._2() + longTuple22._2());
+ }
+ })._2();
+ return count;
+ }
+}
http://git-wip-us.apache.org/repos/asf/kylin/blob/b7d87bb4/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkExecutable.java
----------------------------------------------------------------------
diff --git a/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkExecutable.java b/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkExecutable.java
index 7c88372..644f73f 100644
--- a/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkExecutable.java
+++ b/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkExecutable.java
@@ -22,6 +22,7 @@ import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.apache.kylin.common.KylinConfig;
+import org.apache.kylin.common.util.CliCommandExecutor;
import org.apache.kylin.common.util.Logger;
import org.apache.kylin.job.exception.ExecuteException;
import org.apache.kylin.job.execution.AbstractExecutable;
@@ -38,11 +39,16 @@ public class SparkExecutable extends AbstractExecutable {
private static final org.slf4j.Logger logger = LoggerFactory.getLogger(SparkExecutable.class);
private static final String CLASS_NAME = "className";
+ private static final String JARS = "jars";
public void setClassName(String className) {
this.setParam(CLASS_NAME, className);
}
+ public void setJars(String jars) {
+ this.setParam(JARS, jars);
+ }
+
private String formatArgs() {
StringBuilder stringBuilder = new StringBuilder();
for (Map.Entry<String, String> entry : getParams().entrySet()) {
@@ -50,6 +56,9 @@ public class SparkExecutable extends AbstractExecutable {
tmp.append("-").append(entry.getKey()).append(" ").append(entry.getValue()).append(" ");
if (entry.getKey().equals(CLASS_NAME)) {
stringBuilder.insert(0, tmp);
+ } else if (entry.getKey().equals(JARS)) {
+ // JARS is for spark-submit, not for app
+ continue;
} else {
stringBuilder.append(tmp);
}
@@ -65,12 +74,22 @@ public class SparkExecutable extends AbstractExecutable {
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
final KylinConfig config = context.getConfig();
Preconditions.checkNotNull(config.getSparkHome());
- Preconditions.checkNotNull(config.getSparkMaster());
+ Preconditions.checkNotNull(config.getKylinJobJarPath());
+ String sparkConf = config.getSparkConfFile();
+ String jars = this.getParam(JARS);
+
+ String jobJar = config.getKylinJobJarPath();
+
+ if (StringUtils.isEmpty(jars)) {
+ jars = jobJar;
+ }
+
try {
- String cmd = String.format("%s/bin/spark-submit --class \"org.apache.kylin.common.util.SparkEntry\" --master %s %s %s", config.getSparkHome(), config.getSparkMaster(), config.getKylinSparkJobJarPath(), formatArgs());
+ String cmd = String.format("export HADOOP_CONF_DIR=%s && %s/bin/spark-submit --class \"org.apache.kylin.common.util.SparkEntry\" --properties-file %s --jars %s %s %s", config.getSparkHadoopConfDir(), config.getSparkHome(), sparkConf, jars, jobJar, formatArgs());
logger.info("cmd:" + cmd);
final StringBuilder output = new StringBuilder();
- config.getCliCommandExecutor().execute(cmd, new Logger() {
+ CliCommandExecutor exec = new CliCommandExecutor();
+ exec.execute(cmd, new Logger() {
@Override
public void log(String message) {
output.append(message);
@@ -84,4 +103,5 @@ public class SparkExecutable extends AbstractExecutable {
return new ExecuteResult(ExecuteResult.State.ERROR, e.getLocalizedMessage());
}
}
+
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/b7d87bb4/engine-spark/src/main/java/org/apache/kylin/engine/spark/cube/DefaultTupleConverter.java
----------------------------------------------------------------------
diff --git a/engine-spark/src/main/java/org/apache/kylin/engine/spark/cube/DefaultTupleConverter.java b/engine-spark/src/main/java/org/apache/kylin/engine/spark/cube/DefaultTupleConverter.java
index 4f407ff..793cd87 100644
--- a/engine-spark/src/main/java/org/apache/kylin/engine/spark/cube/DefaultTupleConverter.java
+++ b/engine-spark/src/main/java/org/apache/kylin/engine/spark/cube/DefaultTupleConverter.java
@@ -38,7 +38,7 @@ import scala.Tuple2;
*/
public final class DefaultTupleConverter implements TupleConverter {
- private final static ThreadLocal<ByteBuffer> valueBuf = new ThreadLocal<>();
+ private final static transient ThreadLocal<ByteBuffer> valueBuf = new ThreadLocal<>();
private final CubeSegment segment;
private final int measureCount;
private final Map<TblColRef, Integer> columnLengthMap;
http://git-wip-us.apache.org/repos/asf/kylin/blob/b7d87bb4/examples/test_case_data/sandbox/kylin-spark-conf.properties
----------------------------------------------------------------------
diff --git a/examples/test_case_data/sandbox/kylin-spark-conf.properties b/examples/test_case_data/sandbox/kylin-spark-conf.properties
new file mode 100644
index 0000000..b4a0c97
--- /dev/null
+++ b/examples/test_case_data/sandbox/kylin-spark-conf.properties
@@ -0,0 +1,28 @@
+spark.yarn.submit.file.replication=1
+spark.yarn.executor.memoryOverhead=200
+spark.yarn.driver.memoryOverhead=384
+#spark.master=local[4]
+#spark.submit.deployMode=client
+spark.master=yarn
+spark.submit.deployMode=cluster
+spark.eventLog.enabled=true
+spark.yarn.scheduler.heartbeat.interval-ms=5000
+spark.yarn.preserve.staging.files=true
+spark.yarn.queue=default
+spark.yarn.containerLauncherMaxThreads=25
+spark.yarn.max.executor.failures=3
+spark.eventLog.dir=hdfs\:///spark-history
+spark.history.kerberos.enabled=true
+spark.history.provider=org.apache.spark.deploy.history.FsHistoryProvider
+spark.history.ui.port=18080
+spark.history.fs.logDirectory=hdfs\:///spark-history
+spark.executor.memory=1G
+spark.storage.memoryFraction=0.3
+spark.executor.cores=1
+spark.executor.instances=1
+spark.history.kerberos.keytab=none
+spark.history.kerberos.principal=none
+spark.yarn.jar=hdfs://sandbox.hortonworks.com:8020/apps/spark/spark-assembly-1.6.3-hadoop2.6.0.jar
+spark.driver.extraJavaOptions=-Dhdp.version=current
+spark.yarn.am.extraJavaOptions=-Dhdp.version=current
+spark.executor.extraJavaOptions=-Dhdp.version=current
http://git-wip-us.apache.org/repos/asf/kylin/blob/b7d87bb4/examples/test_case_data/sandbox/kylin.properties
----------------------------------------------------------------------
diff --git a/examples/test_case_data/sandbox/kylin.properties b/examples/test_case_data/sandbox/kylin.properties
index 1724619..a011911 100644
--- a/examples/test_case_data/sandbox/kylin.properties
+++ b/examples/test_case_data/sandbox/kylin.properties
@@ -79,6 +79,8 @@ kylin.job.remote-cli-working-dir=/tmp/kylin
# Max count of concurrent jobs running
kylin.job.max-concurrent-jobs=10
+kylin.source.hive.redistribute-flat-table=false
+
# Time interval to check hadoop job status
kylin.engine.mr.yarn-check-interval-seconds=10
@@ -154,3 +156,11 @@ kylin.server.query-metrics-percentiles-intervals=60, 360, 3600
# Env DEV|QA|PROD
kylin.env=DEV
+kylin.source.hive.keep-flat-table=true
+
+### Spark as Engine ###
+#kylin.engine.spark.env.hadoop-conf-dir=/etc/hadoop/conf
+kylin.engine.spark.env.hadoop-conf-dir=../examples/test_case_data/sandbox
+kylin.engine.spark.spark-home=/usr/local/spark
+kylin.engine.spark.properties-file=../examples/test_case_data/sandbox/kylin-spark-conf.properties
+kylin.engine.spark.sanity-check-enabled=false
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/kylin/blob/b7d87bb4/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 841f19d..fcb5ce3 100644
--- a/pom.xml
+++ b/pom.xml
@@ -84,7 +84,7 @@
<commons-math3.version>3.6.1</commons-math3.version>
<!-- Spark -->
- <spark.version>1.3.0</spark.version>
+ <spark.version>1.6.3</spark.version>
<!-- Utility -->
<log4j.version>1.2.17</log4j.version>
http://git-wip-us.apache.org/repos/asf/kylin/blob/b7d87bb4/server-base/src/main/java/org/apache/kylin/rest/controller/CubeController.java
----------------------------------------------------------------------
diff --git a/server-base/src/main/java/org/apache/kylin/rest/controller/CubeController.java b/server-base/src/main/java/org/apache/kylin/rest/controller/CubeController.java
index a9c8321..f905fe1 100644
--- a/server-base/src/main/java/org/apache/kylin/rest/controller/CubeController.java
+++ b/server-base/src/main/java/org/apache/kylin/rest/controller/CubeController.java
@@ -327,10 +327,10 @@ public class CubeController extends BasicController {
throw new InternalErrorException("Cannot find cube '" + cubeName + "'");
}
- if (cube.getSegments() != null && cube.getBuildingSegments().size() > 0) {
- int num = cube.getBuildingSegments().size();
- throw new InternalErrorException("Cannot purge cube '" + cubeName + "' as there is " + num + " building " + (num > 1 ? "segment(s)." : "segment. Discard the related job first."));
- }
+// if (cube.getSegments() != null && cube.getBuildingSegments().size() > 0) {
+// int num = cube.getBuildingSegments().size();
+// throw new InternalErrorException("Cannot purge cube '" + cubeName + "' as there is " + num + " building " + (num > 1 ? "segment(s)." : "segment. Discard the related job first."));
+// }
return cubeService.purgeCube(cube);
} catch (Exception e) {
http://git-wip-us.apache.org/repos/asf/kylin/blob/b7d87bb4/server/pom.xml
----------------------------------------------------------------------
diff --git a/server/pom.xml b/server/pom.xml
index 617cc85..454ed6b 100644
--- a/server/pom.xml
+++ b/server/pom.xml
@@ -253,6 +253,13 @@
<scope>provided</scope>
</dependency>
+ <!-- Spark dependency -->
+ <dependency>
+ <groupId>org.apache.spark</groupId>
+ <artifactId>spark-core_2.10</artifactId>
+ <scope>provided</scope>
+ </dependency>
+
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-testing-util</artifactId>
http://git-wip-us.apache.org/repos/asf/kylin/blob/b7d87bb4/source-hive/src/main/java/org/apache/kylin/source/hive/HiveMRInput.java
----------------------------------------------------------------------
diff --git a/source-hive/src/main/java/org/apache/kylin/source/hive/HiveMRInput.java b/source-hive/src/main/java/org/apache/kylin/source/hive/HiveMRInput.java
index 0f351b0..bd53f9a 100644
--- a/source-hive/src/main/java/org/apache/kylin/source/hive/HiveMRInput.java
+++ b/source-hive/src/main/java/org/apache/kylin/source/hive/HiveMRInput.java
@@ -22,6 +22,7 @@ import java.io.IOException;
import java.util.Map;
import java.util.Set;
+import com.google.common.collect.Sets;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@@ -56,8 +57,6 @@ import org.apache.kylin.metadata.model.TableDesc;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import com.google.common.collect.Sets;
-
public class HiveMRInput implements IMRInput {
public static String getTableNameForHCat(TableDesc table) {
@@ -74,7 +73,7 @@ public class HiveMRInput implements IMRInput {
public IMRTableInputFormat getTableInputFormat(TableDesc table) {
return new HiveTableInputFormat(getTableNameForHCat(table));
}
-
+
@Override
public IMRBatchMergeInputSide getBatchMergeInputSide(ISegment seg) {
return new IMRBatchMergeInputSide() {
http://git-wip-us.apache.org/repos/asf/kylin/blob/b7d87bb4/storage-hbase/src/test/java/org/apache/kylin/storage/hbase/steps/RowValueDecoderTest.java
----------------------------------------------------------------------
diff --git a/storage-hbase/src/test/java/org/apache/kylin/storage/hbase/steps/RowValueDecoderTest.java b/storage-hbase/src/test/java/org/apache/kylin/storage/hbase/steps/RowValueDecoderTest.java
index f0ad6e0..9b1a00d 100644
--- a/storage-hbase/src/test/java/org/apache/kylin/storage/hbase/steps/RowValueDecoderTest.java
+++ b/storage-hbase/src/test/java/org/apache/kylin/storage/hbase/steps/RowValueDecoderTest.java
@@ -31,7 +31,6 @@ import org.apache.kylin.cube.model.CubeDesc;
import org.apache.kylin.cube.model.HBaseColumnDesc;
import org.apache.kylin.measure.BufferedMeasureCodec;
import org.apache.kylin.metadata.MetadataManager;
-import org.apache.kylin.metadata.datatype.LongMutable;
import org.apache.kylin.metadata.model.FunctionDesc;
import org.apache.kylin.metadata.model.MeasureDesc;
import org.junit.After;
@@ -60,8 +59,8 @@ public class RowValueDecoderTest extends LocalFileMetadataTestCase {
BigDecimal sum = new BigDecimal("333.1234567");
BigDecimal min = new BigDecimal("333.1111111");
BigDecimal max = new BigDecimal("333.1999999");
- LongMutable count = new LongMutable(2);
- LongMutable item_count = new LongMutable(100);
+ Long count = new Long(2);
+ Long item_count = new Long(100);
ByteBuffer buf = codec.encode(new Object[] { sum, min, max, count, item_count });
buf.flip();
@@ -91,7 +90,7 @@ public class RowValueDecoderTest extends LocalFileMetadataTestCase {
BigDecimal min = new BigDecimal("333.1111111");
BigDecimal max = new BigDecimal("333.1999999");
LongWritable count = new LongWritable(2);
- LongMutable item_count = new LongMutable(100);
+ Long item_count = new Long(100);
codec.encode(new Object[] { sum, min, max, count, item_count });
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/b7d87bb4/tool/pom.xml
----------------------------------------------------------------------
diff --git a/tool/pom.xml b/tool/pom.xml
index aae0dab..919a903 100644
--- a/tool/pom.xml
+++ b/tool/pom.xml
@@ -43,6 +43,10 @@
</dependency>
<dependency>
<groupId>org.apache.kylin</groupId>
+ <artifactId>kylin-engine-spark</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.kylin</groupId>
<artifactId>kylin-source-kafka</artifactId>
</dependency>
<dependency>