You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hivemall.apache.org by my...@apache.org on 2016/12/02 07:04:09 UTC
[07/50] [abbrv] incubator-hivemall git commit: change interface of
chi2
change interface of chi2
Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/7b07e4a6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/7b07e4a6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/7b07e4a6
Branch: refs/heads/JIRA-22/pr-385
Commit: 7b07e4a6e1f700ba0a6e5b68659a040a3d89aa2f
Parents: d0e97e6
Author: amaya <gi...@sapphire.in.net>
Authored: Tue Sep 20 12:03:44 2016 +0900
Committer: amaya <gi...@sapphire.in.net>
Committed: Tue Sep 20 12:11:42 2016 +0900
----------------------------------------------------------------------
.../ftvec/selection/ChiSquareTestUDF.java | 21 ----
.../hivemall/ftvec/selection/ChiSquareUDF.java | 124 +++++++++++++++++--
.../ftvec/selection/DissociationDegreeUDF.java | 88 -------------
.../java/hivemall/utils/math/StatsUtils.java | 49 ++++++--
4 files changed, 155 insertions(+), 127 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7b07e4a6/core/src/main/java/hivemall/ftvec/selection/ChiSquareTestUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/ftvec/selection/ChiSquareTestUDF.java b/core/src/main/java/hivemall/ftvec/selection/ChiSquareTestUDF.java
deleted file mode 100644
index d367085..0000000
--- a/core/src/main/java/hivemall/ftvec/selection/ChiSquareTestUDF.java
+++ /dev/null
@@ -1,21 +0,0 @@
-package hivemall.ftvec.selection;
-
-import hivemall.utils.math.StatsUtils;
-import org.apache.hadoop.hive.ql.exec.Description;
-
-import javax.annotation.Nonnull;
-
-@Description(name = "chi2_test",
- value = "_FUNC_(array<number> expected, array<number> observed) - Returns p-value as double")
-public class ChiSquareTestUDF extends DissociationDegreeUDF {
- @Override
- double calcDissociation(@Nonnull final double[] expected,@Nonnull final double[] observed) {
- return StatsUtils.chiSquareTest(expected, observed);
- }
-
- @Override
- @Nonnull
- String getFuncName() {
- return "chi2_test";
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7b07e4a6/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java b/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java
index 937b1bd..1954e33 100644
--- a/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java
+++ b/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java
@@ -1,21 +1,131 @@
package hivemall.ftvec.selection;
+import hivemall.utils.hadoop.HiveUtils;
+import hivemall.utils.hadoop.WritableUtils;
+import hivemall.utils.lang.Preconditions;
import hivemall.utils.math.StatsUtils;
import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StructField;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import javax.annotation.Nonnull;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
@Description(name = "chi2",
- value = "_FUNC_(array<number> expected, array<number> observed) - Returns chi2-value as double")
-public class ChiSquareUDF extends DissociationDegreeUDF {
+ value = "_FUNC_(array<array<number>> observed, array<array<number>> expected)" +
+ " - Returns chi2_val and p_val of each columns as <array<double>, array<double>>")
+public class ChiSquareUDF extends GenericUDF {
+ private ListObjectInspector observedOI;
+ private ListObjectInspector observedRowOI;
+ private PrimitiveObjectInspector observedElOI;
+ private ListObjectInspector expectedOI;
+ private ListObjectInspector expectedRowOI;
+ private PrimitiveObjectInspector expectedElOI;
+
@Override
- double calcDissociation(@Nonnull final double[] expected,@Nonnull final double[] observed) {
- return StatsUtils.chiSquare(expected, observed);
+ public ObjectInspector initialize(ObjectInspector[] OIs) throws UDFArgumentException {
+ if (OIs.length != 2) {
+ throw new UDFArgumentLengthException("Specify two arguments.");
+ }
+
+ if (!HiveUtils.isNumberListListOI(OIs[0])){
+ throw new UDFArgumentTypeException(0, "Only array<array<number>> type argument is acceptable but "
+ + OIs[0].getTypeName() + " was passed as `observed`");
+ }
+
+ if (!HiveUtils.isNumberListListOI(OIs[1])){
+ throw new UDFArgumentTypeException(1, "Only array<array<number>> type argument is acceptable but "
+ + OIs[1].getTypeName() + " was passed as `expected`");
+ }
+
+ observedOI = HiveUtils.asListOI(OIs[1]);
+ observedRowOI=HiveUtils.asListOI(observedOI.getListElementObjectInspector());
+ observedElOI = HiveUtils.asDoubleCompatibleOI( observedRowOI.getListElementObjectInspector());
+ expectedOI = HiveUtils.asListOI(OIs[0]);
+ expectedRowOI=HiveUtils.asListOI(expectedOI.getListElementObjectInspector());
+ expectedElOI = HiveUtils.asDoubleCompatibleOI(expectedRowOI.getListElementObjectInspector());
+
+ List<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
+ fieldOIs.add(ObjectInspectorFactory.getStandardListObjectInspector(
+ PrimitiveObjectInspectorFactory.writableDoubleObjectInspector));
+ fieldOIs.add(ObjectInspectorFactory.getStandardListObjectInspector(
+ PrimitiveObjectInspectorFactory.writableDoubleObjectInspector));
+
+ return ObjectInspectorFactory.getStandardStructObjectInspector(
+ Arrays.asList("chi2_vals", "p_vals"), fieldOIs);
+ }
+
+ @Override
+ public Object evaluate(GenericUDF.DeferredObject[] dObj) throws HiveException {
+ List observedObj = observedOI.getList(dObj[0].get()); // shape = (#classes, #features)
+ List expectedObj = expectedOI.getList(dObj[1].get()); // shape = (#classes, #features)
+
+ Preconditions.checkNotNull(observedObj);
+ Preconditions.checkNotNull(expectedObj);
+ final int nClasses = observedObj.size();
+ Preconditions.checkArgument(nClasses == expectedObj.size()); // same #rows
+
+ int nFeatures=-1;
+ double[] observedRow=null; // to reuse
+ double[] expectedRow=null; // to reuse
+ double[][] observed =null; // shape = (#features, #classes)
+ double[][] expected = null; // shape = (#features, #classes)
+
+ // explode and transpose matrix
+ for(int i=0;i<nClasses;i++){
+ if(i==0){
+ // init
+ observedRow=HiveUtils.asDoubleArray(observedObj.get(i),observedRowOI,observedElOI,false);
+ expectedRow=HiveUtils.asDoubleArray(expectedObj.get(i),expectedRowOI,expectedElOI, false);
+ nFeatures = observedRow.length;
+ observed=new double[nFeatures][nClasses];
+ expected = new double[nFeatures][nClasses];
+ }else{
+ HiveUtils.toDoubleArray(observedObj.get(i),observedRowOI,observedElOI,observedRow,false);
+ HiveUtils.toDoubleArray(expectedObj.get(i),expectedRowOI,expectedElOI,expectedRow, false);
+ }
+
+ for(int j=0;j<nFeatures;j++){
+ observed[j][i] = observedRow[j];
+ expected[j][i] = expectedRow[j];
+ }
+ }
+
+ final Map.Entry<double[],double[]> chi2 = StatsUtils.chiSquares(observed,expected);
+
+ final Object[] result = new Object[2];
+ result[0] = WritableUtils.toWritableList(chi2.getKey());
+ result[1]=WritableUtils.toWritableList(chi2.getValue());
+ return result;
}
@Override
- @Nonnull
- String getFuncName() {
- return "chi2";
+ public String getDisplayString(String[] children) {
+ final StringBuilder sb = new StringBuilder();
+ sb.append("chi2");
+ sb.append("(");
+ if (children.length > 0) {
+ sb.append(children[0]);
+ for (int i = 1; i < children.length; i++) {
+ sb.append(", ");
+ sb.append(children[i]);
+ }
+ }
+ sb.append(")");
+ return sb.toString();
}
}
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7b07e4a6/core/src/main/java/hivemall/ftvec/selection/DissociationDegreeUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/ftvec/selection/DissociationDegreeUDF.java b/core/src/main/java/hivemall/ftvec/selection/DissociationDegreeUDF.java
deleted file mode 100644
index 0acae82..0000000
--- a/core/src/main/java/hivemall/ftvec/selection/DissociationDegreeUDF.java
+++ /dev/null
@@ -1,88 +0,0 @@
-package hivemall.ftvec.selection;
-
-import hivemall.utils.hadoop.HiveUtils;
-import hivemall.utils.lang.Preconditions;
-import hivemall.utils.math.StatsUtils;
-import org.apache.hadoop.hive.ql.exec.Description;
-import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
-import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
-import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
-import org.apache.hadoop.hive.ql.metadata.HiveException;
-import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
-import org.apache.hadoop.hive.serde2.io.DoubleWritable;
-import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
-
-import javax.annotation.Nonnull;
-
-@Description(name = "",
- value = "_FUNC_(array<number> expected, array<number> observed) - Returns dissociation degree as double")
-public abstract class DissociationDegreeUDF extends GenericUDF {
- private ListObjectInspector expectedOI;
- private DoubleObjectInspector expectedElOI;
- private ListObjectInspector observedOI;
- private DoubleObjectInspector observedElOI;
-
- @Override
- public ObjectInspector initialize(ObjectInspector[] OIs) throws UDFArgumentException {
- if (OIs.length != 2) {
- throw new UDFArgumentLengthException("Specify two arguments.");
- }
-
- if (!HiveUtils.isListOI(OIs[0])
- || !HiveUtils.isNumberOI(((ListObjectInspector) OIs[0]).getListElementObjectInspector())){
- throw new UDFArgumentTypeException(0, "Only array<number> type argument is acceptable but "
- + OIs[0].getTypeName() + " was passed as `expected`");
- }
-
- if (!HiveUtils.isListOI(OIs[1])
- || !HiveUtils.isNumberOI(((ListObjectInspector) OIs[1]).getListElementObjectInspector())){
- throw new UDFArgumentTypeException(1, "Only array<number> type argument is acceptable but "
- + OIs[1].getTypeName() + " was passed as `observed`");
- }
-
- expectedOI = (ListObjectInspector) OIs[0];
- expectedElOI = (DoubleObjectInspector) expectedOI.getListElementObjectInspector();
- observedOI = (ListObjectInspector) OIs[1];
- observedElOI = (DoubleObjectInspector) observedOI.getListElementObjectInspector();
-
- return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector;
- }
-
- @Override
- public Object evaluate(GenericUDF.DeferredObject[] dObj) throws HiveException {
- final double[] expected = HiveUtils.asDoubleArray(dObj[0].get(),expectedOI,expectedElOI);
- final double[] observed = HiveUtils.asDoubleArray(dObj[1].get(),observedOI,observedElOI);
-
- Preconditions.checkNotNull(expected);
- Preconditions.checkNotNull(observed);
- Preconditions.checkArgument(expected.length == observed.length);
-
- final double dissociation = calcDissociation(expected,observed);
-
- return new DoubleWritable(dissociation);
- }
-
- @Override
- public String getDisplayString(String[] children) {
- final StringBuilder sb = new StringBuilder();
- sb.append(getFuncName());
- sb.append("(");
- if (children.length > 0) {
- sb.append(children[0]);
- for (int i = 1; i < children.length; i++) {
- sb.append(", ");
- sb.append(children[i]);
- }
- }
- sb.append(")");
- return sb.toString();
- }
-
- abstract double calcDissociation(@Nonnull final double[] expected,@Nonnull final double[] observed);
-
- @Nonnull
- abstract String getFuncName();
-}
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7b07e4a6/core/src/main/java/hivemall/utils/math/StatsUtils.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/utils/math/StatsUtils.java b/core/src/main/java/hivemall/utils/math/StatsUtils.java
index 7633419..f9d0f30 100644
--- a/core/src/main/java/hivemall/utils/math/StatsUtils.java
+++ b/core/src/main/java/hivemall/utils/math/StatsUtils.java
@@ -29,6 +29,9 @@ import org.apache.commons.math3.linear.RealMatrix;
import org.apache.commons.math3.linear.RealVector;
import org.apache.commons.math3.linear.SingularValueDecomposition;
+import java.util.AbstractMap;
+import java.util.Map;
+
public final class StatsUtils {
private StatsUtils() {}
@@ -191,24 +194,24 @@ public final class StatsUtils {
}
/**
- * @param expected mean vector whose value is expected
* @param observed mean vector whose value is observed
- * @return chi2-value
+ * @param expected mean vector whose value is expected
+ * @return chi2 value
*/
- public static double chiSquare(@Nonnull final double[] expected, @Nonnull final double[] observed) {
- Preconditions.checkArgument(expected.length == observed.length);
+ public static double chiSquare(@Nonnull final double[] observed, @Nonnull final double[] expected) {
+ Preconditions.checkArgument(observed.length == expected.length);
- double sumExpected = 0.d;
double sumObserved = 0.d;
+ double sumExpected = 0.d;
for (int ratio = 0; ratio < observed.length; ++ratio) {
- sumExpected += expected[ratio];
sumObserved += observed[ratio];
+ sumExpected += expected[ratio];
}
double var15 = 1.d;
boolean rescale = false;
- if (Math.abs(sumExpected - sumObserved) > 1.e-5) {
+ if (Math.abs(sumObserved - sumExpected) > 1.e-5) {
var15 = sumObserved / sumExpected;
rescale = true;
}
@@ -230,12 +233,36 @@ public final class StatsUtils {
}
/**
- * @param expected means vector whose value is expected
* @param observed means vector whose value is observed
- * @return p-value
+ * @param expected means vector whose value is expected
+ * @return p value
*/
- public static double chiSquareTest(@Nonnull final double[] expected,@Nonnull final double[] observed) {
+ public static double chiSquareTest(@Nonnull final double[] observed, @Nonnull final double[] expected) {
ChiSquaredDistribution distribution = new ChiSquaredDistribution(null, (double)expected.length - 1.d);
- return 1.d - distribution.cumulativeProbability(chiSquare(expected, observed));
+ return 1.d - distribution.cumulativeProbability(chiSquare(observed,expected));
+ }
+
+ /**
+ * This method offers effective calculation for multiple entries rather than calculation individually
+ * @param observeds means matrix whose values are observed
+ * @param expecteds means matrix
+ * @return (chi2 value[], p value[])
+ */
+ public static Map.Entry<double[],double[]> chiSquares(@Nonnull final double[][] observeds, @Nonnull final double[][] expecteds){
+ Preconditions.checkArgument(observeds.length == expecteds.length);
+
+ final int len = expecteds.length;
+ final int lenOfEach = expecteds[0].length;
+
+ final ChiSquaredDistribution distribution = new ChiSquaredDistribution(null, (double)lenOfEach - 1.d);
+
+ final double[] chi2s = new double[len];
+ final double[] ps = new double[len];
+ for(int i=0;i<len;i++){
+ chi2s[i] = chiSquare(observeds[i],expecteds[i]);
+ ps[i] = 1.d - distribution.cumulativeProbability(chi2s[i]);
+ }
+
+ return new AbstractMap.SimpleEntry<double[], double[]>(chi2s,ps);
}
}