You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2016/02/29 20:24:12 UTC
incubator-madlib git commit: Hypothesis tests: Fix docs and examples
Repository: incubator-madlib
Updated Branches:
refs/heads/master 7b87dc9d7 -> 0545cdfc4
Hypothesis tests: Fix docs and examples
JIRA: MADLIB-895
Chi-squared independence test docs and examples had some mistakes and
were not clear.
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/0545cdfc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/0545cdfc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/0545cdfc
Branch: refs/heads/master
Commit: 0545cdfc499354820af670b627a2769ae6af75b1
Parents: 7b87dc9
Author: Frank McQuillan <fm...@pivotal.io>
Authored: Tue Feb 23 16:12:57 2016 -0800
Committer: Rahul Iyer <ri...@pivotal.io>
Committed: Mon Feb 29 11:19:46 2016 -0800
----------------------------------------------------------------------
.../modules/stats/hypothesis_tests.sql_in | 70 +++++++++++---------
1 file changed, 38 insertions(+), 32 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/0545cdfc/src/ports/postgres/modules/stats/hypothesis_tests.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/stats/hypothesis_tests.sql_in b/src/ports/postgres/modules/stats/hypothesis_tests.sql_in
index e768550..29cbe07 100644
--- a/src/ports/postgres/modules/stats/hypothesis_tests.sql_in
+++ b/src/ports/postgres/modules/stats/hypothesis_tests.sql_in
@@ -42,7 +42,7 @@ hypothesis</em> \f$ H_1 \f$ is true.
Hypothesis tests may be devided into parametric and non-parametric tests. A
parametric test assumes certain distributions and makes inferences about
-parameters of the distributions (like, e.g., the mean of a normal distribution).
+parameters of the distributions (e.g., the mean of a normal distribution).
Formally, there is a given domain of possible parameters \f$ \Gamma \f$ and the
null hypothesis \f$ H_0 \f$ is the event that the true parameter
\f$ \gamma_0 \in \Gamma_0 \f$, where \f$ \Gamma_0 \subsetneq \Gamma \f$.
@@ -80,7 +80,7 @@ Two-sample tests expect the following form:
<em>value</em> DOUBLE PRECISION
...
)</pre>
-The \c first column indicates whether a value is from the first (if \c TRUE) or the
+The \c first column indicates whether a value is from the first sample (if \c TRUE) or the
second sample (if \c FALSE).
Many-sample tests expect the following form:
@@ -122,16 +122,16 @@ forms of usage are given. Specific function signatures, as described in
- mw_test
- wsr_test (multi-sample)
- <b>Note:</b>Kolomogov-Smirnov two-sample test is based on the asymptotic theory.
- The p-value is given by comparing the test statistics with Kolomogov distribution.
+ <b>Note:</b> Kolomogov-Smirnov two-sample test is based on the asymptotic theory.
+ The p-value is given by comparing the test statistics with the Kolomogov distribution.
The p-value is also adjusted for data with heavy tail distribution, which may give
- different results given by R function's ks.test. See [3] for detailed explantion.
- The literature is not unanimous about the definitions of the Wilcoxon rank sum
- and Mann-Whitney tests. There are two possible definitions for the statistic;
- MADlib outputs the minimum of the two and uses it for significance testing. This
- might give different results for both mw_test and wsr_test compared to statistical
- functions in other popular packages (like R's wilcox.test function). See [4] for
- detailed explanation.
+ different results given by R function's ks.test. See [3] for a detailed explantion.
+ The literature is not unanimous about the definitions of the Wilcoxon rank sum
+ and Mann-Whitney tests. There are two possible definitions for the statistic;
+ MADlib outputs the minimum of the two and uses it for significance testing. This
+ might give different results for both mw_test and wsr_test compared to statistical
+ functions in other popular packages (like R's wilcox.test function). See [4] for
+ a detailed explanation.
@anchor examples
@examp
@@ -206,7 +206,7 @@ SELECT TRUE AS is_us, mpg_us AS mpg
<pre class="example">
-- One sample tests
-SELECT (t_test_one(mpg - 20)).* FROM auto83b_one_sample; -- test rejected for mean = 20
+SELECT (madlib.t_test_one(mpg - 20)).* FROM auto83b_one_sample; -- test rejected for mean = 20
</pre>
<pre class="result">
@@ -216,7 +216,7 @@ SELECT (t_test_one(mpg - 20)).* FROM auto83b_one_sample; -- test rejected for m
</pre>
<pre class="example">
-SELECT (t_test_one(mpg - 15.7)).* FROM auto83b_one_sample; -- test not rejected
+SELECT (madlib.t_test_one(mpg - 15.7)).* FROM auto83b_one_sample; -- test not rejected
</pre>
<pre class="result">
@@ -227,7 +227,7 @@ SELECT (t_test_one(mpg - 15.7)).* FROM auto83b_one_sample; -- test not rejected
<pre class="example">
-- Two sample tests
-SELECT (t_test_two_pooled(is_us, mpg)).* FROM auto83b_two_sample;
+SELECT (madlib.t_test_two_pooled(is_us, mpg)).* FROM auto83b_two_sample;
</pre>
<pre class="result">
statistic | df | p_value_one_sided | p_value_two_sided
@@ -236,7 +236,7 @@ SELECT (t_test_two_pooled(is_us, mpg)).* FROM auto83b_two_sample;
</pre>
<pre class="example">
-SELECT (t_test_two_unpooled(is_us, mpg)).* FROM auto83b_two_sample;
+SELECT (madlib.t_test_two_unpooled(is_us, mpg)).* FROM auto83b_two_sample;
</pre>
<pre class="result">
@@ -248,7 +248,7 @@ SELECT (t_test_two_unpooled(is_us, mpg)).* FROM auto83b_two_sample;
- <b>F-Test</b> (Uses same data as above t-test)
<pre class="example">
-SELECT (f_test(is_us, mpg)).* FROM auto83b_two_sample;
+SELECT (madlib.f_test(is_us, mpg)).* FROM auto83b_two_sample;
-- Test result indicates that the two distributions have different variances
</pre>
<pre class="result">
@@ -257,7 +257,7 @@ SELECT (f_test(is_us, mpg)).* FROM auto83b_two_sample;
0.311786921089247 | 26 | 23 | 0.997559863672441 | 0.00488027265511803
</pre>
-- <b>Chi-squared goodness-of-fit test</b> (<a href="http://www.statsdirect.com/help/chi_square_tests/chi_good.htm">Data source</a>)
+- <b>Chi-squared goodness-of-fit test</b> (<a href="http://www.statsdirect.com/help/default.htm#nonparametric_methods/chisq_goodness_fit.htm">Data source</a>)
<pre class="example">
CREATE TABLE chi2_test_blood_group (
@@ -281,6 +281,12 @@ SELECT (madlib.chi2_gof_test(observed, expected)).* FROM chi2_test_blood_group;
- <b>Chi-squared independence test</b> (<a href=http://itl.nist.gov/div898/software/dataplot/refman1/auxillar/chistest.htm>Data source</a>)
+The Chi-squared independence test uses the Chi-squared goodness-of-fit function,
+as shown in the example below. The expected value needs to be computed and passed
+to the goodness-of-fit function. The expected value for MADlib is computed as
+<em>sum of rows * sum of columns</em>, for each element of the input matrix.
+For e.g., expected value for element (2,1) would be <em>sum of row 2 * sum of column 1</em>.
+
<pre class="example">
CREATE TABLE chi2_test_friendly (
id_x SERIAL,
@@ -292,27 +298,27 @@ INSERT INTO chi2_test_friendly(values) VALUES
(array[20, 84, 17, 94]),
(array[68, 119, 26, 7]);
--- we expect the table to be unpivoted
+-- Input table is expected to be unpivoted, so need to pivot it
CREATE TABLE chi2_test_friendly_unpivoted AS
SELECT id_x, id_y, values[id_y] AS observed
FROM
chi2_test_friendly,
generate_series(1,4) AS id_y;
--- Compute Chi-squared independence statistic
+-- Compute Chi-squared independence statistic, by calculating expected value in the SQL and calling the goodness-of-fit function
SELECT (madlib.chi2_gof_test(observed, expected, deg_freedom)).*
FROM (
-- Compute expected values and degrees of freedom
SELECT
observed,
- sum(observed) OVER (PARTITION BY id_x)::DOUBLE PRECISION
- * sum(observed) OVER (PARTITION BY id_y) AS expected
+ sum(observed) OVER (PARTITION BY id_x)::DOUBLE PRECISION *
+ sum(observed) OVER (PARTITION BY id_y) AS expected
FROM chi2_test_friendly_unpivoted
) p, (
SELECT
(count(DISTINCT id_x) - 1) * (count(DISTINCT id_y) - 1) AS deg_freedom
FROM chi2_test_friendly_unpivoted
-);
+) q;
</pre>
<pre class="result">
statistic | p_value | df | phi | contingency_coef
@@ -327,13 +333,13 @@ CREATE TABLE nist_anova_test (
id SERIAL,
resistance FLOAT8[]
);
-COPY nist_anova_test(resistance) FROM stdin;
-{6.9,8.3,8.0}
-{5.4,6.8,10.5}
-{5.8,7.8,8.1}
-{4.6,9.2,6.9}
-{4.0,6.5,9.3}
-\.
+INSERT INTO nist_anova_test(resistance) VALUES
+ (array[6.9,8.3,8.0]),
+ (array[5.4,6.8,10.5]),
+ (array[5.8,7.8,8.1]),
+ (array[4.6,9.2,6.9]),
+ (array[4.0,6.5,9.3]);
+
SELECT (madlib.one_way_anova(level, value)).* FROM (
SELECT level, resistance[level] AS value
FROM
@@ -359,7 +365,7 @@ SELECT
FALSE,
unnest(ARRAY[-5.13, -2.19, -2.43, -3.83, 0.50, -3.25, 4.32, 1.63, 5.18, -0.43, 7.11, 4.87, -3.10, -5.81, 3.76, 6.31, 2.58, 0.07, 5.76, 3.50]);
-SELECT (ks_test(first, value,
+SELECT (madlib.ks_test(first, value,
(SELECT count(value) FROM ks_sample_1 WHERE first),
(SELECT count(value) FROM ks_sample_1 WHERE NOT first)
ORDER BY value)).*
@@ -374,7 +380,7 @@ FROM ks_sample_1;
- <b>Mann-Whitney test</b> (use same data as t-test)
<pre class="example">
-SELECT (mw_test(is_us, mpg ORDER BY mpg)).* from auto83b_two_sample;
+SELECT (madlib.mw_test(is_us, mpg ORDER BY mpg)).* from auto83b_two_sample;
</pre>
<pre class="result">
statistic | u_statistic | p_value_one_sided | p_value_two_sided
@@ -418,7 +424,7 @@ COPY test_wsr (x, y) FROM stdin DELIMITER '|';
0.48|0.4
\\.
\s
-SELECT (wsr_test(
+SELECT (madlib.wsr_test(
x - y,
2 * 2^(-52) * greatest(x,y)
ORDER BY abs(x - y)