You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2016/02/29 20:24:12 UTC

incubator-madlib git commit: Hypothesis tests: Fix docs and examples

Repository: incubator-madlib
Updated Branches:
  refs/heads/master 7b87dc9d7 -> 0545cdfc4


Hypothesis tests: Fix docs and examples

JIRA: MADLIB-895

Chi-squared independence test docs and examples had some mistakes and
were not clear.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/0545cdfc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/0545cdfc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/0545cdfc

Branch: refs/heads/master
Commit: 0545cdfc499354820af670b627a2769ae6af75b1
Parents: 7b87dc9
Author: Frank McQuillan <fm...@pivotal.io>
Authored: Tue Feb 23 16:12:57 2016 -0800
Committer: Rahul Iyer <ri...@pivotal.io>
Committed: Mon Feb 29 11:19:46 2016 -0800

----------------------------------------------------------------------
 .../modules/stats/hypothesis_tests.sql_in       | 70 +++++++++++---------
 1 file changed, 38 insertions(+), 32 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/0545cdfc/src/ports/postgres/modules/stats/hypothesis_tests.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/stats/hypothesis_tests.sql_in b/src/ports/postgres/modules/stats/hypothesis_tests.sql_in
index e768550..29cbe07 100644
--- a/src/ports/postgres/modules/stats/hypothesis_tests.sql_in
+++ b/src/ports/postgres/modules/stats/hypothesis_tests.sql_in
@@ -42,7 +42,7 @@ hypothesis</em> \f$ H_1 \f$ is true.
 
 Hypothesis tests may be devided into parametric and non-parametric tests. A
 parametric test assumes certain distributions and makes inferences about
-parameters of the distributions (like, e.g., the mean of a normal distribution).
+parameters of the distributions (e.g., the mean of a normal distribution).
 Formally, there is a given domain of possible parameters \f$ \Gamma \f$ and the
 null hypothesis \f$ H_0 \f$ is the event that the true parameter
 \f$ \gamma_0 \in \Gamma_0 \f$, where \f$ \Gamma_0 \subsetneq \Gamma \f$.
@@ -80,7 +80,7 @@ Two-sample tests expect the following form:
     <em>value</em> DOUBLE PRECISION
     ...
 )</pre>
-The \c first column indicates whether a value is from the first (if \c TRUE) or the
+The \c first column indicates whether a value is from the first sample (if \c TRUE) or the
 second sample (if \c FALSE).
 
 Many-sample tests expect the following form:
@@ -122,16 +122,16 @@ forms of usage are given. Specific function signatures, as described in
         - mw_test
         - wsr_test (multi-sample)
 
-        <b>Note:</b>Kolomogov-Smirnov two-sample test is based on the asymptotic theory.
-        The p-value is given by comparing the test statistics with Kolomogov distribution.
+        <b>Note:</b> Kolomogov-Smirnov two-sample test is based on the asymptotic theory.
+        The p-value is given by comparing the test statistics with the Kolomogov distribution.
         The p-value is also adjusted for data with heavy tail distribution, which may give
-        different results given by R function's ks.test. See [3] for detailed explantion. 
-        The literature is not unanimous about the definitions of the Wilcoxon rank sum 
-        and Mann-Whitney tests. There are two possible definitions for the statistic; 
-        MADlib outputs the minimum of the two and uses it for significance testing. This 
-        might give different results for both mw_test and wsr_test compared to statistical 
-        functions in other popular packages (like R's wilcox.test function). See [4] for 
-        detailed explanation.
+        different results given by R function's ks.test. See [3] for a detailed explantion.
+        The literature is not unanimous about the definitions of the Wilcoxon rank sum
+        and Mann-Whitney tests. There are two possible definitions for the statistic;
+        MADlib outputs the minimum of the two and uses it for significance testing. This
+        might give different results for both mw_test and wsr_test compared to statistical
+        functions in other popular packages (like R's wilcox.test function). See [4] for
+        a detailed explanation.
 
 @anchor examples
 @examp
@@ -206,7 +206,7 @@ SELECT TRUE AS is_us, mpg_us AS mpg
 
 <pre class="example">
 -- One sample tests
-SELECT (t_test_one(mpg - 20)).* FROM auto83b_one_sample;  -- test rejected for mean = 20
+SELECT (madlib.t_test_one(mpg - 20)).* FROM auto83b_one_sample;  -- test rejected for mean = 20
 </pre>
 
 <pre class="result">
@@ -216,7 +216,7 @@ SELECT (t_test_one(mpg - 20)).* FROM auto83b_one_sample;  -- test rejected for m
  </pre>
 
 <pre class="example">
-SELECT (t_test_one(mpg - 15.7)).* FROM auto83b_one_sample;  -- test not rejected
+SELECT (madlib.t_test_one(mpg - 15.7)).* FROM auto83b_one_sample;  -- test not rejected
 </pre>
 
 <pre class="result">
@@ -227,7 +227,7 @@ SELECT (t_test_one(mpg - 15.7)).* FROM auto83b_one_sample;  -- test not rejected
 
 <pre class="example">
 -- Two sample tests
-SELECT (t_test_two_pooled(is_us, mpg)).* FROM auto83b_two_sample;
+SELECT (madlib.t_test_two_pooled(is_us, mpg)).* FROM auto83b_two_sample;
 </pre>
 <pre class="result">
      statistic     | df | p_value_one_sided |  p_value_two_sided
@@ -236,7 +236,7 @@ SELECT (t_test_two_pooled(is_us, mpg)).* FROM auto83b_two_sample;
  </pre>
 
 <pre class="example">
-SELECT (t_test_two_unpooled(is_us, mpg)).* FROM auto83b_two_sample;
+SELECT (madlib.t_test_two_unpooled(is_us, mpg)).* FROM auto83b_two_sample;
 </pre>
 
 <pre class="result">
@@ -248,7 +248,7 @@ SELECT (t_test_two_unpooled(is_us, mpg)).* FROM auto83b_two_sample;
 - <b>F-Test</b> (Uses same data as above t-test)
 
 <pre class="example">
-SELECT (f_test(is_us, mpg)).* FROM auto83b_two_sample;
+SELECT (madlib.f_test(is_us, mpg)).* FROM auto83b_two_sample;
 -- Test result indicates that the two distributions have different variances
 </pre>
 <pre class="result">
@@ -257,7 +257,7 @@ SELECT (f_test(is_us, mpg)).* FROM auto83b_two_sample;
   0.311786921089247 |  26 |  23 | 0.997559863672441 | 0.00488027265511803
 </pre>
 
-- <b>Chi-squared goodness-of-fit test</b> (<a href="http://www.statsdirect.com/help/chi_square_tests/chi_good.htm">Data source</a>)
+- <b>Chi-squared goodness-of-fit test</b> (<a href="http://www.statsdirect.com/help/default.htm#nonparametric_methods/chisq_goodness_fit.htm">Data source</a>)
 
 <pre class="example">
 CREATE TABLE chi2_test_blood_group (
@@ -281,6 +281,12 @@ SELECT (madlib.chi2_gof_test(observed, expected)).* FROM chi2_test_blood_group;
 
 - <b>Chi-squared independence test</b> (<a href=http://itl.nist.gov/div898/software/dataplot/refman1/auxillar/chistest.htm>Data source</a>)
 
+The Chi-squared independence test uses the Chi-squared goodness-of-fit function,
+as shown in the example below.  The expected value needs to be computed and passed
+to the goodness-of-fit function.  The expected value for MADlib is computed as
+<em>sum of rows * sum of columns</em>, for each element of the input matrix.
+For e.g., expected value for element (2,1) would be <em>sum of row 2 * sum of column 1</em>.
+
 <pre class="example">
 CREATE TABLE chi2_test_friendly (
     id_x SERIAL,
@@ -292,27 +298,27 @@ INSERT INTO chi2_test_friendly(values) VALUES
     (array[20, 84, 17, 94]),
     (array[68, 119, 26, 7]);
 
--- we expect the table to be unpivoted
+-- Input table is expected to be unpivoted, so need to pivot it
 CREATE TABLE chi2_test_friendly_unpivoted AS
 SELECT id_x, id_y, values[id_y] AS observed
 FROM
     chi2_test_friendly,
     generate_series(1,4) AS id_y;
 
--- Compute Chi-squared independence statistic
+-- Compute Chi-squared independence statistic, by calculating expected value in the SQL and calling the goodness-of-fit function
 SELECT (madlib.chi2_gof_test(observed, expected, deg_freedom)).*
 FROM (
     -- Compute expected values and degrees of freedom
     SELECT
         observed,
-        sum(observed) OVER (PARTITION BY id_x)::DOUBLE PRECISION
-            * sum(observed) OVER (PARTITION BY id_y) AS expected
+        sum(observed) OVER (PARTITION BY id_x)::DOUBLE PRECISION *
+        sum(observed) OVER (PARTITION BY id_y) AS expected
     FROM chi2_test_friendly_unpivoted
 ) p, (
     SELECT
         (count(DISTINCT id_x) - 1) * (count(DISTINCT id_y) - 1) AS deg_freedom
     FROM chi2_test_friendly_unpivoted
-);
+) q;
 </pre>
  <pre class="result">
      statistic     |       p_value        | df |       phi        | contingency_coef
@@ -327,13 +333,13 @@ CREATE TABLE nist_anova_test (
     id SERIAL,
     resistance FLOAT8[]
 );
-COPY nist_anova_test(resistance) FROM stdin;
-{6.9,8.3,8.0}
-{5.4,6.8,10.5}
-{5.8,7.8,8.1}
-{4.6,9.2,6.9}
-{4.0,6.5,9.3}
-\.
+INSERT INTO nist_anova_test(resistance) VALUES
+    (array[6.9,8.3,8.0]),
+    (array[5.4,6.8,10.5]),
+    (array[5.8,7.8,8.1]),
+    (array[4.6,9.2,6.9]),
+    (array[4.0,6.5,9.3]);
+
 SELECT (madlib.one_way_anova(level, value)).* FROM (
     SELECT level, resistance[level] AS value
     FROM
@@ -359,7 +365,7 @@ SELECT
     FALSE,
     unnest(ARRAY[-5.13, -2.19, -2.43, -3.83, 0.50, -3.25, 4.32, 1.63, 5.18, -0.43, 7.11, 4.87, -3.10, -5.81, 3.76, 6.31, 2.58, 0.07, 5.76, 3.50]);
 
-SELECT (ks_test(first, value,
+SELECT (madlib.ks_test(first, value,
     (SELECT count(value) FROM ks_sample_1 WHERE first),
     (SELECT count(value) FROM ks_sample_1 WHERE NOT first)
     ORDER BY value)).*
@@ -374,7 +380,7 @@ FROM ks_sample_1;
 - <b>Mann-Whitney test</b> (use same data as t-test)
 
 <pre class="example">
-SELECT (mw_test(is_us, mpg ORDER BY mpg)).* from auto83b_two_sample;
+SELECT (madlib.mw_test(is_us, mpg ORDER BY mpg)).* from auto83b_two_sample;
 </pre>
 <pre class="result">
       statistic     | u_statistic | p_value_one_sided |  p_value_two_sided
@@ -418,7 +424,7 @@ COPY test_wsr (x, y) FROM stdin DELIMITER '|';
 0.48|0.4
 \\.
 \s
-SELECT (wsr_test(
+SELECT (madlib.wsr_test(
     x - y,
     2 * 2^(-52) * greatest(x,y)
     ORDER BY abs(x - y)