You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@commons.apache.org by to...@apache.org on 2003/06/11 05:33:05 UTC

cvs commit: jakarta-commons-sandbox/math/src/test/org/apache/commons/math/stat BivariateRegressionTest.java

tobrien     2003/06/10 20:33:05

  Modified:    math/src/java/org/apache/commons/math/stat
                        BivariateRegression.java
               math/src/test/org/apache/commons/math/stat
                        BivariateRegressionTest.java
  Log:
  * Fixed all checkstyle errors and eliminated redundant NaN checks.  Now have
  100% test path coverage.
  
  * Used distribution framework TDistribution to implement
  getSlopeConfidenceInterval and getSignificance methods.
  
  PR: Issue #20657
  Obtained from: Bugzilla
  Submitted by: Phil Steitz
  Reviewed by: Tim O'Brien
  
  Revision  Changes    Path
  1.2       +217 -120  jakarta-commons-sandbox/math/src/java/org/apache/commons/math/stat/BivariateRegression.java
  
  Index: BivariateRegression.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/math/src/java/org/apache/commons/math/stat/BivariateRegression.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- BivariateRegression.java	29 May 2003 20:35:45 -0000	1.1
  +++ BivariateRegression.java	11 Jun 2003 03:33:05 -0000	1.2
  @@ -50,30 +50,33 @@
    * individuals on behalf of the Apache Software Foundation.  For more
    * information on the Apache Software Foundation, please see
    * <http://www.apache.org/>.
  - *
    */
   
   package org.apache.commons.math.stat;
  +import org.apache.commons.math.stat.distribution.DistributionFactory;
  +import org.apache.commons.math.stat.distribution.TDistribution;
   
   /**
    * Estimates an ordinary least squares regression model
  - * with one independent variable: <p>
  - *
  - * y = intercept + slope * x  </code><p>
  - *
  + * with one independent variable.
  + * <p>
  + * <code> y = intercept + slope * x  </code>
  + * <p>
    * Standard errors for <code>intercept</code> and <code>slope</code> are 
  - * available as well as ANOVA, r-square and Pearson's r statistics.<p>
  - *
  + * available as well as ANOVA, r-square and Pearson's r statistics.
  + * <p>
    * Observations (x,y pairs) can be added to the model one at a time or they 
    * can be provided in a 2-dimensional array.  The observations are not stored
    * in memory, so there is no limit to the number of observations that can be
  - * added to the model. <p>
  - *
  + * added to the model. 
  + * <p>
    * <strong>Usage Notes</strong>: <ul>
    * <li> When there are fewer than two observations in the model, or when
    * there is no variation in the x values (i.e. all x values are the same) 
    * all statistics return <code>NaN</code>. At least two observations with
  - * different x coordinates are requred to estimate a bivariate regression model.</li>
  + * different x coordinates are requred to estimate a bivariate regression 
  + * model.
  + * </li>
    * <li> getters for the statistics always compute values based on the current
    * set of observations -- i.e., you can get statistics, then add more data
    * and get updated statistics without using a new instance.  There is no 
  @@ -114,31 +117,34 @@
        */
       public void addData(double x, double y) {
           sumX += x;
  -        sumSqX += x*x;
  +        sumSqX += x * x;
           sumY += y;
  -        sumSqY += y*y;
  -        sumXY += x*y;
  +        sumSqY += y * y;
  +        sumXY += x * y;
           n++;
       } 
       
       /**
  -     * Adds the observations represented by the elements in <code>data.</code><p>
  +     * Adds the observations represented by the elements in 
  +     * <code>data</code>.
  +     * <p>
        * <code>(data[0][0],data[0][1])</code> will be the first observation, then
        * <code>(data[1][0],data[1][1])</code>, etc. <p> 
        * 
        * This method does not replace data that has already been added.  
  -     * To replace all data, use <code>clear()</code> before adding the new data.
  +     * To replace all data, use <code>clear()</code> before adding the new 
  +     * data.
        * 
        * @param data array of observations to be added
        */
       public void addData(double[][] data) {
          for (int i = 0; i < data.length; i++) {
  -            addData(data[i][0],data[i][1]);
  +            addData(data[i][0], data[i][1]);
          }
       }
       
  -    /*
  -     * Clears all data from the model
  +    /**
  +     * Clears all data from the model.
        */
       public void clear() {
           sumX = 0d;
  @@ -150,9 +156,9 @@
       }
             
       /**
  -     * Returns the number of observations that have been added to the model
  +     * Returns the number of observations that have been added to the model.
        *
  -     * @return n
  +     * @return n number of observations that have been added.
        */
       public long getN() {
           return n;
  @@ -160,37 +166,38 @@
       
       /**
        * Returns the "predicted" <code>y</code> value associated with the 
  -     * supplied <code>x</code> value. Specifically, <p>
  -     *
  -     * <code> predict(x) = intercept + slope * x </code> <p>
  -     *
  -     * At least two observations (with at least two different x values)
  +     * supplied <code>x</code> value.
  +     * <p>
  +     * <code> predict(x) = intercept + slope * x </code>
  +     * <p>
  +     * <strong>Preconditions</strong>: <ul>
  +     * <li>At least two observations (with at least two different x values)
        * must have been added before invoking this method. If this method is 
        * invoked before a model can be estimated, <code>Double,NaN</code> is
        * returned.
  +     * </li></ul>
        *
        * @param x input <code>x</code> value
        * @return predicted <code>y</code> value
        */
       public double predict(double x) {
           double b1 = getSlope();
  -        if (b1 == Double.NaN) {
  -            return b1;
  -        }
  -        return getIntercept(b1) + b1*x;
  +        return getIntercept(b1) + b1 * x;
       }
       
       /**
        * Returns the intercept of the estimated regression line.
  -     * The least squares estimate of the intercept is computed using the normal
  -     * equations, as described 
  -     * <a href=http://www.xycoon.com/estimation4.htm>here</a>.
  -     * The intercept is sometimes denoted b0. <p>
  -     *
  -     * At least two distinct data pairs (with at least two different x values)
  +     * <p>
  +     * The least squares estimate of the intercept is computed using the 
  +     * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
  +     * The intercept is sometimes denoted b0. 
  +     * <p>
  +     * <strong>Preconditions</strong>: <ul>
  +     * <li>At least two observations (with at least two different x values)
        * must have been added before invoking this method. If this method is 
        * invoked before a model can be estimated, <code>Double,NaN</code> is
        * returned.
  +     * </li></ul>
        *
        * @return the intercept of the regression line
        */
  @@ -200,15 +207,17 @@
        
        /**
        * Returns the slope of the estimated regression line.  
  -     * The least squares estimate of the slope is computed using the normal
  -     * equations, as described 
  -     * <a href=http://www.xycoon.com/estimation4.htm>here</a>.
  -     * The slope is sometimes denoted b1. <p>
  -     *
  -     * At least two observations (with at least two different x values)
  +     * <p>
  +     * The least squares estimate of the slope is computed using the 
  +     * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
  +     * The slope is sometimes denoted b1. 
  +     * <p>
  +     * <strong>Preconditions</strong>: <ul>
  +     * <li>At least two observations (with at least two different x values)
        * must have been added before invoking this method. If this method is 
        * invoked before a model can be estimated, <code>Double,NaN</code> is
        * returned.
  +     * </li></ul>
        *
        * @return the slope of the regression line
        */
  @@ -217,22 +226,24 @@
                return Double.NaN;  //not enough data 
            }
            double dn = (double) n;
  -         double denom = sumSqX - (sumX*sumX/dn);
  -         if (Math.abs(denom)< 10*Double.MIN_VALUE) { 
  +         double denom = sumSqX - (sumX * sumX / dn);
  +         if (Math.abs(denom) < 10 * Double.MIN_VALUE) { 
                return Double.NaN; //not enough variation in x
            }
  -         return (sumXY - (sumX*sumY/dn))/denom;
  +         return (sumXY - (sumX * sumY / dn)) / denom;
        }
        
        /**
  -      * Returns the sum of squared errors</a> associated with the regression 
  -      * model. This is defined as SSE 
  -      * <a href=http://www.xycoon.com/SumOfSquares.htm>here</a>. <p>
  -      *
  -      * At least two distinct data pairs (with at least two different x values)
  +      * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
  +      * sum of squared errors</a> (SSE) associated with the regression 
  +      * model.
  +      * <p>
  +      * <strong>Preconditions</strong>: <ul>
  +      * <li>At least two observations (with at least two different x values)
         * must have been added before invoking this method. If this method is 
         * invoked before a model can be estimated, <code>Double,NaN</code> is
         * returned.
  +      * </li></ul>
         *
         * @return sum of squared errors associated with the regression model
         */
  @@ -242,10 +253,11 @@
        
        /**
         * Returns the sum of squared deviations of the y values about their mean.
  +      * <p>
         * This is defined as SSTO 
  -      * <a href=http://www.xycoon.com/SumOfSquares.htm>here</a>.
  +      * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.
         * <p>
  -      * If n < 2, this returns NaN.
  +      * If <code>n < 2</code>, this returns <code>Double.NaN</code>.
         *
         * @return sum of squared deviations of y values
         */
  @@ -253,36 +265,37 @@
            if (n < 2) {
                return Double.NaN;
            }
  -         return sumSqY - sumY*sumY/(double) n;
  +         return sumSqY - sumY * sumY / (double) n;
        }
            
        /**
         * Returns the sum of squared deviations of the predicted y values about 
         * their mean (which equals the mean of y).
  +      * <p>
         * This is usually abbreviated SSR or SSM.  It is defined as SSM 
  -      * <a href=http://www.xycoon.com/SumOfSquares.htm>here</a><p>
  -      *
  -      * At least two distinct data pairs (with at least two different x values)
  +      * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>
  +      * <p>
  +      * <strong>Preconditions</strong>: <ul>
  +      * <li>At least two observations (with at least two different x values)
         * must have been added before invoking this method. If this method is 
         * invoked before a model can be estimated, <code>Double,NaN</code> is
         * returned.
  +      * </li></ul>
         *
         * @return sum of squared deviations of y values
         */
        public double getRegressionSumSquares() {
            double b1 = getSlope();
  -         if (b1 == Double.NaN) {
  -             return b1;
  -         }
  -         return b1*(sumXY - sumX*sumY/(double) n);
  +         return b1 * (sumXY - sumX * sumY / (double) n);
        }
        
        /**
  -      * Returns the sum of squared errors divided by the degrees of freedom.
  -      * This is usually abbreviated MSE. <p>
  -      *
  +      * Returns the sum of squared errors divided by the degrees of freedom,
  +      * usually abbreviated MSE. 
  +      * <p>
         * If there are fewer than <strong>three</strong> data pairs in the model,
  -      * or if there is no variation in x, this returns <code>NaN</code>.
  +      * or if there is no variation in <code>x</code>, this returns 
  +      * <code>Double.NaN</code>.
         *
         * @return sum of squared deviations of y values
         */
  @@ -291,29 +304,25 @@
                return Double.NaN;
            }
            double sse = getSumSquaredErrors();
  -         if (sse == Double.NaN) {
  -             return sse;
  -         }
  -         return sse/(double) (n - 2);
  +         return sse / (double) (n - 2);
        }
        
        /**
  -      * Returns <a href=http://www.stt.msu.edu/~xiaoyimi/STT200/Lecture5.pdf>
  -      * Pearson's product moment correlation coefficient</a>.
  -      * This is usually denoted r. <p>
  -      *
  -      * At least two observations (with at least two different x values)
  +      * Returns <a href="http://www.stt.msu.edu/~xiaoyimi/STT200/Lecture5.pdf">
  +      * Pearson's product moment correlation coefficient</a>,
  +      * usually denoted r. 
  +      * <p>
  +      * <strong>Preconditions</strong>: <ul>
  +      * <li>At least two observations (with at least two different x values)
         * must have been added before invoking this method. If this method is 
         * invoked before a model can be estimated, <code>Double,NaN</code> is
         * returned.
  +      * </li></ul>
         *
         * @return Pearson's r
         */
        public double getR() {
            double b1 = getSlope();
  -         if (b1 == Double.NaN) {
  -             return b1;
  -         }
            double result = Math.sqrt(getRSquare(b1));
            if (b1 < 0) {
                result = -result;
  @@ -322,14 +331,16 @@
        }
                
        /** 
  -      * Returns the <a href=http://www.xycoon.com/coefficient1.htm> coefficient
  -      * of determination</a>.
  -      * This is usually denoted r-square. <p>
  -      *
  -      * At least two observaions (with at least two different x values)
  +      * Returns the <a href="http://www.xycoon.com/coefficient1.htm"> 
  +      * coefficient of determination</a>,
  +      * usually denoted r-square. 
  +      * <p>
  +      * <strong>Preconditions</strong>: <ul>
  +      * <li>At least two observations (with at least two different x values)
         * must have been added before invoking this method. If this method is 
         * invoked before a model can be estimated, <code>Double,NaN</code> is
         * returned.
  +      * </li></ul>
         *
         * @return r-square
         */
  @@ -339,70 +350,150 @@
        
            
        /**
  -      * Returns the <a href=http://www.xycoon.com/standarderrorb0.htm>standard
  -      * error of the intercept estimate</a>.
  -      * This is usually denoted s(b0). <p>
  -      *
  -      * If there are fewer that <strong>three</strong> observations in the model,
  -      * or if there is no variation in x, this returns <code>NaN</code>.
  +      * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
  +      * standard error of the intercept estimate</a>, 
  +      * usually denoted s(b0). 
  +      * <p>
  +      * If there are fewer that <strong>three</strong> observations in the 
  +      * model, or if there is no variation in x, this returns 
  +      * <code>Double.NaN</code>.
         *
         * @return standard error associated with intercept estimate
         */
        public double getInterceptStdErr() {
            double ssx = getSumSquaresX();
  -         if (ssx == Double.NaN) {
  -             return ssx;
  -         }
  -         return Math.sqrt(getMeanSquareError()*sumSqX/(((double) n)*ssx));
  +         return Math.sqrt(getMeanSquareError() * sumSqX / (((double) n) * ssx));
        }
                
        /**
  -      * Returns the <a http://www.xycoon.com/standerrorb(1).htm>standard
  -      * error of the slope estimate</a>.
  -      * This is usually denoted s(b1). <p>
  -      *
  +      * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
  +      * error of the slope estimate</a>,
  +      * usually denoted s(b1). 
  +      * <p>
         * If there are fewer that <strong>three</strong> data pairs in the model,
  -      * or if there is no variation in x, this returns <code>NaN</code>.
  +      * or if there is no variation in x, this returns <code>Double.NaN</code>.
         *
         * @return standard error associated with slope estimate
         */
        public double getSlopeStdErr() {
            double ssx = getSumSquaresX();
  -         if (ssx == Double.NaN) {
  -             return ssx;
  +         return Math.sqrt(getMeanSquareError() / ssx);
  +     }
  +     
  +     /**
  +      * Returns the half-width of a 95% confidence interval for the slope
  +      * estimate.
  +      * <p>
  +      * The 95% confidence interval is 
  +      * <p>
  +      * <code>(getSlope() - getSlopeConfidenceInterval(), 
  +      * getSlope() + getSlopeConfidenceInterval())</code>
  +      * <p>
  +      * If there are fewer that <strong>three</strong> observations in the 
  +      * model, or if there is no variation in x, this returns 
  +      * <code>Double.NaN</code>.
  +      * <p>
  +      * <strong>Usage Note</strong>:<br>
  +      * The validity of this statistic depends on the assumption that the 
  +      * observations included in the model are drawn from a
  +      * <a href="http://mathworld.wolfram.com/
  +      * BivariateNormalDistribution.html">Bivariate Normal Distribution</a>.
  +      *
  +      * @return half-width of 95% confidence interval for the slope estimate
  +      */
  +     public double getSlopeConfidenceInterval() {   
  +        return getSlopeConfidenceInterval(0.05d); 
  +     }
  +     
  +     /**
  +      * Returns the half-width of a (100-100*alpha)% confidence interval for 
  +      * the slope estimate.
  +      * <p>
  +      * The (100-100*alpha)% confidence interval is 
  +      * <p>
  +      * <code>(getSlope() - getSlopeConfidenceInterval(), 
  +      * getSlope() + getSlopeConfidenceInterval())</code>
  +      * <p>
  +      * To request, for example, a 99% confidence interval, use 
  +      * <code>alpha = .01</code>
  +      * <p>
  +      * <strong>Usage Note</strong>:<br>
  +      * The validity of this statistic depends on the assumption that the 
  +      * observations included in the model are drawn from a
  +      * <a href="http://mathworld.wolfram.com/
  +      * BivariateNormalDistribution.html">Bivariate Normal Distribution</a>.
  +      * <p>
  +      * <strong> Preconditions:</strong><ul>
  +      * <li>If there are fewer that <strong>three</strong> observations in the 
  +      * model, or if there is no variation in x, this returns 
  +      * <code>Double.NaN</code>. 
  +      * </li>
  +      * <li><code>(0 < alpha < 1)</code>; otherwise an 
  +      * <code>IllegalArgumentException</code> is thrown.
  +      * </li></ul>    
  +      *
  +      * @param alpha the desired significance level 
  +      * @return half-width of 95% confidence interval for the slope estimate
  +      */
  +     public double getSlopeConfidenceInterval(double alpha) { 
  +         if (alpha >= 1 || alpha <= 0) {
  +             throw new IllegalArgumentException();
            }
  -         return Math.sqrt(getMeanSquareError()/ssx);
  +         return getSlopeStdErr() * 
  +            getTDistribution().inverseCummulativeProbability(1d - alpha / 2d); 
  +     }
  +     
  +     /**
  +      * Returns the significance level of the slope (equiv) correlation. 
  +      * <p>
  +      * Specifically, the returned value is the smallest <code>alpha</code>
  +      * such that the slope confidence interval with significance level
  +      * equal to <code>alpha</code> does not include <code>0</code>.
  +      * On regression output, this is often denoted <code>Prob(|t| > 0)</code>
  +      * <p>
  +      * <strong>Usage Note</strong>:<br>
  +      * The validity of this statistic depends on the assumption that the 
  +      * observations included in the model are drawn from a
  +      * <a href="http://mathworld.wolfram.com/
  +      * BivariateNormalDistribution.html">Bivariate Normal Distribution</a>.
  +      * <p>
  +      * If there are fewer that <strong>three</strong> observations in the 
  +      * model, or if there is no variation in x, this returns 
  +      * <code>Double.NaN</code>.
  +      *
  +      * @return significance level for slope/correlation
  +      */
  +     public double getSignificance() {
  +         return (1d - getTDistribution().cummulativeProbability(
  +                Math.abs(getSlope()) / getSlopeStdErr()));
        }
        
        // ---------------------Private methods-----------------------------------
        
        /**
        * Returns the intercept of the estimated regression line, given the slope.
  +     * <p>
        * Will return <code>NaN</code> if slope is <code>NaN</code>.
        *
        * @param slope current slope
        * @return the intercept of the regression line
        */
        private double getIntercept(double slope) {
  -         if (slope == Double.NaN) {
  -             return slope;
  -         }
  -         return (sumY - slope*sumX)/((double) n);
  +         return (sumY - slope * sumX) / ((double) n);
        }
          
        /**
  -      * Returns the sum of squared errors</a> associated with the regression 
  -      * model, using the slope of the regression line.  Returns NaN if the slope
  -      * is NaN.
  -      *
  +      * Returns the sum of squared errors associated with the regression 
  +      * model, using the slope of the regression line. 
  +      * <p> 
  +      * Returns NaN if the slope is NaN.
  +      * 
  +      * @param b1 current slope
         * @return sum of squared errors associated with the regression model
         */
        private double getSumSquaredErrors(double b1) {
  -         if (b1 == Double.NaN) {
  -             return b1;
  -         }
            double b0 = getIntercept(b1);
  -         return sumSqY - b0*sumY - b1*sumXY;
  +         return sumSqY - b0 * sumY - b1 * sumXY;
        } 
        
        /**
  @@ -416,24 +507,30 @@
            if (n < 2) {
                return Double.NaN;
            }
  -         return sumSqX - sumX*sumX/(double) n;
  +         return sumSqX - sumX * sumX / (double) n;
        }
        
        /** 
         * Computes r-square from the slope.
  -      * will return NaN if slope is Nan
  +      * <p>
  +      * will return NaN if slope is Nan.
         *
  +      * @param b1 current slope
         * @return r-square
         */
        private double getRSquare(double b1) {
  -         if (b1 == Double.NaN) {
  -             return b1;
  -         }
            double ssto = getTotalSumSquares();
  -         if (ssto == Double.NaN) {
  -             return ssto;
  -         }
  -         return (ssto - getSumSquaredErrors(b1))/ssto;
  +         return (ssto - getSumSquaredErrors(b1)) / ssto;
  +     }
  +     
  +     /**
  +      * Uses distribution framework to get a t distribution instance 
  +      * with df = n - 2
  +      *
  +      * @return t distribution with df = n - 2
  +      */
  +     private TDistribution getTDistribution() {
  +         return DistributionFactory.newInstance().createTDistribution(n - 2);
        }
   }
   
  
  
  
  1.2       +40 -2     jakarta-commons-sandbox/math/src/test/org/apache/commons/math/stat/BivariateRegressionTest.java
  
  Index: BivariateRegressionTest.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/math/src/test/org/apache/commons/math/stat/BivariateRegressionTest.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- BivariateRegressionTest.java	29 May 2003 20:35:46 -0000	1.1
  +++ BivariateRegressionTest.java	11 Jun 2003 03:33:05 -0000	1.2
  @@ -87,6 +87,18 @@
           {90.6,111.6},{86.5,122.2},{89.7,117.6},{90.6,121.1},{82.8,136.0},
           {70.1,154.2},{65.4,153.6},{61.3,158.5},{62.5,140.6},{63.6,136.2},
           {52.6,168.0},{59.7,154.3},{59.5,149.0},{61.3,165.5}};
  +        
  +    /*
  +     * From Moore and Mcabe, "Introduction to the Practice of Statistics"
  +     * Example 10.3 
  +     */
  +    private double[][] infData = {{15.6,5.2},{26.8,6.1},{37.8,8.7},{36.4,8.5},
  +    {35.5,8.8},{18.6,4.9},{15.3,4.5},{7.9,2.5},{0.0,1.1}};
  +    
  +    /*
  +     * From http://www.xycoon.com/simple_linear_regression.htm
  +     */
  +    private double[][] infData2 = {{1,3},{2,5},{3,7},{4,14},{5,11}};
       
       public BivariateRegressionTest(String name) {
           super(name);
  @@ -221,6 +233,32 @@
          regression.addData(data);
          assertEquals("number of observations",53,regression.getN());
       }
  -                    
  +    
  +    public void testInference() {
  +       BivariateRegression regression = new BivariateRegression();
  +       regression.addData(infData);
  +       assertEquals("slope confidence interval", 0.0271,
  +            regression.getSlopeConfidenceInterval(),0.0001);
  +       assertEquals("slope std err",0.01146,
  +            regression.getSlopeStdErr(),0.0001);
  +       
  +       regression = new BivariateRegression();
  +       regression.addData(infData2);
  +       assertEquals("significance", 0.023331,
  +            regression.getSignificance(),0.0001);
  +       
  +       //FIXME: get a real example to test against with alpha = .01
  +       assertTrue("tighter means wider",
  +            regression.getSlopeConfidenceInterval() < 
  +            regression.getSlopeConfidenceInterval(0.01));
  +       
  +       try {
  +           double x = regression.getSlopeConfidenceInterval(1);
  +           fail("expecting IllegalArgumentException for alpha = 1");
  +       } catch (IllegalArgumentException ex) {
  +           ;
  +       }
  +       
  +    }                                        
   }
   
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org