You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ss...@apache.org on 2013/03/12 08:21:32 UTC
svn commit: r1455426 - in /mahout/trunk/core/src/main/java/org/apache/mahout/classifier: AbstractVectorClassifier.java naivebayes/AbstractNaiveBayesClassifier.java

Author: ssc
Date: Tue Mar 12 07:21:32 2013
New Revision: 1455426

URL: http://svn.apache.org/r1455426
Log:
MAHOUT-1104 Improve Javadoc for AbstractVectorClassifier

Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java
    mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/AbstractNaiveBayesClassifier.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java?rev=1455426&r1=1455425&r2=1455426&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java Tue Mar 12 07:21:32 2013
@@ -25,63 +25,108 @@ import org.apache.mahout.math.Vector;
 import com.google.common.base.Preconditions;
 
 /**
- * Defines the interface for classifiers that take input as a vector.  This is implemented
- * as an abstract class so that it can implement a number of handy convenience methods
- * related to classification of vectors.
+ * Defines the interface for classifiers that take a vector as input. This is
+ * implemented as an abstract class so that it can implement a number of handy
+ * convenience methods related to classification of vectors.
+ * 
+ * <p>
+ * A classifier takes an input vector and calculates the scores (usually
+ * probabilities) that the input vector belongs to one of <code>n</code>
+ * categories. In <code>AbstractVectorClassifier</code> each category is denoted
+ * by an integer <code>c</code> between <code>0</code> and <code>n-1</code>
+ * (inclusive).
+ * 
+ * <p>
+ * New users should start by looking at {@link #classifyFull} (not {@link #classify}).
+ * 
  */
 public abstract class AbstractVectorClassifier {
-  // ------ These are all that are necessary to define a vector classifier.
 
-  /**
-   * Returns the number of categories for the target variable.  A vector classifier
-   * will encode it's output using a zero-based 1 of numCategories encoding.
+  /** Minimum allowable log likelihood value. */
+  public static final double MIN_LOG_LIKELIHOOD = -100.0;
+
+   /**
+   * Returns the number of categories that a target variable can be assigned to.
+   * A vector classifier will encode it's output as an integer from
+   * <code>0</code> to <code>numCategories()-1</code> (inclusive).
+   * 
    * @return The number of categories.
    */
   public abstract int numCategories();
 
   /**
-   * Classify a vector returning a vector of numCategories-1 scores.  It is assumed that
-   * the score for the missing category is one minus the sum of the scores that are returned.
-   *
-   * Note that the missing score is the 0-th score.
+   * Compute and return a vector containing <code>n-1</code> scores, where
+   * <code>n</code> is equal to <code>numCategories()</code>, given an input
+   * vector <code>instance</code>. Higher scores indicate that the input vector
+   * is more likely to belong to that category. The categories are denoted by
+   * the integers <code>0</code> through <code>n-1</code> (inclusive), and the
+   * scores in the returned vector correspond to categories 1 through
+   * <code>n-1</code> (leaving out category 0). It is assumed that the score for
+   * category 0 is one minus the sum of the scores in the returned vector.
+   * 
    * @param instance  A feature vector to be classified.
-   * @return  A vector of probabilities in 1 of n-1 encoding.
+   * @return A vector of probabilities in 1 of <code>n-1</code> encoding.
    */
   public abstract Vector classify(Vector instance);
-
+  
   /**
-   * Classify a vector, but don't apply the inverse link function.  For logistic regression
-   * and other generalized linear models, this is just the linear part of the classification.
+   * Compute and return a vector of scores before applying the inverse link
+   * function. For logistic regression and other generalized linear models, this
+   * is just the linear part of the classification.
+   * 
+   * <p>
+   * The implementation of this method provided by {@link AbstractVectorClassifier} throws an
+   * {@link UnsupportedOperationException}. Your subclass must explicitly override this method to support
+   * this operation.
+   * 
    * @param features  A feature vector to be classified.
-   * @return  A vector of scores.  If transformed by the link function, these will become probabilities.
+   * @return A vector of scores. If transformed by the link function, these will become probabilities.
    */
   public Vector classifyNoLink(Vector features) {
-    throw new UnsupportedOperationException(
-        this.getClass().getName() + " doesn't support classification without a link");
+    throw new UnsupportedOperationException(this.getClass().getName()
+        + " doesn't support classification without a link");
   }
 
   /**
    * Classifies a vector in the special case of a binary classifier where
-   * {@link #classify(Vector)} would return a vector with only one element.  As such,
-   * using this method can void the allocation of a vector.
-   * @param instance   The feature vector to be classified.
+   * {@link #classify(Vector)} would return a vector with only one element. As
+   * such, using this method can avoid the allocation of a vector.
+   * 
+   * @param instance The feature vector to be classified.
    * @return The score for category 1.
-   *
+   * 
    * @see #classify(Vector)
    */
   public abstract double classifyScalar(Vector instance);
 
-  // ------- From here on, we have convenience methods that provide an easier API to use.
-
   /**
-   * Returns n probabilities, one for each category.  If you can use an n-1 coding, and are touchy
-   * about allocation performance, then the classify method is probably better to use.  The 0-th
-   * element of the score vector returned by this method is the missing score as computed by the
-   * classify method.
-   *
+   * Computes and returns a vector containing <code>n</code> scores, where
+   * <code>n</code> is <code>numCategories()</code>, given an input vector
+   * <code>instance</code>. Higher scores indicate that the input vector is more
+   * likely to belong to the corresponding category. The categories are denoted
+   * by the integers <code>0</code> through <code>n-1</code> (inclusive).
+   * 
+   * <p>
+   * Using this method it is possible to classify an input vector, for example,
+   * by selecting the category with the largest score. If
+   * <code>classifier</code> is an instance of
+   * <code>AbstractVectorClassifier</code> and <code>input</code> is a
+   * <code>Vector</code> of features describing an element to be classified,
+   * then the following code could be used to classify <code>input</code>.<br>
+   * <code>
+   * Vector scores = classifier.classifyFull(input);<br>
+   * int assignedCategory = scores.maxValueIndex();<br>
+   * </code> Here <code>assignedCategory</code> is the index of the category
+   * with the maximum score.
+   * 
+   * <p>
+   * If an <code>n-1</code> encoding is acceptable, and allocation performance
+   * is an issue, then the {@link #classify(Vector)} method is probably better
+   * to use.
+   * 
    * @see #classify(Vector)
    * @see #classifyFull(Vector r, Vector instance)
-   *
+   * 
    * @param instance A vector of features to be classified.
    * @return A vector of probabilities, one for each category.
    */
@@ -90,14 +135,32 @@ public abstract class AbstractVectorClas
   }
 
   /**
-   * Returns n probabilities, one for each category into a pre-allocated vector.  One
-   * vector allocation is still done in the process of multiplying by the coefficient
-   * matrix, but that is hard to avoid.  The cost of such an ephemeral allocation is
-   * very small in any case compared to the multiplication itself.
-   *
-   * @param r        Where to put the results.
-   * @param instance A vector of features to be classified.
-   * @return A vector of probabilities, one for each category.
+   * Computes and returns a vector containing <code>n</code> scores, where
+   * <code>n</code> is <code>numCategories()</code>, given an input vector
+   * <code>instance</code>. Higher scores indicate that the input vector is more
+   * likely to belong to the corresponding category. The categories are denoted
+   * by the integers <code>0</code> through <code>n-1</code> (inclusive). The
+   * main difference between this method and {@link #classifyFull(Vector)} is
+   * that this method allows a user to provide a previously allocated
+   * <code>Vector r</code> to store the returned scores.
+   * 
+   * <p>
+   * Using this method it is possible to classify an input vector, for example,
+   * by selecting the category with the largest score. If
+   * <code>classifier</code> is an instance of
+   * <code>AbstractVectorClassifier</code>, <code>result</code> is a non-null
+   * <code>Vector</code>, and <code>input</code> is a <code>Vector</code> of
+   * features describing an element to be classified, then the following code
+   * could be used to classify <code>input</code>.<br>
+   * <code>
+   * Vector scores = classifier.classifyFull(result, input); // Notice that scores == result<br>
+   * int assignedCategory = scores.maxValueIndex();<br>
+   * </code> Here <code>assignedCategory</code> is the index of the category
+   * with the maximum score.
+   * 
+   * @param r Where to put the results.
+   * @param instance  A vector of features to be classified.
+   * @return A vector of scores/probabilities, one for each category.
    */
   public Vector classifyFull(Vector r, Vector instance) {
     r.viewPart(1, numCategories() - 1).assign(classify(instance));
@@ -107,12 +170,13 @@ public abstract class AbstractVectorClas
 
 
   /**
-   * Returns n-1 probabilities, one for each category but the last, for each row of a matrix. The
-   * probability of the missing 0-th category is 1 - rowSum(this result).
-   *
-   * @param data The matrix whose rows are vectors to classify
-   * @return A matrix of scores, one row per row of the input matrix, one column for each but the
-   *         last category.
+   * Returns n-1 probabilities, one for each categories 1 through
+   * <code>n-1</code>, for each row of a matrix, where <code>n</code> is equal
+   * to <code>numCategories()</code>. The probability of the missing 0-th
+   * category is 1 - rowSum(this result).
+   * 
+   * @param data  The matrix whose rows are the input vectors to classify
+   * @return A matrix of scores, one row per row of the input matrix, one column for each but the last category.
    */
   public Matrix classify(Matrix data) {
     Matrix r = new DenseMatrix(data.numRows(), numCategories() - 1);
@@ -123,11 +187,10 @@ public abstract class AbstractVectorClas
   }
 
   /**
-   * Returns n probabilities, one for each category, for each row of a matrix.
-   *
-   * @param data The matrix whose rows are vectors to classify
-   * @return A matrix of scores, one row per row of the input matrix, one column for each but the
-   *         last category.
+   * Returns a matrix where the rows of the matrix each contain <code>n</code> probabilities, one for each category.
+   * 
+   * @param data  The matrix whose rows are the input vectors to classify
+   * @return A matrix of scores, one row per row of the input matrix, one column for each but the last category.
    */
   public Matrix classifyFull(Matrix data) {
     Matrix r = new DenseMatrix(data.numRows(), numCategories());
@@ -138,11 +201,11 @@ public abstract class AbstractVectorClas
   }
 
   /**
-   * Returns a vector of probabilities of the first category, one for each row of a matrix. This
-   * only makes sense if there are exactly two categories, but calling this method in that case can
-   * save a number of vector allocations.
-   *
-   * @param data The matrix whose rows are vectors to classify
+   * Returns a vector of probabilities of category 1, one for each row
+   * of a matrix. This only makes sense if there are exactly two categories, but
+   * calling this method in that case can save a number of vector allocations.
+   * 
+   * @param data  The matrix whose rows are vectors to classify
    * @return A vector of scores, with one value per row of the input matrix.
    */
   public Vector classifyScalar(Matrix data) {
@@ -156,28 +219,29 @@ public abstract class AbstractVectorClas
   }
 
   /**
-   * Returns a measure of how good the classification for a particular example actually is.
-   *
-   * @param actual The correct category for the example.
-   * @param data   The vector to be classified.
-   * @return The log likelihood of the correct answer as estimated by the current model.  This will
-   *         always be <= 0 and larger (closer to 0) indicates better accuracy.  In order to simplify
-   *         code that maintains running averages, we bound this value at -100.
+   * Returns a measure of how good the classification for a particular example
+   * actually is.
+   * 
+   * @param actual  The correct category for the example.
+   * @param data  The vector to be classified.
+   * @return The log likelihood of the correct answer as estimated by the current model. This will always be <= 0
+   *  and larger (closer to 0) indicates better accuracy. In order to simplify code that maintains eunning averages,
+   *  we bound this value at -100.
    */
   public double logLikelihood(int actual, Vector data) {
     if (numCategories() == 2) {
       double p = classifyScalar(data);
       if (actual > 0) {
-        return Math.max(-100.0, Math.log(p));
+        return Math.max(MIN_LOG_LIKELIHOOD, Math.log(p));
       } else {
-        return Math.max(-100.0, Math.log1p(-p));
+        return Math.max(MIN_LOG_LIKELIHOOD, Math.log1p(-p));
       }
     } else {
       Vector p = classify(data);
       if (actual > 0) {
-        return Math.max(-100.0, Math.log(p.get(actual - 1)));
+        return Math.max(MIN_LOG_LIKELIHOOD, Math.log(p.get(actual - 1)));
       } else {
-        return Math.max(-100.0, Math.log1p(-p.zSum()));
+        return Math.max(MIN_LOG_LIKELIHOOD, Math.log1p(-p.zSum()));
       }
     }
   }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/AbstractNaiveBayesClassifier.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/AbstractNaiveBayesClassifier.java?rev=1455426&r1=1455425&r2=1455426&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/AbstractNaiveBayesClassifier.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/AbstractNaiveBayesClassifier.java Tue Mar 12 07:21:32 2013
@@ -23,7 +23,13 @@ import org.apache.mahout.classifier.Abst
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.Vector.Element;
 
-/** Class implementing the Naive Bayes Classifier Algorithm */
+/**
+ * Class implementing the Naive Bayes Classifier Algorithm. Note that this class
+ * supports {@link #classifyFull}, but not <code>classify</code> or
+ * <code>classifyScalar</code>. The reason that these two methods are not
+ * supported is because the scores computed by a NaiveBayesClassifier do not
+ * represent probabilities.
+ */
 public abstract class AbstractNaiveBayesClassifier extends AbstractVectorClassifier {
 
   private final NaiveBayesModel model;
@@ -66,11 +72,13 @@ public abstract class AbstractNaiveBayes
     return r;
   }
 
+  /** Unsupported method. This implementation simply throws an {@link UnsupportedOperationException}. */
   @Override
   public double classifyScalar(Vector instance) {
     throw new UnsupportedOperationException("Not supported in Naive Bayes");
   }
   
+  /** Unsupported method. This implementation simply throws an {@link UnsupportedOperationException}. */
   @Override
   public Vector classify(Vector instance) {
     throw new UnsupportedOperationException("probabilites not supported in Naive Bayes");