You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2010/04/05 07:51:48 UTC

svn commit: r930799 - in /lucene/mahout/trunk/math: pom.xml src/main/java/org/apache/mahout/math/decomposer/hebbian/HebbianSolver.java src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosSolver.java

Author: srowen
Date: Mon Apr  5 05:51:48 2010
New Revision: 930799

URL: http://svn.apache.org/viewvc?rev=930799&view=rev
Log:
Restore logging to SVD related code

Modified:
    lucene/mahout/trunk/math/pom.xml
    lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/HebbianSolver.java
    lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosSolver.java

Modified: lucene/mahout/trunk/math/pom.xml
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/pom.xml?rev=930799&r1=930798&r2=930799&view=diff
==============================================================================
--- lucene/mahout/trunk/math/pom.xml (original)
+++ lucene/mahout/trunk/math/pom.xml Mon Apr  5 05:51:48 2010
@@ -100,6 +100,19 @@
     </dependency>
 
     <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-api</artifactId>
+      <version>1.5.8</version>
+    </dependency>
+
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-jcl</artifactId>
+      <version>1.5.8</version>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
       <groupId>junit</groupId>
       <artifactId>junit</artifactId>
       <scope>test</scope>

Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/HebbianSolver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/HebbianSolver.java?rev=930799&r1=930798&r2=930799&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/HebbianSolver.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/hebbian/HebbianSolver.java Mon Apr  5 05:51:48 2010
@@ -23,6 +23,7 @@ import java.util.Random;
 
 import java.util.ArrayList;
 
+import org.apache.mahout.math.AbstractMatrix;
 import org.apache.mahout.math.DenseMatrix;
 import org.apache.mahout.math.DenseVector;
 import org.apache.mahout.math.Matrix;
@@ -32,6 +33,8 @@ import org.apache.mahout.math.decomposer
 import org.apache.mahout.math.function.TimesFunction;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.function.PlusMult;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * The Hebbian solver is an iterative, sparse, singular value decomposition solver, based on the paper
@@ -41,13 +44,16 @@ import org.apache.mahout.math.function.P
  */
 public class HebbianSolver {
 
+  private static final Logger log = LoggerFactory.getLogger(HebbianSolver.class);
+
   private final EigenUpdater updater;
   private final SingularVectorVerifier verifier;
   private final double convergenceTarget;
   private final int maxPassesPerEigen;
   private final Random rng = new Random();
 
-  //private int numPasses = 0;
+  private int numPasses = 0;
+  private static final boolean debug = false;
 
   /**
    * Creates a new HebbianSolver
@@ -157,6 +163,7 @@ public class HebbianSolver {
     int cols = corpus.numCols();
     Matrix eigens = new DenseMatrix(desiredRank, cols);
     List<Double> eigenValues = new ArrayList<Double>();
+    log.info("Finding " + desiredRank + " singular vectors of matrix with " + corpus.numRows() + " rows, via Hebbian");
     /**
      * The corpusProjections matrix is a running cache of the residual projection of each corpus vector against all
      * of the previously found singular vectors.  Without this, if multiple passes over the data is made (per
@@ -179,6 +186,17 @@ public class HebbianSolver {
             updater.update(currentEigen, corpus.getRow(corpusRow), state);
         }
         state.setFirstPass(false);
+        if (debug) {
+          if (previousEigen == null) {
+            previousEigen = currentEigen.clone();
+          } else {
+            double dot = currentEigen.dot(previousEigen);
+            if (dot > 0) {
+              dot /= (currentEigen.norm(2) * previousEigen.norm(2));
+            }
+           // log.info("Current pass * previous pass = {}", dot);
+          }
+        }
       }
       // converged!
       double eigenValue = state.getStatusProgress().get(state.getStatusProgress().size() - 1).getEigenValue();
@@ -188,6 +206,7 @@ public class HebbianSolver {
       eigens.assignRow(i, currentEigen);
       eigenValues.add(eigenValue);
       state.setCurrentEigenValues(eigenValues);
+      log.info("Found eigenvector {}, eigenvalue: {}", i, eigenValue);
 
       /**
        *  TODO: Persist intermediate output!
@@ -197,7 +216,7 @@ public class HebbianSolver {
       state.setActivationDenominatorSquared(0);
       state.setActivationNumerator(0);
       state.getStatusProgress().clear();
-      //numPasses = 0;
+      numPasses = 0;
     }
     return state;
   }
@@ -234,11 +253,13 @@ public class HebbianSolver {
   protected boolean hasNotConverged(Vector currentPseudoEigen,
                                     Matrix corpus,
                                     TrainingState state) {
-    //numPasses++;
+    numPasses++;
     if (state.isFirstPass()) {
+      log.info("First pass through the corpus, no need to check convergence...");
       return true;
     }
     Matrix previousEigens = state.getCurrentEigens();
+    log.info("Have made {} passes through the corpus, checking convergence...", numPasses);
     /*
      * Step 1: orthogonalize currentPseudoEigen by subtracting off eigen(i) * helper.get(i)
      * Step 2: zero-out the helper vector because it has already helped.
@@ -248,11 +269,20 @@ public class HebbianSolver {
       currentPseudoEigen.assign(previousEigen, new PlusMult(-state.getHelperVector().get(i)));
       state.getHelperVector().set(i, 0);
     }
+    if (debug && currentPseudoEigen.norm(2) > 0) {
+      for (int i = 0; i < state.getNumEigensProcessed(); i++) {
+        Vector previousEigen = previousEigens.getRow(i);
+        log.info("dot with previous: {}", (previousEigen.dot(currentPseudoEigen)) / currentPseudoEigen.norm(2));
+      }
+    }
     /*
      * Step 3: verify how eigen-like the prospective eigen is.  This is potentially asynchronous.
      */
     EigenStatus status = verify(corpus, currentPseudoEigen);
-    if (!status.inProgress()) {
+    if (status.inProgress()) {
+      log.info("Verifier not finished, making another pass...");
+    } else {
+      log.info("Has 1 - cosAngle: {}, convergence target is: {}", (1 - status.getCosAngle()), convergenceTarget);
       state.getStatusProgress().add(status);
     }
     return (state.getStatusProgress().size() <= maxPassesPerEigen && 1 - status.getCosAngle() > convergenceTarget);
@@ -270,6 +300,7 @@ public class HebbianSolver {
     String corpusDir = props.getProperty("solver.input.dir");
     String outputDir = props.getProperty("solver.output.dir");
     if (corpusDir == null || corpusDir.length() == 0 || outputDir == null || outputDir.length() == 0) {
+      log.error("{} must contain values for solver.input.dir and solver.output.dir", propertiesFile);
       return;
     }
     int inBufferSize = Integer.parseInt(props.getProperty("solver.input.bufferSize"));
@@ -290,7 +321,11 @@ public class HebbianSolver {
     } else {
       //  corpus = new ParallelMultiplyingDiskBufferedDoubleMatrix(new File(corpusDir), inBufferSize, numThreads);
     }
+    long now = System.currentTimeMillis();
     TrainingState finalState = solver.solve(corpus, rank);
+    long time = (System.currentTimeMillis() - now) / 1000;
+    log.info("Solved {} eigenVectors in {} seconds.  Persisted to {}",
+             new Object[] {finalState.getCurrentEigens().size()[AbstractMatrix.ROW], time, outputDir});
   }
 
 }

Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosSolver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosSolver.java?rev=930799&r1=930798&r2=930799&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosSolver.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosSolver.java Mon Apr  5 05:51:48 2010
@@ -35,6 +35,8 @@ import org.apache.mahout.math.matrix.Dou
 import org.apache.mahout.math.matrix.DoubleMatrix2D;
 import org.apache.mahout.math.matrix.impl.DenseDoubleMatrix2D;
 import org.apache.mahout.math.matrix.linalg.EigenvalueDecomposition;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * <p>Simple implementation of the <a href="http://en.wikipedia.org/wiki/Lanczos_algorithm">Lanczos algorithm</a> for
@@ -63,6 +65,8 @@ import org.apache.mahout.math.matrix.lin
  */
 public class LanczosSolver {
 
+  private static final Logger log = LoggerFactory.getLogger(LanczosSolver.class);
+
   public static final double SAFE_MAX = 1.0e150;
 
   private static final double NANOS_IN_MILLI = 1.0e6;
@@ -73,7 +77,7 @@ public class LanczosSolver {
 
   private final Map<TimingSection, Long> startTimes = new EnumMap<TimingSection, Long>(TimingSection.class);
   private final Map<TimingSection, Long> times = new EnumMap<TimingSection, Long>(TimingSection.class);
-  protected double scaleFactor = 0.0;
+  protected double scaleFactor = 0;
 
   private static final class Scale implements UnaryFunction {
     private final double d;
@@ -99,6 +103,7 @@ public class LanczosSolver {
                     Matrix eigenVectors,
                     List<Double> eigenValues,
                     boolean isSymmetric) {
+    log.info("Finding {} singular vectors of matrix with {} rows, via Lanczos", desiredRank, corpus.numRows());
     Vector currentVector = getInitialVector(corpus);
     Vector previousVector = new DenseVector(currentVector.size());
     Matrix basis = new SparseRowMatrix(new int[]{desiredRank, corpus.numCols()});
@@ -109,6 +114,7 @@ public class LanczosSolver {
     for (int i = 1; i < desiredRank; i++) {
       startTime(TimingSection.ITERATE);
       Vector nextVector = isSymmetric ? corpus.times(currentVector) : corpus.timesSquared(currentVector);
+      log.info("{} passes through the corpus so far...", i);
       calculateScaleFactor(nextVector);
       nextVector.assign(new Scale(1 / scaleFactor));
       nextVector.assign(previousVector, new PlusMult(-beta));
@@ -122,6 +128,7 @@ public class LanczosSolver {
       // and normalize
       beta = nextVector.norm(2);
       if (outOfRange(beta) || outOfRange(alpha)) {
+        log.warn("Lanczos parameters out of range: alpha = {}, beta = {}.  Bailing out early!", alpha, beta);
         break;
       }
       final double b = beta;
@@ -138,6 +145,7 @@ public class LanczosSolver {
     }
     startTime(TimingSection.TRIDIAG_DECOMP);
 
+    log.info("Lanczos iteration complete - now to diagonalize the tri-diagonal auxiliary matrix.");
     // at this point, have tridiag all filled out, and basis is all filled out, and orthonormalized
     EigenvalueDecomposition decomp = new EigenvalueDecomposition(triDiag);
 
@@ -156,8 +164,10 @@ public class LanczosSolver {
       }
       realEigen = realEigen.normalize();
       eigenVectors.assignRow(i, realEigen);
+      log.info("Eigenvector {} found with eigenvalue {}", i, eigenVals.get(i));
       eigenValues.add(eigenVals.get(i));
     }
+    log.info("LanczosSolver finished.");
     endTime(TimingSection.FINAL_EIGEN_CREATE);
   }