You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2012/04/06 16:44:05 UTC

svn commit: r1310357 - /mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java

Author: srowen
Date: Fri Apr  6 14:44:05 2012
New Revision: 1310357

URL: http://svn.apache.org/viewvc?rev=1310357&view=rev
Log:
MAHOUT-973 one more file needed for fix to compute maxDF as a percent of total count

Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java?rev=1310357&r1=1310356&r2=1310357&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java Fri Apr  6 14:44:05 2012
@@ -284,8 +284,9 @@ public final class SparseVectorsFromSequ
          Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR);
 
          // Calculate the standard deviation
-         double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0D, conf);
-         maxDF = (int) (maxDFSigma * stdDev);
+         double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, conf);
+         long vectorCount = docFrequenciesFeatures.getFirst()[1];
+         maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount);
 
          // Prune the term frequency vectors
          Path tfDir = new Path(outputDir, tfDirName);