You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2012/04/06 16:44:05 UTC
svn commit: r1310357 -
/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
Author: srowen
Date: Fri Apr 6 14:44:05 2012
New Revision: 1310357
URL: http://svn.apache.org/viewvc?rev=1310357&view=rev
Log:
MAHOUT-973 one more file needed for fix to compute maxDF as a percent of total count
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java?rev=1310357&r1=1310356&r2=1310357&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java Fri Apr 6 14:44:05 2012
@@ -284,8 +284,9 @@ public final class SparseVectorsFromSequ
Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR);
// Calculate the standard deviation
- double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0D, conf);
- maxDF = (int) (maxDFSigma * stdDev);
+ double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, conf);
+ long vectorCount = docFrequenciesFeatures.getFirst()[1];
+ maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount);
// Prune the term frequency vectors
Path tfDir = new Path(outputDir, tfDirName);