You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2010/05/23 17:22:30 UTC
svn commit: r947427 - in /mahout/trunk:
core/src/main/java/org/apache/mahout/clustering/dirichlet/
core/src/main/java/org/apache/mahout/clustering/lda/
core/src/main/java/org/apache/mahout/common/commandline/ examples/bin/
examples/src/main/java/org/ap...
Author: jeastman
Date: Sun May 23 15:22:28 2010
New Revision: 947427
URL: http://svn.apache.org/viewvc?rev=947427&view=rev
Log:
MAHOUT-294: fixed -k option as optional but added required=true for Dirichlet
MAHOUT-398: added minimal vector renaming to improve clarity
MAHOUT-397: fixes to allow setting -nr in vector output stages
Tests all ran before I installed Java update. Will test on EC2 again today.
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
mahout/trunk/examples/bin/build-reuters.sh
mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java Sun May 23 15:22:28 2010
@@ -83,7 +83,7 @@ public class DirichletDriver {
Option inputOpt = DefaultOptionCreator.inputOption().create();
Option outputOpt = DefaultOptionCreator.outputOption().create();
Option maxIterOpt = DefaultOptionCreator.maxIterationsOption().create();
- Option kOpt = DefaultOptionCreator.kOption().create();
+ Option kOpt = DefaultOptionCreator.kOption().withRequired(true).create();
Option overwriteOutput = DefaultOptionCreator.overwriteOption().create();
Option clusteringOpt = DefaultOptionCreator.clusteringOption().create();
Option alphaOpt = DefaultOptionCreator.alphaOption().create();
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java Sun May 23 15:22:28 2010
@@ -145,10 +145,10 @@ public final class LDADriver {
double oldLL = Double.NEGATIVE_INFINITY;
boolean converged = false;
- for (int iteration = 0; ((maxIterations < 1) || (iteration < maxIterations)) && !converged; iteration++) {
+ for (int iteration = 1; ((maxIterations < 1) || (iteration <= maxIterations)) && !converged; iteration++) {
log.info("Iteration {}", iteration);
// point the output to a new directory per iteration
- Path stateOut = new Path(output, "state-" + (iteration + 1));
+ Path stateOut = new Path(output, "state-" + iteration);
double ll = runIteration(input, stateIn, stateOut, numTopics, numWords, topicSmoothing, numReducers);
double relChange = (oldLL - ll) / oldLL;
@@ -157,7 +157,7 @@ public final class LDADriver {
log.info("(Old LL: {})", oldLL);
log.info("(Rel Change: {})", relChange);
- converged = (iteration > 2) && (relChange < OVERALL_CONVERGENCE);
+ converged = (iteration > 3) && (relChange < OVERALL_CONVERGENCE);
stateIn = stateOut;
oldLL = ll;
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java Sun May 23 15:22:28 2010
@@ -112,7 +112,7 @@ public final class DefaultOptionCreator
* Returns a default command line option for specification of numbers of clusters to create. Used by Dirichlet, FuzzyKmeans, Kmeans
*/
public static DefaultOptionBuilder kOption() {
- return new DefaultOptionBuilder().withLongName("k").withRequired(true).withArgument(
+ return new DefaultOptionBuilder().withLongName("k").withRequired(false).withArgument(
new ArgumentBuilder().withName("k").withMinimum(1).withMaximum(1).create()).withDescription(
"The number of clusters to create").withShortName("k");
}
Modified: mahout/trunk/examples/bin/build-reuters.sh
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-reuters.sh?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-reuters.sh (original)
+++ mahout/trunk/examples/bin/build-reuters.sh Sun May 23 15:22:28 2010
@@ -38,14 +38,15 @@ fi
cd ../..
./bin/mahout org.apache.lucene.benchmark.utils.ExtractReuters ./examples/bin/work/reuters-sgm/ ./examples/bin/work/reuters-out/
-./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o ./examples/bin/work/reuters-out-seqdir -c UTF-8
-./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o ./examples/bin/work/reuters-out-seqdir-sparse
+./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o ./examples/bin/work/reuters-out-seqdir -c UTF-8 -chunk 5
-# to use k-Means clustering, uncomment the next two lines
-#./bin/mahout kmeans -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors/ -c ./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -x 10 -k 20 -ow
+# to use k-Means clustering, uncomment the next three lines
+#./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o ./examples/bin/work/reuters-out-seqdir-sparse
+#./bin/mahout kmeans -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf/tfidf-vectors/ -c ./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -x 10 -k 20 -ow
#./bin/mahout clusterdump -s examples/bin/work/reuters-kmeans/clusters-10 -d examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 -dt sequencefile -b 100 -n 20
-# to use LDA clustering, uncomment the next two lines
-#./bin/mahout lda -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors -o ./examples/bin/work/reuters-lda -k 20 -v 50000 -ow
-#./bin/mahout ldatopics -i ./examples/bin/work/reuters-lda/state-9 -d ./examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 -dt sequencefile
+# to use LDA clustering, uncomment the next three lines
+#./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o ./examples/bin/work/reuters-out-seqdir-sparse -wt tf -seq -nr 3
+#./bin/mahout lda -i ./examples/bin/work/reuters-out-seqdir-sparse/tf-vectors -o ./examples/bin/work/reuters-lda -k 20 -v 50000 -ow -x 20
+#./bin/mahout ldatopics -i ./examples/bin/work/reuters-lda/state-20 -d ./examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 -dt sequencefile
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java Sun May 23 15:22:28 2010
@@ -79,9 +79,7 @@ public final class SequenceFilesFromDire
private final FileSystem fs;
public ChunkedWriter(int chunkSizeInMB, String outputDir) throws IOException {
- if (chunkSizeInMB < 64) {
- chunkSizeInMB = 64;
- } else if (chunkSizeInMB > 1984) {
+ if (chunkSizeInMB > 1984) {
chunkSizeInMB = 1984;
}
maxChunkSizeInBytes = chunkSizeInMB * 1024 * 1024;
Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java Sun May 23 15:22:28 2010
@@ -101,14 +101,14 @@ public final class SparseVectorsFromSequ
abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
.withDescription(
"(Optional) The maximum size of ngrams to create"
- + " (2 = bigrams, 3 = trigrams, etc) Default Value:2").withShortName("ng").create();
+ + " (2 = bigrams, 3 = trigrams, etc) Default Value:1").withShortName("ng").create();
Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
.withDescription(
- "(Optional) Whether output vectors should be SequentialAccessVectors If set true else false")
+ "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")
.withShortName("seq").create();
Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false).withDescription(
- "If set, overwrite the output directory").withShortName("w").create();
+ "If set, overwrite the output directory").withShortName("ow").create();
Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
.create();
@@ -165,7 +165,7 @@ public final class SparseVectorsFromSequ
if (cmdLine.hasOption(numReduceTasksOpt)) {
reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
}
- log.info("Pass1 reduce tasks: {}", reduceTasks);
+ log.info("Number of reduce tasks: {}", reduceTasks);
Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
if (cmdLine.hasOption(analyzerNameOpt)) {
@@ -224,7 +224,7 @@ public final class SparseVectorsFromSequ
TFIDFConverter.processTfIdf(
new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
new Path(outputDir, TFIDFConverter.TFIDF_OUTPUT_FOLDER), chunkSize, minDf, maxDFPercent, norm,
- sequentialAccessOutput);
+ sequentialAccessOutput, reduceTasks);
}
} catch (OptionException e) {
log.error("Exception", e);
Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java Sun May 23 15:22:28 2010
@@ -69,13 +69,16 @@ public final class PartialVectorMerger {
* output directory were the partial vectors have to be created
* @param normPower
* The normalization value. Must be greater than or equal to 0 or equal to {@link #NO_NORMALIZING}
+ * @param numReducers
+ * The number of reducers to spawn
* @throws IOException
*/
public static void mergePartialVectors(List<Path> partialVectorPaths,
Path output,
float normPower,
int dimension,
- boolean sequentialAccess) throws IOException {
+ boolean sequentialAccess,
+ int numReducers) throws IOException {
if (normPower != NO_NORMALIZING && normPower < 0) {
throw new IllegalArgumentException("normPower must either be -1 or >= 0");
}
@@ -101,6 +104,7 @@ public final class PartialVectorMerger {
conf.setInputFormat(SequenceFileInputFormat.class);
conf.setReducerClass(PartialVectorMergeReducer.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
+ conf.setNumReduceTasks(numReducers);
HadoopUtil.overwriteOutput(output);
Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java Sun May 23 15:22:28 2010
@@ -59,7 +59,7 @@ import org.apache.mahout.utils.vectors.t
*/
public final class DictionaryVectorizer {
- public static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "vectors";
+ public static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tf-vectors";
public static final String MIN_SUPPORT = "min.support";
@@ -153,7 +153,7 @@ public final class DictionaryVectorizer
Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++);
partialVectorPaths.add(partialVectorOutputPath);
makePartialVectors(input, maxNGramSize, dictionaryChunk, partialVectorOutputPath,
- maxTermDimension[0], sequentialAccess);
+ maxTermDimension[0], sequentialAccess, numReducers);
}
Configuration conf = new Configuration();
@@ -162,7 +162,7 @@ public final class DictionaryVectorizer
Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
if (dictionaryChunks.size() > 1) {
PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, -1, maxTermDimension[0],
- sequentialAccess);
+ sequentialAccess, numReducers);
HadoopUtil.deletePaths(partialVectorPaths, fs);
} else {
Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
@@ -245,6 +245,8 @@ public final class DictionaryVectorizer
* location of the chunk of features and the id's
* @param output
* output directory were the partial vectors have to be created
+ * @param numReducers
+ * the desired number of reducer tasks
* @throws IOException
*/
private static void makePartialVectors(Path input,
@@ -252,7 +254,8 @@ public final class DictionaryVectorizer
Path dictionaryFilePath,
Path output,
int dimension,
- boolean sequentialAccess) throws IOException {
+ boolean sequentialAccess,
+ int numReducers) throws IOException {
Configurable client = new JobClient();
JobConf conf = new JobConf(DictionaryVectorizer.class);
@@ -279,6 +282,7 @@ public final class DictionaryVectorizer
conf.setInputFormat(SequenceFileInputFormat.class);
conf.setReducerClass(TFPartialVectorReducer.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
+ conf.setNumReduceTasks(numReducers);
HadoopUtil.overwriteOutput(output);
Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java Sun May 23 15:22:28 2010
@@ -66,7 +66,7 @@ public final class TFIDFConverter {
public static final String TFIDF_OUTPUT_FOLDER = "tfidf";
- private static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "vectors";
+ private static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tfidf-vectors";
private static final String FREQUENCY_FILE = "frequency.file-";
@@ -99,17 +99,21 @@ public final class TFIDFConverter {
* @param output
* output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the document
* are generated
- * @param minDf
- * The minimum document frequency. Default 1
- * @param maxDFPercent
- * The max percentage of vectors for the DF. Can be used to remove really high frequency features.
- * Expressed as an integer between 0 and 100. Default 99
* @param chunkSizeInMegabytes
* the size in MB of the feature => id chunk to be kept in memory at each node during Map/Reduce
* stage. Its recommended you calculated this based on the number of cores and the free memory
* available to you per node. Say, you have 2 cores and around 1GB extra memory to spare we
* recommend you use a split size of around 400-500MB so that two simultaneous reducers can create
* partial vectors without thrashing the system due to increased swapping
+ * @param minDf
+ * The minimum document frequency. Default 1
+ * @param maxDFPercent
+ * The max percentage of vectors for the DF. Can be used to remove really high frequency features.
+ * Expressed as an integer between 0 and 100. Default 99
+ * @param numReducers
+ * The number of reducers to spawn. This also affects the possible parallelism since each reducer
+ * will typically produce a single output file containing tf-idf vectors for a subset of the
+ * documents in the corpus.
* @throws IOException
*/
public static void processTfIdf(Path input,
@@ -118,7 +122,8 @@ public final class TFIDFConverter {
int minDf,
int maxDFPercent,
float normPower,
- boolean sequentialAccessOutput) throws IOException {
+ boolean sequentialAccessOutput,
+ int numReducers) throws IOException {
if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
chunkSizeInMegabytes = MIN_CHUNKSIZE;
} else if (chunkSizeInMegabytes > MAX_CHUNKSIZE) { // 10GB
@@ -158,7 +163,7 @@ public final class TFIDFConverter {
Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
if (dictionaryChunks.size() > 1) {
PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, normPower,
- datasetFeatures.getFirst()[0].intValue(), sequentialAccessOutput);
+ datasetFeatures.getFirst()[0].intValue(), sequentialAccessOutput, numReducers);
HadoopUtil.deletePaths(partialVectorPaths, fs);
} else {
Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java Sun May 23 15:22:28 2010
@@ -119,7 +119,7 @@ public class DictionaryVectorizerTest ex
DictionaryVectorizer.createTermFrequencyVectors(getTestTempDirPath("output/tokenized-documents"),
getTestTempDirPath("output/wordcount"), 2, 1, 0.0f, 1, 100, false);
TFIDFConverter.processTfIdf(getTestTempDirPath("output/wordcount/vectors"),
- getTestTempDirPath("output/tfidf"), 100, 1, 99, 1.0f, false);
+ getTestTempDirPath("output/tfidf"), 100, 1, 99, 1.0f, false, 1);
}
}
Re: svn commit: r947427 - in /mahout/trunk: core/src/main/java/org/apache/mahout/clustering/dirichlet/
core/src/main/java/org/apache/mahout/clustering/lda/ core/src/main/java/org/apache/mahout/common/commandline/
examples/bin/ examples/src/main/java/org/ap...
Posted by Jeff Eastman <jd...@windwardsolutions.com>.
Tests all run on Java 1.6.0_10
On 5/23/10 8:22 AM, jeastman@apache.org wrote:
> Author: jeastman
> Date: Sun May 23 15:22:28 2010
> New Revision: 947427
>
> URL: http://svn.apache.org/viewvc?rev=947427&view=rev
> Log:
> MAHOUT-294: fixed -k option as optional but added required=true for Dirichlet
> MAHOUT-398: added minimal vector renaming to improve clarity
> MAHOUT-397: fixes to allow setting -nr in vector output stages
>
> Tests all ran before I installed Java update. Will test on EC2 again today.
>
> Modified:
> mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
> mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
> mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
> mahout/trunk/examples/bin/build-reuters.sh
> mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
> mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
>
> Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java?rev=947427&r1=947426&r2=947427&view=diff
> ==============================================================================
> --- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java (original)
> +++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java Sun May 23 15:22:28 2010
> @@ -83,7 +83,7 @@ public class DirichletDriver {
> Option inputOpt = DefaultOptionCreator.inputOption().create();
> Option outputOpt = DefaultOptionCreator.outputOption().create();
> Option maxIterOpt = DefaultOptionCreator.maxIterationsOption().create();
> - Option kOpt = DefaultOptionCreator.kOption().create();
> + Option kOpt = DefaultOptionCreator.kOption().withRequired(true).create();
> Option overwriteOutput = DefaultOptionCreator.overwriteOption().create();
> Option clusteringOpt = DefaultOptionCreator.clusteringOption().create();
> Option alphaOpt = DefaultOptionCreator.alphaOption().create();
>
> Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java?rev=947427&r1=947426&r2=947427&view=diff
> ==============================================================================
> --- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java (original)
> +++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java Sun May 23 15:22:28 2010
> @@ -145,10 +145,10 @@ public final class LDADriver {
> double oldLL = Double.NEGATIVE_INFINITY;
> boolean converged = false;
>
> - for (int iteration = 0; ((maxIterations< 1) || (iteration< maxIterations))&& !converged; iteration++) {
> + for (int iteration = 1; ((maxIterations< 1) || (iteration<= maxIterations))&& !converged; iteration++) {
> log.info("Iteration {}", iteration);
> // point the output to a new directory per iteration
> - Path stateOut = new Path(output, "state-" + (iteration + 1));
> + Path stateOut = new Path(output, "state-" + iteration);
> double ll = runIteration(input, stateIn, stateOut, numTopics, numWords, topicSmoothing, numReducers);
> double relChange = (oldLL - ll) / oldLL;
>
> @@ -157,7 +157,7 @@ public final class LDADriver {
> log.info("(Old LL: {})", oldLL);
> log.info("(Rel Change: {})", relChange);
>
> - converged = (iteration> 2)&& (relChange< OVERALL_CONVERGENCE);
> + converged = (iteration> 3)&& (relChange< OVERALL_CONVERGENCE);
> stateIn = stateOut;
> oldLL = ll;
> }
>
> Modified: mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java?rev=947427&r1=947426&r2=947427&view=diff
> ==============================================================================
> --- mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java (original)
> +++ mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java Sun May 23 15:22:28 2010
> @@ -112,7 +112,7 @@ public final class DefaultOptionCreator
> * Returns a default command line option for specification of numbers of clusters to create. Used by Dirichlet, FuzzyKmeans, Kmeans
> */
> public static DefaultOptionBuilder kOption() {
> - return new DefaultOptionBuilder().withLongName("k").withRequired(true).withArgument(
> + return new DefaultOptionBuilder().withLongName("k").withRequired(false).withArgument(
> new ArgumentBuilder().withName("k").withMinimum(1).withMaximum(1).create()).withDescription(
> "The number of clusters to create").withShortName("k");
> }
>
> Modified: mahout/trunk/examples/bin/build-reuters.sh
> URL: http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-reuters.sh?rev=947427&r1=947426&r2=947427&view=diff
> ==============================================================================
> --- mahout/trunk/examples/bin/build-reuters.sh (original)
> +++ mahout/trunk/examples/bin/build-reuters.sh Sun May 23 15:22:28 2010
> @@ -38,14 +38,15 @@ fi
>
> cd ../..
> ./bin/mahout org.apache.lucene.benchmark.utils.ExtractReuters ./examples/bin/work/reuters-sgm/ ./examples/bin/work/reuters-out/
> -./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o ./examples/bin/work/reuters-out-seqdir -c UTF-8
> -./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o ./examples/bin/work/reuters-out-seqdir-sparse
> +./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o ./examples/bin/work/reuters-out-seqdir -c UTF-8 -chunk 5
>
> -# to use k-Means clustering, uncomment the next two lines
> -#./bin/mahout kmeans -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors/ -c ./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -x 10 -k 20 -ow
> +# to use k-Means clustering, uncomment the next three lines
> +#./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o ./examples/bin/work/reuters-out-seqdir-sparse
> +#./bin/mahout kmeans -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf/tfidf-vectors/ -c ./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -x 10 -k 20 -ow
> #./bin/mahout clusterdump -s examples/bin/work/reuters-kmeans/clusters-10 -d examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 -dt sequencefile -b 100 -n 20
>
> -# to use LDA clustering, uncomment the next two lines
> -#./bin/mahout lda -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors -o ./examples/bin/work/reuters-lda -k 20 -v 50000 -ow
> -#./bin/mahout ldatopics -i ./examples/bin/work/reuters-lda/state-9 -d ./examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 -dt sequencefile
> +# to use LDA clustering, uncomment the next three lines
> +#./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o ./examples/bin/work/reuters-out-seqdir-sparse -wt tf -seq -nr 3
> +#./bin/mahout lda -i ./examples/bin/work/reuters-out-seqdir-sparse/tf-vectors -o ./examples/bin/work/reuters-lda -k 20 -v 50000 -ow -x 20
> +#./bin/mahout ldatopics -i ./examples/bin/work/reuters-lda/state-20 -d ./examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 -dt sequencefile
>
>
> Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java?rev=947427&r1=947426&r2=947427&view=diff
> ==============================================================================
> --- mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java (original)
> +++ mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java Sun May 23 15:22:28 2010
> @@ -79,9 +79,7 @@ public final class SequenceFilesFromDire
> private final FileSystem fs;
>
> public ChunkedWriter(int chunkSizeInMB, String outputDir) throws IOException {
> - if (chunkSizeInMB< 64) {
> - chunkSizeInMB = 64;
> - } else if (chunkSizeInMB> 1984) {
> + if (chunkSizeInMB> 1984) {
> chunkSizeInMB = 1984;
> }
> maxChunkSizeInBytes = chunkSizeInMB * 1024 * 1024;
>
> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=947427&r1=947426&r2=947427&view=diff
> ==============================================================================
> --- mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java (original)
> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java Sun May 23 15:22:28 2010
> @@ -101,14 +101,14 @@ public final class SparseVectorsFromSequ
> abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
> .withDescription(
> "(Optional) The maximum size of ngrams to create"
> - + " (2 = bigrams, 3 = trigrams, etc) Default Value:2").withShortName("ng").create();
> + + " (2 = bigrams, 3 = trigrams, etc) Default Value:1").withShortName("ng").create();
> Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
> .withDescription(
> - "(Optional) Whether output vectors should be SequentialAccessVectors If set true else false")
> + "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")
> .withShortName("seq").create();
>
> Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false).withDescription(
> - "If set, overwrite the output directory").withShortName("w").create();
> + "If set, overwrite the output directory").withShortName("ow").create();
> Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
> .create();
>
> @@ -165,7 +165,7 @@ public final class SparseVectorsFromSequ
> if (cmdLine.hasOption(numReduceTasksOpt)) {
> reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
> }
> - log.info("Pass1 reduce tasks: {}", reduceTasks);
> + log.info("Number of reduce tasks: {}", reduceTasks);
>
> Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
> if (cmdLine.hasOption(analyzerNameOpt)) {
> @@ -224,7 +224,7 @@ public final class SparseVectorsFromSequ
> TFIDFConverter.processTfIdf(
> new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
> new Path(outputDir, TFIDFConverter.TFIDF_OUTPUT_FOLDER), chunkSize, minDf, maxDFPercent, norm,
> - sequentialAccessOutput);
> + sequentialAccessOutput, reduceTasks);
> }
> } catch (OptionException e) {
> log.error("Exception", e);
>
> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java?rev=947427&r1=947426&r2=947427&view=diff
> ==============================================================================
> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java (original)
> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java Sun May 23 15:22:28 2010
> @@ -69,13 +69,16 @@ public final class PartialVectorMerger {
> * output directory were the partial vectors have to be created
> * @param normPower
> * The normalization value. Must be greater than or equal to 0 or equal to {@link #NO_NORMALIZING}
> + * @param numReducers
> + * The number of reducers to spawn
> * @throws IOException
> */
> public static void mergePartialVectors(List<Path> partialVectorPaths,
> Path output,
> float normPower,
> int dimension,
> - boolean sequentialAccess) throws IOException {
> + boolean sequentialAccess,
> + int numReducers) throws IOException {
> if (normPower != NO_NORMALIZING&& normPower< 0) {
> throw new IllegalArgumentException("normPower must either be -1 or>= 0");
> }
> @@ -101,6 +104,7 @@ public final class PartialVectorMerger {
> conf.setInputFormat(SequenceFileInputFormat.class);
> conf.setReducerClass(PartialVectorMergeReducer.class);
> conf.setOutputFormat(SequenceFileOutputFormat.class);
> + conf.setNumReduceTasks(numReducers);
>
> HadoopUtil.overwriteOutput(output);
>
>
> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=947427&r1=947426&r2=947427&view=diff
> ==============================================================================
> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java (original)
> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java Sun May 23 15:22:28 2010
> @@ -59,7 +59,7 @@ import org.apache.mahout.utils.vectors.t
> */
> public final class DictionaryVectorizer {
>
> - public static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "vectors";
> + public static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tf-vectors";
>
> public static final String MIN_SUPPORT = "min.support";
>
> @@ -153,7 +153,7 @@ public final class DictionaryVectorizer
> Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++);
> partialVectorPaths.add(partialVectorOutputPath);
> makePartialVectors(input, maxNGramSize, dictionaryChunk, partialVectorOutputPath,
> - maxTermDimension[0], sequentialAccess);
> + maxTermDimension[0], sequentialAccess, numReducers);
> }
>
> Configuration conf = new Configuration();
> @@ -162,7 +162,7 @@ public final class DictionaryVectorizer
> Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
> if (dictionaryChunks.size()> 1) {
> PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, -1, maxTermDimension[0],
> - sequentialAccess);
> + sequentialAccess, numReducers);
> HadoopUtil.deletePaths(partialVectorPaths, fs);
> } else {
> Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
> @@ -245,6 +245,8 @@ public final class DictionaryVectorizer
> * location of the chunk of features and the id's
> * @param output
> * output directory were the partial vectors have to be created
> + * @param numReducers
> + * the desired number of reducer tasks
> * @throws IOException
> */
> private static void makePartialVectors(Path input,
> @@ -252,7 +254,8 @@ public final class DictionaryVectorizer
> Path dictionaryFilePath,
> Path output,
> int dimension,
> - boolean sequentialAccess) throws IOException {
> + boolean sequentialAccess,
> + int numReducers) throws IOException {
>
> Configurable client = new JobClient();
> JobConf conf = new JobConf(DictionaryVectorizer.class);
> @@ -279,6 +282,7 @@ public final class DictionaryVectorizer
> conf.setInputFormat(SequenceFileInputFormat.class);
> conf.setReducerClass(TFPartialVectorReducer.class);
> conf.setOutputFormat(SequenceFileOutputFormat.class);
> + conf.setNumReduceTasks(numReducers);
>
> HadoopUtil.overwriteOutput(output);
>
>
> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java?rev=947427&r1=947426&r2=947427&view=diff
> ==============================================================================
> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java (original)
> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java Sun May 23 15:22:28 2010
> @@ -66,7 +66,7 @@ public final class TFIDFConverter {
>
> public static final String TFIDF_OUTPUT_FOLDER = "tfidf";
>
> - private static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "vectors";
> + private static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tfidf-vectors";
>
> private static final String FREQUENCY_FILE = "frequency.file-";
>
> @@ -99,17 +99,21 @@ public final class TFIDFConverter {
> * @param output
> * output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the document
> * are generated
> - * @param minDf
> - * The minimum document frequency. Default 1
> - * @param maxDFPercent
> - * The max percentage of vectors for the DF. Can be used to remove really high frequency features.
> - * Expressed as an integer between 0 and 100. Default 99
> * @param chunkSizeInMegabytes
> * the size in MB of the feature => id chunk to be kept in memory at each node during Map/Reduce
> * stage. Its recommended you calculated this based on the number of cores and the free memory
> * available to you per node. Say, you have 2 cores and around 1GB extra memory to spare we
> * recommend you use a split size of around 400-500MB so that two simultaneous reducers can create
> * partial vectors without thrashing the system due to increased swapping
> + * @param minDf
> + * The minimum document frequency. Default 1
> + * @param maxDFPercent
> + * The max percentage of vectors for the DF. Can be used to remove really high frequency features.
> + * Expressed as an integer between 0 and 100. Default 99
> + * @param numReducers
> + * The number of reducers to spawn. This also affects the possible parallelism since each reducer
> + * will typically produce a single output file containing tf-idf vectors for a subset of the
> + * documents in the corpus.
> * @throws IOException
> */
> public static void processTfIdf(Path input,
> @@ -118,7 +122,8 @@ public final class TFIDFConverter {
> int minDf,
> int maxDFPercent,
> float normPower,
> - boolean sequentialAccessOutput) throws IOException {
> + boolean sequentialAccessOutput,
> + int numReducers) throws IOException {
> if (chunkSizeInMegabytes< MIN_CHUNKSIZE) {
> chunkSizeInMegabytes = MIN_CHUNKSIZE;
> } else if (chunkSizeInMegabytes> MAX_CHUNKSIZE) { // 10GB
> @@ -158,7 +163,7 @@ public final class TFIDFConverter {
> Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
> if (dictionaryChunks.size()> 1) {
> PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, normPower,
> - datasetFeatures.getFirst()[0].intValue(), sequentialAccessOutput);
> + datasetFeatures.getFirst()[0].intValue(), sequentialAccessOutput, numReducers);
> HadoopUtil.deletePaths(partialVectorPaths, fs);
> } else {
> Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
>
> Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java?rev=947427&r1=947426&r2=947427&view=diff
> ==============================================================================
> --- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java (original)
> +++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java Sun May 23 15:22:28 2010
> @@ -119,7 +119,7 @@ public class DictionaryVectorizerTest ex
> DictionaryVectorizer.createTermFrequencyVectors(getTestTempDirPath("output/tokenized-documents"),
> getTestTempDirPath("output/wordcount"), 2, 1, 0.0f, 1, 100, false);
> TFIDFConverter.processTfIdf(getTestTempDirPath("output/wordcount/vectors"),
> - getTestTempDirPath("output/tfidf"), 100, 1, 99, 1.0f, false);
> + getTestTempDirPath("output/tfidf"), 100, 1, 99, 1.0f, false, 1);
>
> }
> }
>
>
>
>