You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/03 22:36:52 UTC
svn commit: r906255 -
/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java
Author: robinanil
Date: Wed Feb 3 21:36:51 2010
New Revision: 906255
URL: http://svn.apache.org/viewvc?rev=906255&view=rev
Log:
MAHOUT-273 RandomSeedGenerator doesnt estimate cluster centers when input path is a directory. Now iterates over the all the files in the input directory to generate sample vectors
Modified:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java?rev=906255&r1=906254&r2=906255&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java Wed Feb 3 21:36:51 2010
@@ -17,6 +17,7 @@
package org.apache.mahout.clustering.kmeans;
+import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
@@ -65,36 +66,50 @@
}
boolean newFile = fs.createNewFile(outFile);
if (newFile) {
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(input), conf);
- Writable key = (Writable) reader.getKeyClass().newInstance();
- VectorWritable value = (VectorWritable) reader.getValueClass().newInstance();
+ Path inputPathPattern;
+ Path inputPath = new Path(input);
+
+ if (fs.getFileStatus(inputPath).isDir() == true) {
+ inputPathPattern = new Path(inputPath.toString() + "/*");
+ } else {
+ inputPathPattern = inputPath;
+ }
+
+ FileStatus[] inputFiles = fs.globStatus(inputPathPattern);
SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class, Cluster.class);
Random random = RandomUtils.getRandom();
-
List<Text> chosenTexts = new ArrayList<Text>(k);
List<Cluster> chosenClusters = new ArrayList<Cluster>(k);
int nextClusterId = 0;
- while (reader.next(key, value)) {
- Cluster newCluster = new Cluster(value.get(), nextClusterId++);
- newCluster.addPoint(value.get());
- Text newText = new Text(key.toString());
- int currentSize = chosenTexts.size();
- if (currentSize < k) {
- chosenTexts.add(newText);
- chosenClusters.add(newCluster);
- } else if (random.nextInt(currentSize + 1) == 0) { // with chance 1/(currentSize+1) pick new element
- int indexToRemove = random.nextInt(currentSize); // evict one chosen randomly
- chosenTexts.remove(indexToRemove);
- chosenClusters.remove(indexToRemove);
- chosenTexts.add(newText);
- chosenClusters.add(newCluster);
+
+ for (FileStatus fileStatus : inputFiles) {
+ if(fileStatus.isDir() == true) continue; // select only the top level files
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, fileStatus.getPath(), conf);
+ Writable key = (Writable) reader.getKeyClass().newInstance();
+ VectorWritable value = (VectorWritable) reader.getValueClass().newInstance();
+ while (reader.next(key, value)) {
+ Cluster newCluster = new Cluster(value.get(), nextClusterId++);
+ newCluster.addPoint(value.get());
+ Text newText = new Text(key.toString());
+ int currentSize = chosenTexts.size();
+ if (currentSize < k) {
+ chosenTexts.add(newText);
+ chosenClusters.add(newCluster);
+ } else if (random.nextInt(currentSize + 1) == 0) { // with chance 1/(currentSize+1) pick new element
+ int indexToRemove = random.nextInt(currentSize); // evict one chosen randomly
+ chosenTexts.remove(indexToRemove);
+ chosenClusters.remove(indexToRemove);
+ chosenTexts.add(newText);
+ chosenClusters.add(newCluster);
+ }
}
+ reader.close();
}
+
for (int i = 0; i < k; i++) {
writer.append(chosenTexts.get(i), chosenClusters.get(i));
}
log.info("Wrote {} vectors to {}", k, outFile);
- reader.close();
writer.close();
}