You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/03 22:36:52 UTC

svn commit: r906255 - /lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java

Author: robinanil
Date: Wed Feb  3 21:36:51 2010
New Revision: 906255

URL: http://svn.apache.org/viewvc?rev=906255&view=rev
Log:
MAHOUT-273 RandomSeedGenerator doesnt estimate cluster centers when input path is a directory. Now iterates over the all the files in the input directory to generate sample vectors

Modified:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java?rev=906255&r1=906254&r2=906255&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java Wed Feb  3 21:36:51 2010
@@ -17,6 +17,7 @@
 
 package org.apache.mahout.clustering.kmeans;
 
+import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
@@ -65,36 +66,50 @@
     }
     boolean newFile = fs.createNewFile(outFile);
     if (newFile) {
-      SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(input), conf);
-      Writable key = (Writable) reader.getKeyClass().newInstance();
-      VectorWritable value = (VectorWritable) reader.getValueClass().newInstance();
+      Path inputPathPattern;
+      Path inputPath = new Path(input);
+      
+      if (fs.getFileStatus(inputPath).isDir() == true) {
+        inputPathPattern = new Path(inputPath.toString() + "/*");
+      } else {
+        inputPathPattern = inputPath;
+      }
+      
+      FileStatus[] inputFiles = fs.globStatus(inputPathPattern);
       SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class, Cluster.class);
       Random random = RandomUtils.getRandom();
-
       List<Text> chosenTexts = new ArrayList<Text>(k);
       List<Cluster> chosenClusters = new ArrayList<Cluster>(k);
       int nextClusterId = 0;
-      while (reader.next(key, value)) {
-        Cluster newCluster = new Cluster(value.get(), nextClusterId++);
-        newCluster.addPoint(value.get());
-        Text newText = new Text(key.toString());
-        int currentSize = chosenTexts.size();
-        if (currentSize < k) {
-          chosenTexts.add(newText);
-          chosenClusters.add(newCluster);
-        } else if (random.nextInt(currentSize + 1) == 0) { // with chance 1/(currentSize+1) pick new element
-          int indexToRemove = random.nextInt(currentSize); // evict one chosen randomly
-          chosenTexts.remove(indexToRemove);
-          chosenClusters.remove(indexToRemove);
-          chosenTexts.add(newText);
-          chosenClusters.add(newCluster);
+      
+      for (FileStatus fileStatus : inputFiles) {
+        if(fileStatus.isDir() == true) continue; // select only the top level files
+        SequenceFile.Reader reader = new SequenceFile.Reader(fs, fileStatus.getPath(), conf);
+        Writable key = (Writable) reader.getKeyClass().newInstance();
+        VectorWritable value = (VectorWritable) reader.getValueClass().newInstance();
+        while (reader.next(key, value)) {
+          Cluster newCluster = new Cluster(value.get(), nextClusterId++);
+          newCluster.addPoint(value.get());
+          Text newText = new Text(key.toString());
+          int currentSize = chosenTexts.size();
+          if (currentSize < k) {
+            chosenTexts.add(newText);
+            chosenClusters.add(newCluster);
+          } else if (random.nextInt(currentSize + 1) == 0) { // with chance 1/(currentSize+1) pick new element
+            int indexToRemove = random.nextInt(currentSize); // evict one chosen randomly
+            chosenTexts.remove(indexToRemove);
+            chosenClusters.remove(indexToRemove);
+            chosenTexts.add(newText);
+            chosenClusters.add(newCluster);
+          }
         }
+        reader.close();
       }
+      
       for (int i = 0; i < k; i++) {
         writer.append(chosenTexts.get(i), chosenClusters.get(i));
       }
       log.info("Wrote {} vectors to {}", k, outFile);
-      reader.close();
       writer.close();
     }