You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by jm...@apache.org on 2010/03/06 05:58:35 UTC

svn commit: r919700 - in /lucene/mahout/trunk: conf/driver.classes.props conf/rowid.props utils/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java

Author: jmannix
Date: Sat Mar  6 04:58:34 2010
New Revision: 919700

URL: http://svn.apache.org/viewvc?rev=919700&view=rev
Log:
Sequential (non-M/R) cmdline job to turn SequenceFile<Text,VectorWritable> into a pair of SequenceFile<IntWritable,VectorWritable> and SequenceFile<IntWritable,Text>.

Added:
    lucene/mahout/trunk/conf/rowid.props
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java
Modified:
    lucene/mahout/trunk/conf/driver.classes.props

Modified: lucene/mahout/trunk/conf/driver.classes.props
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/conf/driver.classes.props?rev=919700&r1=919699&r2=919700&view=diff
==============================================================================
--- lucene/mahout/trunk/conf/driver.classes.props (original)
+++ lucene/mahout/trunk/conf/driver.classes.props Sat Mar  6 04:58:34 2010
@@ -11,6 +11,7 @@
 org.apache.mahout.utils.vectors.lucene.Driver = lucene.vector : Generate Vectors from a Lucene index
 org.apache.mahout.text.SequenceFilesFromDirectory = seqdirectory : Generate sequence files (of Text) from a directory
 org.apache.mahout.text.SparseVectorsFromSequenceFiles = seq2sparse: Sparse Vector generation from Text sequence files
+org.apache.mahout.utils.vectors.RowIdJob = rowid : Map SequenceFile<Text,VectorWritable> to {SequenceFile<IntWritable,VectorWritable>, SequenceFile<IntWritable,Text>}
 org.apache.mahout.text.WikipediaToSequenceFile = seqwiki : Wikipedia xml dump to sequence file
 org.apache.mahout.classifier.bayes.TestClassifier = testclassifier : Test Bayes Classifier
 org.apache.mahout.classifier.bayes.TrainClassifier = trainclassifier : Train Bayes Classifier

Added: lucene/mahout/trunk/conf/rowid.props
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/conf/rowid.props?rev=919700&view=auto
==============================================================================
--- lucene/mahout/trunk/conf/rowid.props (added)
+++ lucene/mahout/trunk/conf/rowid.props Sat Mar  6 04:58:34 2010
@@ -0,0 +1,2 @@
+#i|input = 
+#o|output = 

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java?rev=919700&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java Sat Mar  6 04:58:34 2010
@@ -0,0 +1,67 @@
+package org.apache.mahout.utils.vectors;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.cf.taste.hadoop.AbstractJob;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Map;
+
+public class RowIdJob extends AbstractJob {
+  private static final Logger log = LoggerFactory.getLogger(RowIdJob.class);
+
+  @Override
+  public int run(String[] strings) throws Exception {
+    Map<String,String> parsedArgs = parseArguments(strings);
+    Configuration conf = getConf();
+    FileSystem fs = FileSystem.get(conf);
+    Path inputPath = fs.makeQualified(new Path(parsedArgs.get("--input")));
+    Path outputPath = fs.makeQualified(new Path(parsedArgs.get("--output")));
+    Path indexPath = new Path(outputPath, "docIndex");
+    Path matrixPath = new Path(outputPath, "matrix");
+    SequenceFile.Writer indexWriter = SequenceFile.createWriter(fs,
+                                                                conf,
+                                                                indexPath,
+                                                                IntWritable.class,
+                                                                Text.class);
+    SequenceFile.Writer matrixWriter = SequenceFile.createWriter(fs,
+                                                                 conf,
+                                                                 matrixPath,
+                                                                 IntWritable.class,
+                                                                 VectorWritable.class);
+    IntWritable docId = new IntWritable();
+    Text inputKey = new Text();
+    VectorWritable v = new VectorWritable();
+
+    int i = 0;
+    for(FileStatus status : fs.listStatus(inputPath)) {
+      SequenceFile.Reader reader = new SequenceFile.Reader(fs, status.getPath(), conf);
+      while(reader.next(inputKey, v)) {
+        docId.set(i);
+        indexWriter.append(docId, inputKey);
+        matrixWriter.append(docId, v);
+        i++;
+      }
+      reader.close();
+    }
+    
+    int numCols = v.get().size();
+    matrixWriter.close();
+    indexWriter.close();
+    log.info("Wrote out matrix with {} rows and {} columns to " + matrixPath, i, numCols);
+    return 0;
+  }
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new RowIdJob(), args);
+  }
+
+}