You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by jm...@apache.org on 2010/03/06 05:58:35 UTC
svn commit: r919700 - in /lucene/mahout/trunk: conf/driver.classes.props
conf/rowid.props
utils/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java
Author: jmannix
Date: Sat Mar 6 04:58:34 2010
New Revision: 919700
URL: http://svn.apache.org/viewvc?rev=919700&view=rev
Log:
Sequential (non-M/R) cmdline job to turn SequenceFile<Text,VectorWritable> into a pair of SequenceFile<IntWritable,VectorWritable> and SequenceFile<IntWritable,Text>.
Added:
lucene/mahout/trunk/conf/rowid.props
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java
Modified:
lucene/mahout/trunk/conf/driver.classes.props
Modified: lucene/mahout/trunk/conf/driver.classes.props
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/conf/driver.classes.props?rev=919700&r1=919699&r2=919700&view=diff
==============================================================================
--- lucene/mahout/trunk/conf/driver.classes.props (original)
+++ lucene/mahout/trunk/conf/driver.classes.props Sat Mar 6 04:58:34 2010
@@ -11,6 +11,7 @@
org.apache.mahout.utils.vectors.lucene.Driver = lucene.vector : Generate Vectors from a Lucene index
org.apache.mahout.text.SequenceFilesFromDirectory = seqdirectory : Generate sequence files (of Text) from a directory
org.apache.mahout.text.SparseVectorsFromSequenceFiles = seq2sparse: Sparse Vector generation from Text sequence files
+org.apache.mahout.utils.vectors.RowIdJob = rowid : Map SequenceFile<Text,VectorWritable> to {SequenceFile<IntWritable,VectorWritable>, SequenceFile<IntWritable,Text>}
org.apache.mahout.text.WikipediaToSequenceFile = seqwiki : Wikipedia xml dump to sequence file
org.apache.mahout.classifier.bayes.TestClassifier = testclassifier : Test Bayes Classifier
org.apache.mahout.classifier.bayes.TrainClassifier = trainclassifier : Train Bayes Classifier
Added: lucene/mahout/trunk/conf/rowid.props
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/conf/rowid.props?rev=919700&view=auto
==============================================================================
--- lucene/mahout/trunk/conf/rowid.props (added)
+++ lucene/mahout/trunk/conf/rowid.props Sat Mar 6 04:58:34 2010
@@ -0,0 +1,2 @@
+#i|input =
+#o|output =
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java?rev=919700&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java Sat Mar 6 04:58:34 2010
@@ -0,0 +1,67 @@
+package org.apache.mahout.utils.vectors;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.cf.taste.hadoop.AbstractJob;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Map;
+
+public class RowIdJob extends AbstractJob {
+ private static final Logger log = LoggerFactory.getLogger(RowIdJob.class);
+
+ @Override
+ public int run(String[] strings) throws Exception {
+ Map<String,String> parsedArgs = parseArguments(strings);
+ Configuration conf = getConf();
+ FileSystem fs = FileSystem.get(conf);
+ Path inputPath = fs.makeQualified(new Path(parsedArgs.get("--input")));
+ Path outputPath = fs.makeQualified(new Path(parsedArgs.get("--output")));
+ Path indexPath = new Path(outputPath, "docIndex");
+ Path matrixPath = new Path(outputPath, "matrix");
+ SequenceFile.Writer indexWriter = SequenceFile.createWriter(fs,
+ conf,
+ indexPath,
+ IntWritable.class,
+ Text.class);
+ SequenceFile.Writer matrixWriter = SequenceFile.createWriter(fs,
+ conf,
+ matrixPath,
+ IntWritable.class,
+ VectorWritable.class);
+ IntWritable docId = new IntWritable();
+ Text inputKey = new Text();
+ VectorWritable v = new VectorWritable();
+
+ int i = 0;
+ for(FileStatus status : fs.listStatus(inputPath)) {
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, status.getPath(), conf);
+ while(reader.next(inputKey, v)) {
+ docId.set(i);
+ indexWriter.append(docId, inputKey);
+ matrixWriter.append(docId, v);
+ i++;
+ }
+ reader.close();
+ }
+
+ int numCols = v.get().size();
+ matrixWriter.close();
+ indexWriter.close();
+ log.info("Wrote out matrix with {} rows and {} columns to " + matrixPath, i, numCols);
+ return 0;
+ }
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new RowIdJob(), args);
+ }
+
+}