You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/13 21:27:30 UTC
svn commit: r909900 [3/4] - in
/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df: ./ builder/
callback/ data/ data/conditions/ mapred/ mapred/inmem/ mapred/partial/
mapreduce/ mapreduce/inmem/ mapreduce/partial/ node/ ref/ split/ tools/
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/InterResults.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/InterResults.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/InterResults.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/InterResults.java Sat Feb 13 20:27:25 2010
@@ -26,38 +26,47 @@
/**
* Stores/Loads the intermediate results of step1 needed by step2.<br>
- * This class should not be needed outside of the partial package, so all its
- * methods are protected.<br>
+ * This class should not be needed outside of the partial package, so all its methods are protected.<br>
*/
public class InterResults {
- private InterResults() {
- }
-
+ private InterResults() {}
+
/**
* Load the trees and the keys returned from the first step
*
- * @param fs forest path file system
- * @param forestPath file path to the (key,tree) file
- * @param numMaps number of map tasks
- * @param numTrees total number of trees in the forest
- * @param partition current partition
- * @param keys array of size numTrees, will contain the loaded keys
- * @param trees array of size numTrees, will contain the loaded trees
+ * @param fs
+ * forest path file system
+ * @param forestPath
+ * file path to the (key,tree) file
+ * @param numMaps
+ * number of map tasks
+ * @param numTrees
+ * total number of trees in the forest
+ * @param partition
+ * current partition
+ * @param keys
+ * array of size numTrees, will contain the loaded keys
+ * @param trees
+ * array of size numTrees, will contain the loaded trees
* @return number of instances in the current partition
* @throws IOException
*/
- public static int load(FileSystem fs, Path forestPath, int numMaps,
- int numTrees, int partition, TreeID[] keys, Node[] trees)
- throws IOException {
+ public static int load(FileSystem fs,
+ Path forestPath,
+ int numMaps,
+ int numTrees,
+ int partition,
+ TreeID[] keys,
+ Node[] trees) throws IOException {
if (keys.length != trees.length) {
throw new IllegalArgumentException("keys.length != trees.length");
}
-
+
FSDataInputStream in = fs.open(forestPath);
-
+
TreeID key = new TreeID();
int numInstances = -1;
-
+
try {
// get current partition's size
for (int p = 0; p < numMaps; p++) {
@@ -67,12 +76,12 @@
in.readInt();
}
}
-
+
// load (key, tree)
int current = 0;
for (int index = 0; index < numTrees; index++) {
key.readFields(in);
-
+
if (key.partition() == partition) {
// skip the trees of the current partition
Node.read(in);
@@ -82,49 +91,52 @@
current++;
}
}
-
+
if (current != keys.length) {
throw new IllegalStateException("loaded less keys/trees than expected");
}
} finally {
in.close();
}
-
+
return numInstances;
}
-
+
/**
* Write the forest trees into a file
*
- * @param fs File System
- * @param keys keys returned by the first step
- * @param trees trees returned by the first step
- * @param sizes partitions' sizes in hadoop order
+ * @param fs
+ * File System
+ * @param keys
+ * keys returned by the first step
+ * @param trees
+ * trees returned by the first step
+ * @param sizes
+ * partitions' sizes in hadoop order
* @throws IOException
*/
- public static void store(FileSystem fs, Path forestPath,
- TreeID[] keys, Node[] trees, int[] sizes) throws IOException {
+ public static void store(FileSystem fs, Path forestPath, TreeID[] keys, Node[] trees, int[] sizes) throws IOException {
if (keys.length != trees.length) {
throw new IllegalArgumentException("keys.length != trees.length");
}
-
+
int numTrees = keys.length;
int numMaps = sizes.length;
-
+
FSDataOutputStream out = fs.create(forestPath);
-
+
// write partitions' sizes
for (int p = 0; p < numMaps; p++) {
out.writeInt(sizes[p]);
}
-
+
// write the data
for (int index = 0; index < numTrees; index++) {
keys[index].write(out);
trees[index].write(out);
}
-
+
out.close();
}
-
+
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/PartialBuilder.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/PartialBuilder.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/PartialBuilder.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/PartialBuilder.java Sat Feb 13 20:27:25 2010
@@ -35,20 +35,19 @@
import org.apache.mahout.df.builder.TreeBuilder;
import org.apache.mahout.df.callback.PredictionCallback;
import org.apache.mahout.df.mapreduce.Builder;
-import org.apache.mahout.df.mapreduce.partial.Step0Job.Step0Output;
import org.apache.mahout.df.mapreduce.MapredOutput;
+import org.apache.mahout.df.mapreduce.partial.Step0Job.Step0Output;
import org.apache.mahout.df.node.Node;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
- * Builds a random forest using partial data. Each mapper uses only the data
- * given by its InputSplit
+ * Builds a random forest using partial data. Each mapper uses only the data given by its InputSplit
*/
public class PartialBuilder extends Builder {
-
+
private static final Logger log = LoggerFactory.getLogger(PartialBuilder.class);
-
+
/**
* Indicates if we should run the second step of the builder.<br>
* This parameter is only meant for debuging, so we keep it protected.
@@ -59,111 +58,121 @@
protected static boolean isStep2(Configuration conf) {
return conf.getBoolean("debug.mahout.rf.partial.step2", true);
}
-
+
/**
* Should run the second step of the builder ?
*
* @param conf
- * @param value true to indicate that the second step will be launched
+ * @param value
+ * true to indicate that the second step will be launched
*
*/
protected static void setStep2(Configuration conf, boolean value) {
conf.setBoolean("debug.mahout.rf.partial.step2", value);
}
-
- public PartialBuilder(TreeBuilder treeBuilder, Path dataPath,
- Path datasetPath, Long seed) {
+
+ public PartialBuilder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath, Long seed) {
this(treeBuilder, dataPath, datasetPath, seed, new Configuration());
}
-
- public PartialBuilder(TreeBuilder treeBuilder, Path dataPath,
- Path datasetPath, Long seed, Configuration conf) {
+
+ public PartialBuilder(TreeBuilder treeBuilder,
+ Path dataPath,
+ Path datasetPath,
+ Long seed,
+ Configuration conf) {
super(treeBuilder, dataPath, datasetPath, seed, conf);
}
-
@Override
- protected void configureJob(Job job, int nbTrees, boolean oobEstimate)
- throws IOException {
+ protected void configureJob(Job job, int nbTrees, boolean oobEstimate) throws IOException {
Configuration conf = job.getConfiguration();
job.setJarByClass(PartialBuilder.class);
FileInputFormat.setInputPaths(job, getDataPath());
FileOutputFormat.setOutputPath(job, getOutputPath(conf));
-
+
job.setOutputKeyClass(TreeID.class);
job.setOutputValueClass(MapredOutput.class);
-
+
job.setMapperClass(Step1Mapper.class);
job.setNumReduceTasks(0); // no reducers
-
+
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
}
-
+
@Override
- protected DecisionForest parseOutput(Job job, PredictionCallback callback)
- throws IOException, ClassNotFoundException, InterruptedException {
+ protected DecisionForest parseOutput(Job job, PredictionCallback callback) throws IOException,
+ ClassNotFoundException,
+ InterruptedException {
Configuration conf = job.getConfiguration();
- int numTrees = getNbTrees(conf);
-
+ int numTrees = Builder.getNbTrees(conf);
+
Path outputPath = getOutputPath(conf);
-
- log.info("Computing partitions' first ids...");
+
+ PartialBuilder.log.info("Computing partitions' first ids...");
Step0Job step0 = new Step0Job(getOutputPath(conf), getDataPath(), getDatasetPath());
Step0Output[] partitions = step0.run(new Configuration(conf));
-
- log.info("Processing the output...");
+
+ PartialBuilder.log.info("Processing the output...");
TreeID[] keys = new TreeID[numTrees];
Node[] trees = new Node[numTrees];
int[] firstIds = Step0Output.extractFirstIds(partitions);
- processOutput(job, outputPath, firstIds, keys, trees, callback);
-
+ PartialBuilder.processOutput(job, outputPath, firstIds, keys, trees, callback);
+
// JobClient should have updated numMaps to the correct number of maps
int numMaps = partitions.length;
-
+
// call the second step in order to complete the oob predictions
- if (callback != null && numMaps > 1 && isStep2(conf)) {
- log.info("*****************************");
- log.info("Second Step");
- log.info("*****************************");
+ if ((callback != null) && (numMaps > 1) && PartialBuilder.isStep2(conf)) {
+ PartialBuilder.log.info("*****************************");
+ PartialBuilder.log.info("Second Step");
+ PartialBuilder.log.info("*****************************");
Step2Job step2 = new Step2Job(getOutputPath(conf), getDataPath(), getDatasetPath(), partitions);
-
+
step2.run(new Configuration(conf), keys, trees, callback);
}
-
+
return new DecisionForest(Arrays.asList(trees));
}
-
+
/**
* Processes the output from the output path.<br>
*
* @param job
- * @param outputPath directory that contains the output of the job
- * @param firstIds partitions' first ids in hadoop's order
- * @param keys can be null
- * @param trees can be null
- * @param callback can be null
+ * @param outputPath
+ * directory that contains the output of the job
+ * @param firstIds
+ * partitions' first ids in hadoop's order
+ * @param keys
+ * can be null
+ * @param trees
+ * can be null
+ * @param callback
+ * can be null
* @throws IOException
*/
- protected static void processOutput(JobContext job, Path outputPath,
- int[] firstIds, TreeID[] keys, Node[] trees, PredictionCallback callback)
- throws IOException {
- if ((keys != null && trees == null)||(keys == null && trees != null)) {
+ protected static void processOutput(JobContext job,
+ Path outputPath,
+ int[] firstIds,
+ TreeID[] keys,
+ Node[] trees,
+ PredictionCallback callback) throws IOException {
+ if (((keys != null) && (trees == null)) || ((keys == null) && (trees != null))) {
throw new IllegalArgumentException("if keys is null, trees should also be null");
}
- if (keys != null && keys.length != trees.length) {
+ if ((keys != null) && (keys.length != trees.length)) {
throw new IllegalArgumentException("keys.length != trees.length");
}
Configuration conf = job.getConfiguration();
FileSystem fs = outputPath.getFileSystem(conf);
-
+
Path[] outfiles = DFUtils.listOutputFiles(fs, outputPath);
-
+
// read all the outputs
TreeID key = new TreeID();
MapredOutput value = new MapredOutput();
@@ -171,7 +180,7 @@
int index = 0;
for (Path path : outfiles) {
Reader reader = new Reader(fs, path, conf);
-
+
try {
while (reader.next(key, value)) {
if (keys != null) {
@@ -182,7 +191,7 @@
trees[index] = value.getTree();
}
- processOutput(firstIds, key, value, callback);
+ PartialBuilder.processOutput(firstIds, key, value, callback);
index++;
}
@@ -190,30 +199,31 @@
reader.close();
}
}
-
+
// make sure we got all the keys/values
- if (keys != null && index != keys.length) {
+ if ((keys != null) && (index != keys.length)) {
throw new IllegalStateException("Some key/values are missing from the output");
}
}
-
+
/**
- * Process the output, extracting the trees and passing the predictions to the
- * callback
+ * Process the output, extracting the trees and passing the predictions to the callback
*
- * @param firstIds partitions' first ids in hadoop's order
+ * @param firstIds
+ * partitions' first ids in hadoop's order
* @param callback
* @return
*/
- private static void processOutput(int[] firstIds, TreeID key,
- MapredOutput value, PredictionCallback callback) {
-
+ private static void processOutput(int[] firstIds,
+ TreeID key,
+ MapredOutput value,
+ PredictionCallback callback) {
+
if (callback != null) {
int[] predictions = value.getPredictions();
-
+
for (int instanceId = 0; instanceId < predictions.length; instanceId++) {
- callback.prediction(key.treeId(), firstIds[key.partition()] + instanceId,
- predictions[instanceId]);
+ callback.prediction(key.treeId(), firstIds[key.partition()] + instanceId, predictions[instanceId]);
}
}
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step0Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step0Job.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step0Job.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step0Job.java Sat Feb 13 20:27:25 2010
@@ -47,25 +47,26 @@
import org.slf4j.LoggerFactory;
/**
- * preparation step of the partial mapreduce builder. Computes some stats that
- * will be used by the builder.
+ * preparation step of the partial mapreduce builder. Computes some stats that will be used by the builder.
*/
public class Step0Job {
-
+
private static final Logger log = LoggerFactory.getLogger(Step0Job.class);
/** directory that will hold this job's output */
private final Path outputPath;
-
+
/** file that contains the serialized dataset */
private final Path datasetPath;
-
+
/** directory that contains the data used in the first step */
private final Path dataPath;
-
+
/**
- * @param base base directory
- * @param dataPath data used in the first step
+ * @param base
+ * base directory
+ * @param dataPath
+ * data used in the first step
* @param datasetPath
*/
public Step0Job(Path base, Path dataPath, Path datasetPath) {
@@ -73,70 +74,74 @@
this.dataPath = dataPath;
this.datasetPath = datasetPath;
}
-
+
/**
* Computes the partitions' info in Hadoop's order
*
- * @param conf configuration
+ * @param conf
+ * configuration
* @return partitions' info in Hadoop's order
*/
- public Step0Output[] run(Configuration conf) throws IOException, ClassNotFoundException, InterruptedException {
+ public Step0Output[] run(Configuration conf) throws IOException,
+ ClassNotFoundException,
+ InterruptedException {
// check the output
- if (outputPath.getFileSystem(conf).exists(outputPath))
+ if (outputPath.getFileSystem(conf).exists(outputPath)) {
throw new IOException("Output path already exists : " + outputPath);
-
+ }
+
// put the dataset into the DistributedCache
// use setCacheFiles() to overwrite the first-step cache files
- URI[] files = { datasetPath.toUri() };
+ URI[] files = {datasetPath.toUri()};
DistributedCache.setCacheFiles(files, conf);
-
+
Job job = new Job(conf);
job.setJarByClass(Step0Job.class);
-
+
FileInputFormat.setInputPaths(job, dataPath);
FileOutputFormat.setOutputPath(job, outputPath);
-
+
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Step0Output.class);
-
+
job.setMapperClass(Step0Mapper.class);
job.setNumReduceTasks(0); // no reducers
-
+
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
-
+
// run the job
job.waitForCompletion(true);
-
+
return parseOutput(job);
}
-
+
/**
* Extracts the output and processes it
- *
+ *
* @return info for each partition in Hadoop's order
* @throws IOException
*/
protected Step0Output[] parseOutput(JobContext job) throws IOException {
Configuration conf = job.getConfiguration();
- log.info("mapred.map.tasks = {}", conf.getInt("mapred.map.tasks", -1));
+ Step0Job.log.info("mapred.map.tasks = {}", conf.getInt("mapred.map.tasks", -1));
FileSystem fs = outputPath.getFileSystem(conf);
-
+
Path[] outfiles = DFUtils.listOutputFiles(fs, outputPath);
-
+
List<Integer> keys = new ArrayList<Integer>();
List<Step0Output> values = new ArrayList<Step0Output>();
-
+
// read all the outputs
IntWritable key = new IntWritable();
Step0Output value = new Step0Output(0L, 0);
-
+
for (Path path : outfiles) {
Reader reader = new Reader(fs, path, conf);
-
+
try {
while (reader.next(key, value)) {
keys.add(key.get());
@@ -146,10 +151,10 @@
reader.close();
}
}
-
- return processOutput(keys, values);
+
+ return Step0Job.processOutput(keys, values);
}
-
+
/**
* Replaces the first id for each partition in Hadoop's order
*
@@ -164,14 +169,14 @@
Step0Output[] sorted = new Step0Output[numMaps];
values.toArray(sorted);
Arrays.sort(sorted);
-
+
// compute the partitions firstIds (file order)
int[] orderedIds = new int[numMaps];
orderedIds[0] = 0;
for (int p = 1; p < numMaps; p++) {
orderedIds[p] = orderedIds[p - 1] + sorted[p - 1].size;
}
-
+
// update the values' first ids
for (int p = 0; p < numMaps; p++) {
int order = ArrayUtils.indexOf(sorted, values.get(p));
@@ -183,28 +188,27 @@
for (int p = 0; p < numMaps; p++) {
reordered[keys.get(p)] = values.get(p);
}
-
+
return reordered;
}
-
+
/**
* Outputs the first key and the size of the partition
*
*/
- protected static class Step0Mapper extends
- Mapper<LongWritable, Text, IntWritable, Step0Output> {
-
+ protected static class Step0Mapper extends Mapper<LongWritable,Text,IntWritable,Step0Output> {
+
private int partition;
-
+
private int size;
-
+
private Long firstId;
-
+
@Override
protected void setup(Context context) throws IOException, InterruptedException {
configure(context.getConfiguration().getInt("mapred.task.partition", -1));
}
-
+
/**
* Useful when testing
*
@@ -216,64 +220,64 @@
throw new IllegalArgumentException("Wrong partition id : " + partition);
}
}
-
+
@Override
- protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
+ protected void map(LongWritable key, Text value, Context context) throws IOException,
+ InterruptedException {
if (firstId == null) {
firstId = key.get();
}
-
+
size++;
}
-
+
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
context.write(new IntWritable(partition), new Step0Output(firstId, size));
}
-
+
}
-
+
/**
* Output of the step0's mappers
*
*/
- public static class Step0Output implements Writable,
- Comparable<Step0Output>, Cloneable {
-
+ public static class Step0Output implements Writable, Comparable<Step0Output>, Cloneable {
+
/**
* first key of the partition<br>
* used to sort the partition
*/
private long firstId;
-
+
/** number of instances in the partition */
private int size;
-
+
protected Step0Output(long firstId, int size) {
this.firstId = firstId;
this.size = size;
}
-
+
protected long getFirstId() {
return firstId;
}
-
+
protected int getSize() {
return size;
}
-
+
@Override
public void readFields(DataInput in) throws IOException {
firstId = in.readLong();
size = in.readInt();
}
-
+
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(firstId);
out.writeInt(size);
}
-
+
@Override
public boolean equals(Object other) {
if (!(other instanceof Step0Output)) {
@@ -281,44 +285,45 @@
}
return firstId == ((Step0Output) other).firstId;
}
-
+
@Override
public int hashCode() {
return (int) firstId;
}
-
+
@Override
protected Step0Output clone() {
return new Step0Output(firstId, size);
}
-
+
@Override
public int compareTo(Step0Output obj) {
- if (firstId < obj.firstId)
+ if (firstId < obj.firstId) {
return -1;
- else if (firstId > obj.firstId)
+ } else if (firstId > obj.firstId) {
return 1;
- else
+ } else {
return 0;
+ }
}
-
+
public static int[] extractFirstIds(Step0Output[] partitions) {
int[] ids = new int[partitions.length];
for (int p = 0; p < partitions.length; p++) {
ids[p] = (int) partitions[p].firstId;
}
-
+
return ids;
}
-
+
public static int[] extractSizes(Step0Output[] partitions) {
int[] sizes = new int[partitions.length];
for (int p = 0; p < partitions.length; p++) {
sizes[p] = partitions[p].size;
}
-
+
return sizes;
}
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step1Mapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step1Mapper.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step1Mapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step1Mapper.java Sat Feb 13 20:27:25 2010
@@ -39,92 +39,95 @@
import org.slf4j.LoggerFactory;
/**
- * First step of the Partial Data Builder. Builds the trees using the data
- * available in the InputSplit. Predict the oob classes for each tree in its
- * growing partition (input split).
+ * First step of the Partial Data Builder. Builds the trees using the data available in the InputSplit.
+ * Predict the oob classes for each tree in its growing partition (input split).
*/
-public class Step1Mapper extends
- MapredMapper<LongWritable, Text, TreeID, MapredOutput> {
-
+public class Step1Mapper extends MapredMapper<LongWritable,Text,TreeID,MapredOutput> {
+
private static final Logger log = LoggerFactory.getLogger(Step1Mapper.class);
-
+
/** used to convert input values to data instances */
private DataConverter converter;
-
+
private Random rng;
-
+
/** number of trees to be built by this mapper */
private int nbTrees;
-
+
/** id of the first tree */
private int firstTreeId;
-
+
/** mapper's partition */
private int partition;
-
+
/** will contain all instances if this mapper's split */
private final List<Instance> instances = new ArrayList<Instance>();
public int getFirstTreeId() {
return firstTreeId;
}
-
+
@Override
- protected void setup(Context context) throws IOException,
- InterruptedException {
+ protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
Configuration conf = context.getConfiguration();
-
- configure(Builder.getRandomSeed(conf), conf.getInt("mapred.task.partition",
- -1), Builder.getNumMaps(conf), Builder.getNbTrees(conf));
+
+ configure(Builder.getRandomSeed(conf), conf.getInt("mapred.task.partition", -1),
+ Builder.getNumMaps(conf), Builder.getNbTrees(conf));
}
-
+
/**
* Useful when testing
*
* @param seed
- * @param partition current mapper inputSplit partition
- * @param numMapTasks number of running map tasks
- * @param numTrees total number of trees in the forest
+ * @param partition
+ * current mapper inputSplit partition
+ * @param numMapTasks
+ * number of running map tasks
+ * @param numTrees
+ * total number of trees in the forest
*/
- protected void configure(Long seed, int partition, int numMapTasks,
- int numTrees) {
+ protected void configure(Long seed, int partition, int numMapTasks, int numTrees) {
converter = new DataConverter(getDataset());
-
+
// prepare random-numders generator
- log.debug("seed : {}", seed);
- if (seed == null)
+ Step1Mapper.log.debug("seed : {}", seed);
+ if (seed == null) {
rng = RandomUtils.getRandom();
- else
+ } else {
rng = RandomUtils.getRandom(seed);
-
+ }
+
// mapper's partition
if (partition < 0) {
throw new IllegalArgumentException("Wrong partition ID");
}
this.partition = partition;
-
+
// compute number of trees to build
- nbTrees = nbTrees(numMapTasks, numTrees, partition);
-
+ nbTrees = Step1Mapper.nbTrees(numMapTasks, numTrees, partition);
+
// compute first tree id
firstTreeId = 0;
for (int p = 0; p < partition; p++) {
- firstTreeId += nbTrees(numMapTasks, numTrees, p);
+ firstTreeId += Step1Mapper.nbTrees(numMapTasks, numTrees, p);
}
-
- log.debug("partition : {}", partition);
- log.debug("nbTrees : {}", nbTrees);
- log.debug("firstTreeId : {}", firstTreeId);
+
+ Step1Mapper.log.debug("partition : {}", partition);
+ Step1Mapper.log.debug("nbTrees : {}", nbTrees);
+ Step1Mapper.log.debug("firstTreeId : {}", firstTreeId);
}
-
+
/**
- * Compute the number of trees for a given partition. The first partition (0)
- * may be longer than the rest of partition because of the remainder.
+ * Compute the number of trees for a given partition. The first partition (0) may be longer than the rest of
+ * partition because of the remainder.
*
- * @param numMaps total number of maps (partitions)
- * @param numTrees total number of trees to build
- * @param partition partition to compute the number of trees for
+ * @param numMaps
+ * total number of maps (partitions)
+ * @param numTrees
+ * total number of trees to build
+ * @param partition
+ * partition to compute the number of trees for
* @return
*/
public static int nbTrees(int numMaps, int numTrees, int partition) {
@@ -132,46 +135,44 @@
if (partition == 0) {
nbTrees += numTrees - nbTrees * numMaps;
}
-
+
return nbTrees;
}
-
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
instances.add(converter.convert((int) key.get(), value.toString()));
}
-
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
// prepare the data
- log.debug("partition: {} numInstances: {}", partition, instances.size());
-
+ Step1Mapper.log.debug("partition: {} numInstances: {}", partition, instances.size());
+
Data data = new Data(getDataset(), instances);
Bagging bagging = new Bagging(getTreeBuilder(), data);
-
+
TreeID key = new TreeID();
-
- log.debug("Building {} trees", nbTrees);
+
+ Step1Mapper.log.debug("Building {} trees", nbTrees);
SingleTreePredictions callback = null;
int[] predictions = null;
for (int treeId = 0; treeId < nbTrees; treeId++) {
- log.debug("Building tree number : {}", treeId);
+ Step1Mapper.log.debug("Building tree number : {}", treeId);
if (isOobEstimate() && !isNoOutput()) {
callback = new SingleTreePredictions(data.size());
predictions = callback.getPredictions();
}
-
+
Node tree = bagging.build(treeId, rng, callback);
-
+
key.set(partition, firstTreeId + treeId);
-
+
if (!isNoOutput()) {
MapredOutput emOut = new MapredOutput(tree, predictions);
context.write(key, emOut);
}
}
}
-
+
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step2Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step2Job.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step2Job.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step2Job.java Sat Feb 13 20:27:25 2010
@@ -35,31 +35,33 @@
import org.apache.mahout.df.node.Node;
/**
- * 2nd step of the partial mapreduce builder. Computes the oob predictions using
- * all the trees of the forest
+ * 2nd step of the partial mapreduce builder. Computes the oob predictions using all the trees of the forest
*/
public class Step2Job {
/** directory that will hold this job's output */
private final Path outputPath;
-
+
/** file that will contains the forest, passed to the maps */
private final Path forestPath;
-
+
/** file that contains the serialized dataset */
private final Path datasetPath;
-
+
/** directory that contains the data used in the first step */
private final Path dataPath;
-
+
/** partitions info in Hadoop's order */
private final Step0Output[] partitions;
/**
- * @param base base directory
- * @param dataPath data used in the first step
+ * @param base
+ * base directory
+ * @param dataPath
+ * data used in the first step
* @param datasetPath
- * @param partitions partitions' infos in hadoop's order
+ * @param partitions
+ * partitions' infos in hadoop's order
*/
public Step2Job(Path base, Path dataPath, Path datasetPath, Step0Output[] partitions) {
this.outputPath = new Path(base, "step2.output");
@@ -68,52 +70,57 @@
this.datasetPath = datasetPath;
this.partitions = partitions;
}
-
+
/**
* Run the second step.
*
- * @param conf configuration
- * @param keys keys returned by the first step
- * @param trees trees returned by the first step
+ * @param conf
+ * configuration
+ * @param keys
+ * keys returned by the first step
+ * @param trees
+ * trees returned by the first step
* @param callback
*/
- public void run(Configuration conf, TreeID[] keys, Node[] trees, PredictionCallback callback)
- throws IOException, ClassNotFoundException, InterruptedException {
+ public void run(Configuration conf, TreeID[] keys, Node[] trees, PredictionCallback callback) throws IOException,
+ ClassNotFoundException,
+ InterruptedException {
if (callback == null) {
// no need to launch the job
return;
}
-
+
int numTrees = keys.length;
// check the output
- if (outputPath.getFileSystem(conf).exists(outputPath))
+ if (outputPath.getFileSystem(conf).exists(outputPath)) {
throw new IOException("Output path already exists : " + outputPath);
-
+ }
+
int[] sizes = Step0Output.extractSizes(partitions);
InterResults.store(forestPath.getFileSystem(conf), forestPath, keys, trees, sizes);
-
+
// needed by the mapper
Builder.setNbTrees(conf, numTrees);
-
+
// put the dataset and the forest into the DistributedCache
// use setCacheFiles() to overwrite the first-step cache files
- URI[] files = { datasetPath.toUri(), forestPath.toUri() };
+ URI[] files = {datasetPath.toUri(), forestPath.toUri()};
DistributedCache.setCacheFiles(files, conf);
-
+
Job job = new Job(conf);
job.setJarByClass(Step2Job.class);
-
+
FileInputFormat.setInputPaths(job, dataPath);
FileOutputFormat.setOutputPath(job, outputPath);
-
+
job.setOutputKeyClass(TreeID.class);
job.setOutputValueClass(MapredOutput.class);
-
+
job.setMapperClass(Step2Mapper.class);
job.setNumReduceTasks(0); // no reducers
-
+
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
@@ -122,7 +129,7 @@
parseOutput(job, callback);
}
}
-
+
/**
* Extracts the output and processes it
*
@@ -130,19 +137,18 @@
* @param callback
* @throws IOException
*/
- protected void parseOutput(Job job, PredictionCallback callback)
- throws IOException {
+ protected void parseOutput(Job job, PredictionCallback callback) throws IOException {
Configuration conf = job.getConfiguration();
int numMaps = Builder.getNumMaps(conf);
int numTrees = Builder.getNbTrees(conf);
-
+
// compute the total number of output values
int total = 0;
for (int partition = 0; partition < numMaps; partition++) {
total += Step2Mapper.nbConcerned(numMaps, numTrees, partition);
}
-
+
int[] firstIds = Step0Output.extractFirstIds(partitions);
PartialBuilder.processOutput(job, outputPath, firstIds, null, null, callback);
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step2Mapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step2Mapper.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step2Mapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step2Mapper.java Sat Feb 13 20:27:25 2010
@@ -39,27 +39,25 @@
import org.slf4j.LoggerFactory;
/**
- * Second step of PartialBuilder. Using the trees of the first step, computes
- * the oob predictions for each tree, except those of its own partition, on all
- * instancesof the partition.
+ * Second step of PartialBuilder. Using the trees of the first step, computes the oob predictions for each
+ * tree, except those of its own partition, on all instancesof the partition.
*/
-public class Step2Mapper extends Mapper<LongWritable, Text, TreeID, MapredOutput> {
-
+public class Step2Mapper extends Mapper<LongWritable,Text,TreeID,MapredOutput> {
+
private static final Logger log = LoggerFactory.getLogger(Step2Mapper.class);
-
+
private TreeID[] keys;
-
+
private Node[] trees;
-
+
private SingleTreePredictions[] callbacks;
-
+
private DataConverter converter;
-
+
private int partition = -1;
-
+
/** num treated instances */
private int instanceId;
-
@Override
protected void setup(Context context) throws IOException, InterruptedException {
@@ -67,45 +65,46 @@
// get the cached files' paths
URI[] files = DistributedCache.getCacheFiles(conf);
-
- log.info("DistributedCache.getCacheFiles(): {}", ArrayUtils.toString(files));
- if (files == null || files.length < 2) {
+ Step2Mapper.log.info("DistributedCache.getCacheFiles(): {}", ArrayUtils.toString(files));
+
+ if ((files == null) || (files.length < 2)) {
throw new IllegalArgumentException("missing paths from the DistributedCache");
}
-
+
Path datasetPath = new Path(files[0].getPath());
Dataset dataset = Dataset.load(conf, datasetPath);
-
+
int numMaps = Builder.getNumMaps(conf);
int p = conf.getInt("mapred.task.partition", -1);
-
+
// total number of trees in the forest
int numTrees = Builder.getNbTrees(conf);
if (numTrees == -1) {
throw new IllegalArgumentException("numTrees not found !");
}
-
- int nbConcerned = nbConcerned(numMaps, numTrees, p);
+
+ int nbConcerned = Step2Mapper.nbConcerned(numMaps, numTrees, p);
keys = new TreeID[nbConcerned];
trees = new Node[nbConcerned];
-
+
Path forestPath = new Path(files[1].getPath());
FileSystem fs = forestPath.getFileSystem(conf);
- int numInstances = InterResults.load(fs, forestPath, numMaps, numTrees,
- p, keys, trees);
-
- log.debug("partition: {} numInstances: {}", p, numInstances);
+ int numInstances = InterResults.load(fs, forestPath, numMaps, numTrees, p, keys, trees);
+
+ Step2Mapper.log.debug("partition: {} numInstances: {}", p, numInstances);
configure(p, dataset, keys, trees, numInstances);
}
-
+
/**
- * Compute the number of trees that need to classify the instances of this
- * mapper's partition
+ * Compute the number of trees that need to classify the instances of this mapper's partition
*
- * @param numMaps total number of map tasks
- * @param numTrees total number of trees in the forest
- * @param partition mapper's partition
+ * @param numMaps
+ * total number of map tasks
+ * @param numTrees
+ * total number of trees in the forest
+ * @param partition
+ * mapper's partition
* @return
*/
public static int nbConcerned(int numMaps, int numTrees, int partition) {
@@ -115,63 +114,64 @@
// the trees of the mapper's partition are not concerned
return numTrees - Step1Mapper.nbTrees(numMaps, numTrees, partition);
}
-
+
/**
* Useful for testing. Configures the mapper without using a JobConf<br>
* TODO we don't need the keys partitions, the tree ids should suffice
*
- * @param partition mapper's partition
+ * @param partition
+ * mapper's partition
* @param dataset
- * @param keys keys returned by the first step
- * @param trees trees returned by the first step
- * @param numInstances number of instances in the mapper's partition
+ * @param keys
+ * keys returned by the first step
+ * @param trees
+ * trees returned by the first step
+ * @param numInstances
+ * number of instances in the mapper's partition
*/
- public void configure(int partition, Dataset dataset, TreeID[] keys,
- Node[] trees, int numInstances) {
+ public void configure(int partition, Dataset dataset, TreeID[] keys, Node[] trees, int numInstances) {
this.partition = partition;
if (partition < 0) {
throw new IllegalArgumentException("Wrong partition id : " + partition);
}
-
+
converter = new DataConverter(dataset);
-
+
if (keys.length != trees.length) {
throw new IllegalArgumentException("keys.length != trees.length");
}
int nbConcerned = keys.length;
-
+
this.keys = keys;
this.trees = trees;
-
+
// make sure the trees are not from this partition
for (TreeID key : keys) {
if (key.partition() == partition) {
throw new IllegalArgumentException("a tree from this partition was found !");
}
}
-
+
// init the callbacks
callbacks = new SingleTreePredictions[nbConcerned];
for (int index = 0; index < nbConcerned; index++) {
callbacks[index] = new SingleTreePredictions(numInstances);
}
-
+
}
-
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
-
+
Instance instance = converter.convert(instanceId, value.toString());
-
+
for (int index = 0; index < keys.length; index++) {
int prediction = trees[index].classify(instance);
callbacks[index].prediction(index, instanceId, prediction);
}
-
+
instanceId++;
}
-
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
@@ -180,5 +180,5 @@
context.write(key, new MapredOutput(callbacks[index].getPredictions()));
}
}
-
+
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/TreeID.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/TreeID.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/TreeID.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/TreeID.java Sat Feb 13 20:27:25 2010
@@ -23,11 +23,11 @@
* Indicates both the tree and the data partition used to grow the tree
*/
public class TreeID extends LongWritable implements Cloneable {
-
+
public static final int MAX_TREEID = 100000;
- public TreeID() {
- }
-
+
+ public TreeID() {}
+
public TreeID(int partition, int treeId) {
if (partition < 0) {
throw new IllegalArgumentException("partition < 0");
@@ -37,21 +37,22 @@
}
set(partition, treeId);
}
-
+
public void set(int partition, int treeId) {
- super.set((long) partition * MAX_TREEID + treeId);
+ super.set((long) partition * TreeID.MAX_TREEID + treeId);
}
/**
* Data partition (InputSplit's index) that was used to grow the tree
+ *
* @return
*/
public int partition() {
- return (int) (get() / MAX_TREEID);
+ return (int) (get() / TreeID.MAX_TREEID);
}
public int treeId() {
- return (int) (get() % MAX_TREEID);
+ return (int) (get() % TreeID.MAX_TREEID);
}
@Override
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/CategoricalNode.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/CategoricalNode.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/CategoricalNode.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/CategoricalNode.java Sat Feb 13 20:27:25 2010
@@ -28,20 +28,19 @@
public class CategoricalNode extends Node {
private int attr;
-
+
private double[] values;
-
+
private Node[] childs;
-
- public CategoricalNode() {
- }
+
+ public CategoricalNode() {}
public CategoricalNode(int attr, double[] values, Node[] childs) {
this.attr = attr;
this.values = values;
this.childs = childs;
}
-
+
@Override
public int classify(Instance instance) {
int index = ArrayUtils.indexOf(values, instance.get(attr));
@@ -51,49 +50,51 @@
}
return childs[index].classify(instance);
}
-
+
@Override
public long maxDepth() {
long max = 0;
-
+
for (Node child : childs) {
long depth = child.maxDepth();
- if (depth > max)
+ if (depth > max) {
max = depth;
+ }
}
-
+
return 1 + max;
}
-
+
@Override
public long nbNodes() {
long nbNodes = 1;
-
+
for (Node child : childs) {
nbNodes += child.nbNodes();
}
-
+
return nbNodes;
}
-
+
@Override
protected Type getType() {
return Type.CATEGORICAL;
}
-
+
@Override
public boolean equals(Object obj) {
- if (this == obj)
+ if (this == obj) {
return true;
- if (obj == null || !(obj instanceof CategoricalNode))
+ }
+ if ((obj == null) || !(obj instanceof CategoricalNode)) {
return false;
-
+ }
+
CategoricalNode node = (CategoricalNode) obj;
-
- return attr == node.attr && Arrays.equals(values, node.values)
- && Arrays.equals(childs, node.childs);
+
+ return (attr == node.attr) && Arrays.equals(values, node.values) && Arrays.equals(childs, node.childs);
}
-
+
@Override
public int hashCode() {
int hashCode = attr;
@@ -110,20 +111,20 @@
protected String getString() {
StringBuilder buffer = new StringBuilder();
- for (Node child:childs) {
+ for (Node child : childs) {
buffer.append(child).append(',');
}
return buffer.toString();
}
-
+
@Override
public void readFields(DataInput in) throws IOException {
attr = in.readInt();
values = DFUtils.readDoubleArray(in);
childs = DFUtils.readNodeArray(in);
}
-
+
@Override
protected void writeNode(DataOutput out) throws IOException {
out.writeInt(attr);
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/Leaf.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/Leaf.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/Leaf.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/Leaf.java Sat Feb 13 20:27:25 2010
@@ -29,29 +29,28 @@
*/
public class Leaf extends Node {
private int label;
-
- protected Leaf() {
- }
-
+
+ protected Leaf() {}
+
public Leaf(int label) {
this.label = label;
}
-
+
@Override
public int classify(Instance instance) {
return label;
}
-
+
@Override
public long maxDepth() {
return 1;
}
-
+
@Override
public long nbNodes() {
return 1;
}
-
+
/**
* Extract a Leaf Node
*
@@ -60,42 +59,44 @@
*/
static Leaf parse(StringTokenizer tokenizer) {
int label = Integer.parseInt(tokenizer.nextToken());
-
+
return new Leaf(label);
}
-
+
@Override
protected Type getType() {
return Type.LEAF;
}
-
+
@Override
public boolean equals(Object obj) {
- if (this == obj)
+ if (this == obj) {
return true;
- if (obj == null || !(obj instanceof Leaf))
+ }
+ if ((obj == null) || !(obj instanceof Leaf)) {
return false;
-
+ }
+
Leaf leaf = (Leaf) obj;
-
+
return label == leaf.label;
}
-
+
@Override
public int hashCode() {
return label;
}
-
+
@Override
protected String getString() {
return "";
}
-
+
@Override
public void readFields(DataInput in) throws IOException {
label = in.readInt();
}
-
+
@Override
protected void writeNode(DataOutput out) throws IOException {
out.writeInt(label);
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/MockLeaf.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/MockLeaf.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/MockLeaf.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/MockLeaf.java Sat Feb 13 20:27:25 2010
@@ -19,22 +19,21 @@
import org.apache.mahout.df.data.Instance;
/**
- * Custom Leaf node that returns for each instance its own label. Used mainly
- * for testing purposes
+ * Custom Leaf node that returns for each instance its own label. Used mainly for testing purposes
*
*/
public class MockLeaf extends Leaf {
-
+
@Override
public int classify(Instance instance) {
return instance.label;
}
-
+
@Override
protected Type getType() {
return Type.MOCKLEAF;
}
-
+
@Override
protected String getString() {
return "[MockLeaf]";
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/Node.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/Node.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/Node.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/Node.java Sat Feb 13 20:27:25 2010
@@ -28,11 +28,14 @@
* Represents an abstract node of a decision tree
*/
public abstract class Node implements Writable {
-
+
protected enum Type {
- MOCKLEAF, LEAF, NUMERICAL, CATEGORICAL
+ MOCKLEAF,
+ LEAF,
+ NUMERICAL,
+ CATEGORICAL
}
-
+
/**
* predicts the label for the instance
*
@@ -40,27 +43,27 @@
* @return -1 if the label cannot be predicted
*/
public abstract int classify(Instance instance);
-
+
/**
* returns the total number of nodes of the tree
*
* @return
*/
public abstract long nbNodes();
-
+
/**
* returns the maximum depth of the tree
*
* @return
*/
public abstract long maxDepth();
-
+
protected abstract Type getType();
-
+
public static Node read(DataInput in) throws IOException {
Type type = Type.values()[in.readInt()];
Node node;
-
+
switch (type) {
case MOCKLEAF:
node = new MockLeaf();
@@ -75,28 +78,27 @@
node = new CategoricalNode();
break;
default:
- throw new IllegalStateException(
- "This implementation is not currently supported");
+ throw new IllegalStateException("This implementation is not currently supported");
}
-
+
node.readFields(in);
-
+
return node;
}
-
+
@Override
public final String toString() {
return getType() + ":" + getString() + ';';
}
-
+
protected abstract String getString();
-
+
@Override
public final void write(DataOutput out) throws IOException {
out.writeInt(getType().ordinal());
writeNode(out);
}
-
+
protected abstract void writeNode(DataOutput out) throws IOException;
-
+
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/NumericalNode.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/NumericalNode.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/NumericalNode.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/NumericalNode.java Sat Feb 13 20:27:25 2010
@@ -29,62 +29,64 @@
public class NumericalNode extends Node {
/** numerical attribute to split for */
private int attr;
-
+
/** split value */
private double split;
-
+
/** child node when attribute's value < split value */
private Node loChild;
-
+
/** child node when attribute's value >= split value */
private Node hiChild;
-
- public NumericalNode() {
- }
-
+
+ public NumericalNode() {}
+
public NumericalNode(int attr, double split, Node loChild, Node hiChild) {
this.attr = attr;
this.split = split;
this.loChild = loChild;
this.hiChild = hiChild;
}
-
+
@Override
public int classify(Instance instance) {
- if (instance.get(attr) < split)
+ if (instance.get(attr) < split) {
return loChild.classify(instance);
- else
+ } else {
return hiChild.classify(instance);
+ }
}
-
+
@Override
public long maxDepth() {
return 1 + Math.max(loChild.maxDepth(), hiChild.maxDepth());
}
-
+
@Override
public long nbNodes() {
return 1 + loChild.nbNodes() + hiChild.nbNodes();
}
-
+
@Override
protected Type getType() {
return Type.NUMERICAL;
- }
-
+ }
+
@Override
public boolean equals(Object obj) {
- if (this == obj)
+ if (this == obj) {
return true;
- if (obj == null || !(obj instanceof NumericalNode))
+ }
+ if ((obj == null) || !(obj instanceof NumericalNode)) {
return false;
-
+ }
+
NumericalNode node = (NumericalNode) obj;
-
- return attr == node.attr && split == node.split
- && loChild.equals(node.loChild) && hiChild.equals(node.hiChild);
+
+ return (attr == node.attr) && (split == node.split) && loChild.equals(node.loChild)
+ && hiChild.equals(node.hiChild);
}
-
+
@Override
public int hashCode() {
return attr + (int) Double.doubleToLongBits(split) + loChild.hashCode() + hiChild.hashCode();
@@ -94,7 +96,7 @@
protected String getString() {
return loChild.toString() + ',' + hiChild.toString();
}
-
+
@Override
public void readFields(DataInput in) throws IOException {
attr = in.readInt();
@@ -102,7 +104,7 @@
loChild = Node.read(in);
hiChild = Node.read(in);
}
-
+
@Override
protected void writeNode(DataOutput out) throws IOException {
out.writeInt(attr);
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/ref/SequentialBuilder.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/ref/SequentialBuilder.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/ref/SequentialBuilder.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/ref/SequentialBuilder.java Sat Feb 13 20:27:25 2010
@@ -34,41 +34,45 @@
* Builds a Random Decision Forest using a given TreeBuilder to grow the trees
*/
public class SequentialBuilder {
-
+
private static final Logger log = LoggerFactory.getLogger(SequentialBuilder.class);
-
+
private final Random rng;
private final Bagging bagging;
-
+
/**
* Constructor
*
- * @param rng random-numbers generator
- * @param treeBuilder tree builder
- * @param data training data
+ * @param rng
+ * random-numbers generator
+ * @param treeBuilder
+ * tree builder
+ * @param data
+ * training data
*/
public SequentialBuilder(Random rng, TreeBuilder treeBuilder, Data data) {
this.rng = rng;
bagging = new Bagging(treeBuilder, data);
}
-
+
public DecisionForest build(int nbTrees, PredictionCallback callback) {
List<Node> trees = new ArrayList<Node>();
for (int treeId = 0; treeId < nbTrees; treeId++) {
trees.add(bagging.build(treeId, rng, callback));
- logProgress(((float) treeId + 1) / nbTrees);
+ SequentialBuilder.logProgress(((float) treeId + 1) / nbTrees);
}
-
+
return new DecisionForest(trees);
}
private static void logProgress(float progress) {
int percent = (int) (progress * 100);
- if (percent % 10 == 0)
- log.info(String.format("Building %2d%%", percent));
-
+ if (percent % 10 == 0) {
+ SequentialBuilder.log.info(String.format("Building %2d%%", percent));
+ }
+
}
-
+
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/DefaultIgSplit.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/DefaultIgSplit.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/DefaultIgSplit.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/DefaultIgSplit.java Sat Feb 13 20:27:25 2010
@@ -26,17 +26,17 @@
* Default, not optimized, implementation of IgSplit
*/
public class DefaultIgSplit extends IgSplit {
-
+
/** used by entropy() */
private int[] counts;
-
+
@Override
public Split computeSplit(Data data, int attr) {
if (data.getDataset().isNumerical(attr)) {
double[] values = data.values(attr);
double bestIg = -1;
double bestSplit = 0.0;
-
+
for (double value : values) {
double ig = numericalIg(data, attr, value);
if (ig > bestIg) {
@@ -44,15 +44,15 @@
bestSplit = value;
}
}
-
+
return new Split(attr, bestIg, bestSplit);
} else {
double ig = categoricalIg(data, attr);
-
+
return new Split(attr, ig);
}
}
-
+
/**
* Computes the Information Gain for a CATEGORICAL attribute
*
@@ -65,18 +65,17 @@
double hy = entropy(data); // H(Y)
double hyx = 0.0; // H(Y|X)
double invDataSize = 1.0 / data.size();
-
+
for (double value : values) {
Data subset = data.subset(Condition.equals(attr, value));
hyx += subset.size() * invDataSize * entropy(subset);
}
-
+
return hy - hyx;
}
-
+
/**
- * Computes the Information Gain for a NUMERICAL attribute given a splitting
- * value
+ * Computes the Information Gain for a NUMERICAL attribute given a splitting value
*
* @param data
* @param attr
@@ -86,18 +85,18 @@
protected double numericalIg(Data data, int attr, double split) {
double hy = entropy(data);
double invDataSize = 1.0 / data.size();
-
+
// LO subset
Data subset = data.subset(Condition.lesser(attr, split));
hy -= subset.size() * invDataSize * entropy(subset);
-
+
// HI subset
subset = data.subset(Condition.greaterOrEquals(attr, split));
hy -= subset.size() * invDataSize * entropy(subset);
-
+
return hy;
}
-
+
/**
* Computes the Entropy
*
@@ -106,23 +105,25 @@
*/
protected double entropy(Data data) {
double invDataSize = 1.0 / data.size();
-
- if (counts == null)
+
+ if (counts == null) {
counts = new int[data.getDataset().nblabels()];
-
+ }
+
Arrays.fill(counts, 0);
data.countLabels(counts);
-
+
double entropy = 0.0;
for (int label = 0; label < data.getDataset().nblabels(); label++) {
int count = counts[label];
- if (count == 0)
+ if (count == 0) {
continue; // otherwise we get a NaN
+ }
double p = count * invDataSize;
- entropy += -p * Math.log(p) / LOG2;
+ entropy += -p * Math.log(p) / IgSplit.LOG2;
}
-
+
return entropy;
}
-
+
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/IgSplit.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/IgSplit.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/IgSplit.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/IgSplit.java Sat Feb 13 20:27:25 2010
@@ -23,9 +23,9 @@
* Computes the best split using the Information Gain measure
*/
public abstract class IgSplit {
-
+
protected static final double LOG2 = Math.log(2.0);
-
+
/**
* Computes the best split for the given attribute
*
@@ -34,5 +34,5 @@
* @return
*/
public abstract Split computeSplit(Data data, int attr);
-
+
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/OptIgSplit.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/OptIgSplit.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/OptIgSplit.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/OptIgSplit.java Sat Feb 13 20:27:25 2010
@@ -28,22 +28,22 @@
* Optimized implementation of IgSplit
*/
public class OptIgSplit extends IgSplit {
-
+
private int[][] counts;
-
+
private int[] countAll;
-
+
private int[] countLess;
-
+
@Override
public Split computeSplit(Data data, int attr) {
if (data.getDataset().isNumerical(attr)) {
return numericalSplit(data, attr);
} else {
- return categoricalSplit(data, attr);
+ return OptIgSplit.categoricalSplit(data, attr);
}
}
-
+
/**
* Computes the split for a CATEGORICAL attribute
*
@@ -55,28 +55,28 @@
double[] values = data.values(attr);
int[][] counts = new int[values.length][data.getDataset().nblabels()];
int[] countAll = new int[data.getDataset().nblabels()];
-
+
// compute frequencies
for (int index = 0; index < data.size(); index++) {
Instance instance = data.get(index);
counts[ArrayUtils.indexOf(values, instance.get(attr))][instance.label]++;
countAll[instance.label]++;
}
-
+
int size = data.size();
- double hy = entropy(countAll, size); // H(Y)
+ double hy = OptIgSplit.entropy(countAll, size); // H(Y)
double hyx = 0.0; // H(Y|X)
double invDataSize = 1.0 / size;
-
+
for (int index = 0; index < values.length; index++) {
size = DataUtils.sum(counts[index]);
- hyx += size * invDataSize * entropy(counts[index], size);
+ hyx += size * invDataSize * OptIgSplit.entropy(counts[index], size);
}
-
+
double ig = hy - hyx;
return new Split(attr, ig);
}
-
+
/**
* Return the sorted list of distinct values for the given attribute
*
@@ -87,10 +87,10 @@
private static double[] sortedValues(Data data, int attr) {
double[] values = data.values(attr);
Arrays.sort(values);
-
+
return values;
}
-
+
/**
* Instantiates the counting arrays
*
@@ -102,7 +102,7 @@
countAll = new int[data.getDataset().nblabels()];
countLess = new int[data.getDataset().nblabels()];
}
-
+
protected void computeFrequencies(Data data, int attr, double[] values) {
for (int index = 0; index < data.size(); index++) {
Instance instance = data.get(index);
@@ -119,68 +119,72 @@
* @return
*/
protected Split numericalSplit(Data data, int attr) {
- double[] values = sortedValues(data, attr);
-
+ double[] values = OptIgSplit.sortedValues(data, attr);
+
initCounts(data, values);
-
+
computeFrequencies(data, attr, values);
-
+
int size = data.size();
- double hy = entropy(countAll, size);
+ double hy = OptIgSplit.entropy(countAll, size);
double invDataSize = 1.0 / size;
-
+
int best = -1;
double bestIg = -1.0;
-
+
// try each possible split value
for (int index = 0; index < values.length; index++) {
double ig = hy;
-
+
// instance with attribute value < values[index]
size = DataUtils.sum(countLess);
- ig -= size * invDataSize * entropy(countLess, size);
-
+ ig -= size * invDataSize * OptIgSplit.entropy(countLess, size);
+
// instance with attribute value >= values[index]
size = DataUtils.sum(countAll);
- ig -= size * invDataSize * entropy(countAll, size);
-
+ ig -= size * invDataSize * OptIgSplit.entropy(countAll, size);
+
if (ig > bestIg) {
bestIg = ig;
best = index;
}
-
+
DataUtils.add(countLess, counts[index]);
DataUtils.dec(countAll, counts[index]);
}
-
+
if (best == -1) {
throw new IllegalStateException("no best split found !");
}
return new Split(attr, bestIg, values[best]);
}
-
+
/**
* Computes the Entropy
*
- * @param counts counts[i] = numInstances with label i
- * @param dataSize numInstances
+ * @param counts
+ * counts[i] = numInstances with label i
+ * @param dataSize
+ * numInstances
* @return
*/
private static double entropy(int[] counts, int dataSize) {
- if (dataSize == 0)
+ if (dataSize == 0) {
return 0.0;
-
+ }
+
double entropy = 0.0;
double invDataSize = 1.0 / dataSize;
-
+
for (int count : counts) {
- if (count == 0)
+ if (count == 0) {
continue; // otherwise we get a NaN
+ }
double p = count * invDataSize;
- entropy += -p * Math.log(p) / LOG2;
+ entropy += -p * Math.log(p) / IgSplit.LOG2;
}
-
+
return entropy;
}
-
+
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/Split.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/Split.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/Split.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/Split.java Sat Feb 13 20:27:25 2010
@@ -17,27 +17,26 @@
package org.apache.mahout.df.split;
-
/**
* Contains enough information to identify each split
*/
public class Split {
-
+
/** attribute to split for */
public final int attr;
-
+
/** Information Gain of the split */
public final double ig;
-
+
/** split value for NUMERICAL attributes */
public final double split;
-
+
public Split(int attr, double ig, double split) {
this.attr = attr;
this.ig = ig;
this.split = split;
}
-
+
public Split(int attr, double ig) {
this(attr, ig, Double.NaN);
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/tools/Describe.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/tools/Describe.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/tools/Describe.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/tools/Describe.java Sat Feb 13 20:27:25 2010
@@ -34,11 +34,11 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
+import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.df.data.DataLoader;
import org.apache.mahout.df.data.Dataset;
import org.apache.mahout.df.data.DescriptorException;
import org.apache.mahout.df.data.DescriptorUtils;
-import org.apache.mahout.common.CommandLineUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -46,86 +46,81 @@
* Generates a file descriptor for a given dataset
*/
public class Describe {
-
+
private static final Logger log = LoggerFactory.getLogger(Describe.class);
-
- private Describe() {
- }
-
+
+ private Describe() {}
+
public static void main(String[] args) throws IOException, DescriptorException {
-
+
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
-
- Option pathOpt = obuilder.withLongName("path").withShortName("p")
- .withRequired(true).withArgument(
- abuilder.withName("path").withMinimum(1).withMaximum(1).create())
- .withDescription("Data path").create();
-
- Option descriptorOpt = obuilder.withLongName("descriptor").withShortName(
- "d").withRequired(true).withArgument(
- abuilder.withName("descriptor").withMinimum(1).create())
- .withDescription("data descriptor").create();
-
- Option descPathOpt = obuilder.withLongName("file").withShortName("f")
- .withRequired(true).withArgument(
- abuilder.withName("file").withMinimum(1).withMaximum(1).create())
- .withDescription("Path to generated descriptor file").create();
-
- Option helpOpt = obuilder.withLongName("help").withDescription(
- "Print out help").withShortName("h").create();
-
- Group group = gbuilder.withName("Options").withOption(pathOpt).withOption(
- descPathOpt).withOption(descriptorOpt).withOption(helpOpt).create();
-
+
+ Option pathOpt = obuilder.withLongName("path").withShortName("p").withRequired(true).withArgument(
+ abuilder.withName("path").withMinimum(1).withMaximum(1).create()).withDescription("Data path").create();
+
+ Option descriptorOpt = obuilder.withLongName("descriptor").withShortName("d").withRequired(true)
+ .withArgument(abuilder.withName("descriptor").withMinimum(1).create()).withDescription(
+ "data descriptor").create();
+
+ Option descPathOpt = obuilder.withLongName("file").withShortName("f").withRequired(true).withArgument(
+ abuilder.withName("file").withMinimum(1).withMaximum(1).create()).withDescription(
+ "Path to generated descriptor file").create();
+
+ Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
+ .create();
+
+ Group group = gbuilder.withName("Options").withOption(pathOpt).withOption(descPathOpt).withOption(
+ descriptorOpt).withOption(helpOpt).create();
+
try {
Parser parser = new Parser();
parser.setGroup(group);
CommandLine cmdLine = parser.parse(args);
-
+
if (cmdLine.hasOption(helpOpt)) {
CommandLineUtil.printHelp(group);
return;
}
-
+
String dataPath = cmdLine.getValue(pathOpt).toString();
String descPath = cmdLine.getValue(descPathOpt).toString();
- List<String> descriptor = convert(cmdLine.getValues(descriptorOpt));
-
- log.debug("Data path : {}", dataPath);
- log.debug("Descriptor path : {}", descPath);
- log.debug("Descriptor : {}", descriptor);
-
- runTool(dataPath, descriptor, descPath);
+ List<String> descriptor = Describe.convert(cmdLine.getValues(descriptorOpt));
+
+ Describe.log.debug("Data path : {}", dataPath);
+ Describe.log.debug("Descriptor path : {}", descPath);
+ Describe.log.debug("Descriptor : {}", descriptor);
+
+ Describe.runTool(dataPath, descriptor, descPath);
} catch (OptionException e) {
- log.warn(e.toString(), e);
+ Describe.log.warn(e.toString(), e);
CommandLineUtil.printHelp(group);
}
}
-
- private static void runTool(String dataPath, List<String> description,
- String filePath) throws DescriptorException, IOException {
- log.info("Generating the descriptor...");
+
+ private static void runTool(String dataPath, List<String> description, String filePath) throws DescriptorException,
+ IOException {
+ Describe.log.info("Generating the descriptor...");
String descriptor = DescriptorUtils.generateDescriptor(description);
-
- Path fPath = validateOutput(filePath);
- log.info("generating the dataset...");
- Dataset dataset = generateDataset(descriptor, dataPath);
-
- log.info("storing the dataset description");
- storeWritable(new Configuration(), fPath, dataset);
+ Path fPath = Describe.validateOutput(filePath);
+
+ Describe.log.info("generating the dataset...");
+ Dataset dataset = Describe.generateDataset(descriptor, dataPath);
+
+ Describe.log.info("storing the dataset description");
+ Describe.storeWritable(new Configuration(), fPath, dataset);
}
-
- private static Dataset generateDataset(String descriptor, String dataPath)
- throws IOException, DescriptorException {
+
+ private static Dataset generateDataset(String descriptor, String dataPath) throws IOException,
+ DescriptorException {
Path path = new Path(dataPath);
FileSystem fs = path.getFileSystem(new Configuration());
-
+
return DataLoader.generateDataset(descriptor, fs, path);
}
-
+
private static Path validateOutput(String filePath) throws IOException {
Path path = new Path(filePath);
FileSystem fs = path.getFileSystem(new Configuration());
@@ -143,11 +138,10 @@
}
return list;
}
-
- private static void storeWritable(Configuration conf, Path path, Writable dataset)
- throws IOException {
- FileSystem fs = path.getFileSystem(conf);
+ private static void storeWritable(Configuration conf, Path path, Writable dataset) throws IOException {
+ FileSystem fs = path.getFileSystem(conf);
+
FSDataOutputStream out = fs.create(path);
dataset.write(out);
out.close();
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/tools/Frequencies.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/tools/Frequencies.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/tools/Frequencies.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/tools/Frequencies.java Sat Feb 13 20:27:25 2010
@@ -17,6 +17,9 @@
package org.apache.mahout.df.tools;
+import java.io.IOException;
+import java.util.Arrays;
+
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
@@ -35,97 +38,90 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.IOException;
-import java.util.Arrays;
-
/**
* Compute the frequency distribution of the "class label"
*/
-public class Frequencies extends Configured implements Tool {
-
+public class Frequencies extends Configured implements Tool {
+
private static final Logger log = LoggerFactory.getLogger(Frequencies.class);
-
- private Frequencies() {
- }
-
+
+ private Frequencies() {}
+
@Override
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
-
+
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
-
- Option dataOpt = obuilder.withLongName("data").withShortName("d")
- .withRequired(true).withArgument(
- abuilder.withName("path").withMinimum(1).withMaximum(1).create())
- .withDescription("Data path").create();
-
- Option datasetOpt = obuilder.withLongName("dataset").withShortName(
- "ds").withRequired(true).withArgument(
- abuilder.withName("path").withMinimum(1).create())
- .withDescription("dataset path").create();
-
- Option helpOpt = obuilder.withLongName("help").withDescription(
- "Print out help").withShortName("h").create();
-
- Group group = gbuilder.withName("Options").withOption(dataOpt).withOption(
- datasetOpt).withOption(helpOpt).create();
-
+
+ Option dataOpt = obuilder.withLongName("data").withShortName("d").withRequired(true).withArgument(
+ abuilder.withName("path").withMinimum(1).withMaximum(1).create()).withDescription("Data path").create();
+
+ Option datasetOpt = obuilder.withLongName("dataset").withShortName("ds").withRequired(true).withArgument(
+ abuilder.withName("path").withMinimum(1).create()).withDescription("dataset path").create();
+
+ Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
+ .create();
+
+ Group group = gbuilder.withName("Options").withOption(dataOpt).withOption(datasetOpt).withOption(helpOpt)
+ .create();
+
try {
Parser parser = new Parser();
parser.setGroup(group);
CommandLine cmdLine = parser.parse(args);
-
+
if (cmdLine.hasOption(helpOpt)) {
CommandLineUtil.printHelp(group);
return 0;
}
-
+
String dataPath = cmdLine.getValue(dataOpt).toString();
String datasetPath = cmdLine.getValue(datasetOpt).toString();
-
- log.debug("Data path : {}", dataPath);
- log.debug("Dataset path : {}", datasetPath);
-
+
+ Frequencies.log.debug("Data path : {}", dataPath);
+ Frequencies.log.debug("Dataset path : {}", datasetPath);
+
runTool(dataPath, datasetPath);
} catch (OptionException e) {
- log.warn(e.toString(), e);
+ Frequencies.log.warn(e.toString(), e);
CommandLineUtil.printHelp(group);
}
-
+
return 0;
}
-
- private void runTool(String data, String dataset)
- throws IOException, ClassNotFoundException, InterruptedException {
-
+
+ private void runTool(String data, String dataset) throws IOException,
+ ClassNotFoundException,
+ InterruptedException {
+
FileSystem fs = FileSystem.get(getConf());
Path workingDir = fs.getWorkingDirectory();
-
+
Path dataPath = new Path(data);
Path datasetPath = new Path(dataset);
-
- log.info("Computing the frequencies...");
+
+ Frequencies.log.info("Computing the frequencies...");
FrequenciesJob job = new FrequenciesJob(new Path(workingDir, "output"), dataPath, datasetPath);
-
+
int[][] counts = job.run(getConf());
-
+
// compute the partitions' sizes
int numPartitions = counts.length;
-// int[] sizes = new int[numPartitions]; // TODO this isn't used?
-// for (int p = 0; p < numPartitions; p++) {
-// sizes[p] = DataUtils.sum(counts[p]);
-// }
-
+ // int[] sizes = new int[numPartitions]; // TODO this isn't used?
+ // for (int p = 0; p < numPartitions; p++) {
+ // sizes[p] = DataUtils.sum(counts[p]);
+ // }
+
// outputing the frequencies
- log.info("counts[partition][class]");
+ Frequencies.log.info("counts[partition][class]");
for (int p = 0; p < numPartitions; p++) {
- log.info(Arrays.toString(counts[p]));
+ Frequencies.log.info(Arrays.toString(counts[p]));
}
}
-
+
public static void main(String[] args) throws Exception {
ToolRunner.run(new Configuration(), new Frequencies(), args);
}
-
+
}
\ No newline at end of file