You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2011/07/05 15:54:40 UTC

svn commit: r1143063 - in /mahout/trunk/core/src: main/java/org/apache/mahout/math/stats/entropy/ test/java/org/apache/mahout/math/stats/entropy/

Author: srowen
Date: Tue Jul  5 13:54:39 2011
New Revision: 1143063

URL: http://svn.apache.org/viewvc?rev=1143063&view=rev
Log:
MAHOUT-747 entropy calculations on Hadoop

Added:
    mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/
    mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyMapper.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyReducer.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateSpecificConditionalEntropyMapper.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/ConditionalEntropy.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/DoubleSumReducer.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/GroupAndCountByKeyAndValueMapper.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGain.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGainRatio.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/KeyCounterMapper.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/SpecificConditionalEntropyMapper.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/SpecificConditionalEntropyReducer.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/ValueCounterMapper.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/VarIntSumReducer.java
    mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/
    mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/ConditionalEntropyTest.java
    mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/EntropyTest.java
    mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/InformationGainRatioTest.java
    mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/InformationGainTest.java

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyMapper.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyMapper.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyMapper.java Tue Jul  5 13:54:39 2011
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+
+/**
+ * Calculates the entropy for the value with H(x) = x * log(x)
+ */
+public final class CalculateEntropyMapper extends Mapper<Text, VarIntWritable, NullWritable, DoubleWritable> {
+
+  private final DoubleWritable result = new DoubleWritable();
+
+  @Override
+  protected void map(Text key, VarIntWritable value, Context context) throws IOException, InterruptedException {
+    result.set(value.get() * Math.log(value.get()));
+    context.write(NullWritable.get(), result);
+  }
+
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyReducer.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyReducer.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyReducer.java Tue Jul  5 13:54:39 2011
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapreduce.Reducer;
+
+import java.io.IOException;
+
+/**
+ * Subtracts the partial entropy.
+ */
+public final class CalculateEntropyReducer
+    extends Reducer<NullWritable, DoubleWritable, NullWritable, DoubleWritable> {
+
+  private static final double LOG_2 = Math.log(2.0);
+
+  private final DoubleWritable result = new DoubleWritable();
+  private long numberItems;
+
+  @Override
+  protected void setup(Context context) throws IOException, InterruptedException {
+    super.setup(context);
+    numberItems = Long.parseLong(context.getConfiguration().get(Entropy.NUMBER_ITEMS_PARAM));
+  }
+
+  @Override
+  protected void reduce(NullWritable key, Iterable<DoubleWritable> values, Context context)
+      throws IOException, InterruptedException {
+    double entropy = 0.0;
+    for (DoubleWritable value : values) {
+      entropy += value.get();
+    }
+    result.set((Math.log(numberItems) - entropy / numberItems) / LOG_2);
+    context.write(key, result);
+  }
+
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateSpecificConditionalEntropyMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateSpecificConditionalEntropyMapper.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateSpecificConditionalEntropyMapper.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateSpecificConditionalEntropyMapper.java Tue Jul  5 13:54:39 2011
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+
+import java.io.IOException;
+
+/**
+ * Drops the key.
+ */
+public final class CalculateSpecificConditionalEntropyMapper
+    extends Mapper<Text, DoubleWritable, NullWritable, DoubleWritable> {
+
+  @Override
+  protected void map(Text key, DoubleWritable value, Context context) throws IOException, InterruptedException {
+    context.write(NullWritable.get(), value);
+  }
+
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/ConditionalEntropy.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/ConditionalEntropy.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/ConditionalEntropy.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/ConditionalEntropy.java Tue Jul  5 13:54:39 2011
@@ -0,0 +1,123 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.StringTuple;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+
+/**
+ * A Hadoop job to compute the conditional entropy H(Value|Key) for a sequence file.
+ * <ul>
+ * <li>-i The input sequence file</li>
+ * <li>-o The output sequence file</li>
+ * </ul>
+ */
+public final class ConditionalEntropy extends AbstractJob {
+
+  private long numberItems;
+
+  private Path keyValueCountPath;
+  private Path specificConditionalEntropyPath;
+
+  private static final String KEY_VALUE_COUNT_FILE = "key_value_count";
+  private static final String SPECIFIC_CONDITIONAL_ENTROPY_FILE = "specific_conditional_entropy";
+  static final String NUMBER_ITEMS_PARAM = "items.number";
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new Entropy(), args);
+  }
+
+  @Override
+  public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
+    prepareArguments(args);
+    groupAndCountByKeyAndValue();
+    calculateSpecificConditionalEntropy();
+    calculateConditionalEntropy();
+    return 0;
+  }
+
+  /**
+   * Prepares and sets the arguments.
+   */
+  private void prepareArguments(String[] args) throws IOException {
+    addInputOption();
+    addOutputOption();
+    parseArguments(args);
+    keyValueCountPath = new Path(getTempPath(), KEY_VALUE_COUNT_FILE + '-' + System.currentTimeMillis());
+    specificConditionalEntropyPath =
+        new Path(getTempPath(), SPECIFIC_CONDITIONAL_ENTROPY_FILE + '_' + System.currentTimeMillis());
+  }
+
+  /**
+   * Groups and counts by key and value.
+   * SQL-like: SELECT key, value, COUNT(*) FROM x GROUP BY key, value
+   */
+  private void groupAndCountByKeyAndValue() throws IOException, ClassNotFoundException, InterruptedException {
+
+    Job job = prepareJob(getInputPath(), keyValueCountPath, SequenceFileInputFormat.class,
+        GroupAndCountByKeyAndValueMapper.class, StringTuple.class, VarIntWritable.class, VarIntSumReducer.class,
+        StringTuple.class, VarIntWritable.class, SequenceFileOutputFormat.class);
+    job.setCombinerClass(VarIntSumReducer.class);
+    job.waitForCompletion(true);
+
+    numberItems =
+        job.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS").getValue();
+
+  }
+
+  /**
+   * Calculates the specific conditional entropy which is H(Y|X).
+   * Needs the number of all items for normalizing.
+   */
+  private void calculateSpecificConditionalEntropy() throws IOException, ClassNotFoundException, InterruptedException {
+
+    Job job = prepareJob(keyValueCountPath, specificConditionalEntropyPath, SequenceFileInputFormat.class,
+        SpecificConditionalEntropyMapper.class, Text.class, VarIntWritable.class,
+        SpecificConditionalEntropyReducer.class, Text.class, DoubleWritable.class,
+        SequenceFileOutputFormat.class);
+    job.getConfiguration().set(NUMBER_ITEMS_PARAM, String.valueOf(numberItems));
+    job.waitForCompletion(true);
+
+  }
+
+  /**
+   * Sums the calculated specific conditional entropy. Output is in the value.
+   */
+  private void calculateConditionalEntropy() throws IOException, ClassNotFoundException, InterruptedException {
+
+    Job job = prepareJob(specificConditionalEntropyPath, getOutputPath(), SequenceFileInputFormat.class,
+        CalculateSpecificConditionalEntropyMapper.class, NullWritable.class, DoubleWritable.class,
+        DoubleSumReducer.class, NullWritable.class, DoubleWritable.class,
+        SequenceFileOutputFormat.class);
+    job.setCombinerClass(DoubleSumReducer.class);
+    job.waitForCompletion(true);
+
+  }
+
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/DoubleSumReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/DoubleSumReducer.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/DoubleSumReducer.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/DoubleSumReducer.java Tue Jul  5 13:54:39 2011
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Reducer;
+
+import java.io.IOException;
+
+/**
+ * Analog of {@link org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer} which sums the double values.
+ */
+public final class DoubleSumReducer extends Reducer<Writable, DoubleWritable, Writable, DoubleWritable> {
+
+  private final DoubleWritable result = new DoubleWritable();
+
+  @Override
+  protected void reduce(Writable key, Iterable<DoubleWritable> values, Context context)
+      throws IOException, InterruptedException {
+    double sum = 0.0;
+    for (DoubleWritable value : values) {
+      sum += value.get();
+    }
+    result.set(sum);
+    context.write(key, result);
+  }
+
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java Tue Jul  5 13:54:39 2011
@@ -0,0 +1,142 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+import java.util.Map;
+
+/**
+ * A Hadoop job to compute the entropy of keys or values in a {@link SequenceFile}. Format has to be {@link Text} for
+ * key or value.
+ * <p/>
+ * <ul>
+ * <li>-i The input sequence file</li>
+ * <li>-o The output sequence file</li>
+ * <li>-s The source. Can be \<key\> or \<value\>. Default is \<key\></li>
+ * </ul>
+ */
+public final class Entropy extends AbstractJob {
+
+  private Path tempPath;
+  private long numberItems;
+  private String source;
+
+  private static final String TEMP_FILE = "temp";
+  static final String NUMBER_ITEMS_PARAM = "number.items";
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new Entropy(), args);
+  }
+
+  /**
+   * Returns the number of elements in the file. Only works after run.
+   *
+   * @return The number of processed items
+   */
+  public long getNumberItems() {
+    return numberItems;
+  }
+
+  @Override
+  public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
+
+    prepareArguments(args);
+    groupAndCount();
+    calculateEntropy();
+
+    return 1;
+  }
+
+  /**
+   * Prepares and sets the arguments.
+   *
+   * @param args
+   * @throws IOException
+   */
+  private void prepareArguments(String[] args) throws IOException {
+
+    addInputOption();
+    addOutputOption();
+    addOption("source", "s", "Sets, if the entropy is calculated for the keys or the values. Can be <key> or <value>"
+        , "key");
+
+    Map<String, String> arguments = parseArguments(args);
+    source = arguments.get("--source");
+    tempPath = new Path(getTempPath(), TEMP_FILE + '-' + System.currentTimeMillis());
+
+  }
+
+
+  /**
+   * Groups the items and counts the occur for each of them.
+   * SQL-like: SELECT item, COUNT(*) FROM x GROUP BY item
+   *
+   * @throws IOException
+   * @throws ClassNotFoundException
+   * @throws InterruptedException
+   */
+  private void groupAndCount() throws IOException, ClassNotFoundException, InterruptedException {
+
+    Class<? extends Mapper> mapper = "key".equals(source) ? KeyCounterMapper.class : ValueCounterMapper.class;
+
+    Job job = prepareJob(getInputPath(), tempPath, SequenceFileInputFormat.class, mapper, Text.class,
+        VarIntWritable.class, VarIntSumReducer.class, Text.class, VarIntWritable.class,
+        SequenceFileOutputFormat.class);
+    job.setCombinerClass(VarIntSumReducer.class);
+    job.waitForCompletion(true);
+
+    numberItems =
+        job.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS").getValue();
+
+  }
+
+  /**
+   * Calculates the entropy with
+   * <p/>
+   * H(X) = -sum_i(x_i/n * log_2(x_i/n))  WITH n = sum_i(x_i)
+   * = -sum_i(x_i/n * (log_2(x_i) - log_2(n)))
+   * = -sum_i(x_i/n * log_2(x_i)) + sum_i(x_i/n * log_2(n))
+   * = (n * log_2(n) - sum_i(x_i * log_2(x_i)) / n
+   * = log_2(n) - sum_i(x_i * log_2(x_i)) / n
+   * = (log(n) - sum_i(x_i * log(x_i)) / n) / log(2)
+   */
+  private void calculateEntropy() throws IOException, ClassNotFoundException, InterruptedException {
+
+    Job job = prepareJob(tempPath, getOutputPath(), SequenceFileInputFormat.class, CalculateEntropyMapper.class,
+        NullWritable.class, DoubleWritable.class, CalculateEntropyReducer.class, NullWritable.class,
+        DoubleWritable.class, SequenceFileOutputFormat.class);
+    job.getConfiguration().set(NUMBER_ITEMS_PARAM, String.valueOf(numberItems));
+    job.setCombinerClass(DoubleSumReducer.class);
+    job.waitForCompletion(true);
+
+  }
+
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/GroupAndCountByKeyAndValueMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/GroupAndCountByKeyAndValueMapper.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/GroupAndCountByKeyAndValueMapper.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/GroupAndCountByKeyAndValueMapper.java Tue Jul  5 13:54:39 2011
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.common.StringTuple;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+
+/**
+ * Groups the input by key and value. Therefore it merges both to one key of type {@link StringTuple} and emits
+ * {@link VarIntWritable}(1) as value.
+ */
+public final class GroupAndCountByKeyAndValueMapper extends Mapper<Text, Text, StringTuple, VarIntWritable> {
+
+  private static final VarIntWritable ONE = new VarIntWritable(1);
+
+  @Override
+  protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
+    StringTuple tuple = new StringTuple(key.toString());
+    tuple.add(value.toString());
+    context.write(tuple, ONE);
+  }
+
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGain.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGain.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGain.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGain.java Tue Jul  5 13:54:39 2011
@@ -0,0 +1,116 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+/**
+ * Calculates the information gain for a {@link SequenceFile}.
+ * Computes, how 'useful' are the keys when predicting the values.
+ * <ul>
+ * <li>-i The input sequence file</li>
+ * </ul>
+ */
+public final class InformationGain extends AbstractJob {
+
+  private static final String ENTROPY_FILE = "entropy";
+  private static final String CONDITIONAL_ENTROPY_FILE = "conditional_entropy";
+
+  private Path entropyPath;
+  private Path conditionalEntropyPath;
+  private double entropy;
+  private double conditionalEntropy;
+  private double informationGain;
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new Entropy(), args);
+  }
+
+  @Override
+  public int run(String[] args) throws Exception {
+    prepareArguments(args);
+    calculateEntropy();
+    calculateConditionalEntropy();
+    calculateInformationGain();
+    return 0;
+  }
+
+  public double getEntropy() {
+    return entropy;
+  }
+
+  public double getConditionalEntropy() {
+    return conditionalEntropy;
+  }
+
+  public double getInformationGain() {
+    return informationGain;
+  }
+
+  /**
+   * Prepares and sets the arguments.
+   */
+  private void prepareArguments(String[] args) throws IOException {
+    addInputOption();
+    parseArguments(args);
+    entropyPath = new Path(getTempPath(), ENTROPY_FILE + '-' + System.currentTimeMillis());
+    conditionalEntropyPath = new Path(getTempPath(), CONDITIONAL_ENTROPY_FILE + '-' + System.currentTimeMillis());
+  }
+
+  private void calculateEntropy() throws Exception {
+    String[] args = { "-i", getInputPath().toString(), "-o", entropyPath.toString(), "-s", "value" };
+    ToolRunner.run(new Entropy(), args);
+    entropy = readDoubleFromPath(entropyPath);
+  }
+
+  private void calculateConditionalEntropy() throws Exception {
+    String[] args = { "-i", getInputPath().toString(), "-o", conditionalEntropyPath.toString() };
+    ToolRunner.run(new ConditionalEntropy(), args);
+    conditionalEntropy = readDoubleFromPath(conditionalEntropyPath);
+  }
+
+  private void calculateInformationGain() {
+    informationGain = entropy - conditionalEntropy;
+  }
+
+  private static double readDoubleFromPath(Path path) throws IOException {
+    Iterator<DoubleWritable> iteratorNodes =
+        new SequenceFileDirValueIterator<DoubleWritable>(path,
+                                                         PathType.LIST,
+                                                         PathFilters.logsCRCFilter(),
+                                                         null,
+                                                         false,
+                                                         new Configuration());
+    if (!iteratorNodes.hasNext()) {
+      throw new IllegalArgumentException("Can't read double value from " + path.toString());
+    }
+    return iteratorNodes.next().get();
+  }
+
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGainRatio.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGainRatio.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGainRatio.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGainRatio.java Tue Jul  5 13:54:39 2011
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+
+/**
+ * A job to calculate the normalized information gain.
+ * <ul>
+ * <li>-i The input sequence file</li>
+ * </ul>
+ */
+public final class InformationGainRatio extends AbstractJob {
+
+  private double entropy;
+  private double informationGain;
+  private double informationGainRatio;
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new InformationGainRatio(), args);
+  }
+
+  @Override
+  public int run(String[] args) throws Exception {
+    InformationGain job = new InformationGain();
+    ToolRunner.run(job, args);
+    informationGain = job.getInformationGain();
+    entropy = job.getEntropy();
+    informationGainRatio = informationGain / entropy;
+    return 0;
+  }
+
+  public double getEntropy() {
+    return entropy;
+  }
+
+  public double getInformationGain() {
+    return informationGain;
+  }
+
+  public double getInformationGainRatio() {
+    return informationGainRatio;
+  }
+
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/KeyCounterMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/KeyCounterMapper.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/KeyCounterMapper.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/KeyCounterMapper.java Tue Jul  5 13:54:39 2011
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+
+/**
+ * Emits the key and the count of 1 as {@link VarIntWritable}.
+ */
+public final class KeyCounterMapper extends Mapper<Writable, Object, Writable, VarIntWritable> {
+
+  private static final VarIntWritable ONE = new VarIntWritable(1);
+
+  @Override
+  protected void map(Writable key, Object value, Context context) throws IOException, InterruptedException {
+    context.write(key, ONE);
+  }
+
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/SpecificConditionalEntropyMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/SpecificConditionalEntropyMapper.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/SpecificConditionalEntropyMapper.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/SpecificConditionalEntropyMapper.java Tue Jul  5 13:54:39 2011
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.common.StringTuple;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+
+/**
+ * Converts the key from {@link StringTuple} with values [key, value] to {@link Text} with value key.
+ */
+public class SpecificConditionalEntropyMapper extends Mapper<StringTuple, VarIntWritable, Text, VarIntWritable> {
+
+  private final Text resultKey = new Text();
+
+  @Override
+  protected void map(StringTuple key, VarIntWritable value, Context context)
+      throws IOException, InterruptedException {
+    resultKey.set(key.stringAt(0));
+    context.write(resultKey, value);
+  }
+
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/SpecificConditionalEntropyReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/SpecificConditionalEntropyReducer.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/SpecificConditionalEntropyReducer.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/SpecificConditionalEntropyReducer.java Tue Jul  5 13:54:39 2011
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+
+/**
+ * Does the weighted conditional entropy calculation with
+ * <p/>
+ * H(values|key) = p(key) * sum_i(p(values_i|key) * log_2(p(values_i|key)))
+ * = p(key) * (log(|key|) - sum_i(values_i * log_2(values_i)) / |key|)
+ * = (sum * log_2(sum) - sum_i(values_i * log_2(values_i))/n WITH sum = sum_i(values_i)
+ * = (sum * log(sum) - sum_i(values_i * log(values_i)) / (n * log(2))
+ */
+public final class SpecificConditionalEntropyReducer extends Reducer<Text, VarIntWritable, Text, DoubleWritable> {
+
+  private final DoubleWritable result = new DoubleWritable();
+  private double numberItemsLog2;
+
+  @Override
+  protected void setup(Context context) throws IOException, InterruptedException {
+    super.setup(context);
+    numberItemsLog2 =
+        Math.log(2) * Integer.parseInt(context.getConfiguration().get(ConditionalEntropy.NUMBER_ITEMS_PARAM));
+  }
+
+  @Override
+  protected void reduce(Text key, Iterable<VarIntWritable> values, Context context)
+      throws IOException, InterruptedException {
+    double sum = 0.0;
+    double entropy = 0.0;
+    for (VarIntWritable value : values) {
+      int valueInt = value.get();
+      sum += valueInt;
+      entropy += valueInt * Math.log(valueInt);
+    }
+    result.set((sum * Math.log(sum) - entropy) / numberItemsLog2);
+    context.write(key, result);
+  }
+
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/ValueCounterMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/ValueCounterMapper.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/ValueCounterMapper.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/ValueCounterMapper.java Tue Jul  5 13:54:39 2011
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+
+/**
+ * Emits the value and the count of 1 as {@link VarIntWritable}.
+ */
+public final class ValueCounterMapper extends Mapper<Object, Writable, Writable, VarIntWritable> {
+
+  private static final VarIntWritable ONE = new VarIntWritable(1);
+
+  @Override
+  public void map(Object key, Writable value, Context context) throws IOException, InterruptedException {
+    context.write(value, ONE);
+  }
+
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/VarIntSumReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/VarIntSumReducer.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/VarIntSumReducer.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/VarIntSumReducer.java Tue Jul  5 13:54:39 2011
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+
+/**
+ * The analog of {@link org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer} which uses {@link VarIntWritable}.
+ */
+public final class VarIntSumReducer extends Reducer<Writable, VarIntWritable, Writable, VarIntWritable> {
+
+  private final VarIntWritable result = new VarIntWritable();
+
+  @Override
+  protected void reduce(Writable key, Iterable<VarIntWritable> values, Context context)
+      throws IOException, InterruptedException {
+    int sum = 0;
+    for (VarIntWritable value : values) {
+      sum += value.get();
+    }
+    result.set(sum);
+    context.write(key, result);
+  }
+
+}

Added: mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/ConditionalEntropyTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/ConditionalEntropyTest.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/ConditionalEntropyTest.java (added)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/ConditionalEntropyTest.java Tue Jul  5 13:54:39 2011
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import com.google.common.io.Closeables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
+import org.junit.Test;
+
+import java.util.Iterator;
+
+public final class ConditionalEntropyTest extends MahoutTestCase {
+
+  @Test
+  public void testConditionalEntropy() throws Exception {
+
+    Configuration configuration = new Configuration();
+    FileSystem fileSystem = FileSystem.get(configuration);
+    Path input = getTestTempFilePath("input");
+    Path output = getTestTempFilePath("output");
+
+    // create input
+    String[] keys = { "Math", "History", "CS", "Math", "Math", "CS", "History", "Math" };
+    String[] values = { "Yes", "No", "Yes", "No", "No", "Yes", "No", "Yes" };
+    SequenceFile.Writer writer = new SequenceFile.Writer(fileSystem, configuration, input, Text.class, Text.class);
+    try {
+      for (int i = 0; i < keys.length; i++) {
+        writer.append(new Text(keys[i]), new Text(values[i]));
+      }
+    } finally {
+      Closeables.closeQuietly(writer);
+    }
+
+    // run the job
+    Tool job = new ConditionalEntropy();
+    String[] args = { "-i", input.toString(), "-o", output.toString() };
+    ToolRunner.run(job, args);
+
+    // check the output
+    Iterator<DoubleWritable> iteratorNodes =
+        new SequenceFileDirValueIterator<DoubleWritable>(output,
+                                                         PathType.LIST,
+                                                         PathFilters.logsCRCFilter(),
+                                                         null,
+                                                         false,
+                                                         new Configuration());
+    while (iteratorNodes.hasNext()) {
+      assertEquals(0.5, iteratorNodes.next().get(), EPSILON);
+    }
+
+  }
+
+}

Added: mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/EntropyTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/EntropyTest.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/EntropyTest.java (added)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/EntropyTest.java Tue Jul  5 13:54:39 2011
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import com.google.common.io.Closeables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
+import org.junit.Test;
+
+import java.util.Iterator;
+
+public final class EntropyTest extends MahoutTestCase {
+
+  @Test
+  public void testLetters() throws Exception {
+    String[] content = { "A", "A", "A", "A", "A", "B", "B", "C", "D", "E" };
+    calculateEntropy(content, 1.96096405, "key");
+  }
+
+  @Test
+  public void testYN() throws Exception {
+    String[] content = { "Yes", "No", "Yes", "No", "No", "Yes", "No", "Yes" };
+    calculateEntropy(content, 1.0, "value");
+  }
+
+  private void calculateEntropy(String[] content, double expected, String source) throws Exception {
+
+    Configuration configuration = new Configuration();
+    FileSystem fileSystem = FileSystem.get(configuration);
+    Path input = getTestTempFilePath("input");
+    Path output = getTestTempFilePath("output");
+
+    // write content into test text file
+    SequenceFile.Writer writer = new SequenceFile.Writer(fileSystem, configuration, input, Text.class, Text.class);
+    Writable empty = new Text();
+    try {
+      for (String item : content) {
+        if ("key".equals(source)) {
+          writer.append(new Text(item), empty);
+        } else {
+          writer.append(empty, new Text(item));
+        }
+
+      }
+    } finally {
+      Closeables.closeQuietly(writer);
+    }
+
+    // run the job
+    String[] args = { "-i", input.toString(), "-o", output.toString(), "-s", source };
+    Entropy job = new Entropy();
+    ToolRunner.run(job, args);
+
+    assertEquals(content.length, job.getNumberItems());
+
+    // check output
+    Iterator<DoubleWritable> iteratorNodes =
+        new SequenceFileDirValueIterator<DoubleWritable>(output,
+                                                         PathType.LIST,
+                                                         PathFilters.logsCRCFilter(),
+                                                         null,
+                                                         false,
+                                                         new Configuration());
+    assertTrue(iteratorNodes.hasNext());
+    assertEquals(expected, iteratorNodes.next().get(), EPSILON);
+  }
+
+}

Added: mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/InformationGainRatioTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/InformationGainRatioTest.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/InformationGainRatioTest.java (added)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/InformationGainRatioTest.java Tue Jul  5 13:54:39 2011
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import com.google.common.io.Closeables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+public final class InformationGainRatioTest extends MahoutTestCase {
+
+  @Test
+  public void testInformationGain() throws Exception {
+
+    Configuration configuration = new Configuration();
+    FileSystem fileSystem = FileSystem.get(configuration);
+    Path input = getTestTempFilePath("input");
+
+    // create input
+    String[] keys = { "Math", "History", "CS", "Math", "Math", "CS", "History", "Math" };
+    String[] values = { "Yes", "No", "Yes", "No", "No", "Yes", "No", "Yes" };
+    SequenceFile.Writer writer = new SequenceFile.Writer(fileSystem, configuration, input, Text.class, Text.class);
+    try {
+      for (int i = 0; i < keys.length; i++) {
+        writer.append(new Text(keys[i]), new Text(values[i]));
+      }
+    } finally {
+      Closeables.closeQuietly(writer);
+    }
+
+    // run the job
+    InformationGainRatio job = new InformationGainRatio();
+    String[] args = { "-i", input.toString() };
+    ToolRunner.run(job, args);
+
+    // check the output
+    assertEquals(1.0, job.getEntropy(), EPSILON);
+    assertEquals(0.5, job.getInformationGain(), EPSILON);
+    assertEquals(0.5, job.getInformationGainRatio(), EPSILON);
+  }
+
+}

Added: mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/InformationGainTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/InformationGainTest.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/InformationGainTest.java (added)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/InformationGainTest.java Tue Jul  5 13:54:39 2011
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import com.google.common.io.Closeables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+public final class InformationGainTest extends MahoutTestCase {
+
+  @Test
+  public void testInformationGain() throws Exception {
+
+    Configuration configuration = new Configuration();
+    FileSystem fileSystem = FileSystem.get(configuration);
+    Path input = getTestTempFilePath("input");
+
+    // create input
+    String[] keys = { "Math", "History", "CS", "Math", "Math", "CS", "History", "Math" };
+    String[] values = { "Yes", "No", "Yes", "No", "No", "Yes", "No", "Yes" };
+    SequenceFile.Writer writer = new SequenceFile.Writer(fileSystem, configuration, input, Text.class, Text.class);
+    try {
+      for (int i = 0; i < keys.length; i++) {
+        writer.append(new Text(keys[i]), new Text(values[i]));
+      }
+    } finally {
+      Closeables.closeQuietly(writer);
+    }
+
+    // run the job
+    InformationGain job = new InformationGain();
+    String[] args = { "-i", input.toString() };
+    ToolRunner.run(job, args);
+
+    // check the output
+    assertEquals(1.0, job.getEntropy(), EPSILON);
+    assertEquals(0.5, job.getConditionalEntropy(), EPSILON);
+    assertEquals(0.5, job.getInformationGain(), EPSILON);
+
+  }
+
+}