You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2011/07/05 15:54:40 UTC
svn commit: r1143063 - in /mahout/trunk/core/src:
main/java/org/apache/mahout/math/stats/entropy/
test/java/org/apache/mahout/math/stats/entropy/
Author: srowen
Date: Tue Jul 5 13:54:39 2011
New Revision: 1143063
URL: http://svn.apache.org/viewvc?rev=1143063&view=rev
Log:
MAHOUT-747 entropy calculations on Hadoop
Added:
mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/
mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyMapper.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyReducer.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateSpecificConditionalEntropyMapper.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/ConditionalEntropy.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/DoubleSumReducer.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/GroupAndCountByKeyAndValueMapper.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGain.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGainRatio.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/KeyCounterMapper.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/SpecificConditionalEntropyMapper.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/SpecificConditionalEntropyReducer.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/ValueCounterMapper.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/VarIntSumReducer.java
mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/
mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/ConditionalEntropyTest.java
mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/EntropyTest.java
mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/InformationGainRatioTest.java
mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/InformationGainTest.java
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyMapper.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyMapper.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyMapper.java Tue Jul 5 13:54:39 2011
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+
+/**
+ * Calculates the entropy for the value with H(x) = x * log(x)
+ */
+public final class CalculateEntropyMapper extends Mapper<Text, VarIntWritable, NullWritable, DoubleWritable> {
+
+ private final DoubleWritable result = new DoubleWritable();
+
+ @Override
+ protected void map(Text key, VarIntWritable value, Context context) throws IOException, InterruptedException {
+ result.set(value.get() * Math.log(value.get()));
+ context.write(NullWritable.get(), result);
+ }
+
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyReducer.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyReducer.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyReducer.java Tue Jul 5 13:54:39 2011
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapreduce.Reducer;
+
+import java.io.IOException;
+
+/**
+ * Subtracts the partial entropy.
+ */
+public final class CalculateEntropyReducer
+ extends Reducer<NullWritable, DoubleWritable, NullWritable, DoubleWritable> {
+
+ private static final double LOG_2 = Math.log(2.0);
+
+ private final DoubleWritable result = new DoubleWritable();
+ private long numberItems;
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ super.setup(context);
+ numberItems = Long.parseLong(context.getConfiguration().get(Entropy.NUMBER_ITEMS_PARAM));
+ }
+
+ @Override
+ protected void reduce(NullWritable key, Iterable<DoubleWritable> values, Context context)
+ throws IOException, InterruptedException {
+ double entropy = 0.0;
+ for (DoubleWritable value : values) {
+ entropy += value.get();
+ }
+ result.set((Math.log(numberItems) - entropy / numberItems) / LOG_2);
+ context.write(key, result);
+ }
+
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateSpecificConditionalEntropyMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateSpecificConditionalEntropyMapper.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateSpecificConditionalEntropyMapper.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateSpecificConditionalEntropyMapper.java Tue Jul 5 13:54:39 2011
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+
+import java.io.IOException;
+
+/**
+ * Drops the key.
+ */
+public final class CalculateSpecificConditionalEntropyMapper
+ extends Mapper<Text, DoubleWritable, NullWritable, DoubleWritable> {
+
+ @Override
+ protected void map(Text key, DoubleWritable value, Context context) throws IOException, InterruptedException {
+ context.write(NullWritable.get(), value);
+ }
+
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/ConditionalEntropy.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/ConditionalEntropy.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/ConditionalEntropy.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/ConditionalEntropy.java Tue Jul 5 13:54:39 2011
@@ -0,0 +1,123 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.StringTuple;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+
+/**
+ * A Hadoop job to compute the conditional entropy H(Value|Key) for a sequence file.
+ * <ul>
+ * <li>-i The input sequence file</li>
+ * <li>-o The output sequence file</li>
+ * </ul>
+ */
+public final class ConditionalEntropy extends AbstractJob {
+
+ private long numberItems;
+
+ private Path keyValueCountPath;
+ private Path specificConditionalEntropyPath;
+
+ private static final String KEY_VALUE_COUNT_FILE = "key_value_count";
+ private static final String SPECIFIC_CONDITIONAL_ENTROPY_FILE = "specific_conditional_entropy";
+ static final String NUMBER_ITEMS_PARAM = "items.number";
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Entropy(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
+ prepareArguments(args);
+ groupAndCountByKeyAndValue();
+ calculateSpecificConditionalEntropy();
+ calculateConditionalEntropy();
+ return 0;
+ }
+
+ /**
+ * Prepares and sets the arguments.
+ */
+ private void prepareArguments(String[] args) throws IOException {
+ addInputOption();
+ addOutputOption();
+ parseArguments(args);
+ keyValueCountPath = new Path(getTempPath(), KEY_VALUE_COUNT_FILE + '-' + System.currentTimeMillis());
+ specificConditionalEntropyPath =
+ new Path(getTempPath(), SPECIFIC_CONDITIONAL_ENTROPY_FILE + '_' + System.currentTimeMillis());
+ }
+
+ /**
+ * Groups and counts by key and value.
+ * SQL-like: SELECT key, value, COUNT(*) FROM x GROUP BY key, value
+ */
+ private void groupAndCountByKeyAndValue() throws IOException, ClassNotFoundException, InterruptedException {
+
+ Job job = prepareJob(getInputPath(), keyValueCountPath, SequenceFileInputFormat.class,
+ GroupAndCountByKeyAndValueMapper.class, StringTuple.class, VarIntWritable.class, VarIntSumReducer.class,
+ StringTuple.class, VarIntWritable.class, SequenceFileOutputFormat.class);
+ job.setCombinerClass(VarIntSumReducer.class);
+ job.waitForCompletion(true);
+
+ numberItems =
+ job.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS").getValue();
+
+ }
+
+ /**
+ * Calculates the specific conditional entropy which is H(Y|X).
+ * Needs the number of all items for normalizing.
+ */
+ private void calculateSpecificConditionalEntropy() throws IOException, ClassNotFoundException, InterruptedException {
+
+ Job job = prepareJob(keyValueCountPath, specificConditionalEntropyPath, SequenceFileInputFormat.class,
+ SpecificConditionalEntropyMapper.class, Text.class, VarIntWritable.class,
+ SpecificConditionalEntropyReducer.class, Text.class, DoubleWritable.class,
+ SequenceFileOutputFormat.class);
+ job.getConfiguration().set(NUMBER_ITEMS_PARAM, String.valueOf(numberItems));
+ job.waitForCompletion(true);
+
+ }
+
+ /**
+ * Sums the calculated specific conditional entropy. Output is in the value.
+ */
+ private void calculateConditionalEntropy() throws IOException, ClassNotFoundException, InterruptedException {
+
+ Job job = prepareJob(specificConditionalEntropyPath, getOutputPath(), SequenceFileInputFormat.class,
+ CalculateSpecificConditionalEntropyMapper.class, NullWritable.class, DoubleWritable.class,
+ DoubleSumReducer.class, NullWritable.class, DoubleWritable.class,
+ SequenceFileOutputFormat.class);
+ job.setCombinerClass(DoubleSumReducer.class);
+ job.waitForCompletion(true);
+
+ }
+
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/DoubleSumReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/DoubleSumReducer.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/DoubleSumReducer.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/DoubleSumReducer.java Tue Jul 5 13:54:39 2011
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Reducer;
+
+import java.io.IOException;
+
+/**
+ * Analog of {@link org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer} which sums the double values.
+ */
+public final class DoubleSumReducer extends Reducer<Writable, DoubleWritable, Writable, DoubleWritable> {
+
+ private final DoubleWritable result = new DoubleWritable();
+
+ @Override
+ protected void reduce(Writable key, Iterable<DoubleWritable> values, Context context)
+ throws IOException, InterruptedException {
+ double sum = 0.0;
+ for (DoubleWritable value : values) {
+ sum += value.get();
+ }
+ result.set(sum);
+ context.write(key, result);
+ }
+
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java Tue Jul 5 13:54:39 2011
@@ -0,0 +1,142 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+import java.util.Map;
+
+/**
+ * A Hadoop job to compute the entropy of keys or values in a {@link SequenceFile}. Format has to be {@link Text} for
+ * key or value.
+ * <p/>
+ * <ul>
+ * <li>-i The input sequence file</li>
+ * <li>-o The output sequence file</li>
+ * <li>-s The source. Can be \<key\> or \<value\>. Default is \<key\></li>
+ * </ul>
+ */
+public final class Entropy extends AbstractJob {
+
+ private Path tempPath;
+ private long numberItems;
+ private String source;
+
+ private static final String TEMP_FILE = "temp";
+ static final String NUMBER_ITEMS_PARAM = "number.items";
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Entropy(), args);
+ }
+
+ /**
+ * Returns the number of elements in the file. Only works after run.
+ *
+ * @return The number of processed items
+ */
+ public long getNumberItems() {
+ return numberItems;
+ }
+
+ @Override
+ public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
+
+ prepareArguments(args);
+ groupAndCount();
+ calculateEntropy();
+
+ return 1;
+ }
+
+ /**
+ * Prepares and sets the arguments.
+ *
+ * @param args
+ * @throws IOException
+ */
+ private void prepareArguments(String[] args) throws IOException {
+
+ addInputOption();
+ addOutputOption();
+ addOption("source", "s", "Sets, if the entropy is calculated for the keys or the values. Can be <key> or <value>"
+ , "key");
+
+ Map<String, String> arguments = parseArguments(args);
+ source = arguments.get("--source");
+ tempPath = new Path(getTempPath(), TEMP_FILE + '-' + System.currentTimeMillis());
+
+ }
+
+
+ /**
+ * Groups the items and counts the occur for each of them.
+ * SQL-like: SELECT item, COUNT(*) FROM x GROUP BY item
+ *
+ * @throws IOException
+ * @throws ClassNotFoundException
+ * @throws InterruptedException
+ */
+ private void groupAndCount() throws IOException, ClassNotFoundException, InterruptedException {
+
+ Class<? extends Mapper> mapper = "key".equals(source) ? KeyCounterMapper.class : ValueCounterMapper.class;
+
+ Job job = prepareJob(getInputPath(), tempPath, SequenceFileInputFormat.class, mapper, Text.class,
+ VarIntWritable.class, VarIntSumReducer.class, Text.class, VarIntWritable.class,
+ SequenceFileOutputFormat.class);
+ job.setCombinerClass(VarIntSumReducer.class);
+ job.waitForCompletion(true);
+
+ numberItems =
+ job.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS").getValue();
+
+ }
+
+ /**
+ * Calculates the entropy with
+ * <p/>
+ * H(X) = -sum_i(x_i/n * log_2(x_i/n)) WITH n = sum_i(x_i)
+ * = -sum_i(x_i/n * (log_2(x_i) - log_2(n)))
+ * = -sum_i(x_i/n * log_2(x_i)) + sum_i(x_i/n * log_2(n))
+ * = (n * log_2(n) - sum_i(x_i * log_2(x_i)) / n
+ * = log_2(n) - sum_i(x_i * log_2(x_i)) / n
+ * = (log(n) - sum_i(x_i * log(x_i)) / n) / log(2)
+ */
+ private void calculateEntropy() throws IOException, ClassNotFoundException, InterruptedException {
+
+ Job job = prepareJob(tempPath, getOutputPath(), SequenceFileInputFormat.class, CalculateEntropyMapper.class,
+ NullWritable.class, DoubleWritable.class, CalculateEntropyReducer.class, NullWritable.class,
+ DoubleWritable.class, SequenceFileOutputFormat.class);
+ job.getConfiguration().set(NUMBER_ITEMS_PARAM, String.valueOf(numberItems));
+ job.setCombinerClass(DoubleSumReducer.class);
+ job.waitForCompletion(true);
+
+ }
+
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/GroupAndCountByKeyAndValueMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/GroupAndCountByKeyAndValueMapper.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/GroupAndCountByKeyAndValueMapper.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/GroupAndCountByKeyAndValueMapper.java Tue Jul 5 13:54:39 2011
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.common.StringTuple;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+
+/**
+ * Groups the input by key and value. Therefore it merges both to one key of type {@link StringTuple} and emits
+ * {@link VarIntWritable}(1) as value.
+ */
+public final class GroupAndCountByKeyAndValueMapper extends Mapper<Text, Text, StringTuple, VarIntWritable> {
+
+ private static final VarIntWritable ONE = new VarIntWritable(1);
+
+ @Override
+ protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
+ StringTuple tuple = new StringTuple(key.toString());
+ tuple.add(value.toString());
+ context.write(tuple, ONE);
+ }
+
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGain.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGain.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGain.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGain.java Tue Jul 5 13:54:39 2011
@@ -0,0 +1,116 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+/**
+ * Calculates the information gain for a {@link SequenceFile}.
+ * Computes, how 'useful' are the keys when predicting the values.
+ * <ul>
+ * <li>-i The input sequence file</li>
+ * </ul>
+ */
+public final class InformationGain extends AbstractJob {
+
+ private static final String ENTROPY_FILE = "entropy";
+ private static final String CONDITIONAL_ENTROPY_FILE = "conditional_entropy";
+
+ private Path entropyPath;
+ private Path conditionalEntropyPath;
+ private double entropy;
+ private double conditionalEntropy;
+ private double informationGain;
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Entropy(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+ prepareArguments(args);
+ calculateEntropy();
+ calculateConditionalEntropy();
+ calculateInformationGain();
+ return 0;
+ }
+
+ public double getEntropy() {
+ return entropy;
+ }
+
+ public double getConditionalEntropy() {
+ return conditionalEntropy;
+ }
+
+ public double getInformationGain() {
+ return informationGain;
+ }
+
+ /**
+ * Prepares and sets the arguments.
+ */
+ private void prepareArguments(String[] args) throws IOException {
+ addInputOption();
+ parseArguments(args);
+ entropyPath = new Path(getTempPath(), ENTROPY_FILE + '-' + System.currentTimeMillis());
+ conditionalEntropyPath = new Path(getTempPath(), CONDITIONAL_ENTROPY_FILE + '-' + System.currentTimeMillis());
+ }
+
+ private void calculateEntropy() throws Exception {
+ String[] args = { "-i", getInputPath().toString(), "-o", entropyPath.toString(), "-s", "value" };
+ ToolRunner.run(new Entropy(), args);
+ entropy = readDoubleFromPath(entropyPath);
+ }
+
+ private void calculateConditionalEntropy() throws Exception {
+ String[] args = { "-i", getInputPath().toString(), "-o", conditionalEntropyPath.toString() };
+ ToolRunner.run(new ConditionalEntropy(), args);
+ conditionalEntropy = readDoubleFromPath(conditionalEntropyPath);
+ }
+
+ private void calculateInformationGain() {
+ informationGain = entropy - conditionalEntropy;
+ }
+
+ private static double readDoubleFromPath(Path path) throws IOException {
+ Iterator<DoubleWritable> iteratorNodes =
+ new SequenceFileDirValueIterator<DoubleWritable>(path,
+ PathType.LIST,
+ PathFilters.logsCRCFilter(),
+ null,
+ false,
+ new Configuration());
+ if (!iteratorNodes.hasNext()) {
+ throw new IllegalArgumentException("Can't read double value from " + path.toString());
+ }
+ return iteratorNodes.next().get();
+ }
+
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGainRatio.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGainRatio.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGainRatio.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGainRatio.java Tue Jul 5 13:54:39 2011
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+
+/**
+ * A job to calculate the normalized information gain.
+ * <ul>
+ * <li>-i The input sequence file</li>
+ * </ul>
+ */
+public final class InformationGainRatio extends AbstractJob {
+
+ private double entropy;
+ private double informationGain;
+ private double informationGainRatio;
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new InformationGainRatio(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+ InformationGain job = new InformationGain();
+ ToolRunner.run(job, args);
+ informationGain = job.getInformationGain();
+ entropy = job.getEntropy();
+ informationGainRatio = informationGain / entropy;
+ return 0;
+ }
+
+ public double getEntropy() {
+ return entropy;
+ }
+
+ public double getInformationGain() {
+ return informationGain;
+ }
+
+ public double getInformationGainRatio() {
+ return informationGainRatio;
+ }
+
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/KeyCounterMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/KeyCounterMapper.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/KeyCounterMapper.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/KeyCounterMapper.java Tue Jul 5 13:54:39 2011
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+
+/**
+ * Emits the key and the count of 1 as {@link VarIntWritable}.
+ */
+public final class KeyCounterMapper extends Mapper<Writable, Object, Writable, VarIntWritable> {
+
+ private static final VarIntWritable ONE = new VarIntWritable(1);
+
+ @Override
+ protected void map(Writable key, Object value, Context context) throws IOException, InterruptedException {
+ context.write(key, ONE);
+ }
+
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/SpecificConditionalEntropyMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/SpecificConditionalEntropyMapper.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/SpecificConditionalEntropyMapper.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/SpecificConditionalEntropyMapper.java Tue Jul 5 13:54:39 2011
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.common.StringTuple;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+
+/**
+ * Converts the key from {@link StringTuple} with values [key, value] to {@link Text} with value key.
+ */
+public class SpecificConditionalEntropyMapper extends Mapper<StringTuple, VarIntWritable, Text, VarIntWritable> {
+
+ private final Text resultKey = new Text();
+
+ @Override
+ protected void map(StringTuple key, VarIntWritable value, Context context)
+ throws IOException, InterruptedException {
+ resultKey.set(key.stringAt(0));
+ context.write(resultKey, value);
+ }
+
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/SpecificConditionalEntropyReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/SpecificConditionalEntropyReducer.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/SpecificConditionalEntropyReducer.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/SpecificConditionalEntropyReducer.java Tue Jul 5 13:54:39 2011
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+
+/**
+ * Does the weighted conditional entropy calculation with
+ * <p/>
+ * H(values|key) = p(key) * sum_i(p(values_i|key) * log_2(p(values_i|key)))
+ * = p(key) * (log(|key|) - sum_i(values_i * log_2(values_i)) / |key|)
+ * = (sum * log_2(sum) - sum_i(values_i * log_2(values_i))/n WITH sum = sum_i(values_i)
+ * = (sum * log(sum) - sum_i(values_i * log(values_i)) / (n * log(2))
+ */
+public final class SpecificConditionalEntropyReducer extends Reducer<Text, VarIntWritable, Text, DoubleWritable> {
+
+ private final DoubleWritable result = new DoubleWritable();
+ private double numberItemsLog2;
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ super.setup(context);
+ numberItemsLog2 =
+ Math.log(2) * Integer.parseInt(context.getConfiguration().get(ConditionalEntropy.NUMBER_ITEMS_PARAM));
+ }
+
+ @Override
+ protected void reduce(Text key, Iterable<VarIntWritable> values, Context context)
+ throws IOException, InterruptedException {
+ double sum = 0.0;
+ double entropy = 0.0;
+ for (VarIntWritable value : values) {
+ int valueInt = value.get();
+ sum += valueInt;
+ entropy += valueInt * Math.log(valueInt);
+ }
+ result.set((sum * Math.log(sum) - entropy) / numberItemsLog2);
+ context.write(key, result);
+ }
+
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/ValueCounterMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/ValueCounterMapper.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/ValueCounterMapper.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/ValueCounterMapper.java Tue Jul 5 13:54:39 2011
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+
+/**
+ * Emits the value and the count of 1 as {@link VarIntWritable}.
+ */
+public final class ValueCounterMapper extends Mapper<Object, Writable, Writable, VarIntWritable> {
+
+ private static final VarIntWritable ONE = new VarIntWritable(1);
+
+ @Override
+ public void map(Object key, Writable value, Context context) throws IOException, InterruptedException {
+ context.write(value, ONE);
+ }
+
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/VarIntSumReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/VarIntSumReducer.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/VarIntSumReducer.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/VarIntSumReducer.java Tue Jul 5 13:54:39 2011
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+
+/**
+ * The analog of {@link org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer} which uses {@link VarIntWritable}.
+ */
+public final class VarIntSumReducer extends Reducer<Writable, VarIntWritable, Writable, VarIntWritable> {
+
+ private final VarIntWritable result = new VarIntWritable();
+
+ @Override
+ protected void reduce(Writable key, Iterable<VarIntWritable> values, Context context)
+ throws IOException, InterruptedException {
+ int sum = 0;
+ for (VarIntWritable value : values) {
+ sum += value.get();
+ }
+ result.set(sum);
+ context.write(key, result);
+ }
+
+}
Added: mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/ConditionalEntropyTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/ConditionalEntropyTest.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/ConditionalEntropyTest.java (added)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/ConditionalEntropyTest.java Tue Jul 5 13:54:39 2011
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import com.google.common.io.Closeables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
+import org.junit.Test;
+
+import java.util.Iterator;
+
+public final class ConditionalEntropyTest extends MahoutTestCase {
+
+ @Test
+ public void testConditionalEntropy() throws Exception {
+
+ Configuration configuration = new Configuration();
+ FileSystem fileSystem = FileSystem.get(configuration);
+ Path input = getTestTempFilePath("input");
+ Path output = getTestTempFilePath("output");
+
+ // create input
+ String[] keys = { "Math", "History", "CS", "Math", "Math", "CS", "History", "Math" };
+ String[] values = { "Yes", "No", "Yes", "No", "No", "Yes", "No", "Yes" };
+ SequenceFile.Writer writer = new SequenceFile.Writer(fileSystem, configuration, input, Text.class, Text.class);
+ try {
+ for (int i = 0; i < keys.length; i++) {
+ writer.append(new Text(keys[i]), new Text(values[i]));
+ }
+ } finally {
+ Closeables.closeQuietly(writer);
+ }
+
+ // run the job
+ Tool job = new ConditionalEntropy();
+ String[] args = { "-i", input.toString(), "-o", output.toString() };
+ ToolRunner.run(job, args);
+
+ // check the output
+ Iterator<DoubleWritable> iteratorNodes =
+ new SequenceFileDirValueIterator<DoubleWritable>(output,
+ PathType.LIST,
+ PathFilters.logsCRCFilter(),
+ null,
+ false,
+ new Configuration());
+ while (iteratorNodes.hasNext()) {
+ assertEquals(0.5, iteratorNodes.next().get(), EPSILON);
+ }
+
+ }
+
+}
Added: mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/EntropyTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/EntropyTest.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/EntropyTest.java (added)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/EntropyTest.java Tue Jul 5 13:54:39 2011
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import com.google.common.io.Closeables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
+import org.junit.Test;
+
+import java.util.Iterator;
+
+public final class EntropyTest extends MahoutTestCase {
+
+ @Test
+ public void testLetters() throws Exception {
+ String[] content = { "A", "A", "A", "A", "A", "B", "B", "C", "D", "E" };
+ calculateEntropy(content, 1.96096405, "key");
+ }
+
+ @Test
+ public void testYN() throws Exception {
+ String[] content = { "Yes", "No", "Yes", "No", "No", "Yes", "No", "Yes" };
+ calculateEntropy(content, 1.0, "value");
+ }
+
+ private void calculateEntropy(String[] content, double expected, String source) throws Exception {
+
+ Configuration configuration = new Configuration();
+ FileSystem fileSystem = FileSystem.get(configuration);
+ Path input = getTestTempFilePath("input");
+ Path output = getTestTempFilePath("output");
+
+ // write content into test text file
+ SequenceFile.Writer writer = new SequenceFile.Writer(fileSystem, configuration, input, Text.class, Text.class);
+ Writable empty = new Text();
+ try {
+ for (String item : content) {
+ if ("key".equals(source)) {
+ writer.append(new Text(item), empty);
+ } else {
+ writer.append(empty, new Text(item));
+ }
+
+ }
+ } finally {
+ Closeables.closeQuietly(writer);
+ }
+
+ // run the job
+ String[] args = { "-i", input.toString(), "-o", output.toString(), "-s", source };
+ Entropy job = new Entropy();
+ ToolRunner.run(job, args);
+
+ assertEquals(content.length, job.getNumberItems());
+
+ // check output
+ Iterator<DoubleWritable> iteratorNodes =
+ new SequenceFileDirValueIterator<DoubleWritable>(output,
+ PathType.LIST,
+ PathFilters.logsCRCFilter(),
+ null,
+ false,
+ new Configuration());
+ assertTrue(iteratorNodes.hasNext());
+ assertEquals(expected, iteratorNodes.next().get(), EPSILON);
+ }
+
+}
Added: mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/InformationGainRatioTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/InformationGainRatioTest.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/InformationGainRatioTest.java (added)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/InformationGainRatioTest.java Tue Jul 5 13:54:39 2011
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import com.google.common.io.Closeables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+public final class InformationGainRatioTest extends MahoutTestCase {
+
+ @Test
+ public void testInformationGain() throws Exception {
+
+ Configuration configuration = new Configuration();
+ FileSystem fileSystem = FileSystem.get(configuration);
+ Path input = getTestTempFilePath("input");
+
+ // create input
+ String[] keys = { "Math", "History", "CS", "Math", "Math", "CS", "History", "Math" };
+ String[] values = { "Yes", "No", "Yes", "No", "No", "Yes", "No", "Yes" };
+ SequenceFile.Writer writer = new SequenceFile.Writer(fileSystem, configuration, input, Text.class, Text.class);
+ try {
+ for (int i = 0; i < keys.length; i++) {
+ writer.append(new Text(keys[i]), new Text(values[i]));
+ }
+ } finally {
+ Closeables.closeQuietly(writer);
+ }
+
+ // run the job
+ InformationGainRatio job = new InformationGainRatio();
+ String[] args = { "-i", input.toString() };
+ ToolRunner.run(job, args);
+
+ // check the output
+ assertEquals(1.0, job.getEntropy(), EPSILON);
+ assertEquals(0.5, job.getInformationGain(), EPSILON);
+ assertEquals(0.5, job.getInformationGainRatio(), EPSILON);
+ }
+
+}
Added: mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/InformationGainTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/InformationGainTest.java?rev=1143063&view=auto
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/InformationGainTest.java (added)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/stats/entropy/InformationGainTest.java Tue Jul 5 13:54:39 2011
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats.entropy;
+
+import com.google.common.io.Closeables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+public final class InformationGainTest extends MahoutTestCase {
+
+ @Test
+ public void testInformationGain() throws Exception {
+
+ Configuration configuration = new Configuration();
+ FileSystem fileSystem = FileSystem.get(configuration);
+ Path input = getTestTempFilePath("input");
+
+ // create input
+ String[] keys = { "Math", "History", "CS", "Math", "Math", "CS", "History", "Math" };
+ String[] values = { "Yes", "No", "Yes", "No", "No", "Yes", "No", "Yes" };
+ SequenceFile.Writer writer = new SequenceFile.Writer(fileSystem, configuration, input, Text.class, Text.class);
+ try {
+ for (int i = 0; i < keys.length; i++) {
+ writer.append(new Text(keys[i]), new Text(values[i]));
+ }
+ } finally {
+ Closeables.closeQuietly(writer);
+ }
+
+ // run the job
+ InformationGain job = new InformationGain();
+ String[] args = { "-i", input.toString() };
+ ToolRunner.run(job, args);
+
+ // check the output
+ assertEquals(1.0, job.getEntropy(), EPSILON);
+ assertEquals(0.5, job.getConditionalEntropy(), EPSILON);
+ assertEquals(0.5, job.getInformationGain(), EPSILON);
+
+ }
+
+}