You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by td...@apache.org on 2010/09/17 23:32:49 UTC
svn commit: r998335 - in /mahout/trunk/core/src:
main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java
test/java/org/apache/mahout/vectors/WordLikeValueEncoderTest.java
Author: tdunning
Date: Fri Sep 17 21:32:49 2010
New Revision: 998335
URL: http://svn.apache.org/viewvc?rev=998335&view=rev
Log:
Reverted to having weight dictionary be String->weight instead of byte[]
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java
mahout/trunk/core/src/test/java/org/apache/mahout/vectors/WordLikeValueEncoderTest.java
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java?rev=998335&r1=998334&r2=998335&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java Fri Sep 17 21:32:49 2010
@@ -17,20 +17,22 @@
package org.apache.mahout.vectors;
+import com.google.common.base.Charsets;
+
import java.util.Collections;
import java.util.Map;
/**
- * Encodes a categorical values with an unbounded vocabulary. Values are encoding by incrementing
- * a few locations in the output vector with a weight that is either defaulted to 1 or that is
- * looked up in a weight dictionary. By default, only one probe is used which should be fine
- * but could cause a decrease in the speed of learning because more features will be non-zero.
- * If a large feature vector is used so that the probability of feature collisions is suitably
- * small, then this can be decreased to 1. If a very small feature vector is used, the number
- * of probes should probably be increased to 3.
+ * Encodes a categorical values with an unbounded vocabulary. Values are encoding by incrementing a
+ * few locations in the output vector with a weight that is either defaulted to 1 or that is looked
+ * up in a weight dictionary. By default, only one probe is used which should be fine but could
+ * cause a decrease in the speed of learning because more features will be non-zero. If a large
+ * feature vector is used so that the probability of feature collisions is suitably small, then this
+ * can be decreased to 1. If a very small feature vector is used, the number of probes should
+ * probably be increased to 3.
*/
public class StaticWordValueEncoder extends WordValueEncoder {
- private Map<byte[], Double> dictionary;
+ private Map<String, Double> dictionary;
private double missingValueWeight = 1;
private byte[] nameBytes;
@@ -44,19 +46,21 @@ public class StaticWordValueEncoder exte
return hash(nameBytes, originalForm, WORD_LIKE_VALUE_HASH_SEED + probe, dataSize);
}
- /**
- * Sets the weighting dictionary to be used by this encoder. Also sets
- * the missing value weight to be half the smallest weight in the dictionary.
- * @param dictionary The dictionary to use to look up weights.
+ /**
+ * Sets the weighting dictionary to be used by this encoder. Also sets the missing value weight
+ * to be half the smallest weight in the dictionary.
+ *
+ * @param dictionary The dictionary to use to look up weights.
*/
- public void setDictionary(Map<byte[], Double> dictionary) {
+ public void setDictionary(Map<String, Double> dictionary) {
this.dictionary = dictionary;
missingValueWeight = Collections.min(dictionary.values()) / 2;
}
/**
* Sets the weight that is to be used for values that do not appear in the dictionary.
- * @param missingValueWeight The default weight for missing values.
+ *
+ * @param missingValueWeight The default weight for missing values.
*/
public void setMissingValueWeight(double missingValueWeight) {
this.missingValueWeight = missingValueWeight;
@@ -65,8 +69,11 @@ public class StaticWordValueEncoder exte
@Override
protected double weight(byte[] originalForm) {
double weight = missingValueWeight;
- if (dictionary != null && dictionary.containsKey(originalForm)) {
- weight = dictionary.get(originalForm);
+ if (dictionary != null) {
+ String s = new String(originalForm, Charsets.UTF_8);
+ if (dictionary.containsKey(s)) {
+ weight = dictionary.get(s);
+ }
}
return weight;
}
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/vectors/WordLikeValueEncoderTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/vectors/WordLikeValueEncoderTest.java?rev=998335&r1=998334&r2=998335&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/vectors/WordLikeValueEncoderTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/vectors/WordLikeValueEncoderTest.java Fri Sep 17 21:32:49 2010
@@ -56,12 +56,11 @@ public final class WordLikeValueEncoderT
@Test
public void testStaticWeights() {
StaticWordValueEncoder enc = new StaticWordValueEncoder("word");
- enc.setDictionary(ImmutableMap.<byte[], Double>of("word1".getBytes(Charsets.UTF_8), 3.0, "word2".getBytes(Charsets.UTF_8), 1.5));
+ enc.setDictionary(ImmutableMap.<String, Double>of("word1", 3.0, "word2", 1.5));
Vector v = new DenseVector(200);
enc.addToVector("word1", v);
enc.addToVector("word2", v);
enc.addToVector("word3", v);
- enc.flush(1, v);
Iterator<Vector.Element> i = v.iterateNonZero();
Iterator<Integer> j = ImmutableList.of(7, 101, 118, 119, 152, 199).iterator();
Iterator<Double> k = ImmutableList.of(3.0, 0.75, 1.5, 1.5, 0.75, 3.0).iterator();
@@ -72,7 +71,7 @@ public final class WordLikeValueEncoderT
i = v.iterateNonZero();
while (i.hasNext()) {
Vector.Element element = i.next();
- assertEquals(k.next(), element.get(), 0);
+ assertEquals(String.format("checking v[%d]", element.index()), k.next(), element.get(), 0);
}
assertFalse(j.hasNext());
}