You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by td...@apache.org on 2010/09/17 23:32:49 UTC

svn commit: r998335 - in /mahout/trunk/core/src: main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java test/java/org/apache/mahout/vectors/WordLikeValueEncoderTest.java

Author: tdunning
Date: Fri Sep 17 21:32:49 2010
New Revision: 998335

URL: http://svn.apache.org/viewvc?rev=998335&view=rev
Log:
Reverted to having weight dictionary be String->weight instead of byte[]

Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java
    mahout/trunk/core/src/test/java/org/apache/mahout/vectors/WordLikeValueEncoderTest.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java?rev=998335&r1=998334&r2=998335&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java Fri Sep 17 21:32:49 2010
@@ -17,20 +17,22 @@
 
 package org.apache.mahout.vectors;
 
+import com.google.common.base.Charsets;
+
 import java.util.Collections;
 import java.util.Map;
 
 /**
- * Encodes a categorical values with an unbounded vocabulary.  Values are encoding by incrementing
- * a few locations in the output vector with a weight that is either defaulted to 1 or that is
- * looked up in a weight dictionary.  By default, only one probe is used which should be fine
- * but could cause a decrease in the speed of learning because more features will be non-zero.
- * If a large feature vector is used so that the probability of feature collisions is suitably
- * small, then this can be decreased to 1.  If a very small feature vector is used, the number
- * of probes should probably be increased to 3.
+ * Encodes a categorical values with an unbounded vocabulary.  Values are encoding by incrementing a
+ * few locations in the output vector with a weight that is either defaulted to 1 or that is looked
+ * up in a weight dictionary.  By default, only one probe is used which should be fine but could
+ * cause a decrease in the speed of learning because more features will be non-zero. If a large
+ * feature vector is used so that the probability of feature collisions is suitably small, then this
+ * can be decreased to 1.  If a very small feature vector is used, the number of probes should
+ * probably be increased to 3.
  */
 public class StaticWordValueEncoder extends WordValueEncoder {
-  private Map<byte[], Double> dictionary;
+  private Map<String, Double> dictionary;
   private double missingValueWeight = 1;
   private byte[] nameBytes;
 
@@ -44,19 +46,21 @@ public class StaticWordValueEncoder exte
     return hash(nameBytes, originalForm, WORD_LIKE_VALUE_HASH_SEED + probe, dataSize);
   }
 
-   /**
-   * Sets the weighting dictionary to be used by this encoder.  Also sets
-   * the missing value weight to be half the smallest weight in the dictionary.
-   * @param dictionary  The dictionary to use to look up weights.
+  /**
+   * Sets the weighting dictionary to be used by this encoder.  Also sets the missing value weight
+   * to be half the smallest weight in the dictionary.
+   *
+   * @param dictionary The dictionary to use to look up weights.
    */
-  public void setDictionary(Map<byte[], Double> dictionary) {
+  public void setDictionary(Map<String, Double> dictionary) {
     this.dictionary = dictionary;
     missingValueWeight = Collections.min(dictionary.values()) / 2;
   }
 
   /**
    * Sets the weight that is to be used for values that do not appear in the dictionary.
-   * @param missingValueWeight  The default weight for missing values.
+   *
+   * @param missingValueWeight The default weight for missing values.
    */
   public void setMissingValueWeight(double missingValueWeight) {
     this.missingValueWeight = missingValueWeight;
@@ -65,8 +69,11 @@ public class StaticWordValueEncoder exte
   @Override
   protected double weight(byte[] originalForm) {
     double weight = missingValueWeight;
-    if (dictionary != null && dictionary.containsKey(originalForm)) {
-      weight = dictionary.get(originalForm);
+    if (dictionary != null) {
+      String s = new String(originalForm, Charsets.UTF_8);
+      if (dictionary.containsKey(s)) {
+        weight = dictionary.get(s);
+      }
     }
     return weight;
   }

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/vectors/WordLikeValueEncoderTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/vectors/WordLikeValueEncoderTest.java?rev=998335&r1=998334&r2=998335&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/vectors/WordLikeValueEncoderTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/vectors/WordLikeValueEncoderTest.java Fri Sep 17 21:32:49 2010
@@ -56,12 +56,11 @@ public final class WordLikeValueEncoderT
   @Test
   public void testStaticWeights() {
     StaticWordValueEncoder enc = new StaticWordValueEncoder("word");
-    enc.setDictionary(ImmutableMap.<byte[], Double>of("word1".getBytes(Charsets.UTF_8), 3.0, "word2".getBytes(Charsets.UTF_8), 1.5));
+    enc.setDictionary(ImmutableMap.<String, Double>of("word1", 3.0, "word2", 1.5));
     Vector v = new DenseVector(200);
     enc.addToVector("word1", v);
     enc.addToVector("word2", v);
     enc.addToVector("word3", v);
-    enc.flush(1, v);
     Iterator<Vector.Element> i = v.iterateNonZero();
     Iterator<Integer> j = ImmutableList.of(7, 101, 118, 119, 152, 199).iterator();
     Iterator<Double> k = ImmutableList.of(3.0, 0.75, 1.5, 1.5, 0.75, 3.0).iterator();
@@ -72,7 +71,7 @@ public final class WordLikeValueEncoderT
     i = v.iterateNonZero();
     while (i.hasNext()) {
       Vector.Element element = i.next();
-      assertEquals(k.next(), element.get(), 0);
+      assertEquals(String.format("checking v[%d]", element.index()), k.next(), element.get(), 0);
     }
     assertFalse(j.hasNext());
   }