You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by td...@apache.org on 2010/08/30 21:34:10 UTC

svn commit: r990913 - in /mahout/trunk/core/src: main/java/org/apache/mahout/vectors/ test/java/org/apache/mahout/vectors/

Author: tdunning
Date: Mon Aug 30 19:34:10 2010
New Revision: 990913

URL: http://svn.apache.org/viewvc?rev=990913&view=rev
Log:
MAHOUT-492 - modified InteractionValueEncoder to be more generic, to allow interactions of any two FeatureVectorEncoders and also to allow specifying weight to apply to interaction

Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/vectors/AdaptiveWordValueEncoder.java
    mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ConstantValueEncoder.java
    mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ContinuousValueEncoder.java
    mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
    mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java
    mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java
    mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java
    mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java
    mahout/trunk/core/src/test/java/org/apache/mahout/vectors/InteractionValueEncoderTest.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/AdaptiveWordValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/AdaptiveWordValueEncoder.java?rev=990913&r1=990912&r2=990913&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/AdaptiveWordValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/AdaptiveWordValueEncoder.java Mon Aug 30 19:34:10 2010
@@ -44,7 +44,17 @@ public class AdaptiveWordValueEncoder ex
   @Override
   public void addToVector(String originalForm, double weight, Vector data) {
     dictionary.add(originalForm);
-    super.addToVector(originalForm, weight, data);
+    super.addToVector(originalForm, getWeight(originalForm, weight), data);
+  }
+
+  @Override
+  protected int hashForProbe(String originalForm, Vector data, String name, int i) {
+    return super.hashForProbe(originalForm, data, name, i);
+  }
+
+  @Override
+  protected double getWeight(String originalForm, double w) {
+    return w*weight(originalForm);
   }
 
   @Override

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ConstantValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ConstantValueEncoder.java?rev=990913&r1=990912&r2=990913&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ConstantValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ConstantValueEncoder.java Mon Aug 30 19:34:10 2010
@@ -32,14 +32,24 @@ public class ConstantValueEncoder extend
     int probes = getProbes();
     String name = getName();
     for (int i = 0; i < probes; i++) {
-      int n = hash(name, i, data.size());
-      trace(null, n);
-      data.set(n, data.get(n) + weight);
+        int n = hashForProbe(originalForm, data, name, i);
+        trace(null, n);
+      data.set(n, data.get(n) + getWeight(originalForm,weight));
     }
   }
 
   @Override
+  protected double getWeight(String originalForm, double w) {
+    return w;
+  }
+
+  @Override
   public String asString(String originalForm) {
     return getName();
   }
+
+  protected int hashForProbe(String originalForm, Vector data, String name, int i){
+    return hash(name, i, data.size());
+  }
+
 }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ContinuousValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ContinuousValueEncoder.java?rev=990913&r1=990912&r2=990913&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ContinuousValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ContinuousValueEncoder.java Mon Aug 30 19:34:10 2010
@@ -39,12 +39,22 @@ public class ContinuousValueEncoder exte
     int probes = getProbes();
     String name = getName();
     for (int i = 0; i < probes; i++) {
-      int n = hash(name, CONTINUOUS_VALUE_HASH_SEED + i, data.size());
+      int n = hashForProbe(originalForm, data, name, i);
       trace(null, n);
-      data.set(n, data.get(n) + weight * Double.parseDouble(originalForm));
+      data.set(n, data.get(n) + getWeight(originalForm,weight));
     }
   }
 
+  @Override
+  protected int hashForProbe(String originalForm, Vector data, String name, int i) {
+    return hash(name, CONTINUOUS_VALUE_HASH_SEED + i, data.size());
+  }
+
+  @Override
+  protected double getWeight(String originalForm, double w) {
+    return w * Double.parseDouble(originalForm);
+  }
+
   /**
    * Converts a value into a form that would help a human understand the internals of how the value
    * is being interpreted.  For text-like things, this is likely to be a list of the terms found with

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java?rev=990913&r1=990912&r2=990913&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java Mon Aug 30 19:34:10 2010
@@ -21,6 +21,8 @@ import com.google.common.collect.Sets;
 import org.apache.mahout.math.Vector;
 
 import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
@@ -71,6 +73,19 @@ public abstract class FeatureVectorEncod
    */
   public abstract void addToVector(String originalForm, double weight, Vector data);
 
+  protected abstract int hashForProbe(String originalForm, Vector data, String name, int i);
+
+  protected Iterable<Integer> hashesForProbe(String originalForm, Vector data, String name, int i){
+    List<Integer> hashes = new ArrayList<Integer>();
+    hashes.add(hashForProbe(originalForm,data,name,i));
+  return hashes;
+  }
+
+  
+  protected double getWeight(String originalFor, double w){
+    return 1.0;
+  }
+
   // ******* Utility functions used by most implementations
 
   /**

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java?rev=990913&r1=990912&r2=990913&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java Mon Aug 30 19:34:10 2010
@@ -25,63 +25,77 @@ public class InteractionValueEncoder ext
 
   protected static final int INTERACTION_VALUE_HASH_SEED_1 = 100;
   protected static final int INTERACTION_VALUE_HASH_SEED_2 = 200;
+  protected static FeatureVectorEncoder firstEncoder;
+  protected static FeatureVectorEncoder secondEncoder;
 
-    public InteractionValueEncoder(String name) {
-       super(name, 2);
-     }
+  public InteractionValueEncoder(String name, FeatureVectorEncoder encoderOne, FeatureVectorEncoder encoderTwo) {
+    super(name, 2);
+    firstEncoder = encoderOne;
+    secondEncoder = encoderTwo;
+  }
 
   /**
    * Adds a value to a vector.
    *
    * @param originalForm The original form of the first value as a string.
-   * @param data         The vector to which the value should be added.
+   * @param data          The vector to which the value should be added.
    */
   @Override
   public void addToVector(String originalForm, double w, Vector data) {
   }
 
-     /**
-      * Adds a value to a vector.
-      *
-      * @param originalForm1 The original form of the first value as a string.
-      * @param originalForm2 The original form of the second value as a string.
-      * @param data          The vector to which the value should be added.
-      */
-     public void addInteractionToVector(String originalForm1, String originalForm2, Vector data) {
-       int probes = getProbes();
-       String name = getName();
-       for (int i = 0; i < probes; i++) {
-         int h1 = hash1(name, originalForm1, i, data.size());
-         int h2 = hash2(name, originalForm1, i, data.size());
-         int j =  hash1(name, originalForm2, i, data.size());
-         int n = (h1 + (j+1)*h2) % data.size();
-         if(n < 0){
-             n = n+data.size();
-         }
-         trace(String.format("%s:%s", originalForm1, originalForm2), n);
-         data.set(n, data.get(n) + 1);
-       }
-     }
+  /**
+   * Adds a value to a vector.
+   *
+   * @param originalForm1 The original form of the first value as a string.
+   * @param originalForm2 The original form of the second value as a string.
+   * @param data          The vector to which the value should be added.
+   */
+  public void addInteractionToVector(String originalForm1, String originalForm2, double weight, Vector data) {
+    int probes = getProbes();
+    String name = getName();
+    double w = getWeight(originalForm1, originalForm2, weight);
+    for (int i = 0; i < probes; i++) {
+      int h1 = firstEncoder.hashForProbe(originalForm1, data, name, i);
+      int h2 = secondEncoder.hashForProbe(originalForm1, data, name, i);
+      int j =  firstEncoder.hashForProbe(originalForm2, data, name, i);
+      int n = (h1 + (j+1)*h2) % data.size();
+      if(n < 0){
+        n = n+data.size();
+      }
+      trace(String.format("%s:%s", originalForm1, originalForm2), n);
+      data.set(n, data.get(n) + w);
+    }
+  }
+
+  protected double getWeight(String originalForm1, String originalForm2, double w) {
+    return firstEncoder.getWeight(originalForm1, 1.0) * secondEncoder.getWeight(originalForm2,1.0) * w;
+  }
 
   /**
-   * Converts a value into a form that would help a human understand the internals of how the
-   * value is being interpreted.  For text-like things, this is likely to be a list of the terms
-   * found with associated weights (if any).
+   * Converts a value into a form that would help a human understand the internals of how the value
+   * is being interpreted.  For text-like things, this is likely to be a list of the terms found with
+   * associated weights (if any).
    *
    * @param originalForm The original form of the value as a string.
    * @return A string that a human can read.
    */
   @Override
   public String asString(String originalForm) {
-    return String.format(Locale.ENGLISH, "%s:%s", getName(), originalForm);
+    return String.format("%s:%s", getName(), originalForm);
+  }
+
+  @Override
+  protected int hashForProbe(String originalForm, Vector data, String name, int i) {
+    return hash(name, i, data.size());
   }
 
   protected int hash1(String term1, String term2, int probe, int numFeatures) {
-    return hash(term1, term2, probe + INTERACTION_VALUE_HASH_SEED_1, numFeatures);
+    return hash(term1, term2, probe+INTERACTION_VALUE_HASH_SEED_1,numFeatures);
   }
 
   protected int hash2(String term1, String term2, int probe, int numFeatures) {
-    return hash(term1, term2, probe + INTERACTION_VALUE_HASH_SEED_2, numFeatures);
+    return hash(term1, term2, probe+INTERACTION_VALUE_HASH_SEED_2,numFeatures);
   }
 }
 

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java?rev=990913&r1=990912&r2=990913&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java Mon Aug 30 19:34:10 2010
@@ -17,6 +17,8 @@
 
 package org.apache.mahout.vectors;
 
+import org.apache.mahout.math.Vector;
+
 import java.util.Collections;
 import java.util.Map;
 
@@ -37,7 +39,12 @@ public class StaticWordValueEncoder exte
     super(name);
   }
 
-  /**
+  @Override
+  protected int hashForProbe(String originalForm, Vector data, String name, int i) {
+    return hash(name, i, data.size());
+  }
+
+   /**
    * Sets the weighting dictionary to be used by this encoder.  Also sets
    * the missing value weight to be half the smallest weight in the dictionary.
    * @param dictionary  The dictionary to use to look up weights.

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java?rev=990913&r1=990912&r2=990913&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java Mon Aug 30 19:34:10 2010
@@ -20,6 +20,8 @@ package org.apache.mahout.vectors;
 import com.google.common.base.Splitter;
 import org.apache.mahout.math.Vector;
 
+import java.util.ArrayList;
+import java.util.List;
 import java.util.regex.Pattern;
 
 /**
@@ -50,7 +52,21 @@ public class TextValueEncoder extends Fe
     }
   }
 
-  private Iterable<String> tokenize(CharSequence originalForm) {
+    @Override
+    protected int hashForProbe(String originalForm, Vector data, String name, int i) {
+        return 0;
+    }
+
+    protected Iterable<Integer> hashesForProbe(String originalForm, Vector data, String name, int i){
+        List<Integer> hashes = new ArrayList<Integer>();
+        for (String word : tokenize(originalForm)){
+            hashes.add(hashForProbe(word,data,name,i));
+        }
+        return hashes;
+    }
+
+
+    private Iterable<String> tokenize(CharSequence originalForm) {
     return onNonWord.split(originalForm);
   }
 

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java?rev=990913&r1=990912&r2=990913&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java Mon Aug 30 19:34:10 2010
@@ -41,15 +41,25 @@ public abstract class WordValueEncoder e
   public void addToVector(String originalForm, double w, Vector data) {
     int probes = getProbes();
     String name = getName();
-    double weight = w * weight(originalForm);
+    double weight = getWeight(originalForm,w);
     for (int i = 0; i < probes; i++) {
-      int n = hash(name, originalForm, WORD_LIKE_VALUE_HASH_SEED + i, data.size());
+      int n = hashForProbe(originalForm, data, name, i);
       trace(originalForm, n);
       data.set(n, data.get(n) + weight);
     }
   }
 
-  /**
+  @Override
+  protected double getWeight(String originalForm, double w) {
+    return w*weight(originalForm);    
+  }
+
+  @Override
+  protected int hashForProbe(String originalForm, Vector data, String name, int i) {
+    return hash(name, originalForm, WORD_LIKE_VALUE_HASH_SEED + i, data.size());
+  }
+
+    /**
    * Converts a value into a form that would help a human understand the internals of how the value
    * is being interpreted.  For text-like things, this is likely to be a list of the terms found with
    * associated weights (if any).

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/vectors/InteractionValueEncoderTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/vectors/InteractionValueEncoderTest.java?rev=990913&r1=990912&r2=990913&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/vectors/InteractionValueEncoderTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/vectors/InteractionValueEncoderTest.java Mon Aug 30 19:34:10 2010
@@ -30,27 +30,40 @@ import static org.junit.Assert.assertEqu
 public class InteractionValueEncoderTest {
   @Test
   public void testAddToVector() {
-    InteractionValueEncoder enc = new InteractionValueEncoder("interactions");
+    WordValueEncoder wv = new StaticWordValueEncoder("word");
+    ContinuousValueEncoder cv = new ContinuousValueEncoder("cont");
+    InteractionValueEncoder enc = new InteractionValueEncoder("interactions", wv, cv);
     Vector v1 = new DenseVector(200);
-    enc.addInteractionToVector("a","b",v1);
+    enc.addInteractionToVector("a","1.0",1.0, v1);
     int k = enc.getProbes();
     // should set k distinct locations to 1
     Assert.assertEquals((float) k, v1.norm(1), 0);
     Assert.assertEquals(1.0, v1.maxValue(), 0);
+
     // adding same interaction again should increment weights
-    enc.addInteractionToVector("a","b",v1);
+    enc.addInteractionToVector("a","1.0",1.0,v1);
     Assert.assertEquals((float) k*2, v1.norm(1), 0);
     Assert.assertEquals(2.0, v1.maxValue(), 0);
 
     Vector v2 = new DenseVector(20000);
-    StaticWordValueEncoder wordEncoder = new StaticWordValueEncoder("test");
-    enc.addInteractionToVector("a","b",v2);
-    wordEncoder.addToVector("a", v2);
-    wordEncoder.addToVector("b", v2);
+    enc.addInteractionToVector("a","1.0",1.0,v2);
+    wv.addToVector("a", v2);
+    cv.addToVector("1.0", v2);
     k = enc.getProbes();
-    int j = wordEncoder.getProbes();
     //this assumes no hash collision
-    Assert.assertEquals((float) (k + 2*j), v2.norm(1), 0);
+    Assert.assertEquals((float) (k + wv.getProbes()+cv.getProbes()), v2.norm(1), 1e-3);
   }
 
+  @Test
+  public void testaddToVectorUsesProductOfWeights(){
+    WordValueEncoder wv = new StaticWordValueEncoder("word");
+    ContinuousValueEncoder cv = new ContinuousValueEncoder("cont");
+    InteractionValueEncoder enc = new InteractionValueEncoder("interactions", wv, cv);
+    Vector v1 = new DenseVector(200);
+    enc.addInteractionToVector("a","0.9",0.5, v1);
+    int k = enc.getProbes();
+    // should set k distinct locations to 0.9*0.5
+    Assert.assertEquals((float) k*0.5*0.9, v1.norm(1), 0);
+    Assert.assertEquals(0.5*0.9, v1.maxValue(), 0);
+  }
 }