You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by td...@apache.org on 2010/08/30 21:34:17 UTC

svn commit: r990915 - /mahout/trunk/core/src/main/java/org/apache/mahout/vectors/

Author: tdunning
Date: Mon Aug 30 19:34:16 2010
New Revision: 990915

URL: http://svn.apache.org/viewvc?rev=990915&view=rev
Log:
MAHOUT-492 - Style update and remove quadratic scaling in text interactions

Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/vectors/AdaptiveWordValueEncoder.java
    mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ConstantValueEncoder.java
    mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ContinuousValueEncoder.java
    mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
    mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java
    mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java
    mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java
    mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/AdaptiveWordValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/AdaptiveWordValueEncoder.java?rev=990915&r1=990914&r2=990915&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/AdaptiveWordValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/AdaptiveWordValueEncoder.java Mon Aug 30 19:34:16 2010
@@ -48,13 +48,13 @@ public class AdaptiveWordValueEncoder ex
   }
 
   @Override
-  protected int hashForProbe(String originalForm, Vector data, String name, int i) {
-    return super.hashForProbe(originalForm, data, name, i);
+  protected int hashForProbe(String originalForm, int dataSize, String name, int probe) {
+    return super.hashForProbe(originalForm, dataSize, name, probe);
   }
 
   @Override
   protected double getWeight(String originalForm, double w) {
-    return w*weight(originalForm);
+    return w * weight(originalForm);
   }
 
   @Override

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ConstantValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ConstantValueEncoder.java?rev=990915&r1=990914&r2=990915&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ConstantValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ConstantValueEncoder.java Mon Aug 30 19:34:16 2010
@@ -32,7 +32,7 @@ public class ConstantValueEncoder extend
     int probes = getProbes();
     String name = getName();
     for (int i = 0; i < probes; i++) {
-        int n = hashForProbe(originalForm, data, name, i);
+        int n = hashForProbe(originalForm, data.size(), name, i);
         trace(null, n);
       data.set(n, data.get(n) + getWeight(originalForm,weight));
     }
@@ -48,8 +48,8 @@ public class ConstantValueEncoder extend
     return getName();
   }
 
-  protected int hashForProbe(String originalForm, Vector data, String name, int i){
-    return hash(name, i, data.size());
+  protected int hashForProbe(String originalForm, int dataSize, String name, int probe){
+    return hash(name, probe, dataSize);
   }
 
 }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ContinuousValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ContinuousValueEncoder.java?rev=990915&r1=990914&r2=990915&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ContinuousValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ContinuousValueEncoder.java Mon Aug 30 19:34:16 2010
@@ -39,15 +39,15 @@ public class ContinuousValueEncoder exte
     int probes = getProbes();
     String name = getName();
     for (int i = 0; i < probes; i++) {
-      int n = hashForProbe(originalForm, data, name, i);
+      int n = hashForProbe(originalForm, data.size(), name, i);
       trace(null, n);
       data.set(n, data.get(n) + getWeight(originalForm,weight));
     }
   }
 
   @Override
-  protected int hashForProbe(String originalForm, Vector data, String name, int i) {
-    return hash(name, CONTINUOUS_VALUE_HASH_SEED + i, data.size());
+  protected int hashForProbe(String originalForm, int dataSize, String name, int probe) {
+    return hash(name, CONTINUOUS_VALUE_HASH_SEED + probe, dataSize);
   }
 
   @Override

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java?rev=990915&r1=990914&r2=990915&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java Mon Aug 30 19:34:16 2010
@@ -21,7 +21,9 @@ import com.google.common.collect.Sets;
 import org.apache.mahout.math.Vector;
 
 import java.nio.charset.Charset;
-import java.util.*;
+import java.util.Collections;
+import java.util.Map;
+import java.util.Set;
 
 /**
  * General interface for objects that record features into a feature vector.
@@ -70,10 +72,33 @@ public abstract class FeatureVectorEncod
    */
   public abstract void addToVector(String originalForm, double weight, Vector data);
 
-  protected abstract int hashForProbe(String originalForm, Vector data, String name, int i);
-
-  protected Iterable<Integer> hashesForProbe(String originalForm, Vector data, String name, int i){
-    return Collections.singletonList(hashForProbe(originalForm,data,name,i));
+  /**
+   * Provides the unique hash for a particular probe.  For all encoders except text, this
+   * is all that is needed and the default implementation of hashesForProbe will do the right
+   * thing.  For text and similar values, hashesForProbe should be over-ridden and this method
+   * should not be used.
+   *
+   * @param originalForm  The original string value
+   * @param dataSize      The length of hte vector being encoded
+   * @param name          The name of the variable being encoded
+   * @param probe             The probe number
+   * @return              The hash of the current probe
+   */
+  protected abstract int hashForProbe(String originalForm, int dataSize, String name, int probe);
+
+  /**
+   * Returns all of the hashes for this probe.  For most encoders, this is a singleton, but
+   * for text, many hashes are returned, one for each word (unique or not).  Most implementations
+   * should only implement hashForProbe for simplicity.
+   *
+   * @param originalForm The original string value.
+   * @param dataSize     The length of the vector being encoded
+   * @param name         The name of the variable being encoded
+   * @param probe        The probe number
+   * @return an Iterable of the hashes
+   */
+  protected Iterable<Integer> hashesForProbe(String originalForm, int dataSize, String name, int probe) {
+    return Collections.singletonList(hashForProbe(originalForm, dataSize, name, probe));
   }
 
   protected double getWeight(String originalForm, double w){

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java?rev=990915&r1=990914&r2=990915&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java Mon Aug 30 19:34:16 2010
@@ -17,17 +17,11 @@
 
 package org.apache.mahout.vectors;
 
-import org.apache.mahout.common.iterator.ArrayIterator;
 import org.apache.mahout.math.Vector;
 
-import java.util.ArrayList;
-
 public class InteractionValueEncoder extends FeatureVectorEncoder {
-
-  protected static final int INTERACTION_VALUE_HASH_SEED_1 = 100;
-  protected static final int INTERACTION_VALUE_HASH_SEED_2 = 200;
-  protected static FeatureVectorEncoder firstEncoder;
-  protected static FeatureVectorEncoder secondEncoder;
+  private static FeatureVectorEncoder firstEncoder;
+  private static FeatureVectorEncoder secondEncoder;
 
   public InteractionValueEncoder(String name, FeatureVectorEncoder encoderOne, FeatureVectorEncoder encoderTwo) {
     super(name, 2);
@@ -50,15 +44,17 @@ public class InteractionValueEncoder ext
    *
    * @param originalForm1 The original form of the first value as a string.
    * @param originalForm2 The original form of the second value as a string.
+   * @param weight        How much to weight this interaction
    * @param data          The vector to which the value should be added.
    */
   public void addInteractionToVector(String originalForm1, String originalForm2, double weight, Vector data) {
     String name = getName();
     double w = getWeight(originalForm1, originalForm2, weight);
     for (int i = 0; i < probes(); i++) {
-      for(Integer k : firstEncoder.hashesForProbe(originalForm1, data, name, i)){
-        for(Integer j : secondEncoder.hashesForProbe(originalForm2, data, name, i)){
-          int n = linearDoubleHash(hash1(k,name,i,data),hash2(k,name,i,data),j,data.size());
+      Iterable<Integer> jValues = secondEncoder.hashesForProbe(originalForm2, data.size(), name, i);
+      for(Integer k : firstEncoder.hashesForProbe(originalForm1, data.size(), name, i)){
+        for(Integer j : jValues) {
+          int n = (k + j) % data.size();
           trace(String.format("%s:%s", originalForm1, originalForm2), n);
           data.set(n, data.get(n) + w);
         }
@@ -74,14 +70,6 @@ public class InteractionValueEncoder ext
     return firstEncoder.getWeight(originalForm1, 1.0) * secondEncoder.getWeight(originalForm2, 1.0) * w;
   }
 
-  private int linearDoubleHash(int h1, int h2, int j, int modulus){
-   int n = (h1 + (j+1)*h2) % modulus;
-   if(n < 0){
-    n = n+modulus;
-   }
-   return n;
-  }
-
   /**
    * Converts a value into a form that would help a human understand the internals of how the value
    * is being interpreted.  For text-like things, this is likely to be a list of the terms found with
@@ -96,16 +84,8 @@ public class InteractionValueEncoder ext
   }
 
   @Override
-  protected int hashForProbe(String originalForm, Vector data, String name, int i) {
-    return hash(name, i, data.size());
-  }
-
-  protected int hash1(int value, String name, int i, Vector data){
-    return hash(name, i+value+INTERACTION_VALUE_HASH_SEED_1, data.size());
-  }
-
-  protected int hash2(int value, String name, int i, Vector data){
-    return hash(name, i+value+INTERACTION_VALUE_HASH_SEED_2, data.size());
+  protected int hashForProbe(String originalForm, int dataSize, String name, int probe) {
+    return hash(name, probe, dataSize);
   }
 }
 

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java?rev=990915&r1=990914&r2=990915&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java Mon Aug 30 19:34:16 2010
@@ -17,8 +17,6 @@
 
 package org.apache.mahout.vectors;
 
-import org.apache.mahout.math.Vector;
-
 import java.util.Collections;
 import java.util.Map;
 
@@ -40,8 +38,8 @@ public class StaticWordValueEncoder exte
   }
 
   @Override
-  protected int hashForProbe(String originalForm, Vector data, String name, int i) {
-    return hash(name, originalForm, WORD_LIKE_VALUE_HASH_SEED + i, data.size());
+  protected int hashForProbe(String originalForm, int dataSize, String name, int probe) {
+    return hash(name, originalForm, WORD_LIKE_VALUE_HASH_SEED + probe, dataSize);
   }
 
    /**

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java?rev=990915&r1=990914&r2=990915&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java Mon Aug 30 19:34:16 2010
@@ -18,6 +18,8 @@
 package org.apache.mahout.vectors;
 
 import com.google.common.base.Splitter;
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Multiset;
 import org.apache.mahout.math.Vector;
 
 import java.util.ArrayList;
@@ -33,6 +35,7 @@ public class TextValueEncoder extends Fe
 
   private final Splitter onNonWord = Splitter.on(Pattern.compile("\\W+")).omitEmptyStrings();
   private FeatureVectorEncoder wordEncoder;
+  private static final double LOG_2 = Math.log(2);
 
   public TextValueEncoder(String name) {
     super(name, 2);
@@ -47,20 +50,24 @@ public class TextValueEncoder extends Fe
    */
   @Override
   public void addToVector(String originalForm, double weight, Vector data) {
+    Multiset<String> counts = HashMultiset.create();
     for (String word : tokenize(originalForm)) {
-      wordEncoder.addToVector(word, weight, data);
+      counts.add(word);
+    }
+    for (String word : counts.elementSet()) {
+      wordEncoder.addToVector(word, weight * Math.log(1 + counts.count(word))/LOG_2, data);
     }
   }
 
   @Override
-  protected int hashForProbe(String originalForm, Vector data, String name, int i) {
+  protected int hashForProbe(String originalForm, int dataSize, String name, int probe) {
     return 0;
   }
 
-  protected Iterable<Integer> hashesForProbe(String originalForm, Vector data, String name, int i){
+  protected Iterable<Integer> hashesForProbe(String originalForm, int dataSize, String name, int probe){
     List<Integer> hashes = new ArrayList<Integer>();
     for (String word : tokenize(originalForm)){
-      hashes.add(hashForProbe(word,data,name,i));
+      hashes.add(hashForProbe(word, dataSize, name, probe));
     }
     return hashes;
   }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java?rev=990915&r1=990914&r2=990915&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java Mon Aug 30 19:34:16 2010
@@ -43,7 +43,7 @@ public abstract class WordValueEncoder e
     String name = getName();
     double weight = getWeight(originalForm,w);
     for (int i = 0; i < probes; i++) {
-      int n = hashForProbe(originalForm, data, name, i);
+      int n = hashForProbe(originalForm, data.size(), name, i);
       trace(originalForm, n);
       data.set(n, data.get(n) + weight);
     }
@@ -52,12 +52,12 @@ public abstract class WordValueEncoder e
 
   @Override
   protected double getWeight(String originalForm, double w) {
-    return w*weight(originalForm);    
+    return w * weight(originalForm);
   }
 
   @Override
-  protected int hashForProbe(String originalForm, Vector data, String name, int i) {
-    return hash(name, originalForm, WORD_LIKE_VALUE_HASH_SEED + i, data.size());
+  protected int hashForProbe(String originalForm, int dataSize, String name, int probe) {
+    return hash(name, originalForm, WORD_LIKE_VALUE_HASH_SEED + probe, dataSize);
   }
 
     /**