You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by td...@apache.org on 2010/08/30 21:34:10 UTC
svn commit: r990913 - in /mahout/trunk/core/src:
main/java/org/apache/mahout/vectors/ test/java/org/apache/mahout/vectors/
Author: tdunning
Date: Mon Aug 30 19:34:10 2010
New Revision: 990913
URL: http://svn.apache.org/viewvc?rev=990913&view=rev
Log:
MAHOUT-492 - modified InteractionValueEncoder to be more generic, to allow interactions of any two FeatureVectorEncoders and also to allow specifying weight to apply to interaction
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/AdaptiveWordValueEncoder.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ConstantValueEncoder.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ContinuousValueEncoder.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java
mahout/trunk/core/src/test/java/org/apache/mahout/vectors/InteractionValueEncoderTest.java
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/AdaptiveWordValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/AdaptiveWordValueEncoder.java?rev=990913&r1=990912&r2=990913&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/AdaptiveWordValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/AdaptiveWordValueEncoder.java Mon Aug 30 19:34:10 2010
@@ -44,7 +44,17 @@ public class AdaptiveWordValueEncoder ex
@Override
public void addToVector(String originalForm, double weight, Vector data) {
dictionary.add(originalForm);
- super.addToVector(originalForm, weight, data);
+ super.addToVector(originalForm, getWeight(originalForm, weight), data);
+ }
+
+ @Override
+ protected int hashForProbe(String originalForm, Vector data, String name, int i) {
+ return super.hashForProbe(originalForm, data, name, i);
+ }
+
+ @Override
+ protected double getWeight(String originalForm, double w) {
+ return w*weight(originalForm);
}
@Override
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ConstantValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ConstantValueEncoder.java?rev=990913&r1=990912&r2=990913&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ConstantValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ConstantValueEncoder.java Mon Aug 30 19:34:10 2010
@@ -32,14 +32,24 @@ public class ConstantValueEncoder extend
int probes = getProbes();
String name = getName();
for (int i = 0; i < probes; i++) {
- int n = hash(name, i, data.size());
- trace(null, n);
- data.set(n, data.get(n) + weight);
+ int n = hashForProbe(originalForm, data, name, i);
+ trace(null, n);
+ data.set(n, data.get(n) + getWeight(originalForm,weight));
}
}
@Override
+ protected double getWeight(String originalForm, double w) {
+ return w;
+ }
+
+ @Override
public String asString(String originalForm) {
return getName();
}
+
+ protected int hashForProbe(String originalForm, Vector data, String name, int i){
+ return hash(name, i, data.size());
+ }
+
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ContinuousValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ContinuousValueEncoder.java?rev=990913&r1=990912&r2=990913&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ContinuousValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ContinuousValueEncoder.java Mon Aug 30 19:34:10 2010
@@ -39,12 +39,22 @@ public class ContinuousValueEncoder exte
int probes = getProbes();
String name = getName();
for (int i = 0; i < probes; i++) {
- int n = hash(name, CONTINUOUS_VALUE_HASH_SEED + i, data.size());
+ int n = hashForProbe(originalForm, data, name, i);
trace(null, n);
- data.set(n, data.get(n) + weight * Double.parseDouble(originalForm));
+ data.set(n, data.get(n) + getWeight(originalForm,weight));
}
}
+ @Override
+ protected int hashForProbe(String originalForm, Vector data, String name, int i) {
+ return hash(name, CONTINUOUS_VALUE_HASH_SEED + i, data.size());
+ }
+
+ @Override
+ protected double getWeight(String originalForm, double w) {
+ return w * Double.parseDouble(originalForm);
+ }
+
/**
* Converts a value into a form that would help a human understand the internals of how the value
* is being interpreted. For text-like things, this is likely to be a list of the terms found with
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java?rev=990913&r1=990912&r2=990913&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java Mon Aug 30 19:34:10 2010
@@ -21,6 +21,8 @@ import com.google.common.collect.Sets;
import org.apache.mahout.math.Vector;
import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -71,6 +73,19 @@ public abstract class FeatureVectorEncod
*/
public abstract void addToVector(String originalForm, double weight, Vector data);
+ protected abstract int hashForProbe(String originalForm, Vector data, String name, int i);
+
+ protected Iterable<Integer> hashesForProbe(String originalForm, Vector data, String name, int i){
+ List<Integer> hashes = new ArrayList<Integer>();
+ hashes.add(hashForProbe(originalForm,data,name,i));
+ return hashes;
+ }
+
+
+ protected double getWeight(String originalFor, double w){
+ return 1.0;
+ }
+
// ******* Utility functions used by most implementations
/**
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java?rev=990913&r1=990912&r2=990913&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java Mon Aug 30 19:34:10 2010
@@ -25,63 +25,77 @@ public class InteractionValueEncoder ext
protected static final int INTERACTION_VALUE_HASH_SEED_1 = 100;
protected static final int INTERACTION_VALUE_HASH_SEED_2 = 200;
+ protected static FeatureVectorEncoder firstEncoder;
+ protected static FeatureVectorEncoder secondEncoder;
- public InteractionValueEncoder(String name) {
- super(name, 2);
- }
+ public InteractionValueEncoder(String name, FeatureVectorEncoder encoderOne, FeatureVectorEncoder encoderTwo) {
+ super(name, 2);
+ firstEncoder = encoderOne;
+ secondEncoder = encoderTwo;
+ }
/**
* Adds a value to a vector.
*
* @param originalForm The original form of the first value as a string.
- * @param data The vector to which the value should be added.
+ * @param data The vector to which the value should be added.
*/
@Override
public void addToVector(String originalForm, double w, Vector data) {
}
- /**
- * Adds a value to a vector.
- *
- * @param originalForm1 The original form of the first value as a string.
- * @param originalForm2 The original form of the second value as a string.
- * @param data The vector to which the value should be added.
- */
- public void addInteractionToVector(String originalForm1, String originalForm2, Vector data) {
- int probes = getProbes();
- String name = getName();
- for (int i = 0; i < probes; i++) {
- int h1 = hash1(name, originalForm1, i, data.size());
- int h2 = hash2(name, originalForm1, i, data.size());
- int j = hash1(name, originalForm2, i, data.size());
- int n = (h1 + (j+1)*h2) % data.size();
- if(n < 0){
- n = n+data.size();
- }
- trace(String.format("%s:%s", originalForm1, originalForm2), n);
- data.set(n, data.get(n) + 1);
- }
- }
+ /**
+ * Adds a value to a vector.
+ *
+ * @param originalForm1 The original form of the first value as a string.
+ * @param originalForm2 The original form of the second value as a string.
+ * @param data The vector to which the value should be added.
+ */
+ public void addInteractionToVector(String originalForm1, String originalForm2, double weight, Vector data) {
+ int probes = getProbes();
+ String name = getName();
+ double w = getWeight(originalForm1, originalForm2, weight);
+ for (int i = 0; i < probes; i++) {
+ int h1 = firstEncoder.hashForProbe(originalForm1, data, name, i);
+ int h2 = secondEncoder.hashForProbe(originalForm1, data, name, i);
+ int j = firstEncoder.hashForProbe(originalForm2, data, name, i);
+ int n = (h1 + (j+1)*h2) % data.size();
+ if(n < 0){
+ n = n+data.size();
+ }
+ trace(String.format("%s:%s", originalForm1, originalForm2), n);
+ data.set(n, data.get(n) + w);
+ }
+ }
+
+ protected double getWeight(String originalForm1, String originalForm2, double w) {
+ return firstEncoder.getWeight(originalForm1, 1.0) * secondEncoder.getWeight(originalForm2,1.0) * w;
+ }
/**
- * Converts a value into a form that would help a human understand the internals of how the
- * value is being interpreted. For text-like things, this is likely to be a list of the terms
- * found with associated weights (if any).
+ * Converts a value into a form that would help a human understand the internals of how the value
+ * is being interpreted. For text-like things, this is likely to be a list of the terms found with
+ * associated weights (if any).
*
* @param originalForm The original form of the value as a string.
* @return A string that a human can read.
*/
@Override
public String asString(String originalForm) {
- return String.format(Locale.ENGLISH, "%s:%s", getName(), originalForm);
+ return String.format("%s:%s", getName(), originalForm);
+ }
+
+ @Override
+ protected int hashForProbe(String originalForm, Vector data, String name, int i) {
+ return hash(name, i, data.size());
}
protected int hash1(String term1, String term2, int probe, int numFeatures) {
- return hash(term1, term2, probe + INTERACTION_VALUE_HASH_SEED_1, numFeatures);
+ return hash(term1, term2, probe+INTERACTION_VALUE_HASH_SEED_1,numFeatures);
}
protected int hash2(String term1, String term2, int probe, int numFeatures) {
- return hash(term1, term2, probe + INTERACTION_VALUE_HASH_SEED_2, numFeatures);
+ return hash(term1, term2, probe+INTERACTION_VALUE_HASH_SEED_2,numFeatures);
}
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java?rev=990913&r1=990912&r2=990913&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java Mon Aug 30 19:34:10 2010
@@ -17,6 +17,8 @@
package org.apache.mahout.vectors;
+import org.apache.mahout.math.Vector;
+
import java.util.Collections;
import java.util.Map;
@@ -37,7 +39,12 @@ public class StaticWordValueEncoder exte
super(name);
}
- /**
+ @Override
+ protected int hashForProbe(String originalForm, Vector data, String name, int i) {
+ return hash(name, i, data.size());
+ }
+
+ /**
* Sets the weighting dictionary to be used by this encoder. Also sets
* the missing value weight to be half the smallest weight in the dictionary.
* @param dictionary The dictionary to use to look up weights.
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java?rev=990913&r1=990912&r2=990913&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java Mon Aug 30 19:34:10 2010
@@ -20,6 +20,8 @@ package org.apache.mahout.vectors;
import com.google.common.base.Splitter;
import org.apache.mahout.math.Vector;
+import java.util.ArrayList;
+import java.util.List;
import java.util.regex.Pattern;
/**
@@ -50,7 +52,21 @@ public class TextValueEncoder extends Fe
}
}
- private Iterable<String> tokenize(CharSequence originalForm) {
+ @Override
+ protected int hashForProbe(String originalForm, Vector data, String name, int i) {
+ return 0;
+ }
+
+ protected Iterable<Integer> hashesForProbe(String originalForm, Vector data, String name, int i){
+ List<Integer> hashes = new ArrayList<Integer>();
+ for (String word : tokenize(originalForm)){
+ hashes.add(hashForProbe(word,data,name,i));
+ }
+ return hashes;
+ }
+
+
+ private Iterable<String> tokenize(CharSequence originalForm) {
return onNonWord.split(originalForm);
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java?rev=990913&r1=990912&r2=990913&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java Mon Aug 30 19:34:10 2010
@@ -41,15 +41,25 @@ public abstract class WordValueEncoder e
public void addToVector(String originalForm, double w, Vector data) {
int probes = getProbes();
String name = getName();
- double weight = w * weight(originalForm);
+ double weight = getWeight(originalForm,w);
for (int i = 0; i < probes; i++) {
- int n = hash(name, originalForm, WORD_LIKE_VALUE_HASH_SEED + i, data.size());
+ int n = hashForProbe(originalForm, data, name, i);
trace(originalForm, n);
data.set(n, data.get(n) + weight);
}
}
- /**
+ @Override
+ protected double getWeight(String originalForm, double w) {
+ return w*weight(originalForm);
+ }
+
+ @Override
+ protected int hashForProbe(String originalForm, Vector data, String name, int i) {
+ return hash(name, originalForm, WORD_LIKE_VALUE_HASH_SEED + i, data.size());
+ }
+
+ /**
* Converts a value into a form that would help a human understand the internals of how the value
* is being interpreted. For text-like things, this is likely to be a list of the terms found with
* associated weights (if any).
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/vectors/InteractionValueEncoderTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/vectors/InteractionValueEncoderTest.java?rev=990913&r1=990912&r2=990913&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/vectors/InteractionValueEncoderTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/vectors/InteractionValueEncoderTest.java Mon Aug 30 19:34:10 2010
@@ -30,27 +30,40 @@ import static org.junit.Assert.assertEqu
public class InteractionValueEncoderTest {
@Test
public void testAddToVector() {
- InteractionValueEncoder enc = new InteractionValueEncoder("interactions");
+ WordValueEncoder wv = new StaticWordValueEncoder("word");
+ ContinuousValueEncoder cv = new ContinuousValueEncoder("cont");
+ InteractionValueEncoder enc = new InteractionValueEncoder("interactions", wv, cv);
Vector v1 = new DenseVector(200);
- enc.addInteractionToVector("a","b",v1);
+ enc.addInteractionToVector("a","1.0",1.0, v1);
int k = enc.getProbes();
// should set k distinct locations to 1
Assert.assertEquals((float) k, v1.norm(1), 0);
Assert.assertEquals(1.0, v1.maxValue(), 0);
+
// adding same interaction again should increment weights
- enc.addInteractionToVector("a","b",v1);
+ enc.addInteractionToVector("a","1.0",1.0,v1);
Assert.assertEquals((float) k*2, v1.norm(1), 0);
Assert.assertEquals(2.0, v1.maxValue(), 0);
Vector v2 = new DenseVector(20000);
- StaticWordValueEncoder wordEncoder = new StaticWordValueEncoder("test");
- enc.addInteractionToVector("a","b",v2);
- wordEncoder.addToVector("a", v2);
- wordEncoder.addToVector("b", v2);
+ enc.addInteractionToVector("a","1.0",1.0,v2);
+ wv.addToVector("a", v2);
+ cv.addToVector("1.0", v2);
k = enc.getProbes();
- int j = wordEncoder.getProbes();
//this assumes no hash collision
- Assert.assertEquals((float) (k + 2*j), v2.norm(1), 0);
+ Assert.assertEquals((float) (k + wv.getProbes()+cv.getProbes()), v2.norm(1), 1e-3);
}
+ @Test
+ public void testaddToVectorUsesProductOfWeights(){
+ WordValueEncoder wv = new StaticWordValueEncoder("word");
+ ContinuousValueEncoder cv = new ContinuousValueEncoder("cont");
+ InteractionValueEncoder enc = new InteractionValueEncoder("interactions", wv, cv);
+ Vector v1 = new DenseVector(200);
+ enc.addInteractionToVector("a","0.9",0.5, v1);
+ int k = enc.getProbes();
+ // should set k distinct locations to 0.9*0.5
+ Assert.assertEquals((float) k*0.5*0.9, v1.norm(1), 0);
+ Assert.assertEquals(0.5*0.9, v1.maxValue(), 0);
+ }
}