You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by td...@apache.org on 2010/08/30 21:34:17 UTC
svn commit: r990915 -
/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/
Author: tdunning
Date: Mon Aug 30 19:34:16 2010
New Revision: 990915
URL: http://svn.apache.org/viewvc?rev=990915&view=rev
Log:
MAHOUT-492 - Style update and remove quadratic scaling in text interactions
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/AdaptiveWordValueEncoder.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ConstantValueEncoder.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ContinuousValueEncoder.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/AdaptiveWordValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/AdaptiveWordValueEncoder.java?rev=990915&r1=990914&r2=990915&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/AdaptiveWordValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/AdaptiveWordValueEncoder.java Mon Aug 30 19:34:16 2010
@@ -48,13 +48,13 @@ public class AdaptiveWordValueEncoder ex
}
@Override
- protected int hashForProbe(String originalForm, Vector data, String name, int i) {
- return super.hashForProbe(originalForm, data, name, i);
+ protected int hashForProbe(String originalForm, int dataSize, String name, int probe) {
+ return super.hashForProbe(originalForm, dataSize, name, probe);
}
@Override
protected double getWeight(String originalForm, double w) {
- return w*weight(originalForm);
+ return w * weight(originalForm);
}
@Override
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ConstantValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ConstantValueEncoder.java?rev=990915&r1=990914&r2=990915&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ConstantValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ConstantValueEncoder.java Mon Aug 30 19:34:16 2010
@@ -32,7 +32,7 @@ public class ConstantValueEncoder extend
int probes = getProbes();
String name = getName();
for (int i = 0; i < probes; i++) {
- int n = hashForProbe(originalForm, data, name, i);
+ int n = hashForProbe(originalForm, data.size(), name, i);
trace(null, n);
data.set(n, data.get(n) + getWeight(originalForm,weight));
}
@@ -48,8 +48,8 @@ public class ConstantValueEncoder extend
return getName();
}
- protected int hashForProbe(String originalForm, Vector data, String name, int i){
- return hash(name, i, data.size());
+ protected int hashForProbe(String originalForm, int dataSize, String name, int probe){
+ return hash(name, probe, dataSize);
}
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ContinuousValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ContinuousValueEncoder.java?rev=990915&r1=990914&r2=990915&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ContinuousValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/ContinuousValueEncoder.java Mon Aug 30 19:34:16 2010
@@ -39,15 +39,15 @@ public class ContinuousValueEncoder exte
int probes = getProbes();
String name = getName();
for (int i = 0; i < probes; i++) {
- int n = hashForProbe(originalForm, data, name, i);
+ int n = hashForProbe(originalForm, data.size(), name, i);
trace(null, n);
data.set(n, data.get(n) + getWeight(originalForm,weight));
}
}
@Override
- protected int hashForProbe(String originalForm, Vector data, String name, int i) {
- return hash(name, CONTINUOUS_VALUE_HASH_SEED + i, data.size());
+ protected int hashForProbe(String originalForm, int dataSize, String name, int probe) {
+ return hash(name, CONTINUOUS_VALUE_HASH_SEED + probe, dataSize);
}
@Override
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java?rev=990915&r1=990914&r2=990915&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java Mon Aug 30 19:34:16 2010
@@ -21,7 +21,9 @@ import com.google.common.collect.Sets;
import org.apache.mahout.math.Vector;
import java.nio.charset.Charset;
-import java.util.*;
+import java.util.Collections;
+import java.util.Map;
+import java.util.Set;
/**
* General interface for objects that record features into a feature vector.
@@ -70,10 +72,33 @@ public abstract class FeatureVectorEncod
*/
public abstract void addToVector(String originalForm, double weight, Vector data);
- protected abstract int hashForProbe(String originalForm, Vector data, String name, int i);
-
- protected Iterable<Integer> hashesForProbe(String originalForm, Vector data, String name, int i){
- return Collections.singletonList(hashForProbe(originalForm,data,name,i));
+ /**
+ * Provides the unique hash for a particular probe. For all encoders except text, this
+ * is all that is needed and the default implementation of hashesForProbe will do the right
+ * thing. For text and similar values, hashesForProbe should be over-ridden and this method
+ * should not be used.
+ *
+ * @param originalForm The original string value
+ * @param dataSize The length of hte vector being encoded
+ * @param name The name of the variable being encoded
+ * @param probe The probe number
+ * @return The hash of the current probe
+ */
+ protected abstract int hashForProbe(String originalForm, int dataSize, String name, int probe);
+
+ /**
+ * Returns all of the hashes for this probe. For most encoders, this is a singleton, but
+ * for text, many hashes are returned, one for each word (unique or not). Most implementations
+ * should only implement hashForProbe for simplicity.
+ *
+ * @param originalForm The original string value.
+ * @param dataSize The length of the vector being encoded
+ * @param name The name of the variable being encoded
+ * @param probe The probe number
+ * @return an Iterable of the hashes
+ */
+ protected Iterable<Integer> hashesForProbe(String originalForm, int dataSize, String name, int probe) {
+ return Collections.singletonList(hashForProbe(originalForm, dataSize, name, probe));
}
protected double getWeight(String originalForm, double w){
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java?rev=990915&r1=990914&r2=990915&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java Mon Aug 30 19:34:16 2010
@@ -17,17 +17,11 @@
package org.apache.mahout.vectors;
-import org.apache.mahout.common.iterator.ArrayIterator;
import org.apache.mahout.math.Vector;
-import java.util.ArrayList;
-
public class InteractionValueEncoder extends FeatureVectorEncoder {
-
- protected static final int INTERACTION_VALUE_HASH_SEED_1 = 100;
- protected static final int INTERACTION_VALUE_HASH_SEED_2 = 200;
- protected static FeatureVectorEncoder firstEncoder;
- protected static FeatureVectorEncoder secondEncoder;
+ private static FeatureVectorEncoder firstEncoder;
+ private static FeatureVectorEncoder secondEncoder;
public InteractionValueEncoder(String name, FeatureVectorEncoder encoderOne, FeatureVectorEncoder encoderTwo) {
super(name, 2);
@@ -50,15 +44,17 @@ public class InteractionValueEncoder ext
*
* @param originalForm1 The original form of the first value as a string.
* @param originalForm2 The original form of the second value as a string.
+ * @param weight How much to weight this interaction
* @param data The vector to which the value should be added.
*/
public void addInteractionToVector(String originalForm1, String originalForm2, double weight, Vector data) {
String name = getName();
double w = getWeight(originalForm1, originalForm2, weight);
for (int i = 0; i < probes(); i++) {
- for(Integer k : firstEncoder.hashesForProbe(originalForm1, data, name, i)){
- for(Integer j : secondEncoder.hashesForProbe(originalForm2, data, name, i)){
- int n = linearDoubleHash(hash1(k,name,i,data),hash2(k,name,i,data),j,data.size());
+ Iterable<Integer> jValues = secondEncoder.hashesForProbe(originalForm2, data.size(), name, i);
+ for(Integer k : firstEncoder.hashesForProbe(originalForm1, data.size(), name, i)){
+ for(Integer j : jValues) {
+ int n = (k + j) % data.size();
trace(String.format("%s:%s", originalForm1, originalForm2), n);
data.set(n, data.get(n) + w);
}
@@ -74,14 +70,6 @@ public class InteractionValueEncoder ext
return firstEncoder.getWeight(originalForm1, 1.0) * secondEncoder.getWeight(originalForm2, 1.0) * w;
}
- private int linearDoubleHash(int h1, int h2, int j, int modulus){
- int n = (h1 + (j+1)*h2) % modulus;
- if(n < 0){
- n = n+modulus;
- }
- return n;
- }
-
/**
* Converts a value into a form that would help a human understand the internals of how the value
* is being interpreted. For text-like things, this is likely to be a list of the terms found with
@@ -96,16 +84,8 @@ public class InteractionValueEncoder ext
}
@Override
- protected int hashForProbe(String originalForm, Vector data, String name, int i) {
- return hash(name, i, data.size());
- }
-
- protected int hash1(int value, String name, int i, Vector data){
- return hash(name, i+value+INTERACTION_VALUE_HASH_SEED_1, data.size());
- }
-
- protected int hash2(int value, String name, int i, Vector data){
- return hash(name, i+value+INTERACTION_VALUE_HASH_SEED_2, data.size());
+ protected int hashForProbe(String originalForm, int dataSize, String name, int probe) {
+ return hash(name, probe, dataSize);
}
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java?rev=990915&r1=990914&r2=990915&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/StaticWordValueEncoder.java Mon Aug 30 19:34:16 2010
@@ -17,8 +17,6 @@
package org.apache.mahout.vectors;
-import org.apache.mahout.math.Vector;
-
import java.util.Collections;
import java.util.Map;
@@ -40,8 +38,8 @@ public class StaticWordValueEncoder exte
}
@Override
- protected int hashForProbe(String originalForm, Vector data, String name, int i) {
- return hash(name, originalForm, WORD_LIKE_VALUE_HASH_SEED + i, data.size());
+ protected int hashForProbe(String originalForm, int dataSize, String name, int probe) {
+ return hash(name, originalForm, WORD_LIKE_VALUE_HASH_SEED + probe, dataSize);
}
/**
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java?rev=990915&r1=990914&r2=990915&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java Mon Aug 30 19:34:16 2010
@@ -18,6 +18,8 @@
package org.apache.mahout.vectors;
import com.google.common.base.Splitter;
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Multiset;
import org.apache.mahout.math.Vector;
import java.util.ArrayList;
@@ -33,6 +35,7 @@ public class TextValueEncoder extends Fe
private final Splitter onNonWord = Splitter.on(Pattern.compile("\\W+")).omitEmptyStrings();
private FeatureVectorEncoder wordEncoder;
+ private static final double LOG_2 = Math.log(2);
public TextValueEncoder(String name) {
super(name, 2);
@@ -47,20 +50,24 @@ public class TextValueEncoder extends Fe
*/
@Override
public void addToVector(String originalForm, double weight, Vector data) {
+ Multiset<String> counts = HashMultiset.create();
for (String word : tokenize(originalForm)) {
- wordEncoder.addToVector(word, weight, data);
+ counts.add(word);
+ }
+ for (String word : counts.elementSet()) {
+ wordEncoder.addToVector(word, weight * Math.log(1 + counts.count(word))/LOG_2, data);
}
}
@Override
- protected int hashForProbe(String originalForm, Vector data, String name, int i) {
+ protected int hashForProbe(String originalForm, int dataSize, String name, int probe) {
return 0;
}
- protected Iterable<Integer> hashesForProbe(String originalForm, Vector data, String name, int i){
+ protected Iterable<Integer> hashesForProbe(String originalForm, int dataSize, String name, int probe){
List<Integer> hashes = new ArrayList<Integer>();
for (String word : tokenize(originalForm)){
- hashes.add(hashForProbe(word,data,name,i));
+ hashes.add(hashForProbe(word, dataSize, name, probe));
}
return hashes;
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java?rev=990915&r1=990914&r2=990915&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/WordValueEncoder.java Mon Aug 30 19:34:16 2010
@@ -43,7 +43,7 @@ public abstract class WordValueEncoder e
String name = getName();
double weight = getWeight(originalForm,w);
for (int i = 0; i < probes; i++) {
- int n = hashForProbe(originalForm, data, name, i);
+ int n = hashForProbe(originalForm, data.size(), name, i);
trace(originalForm, n);
data.set(n, data.get(n) + weight);
}
@@ -52,12 +52,12 @@ public abstract class WordValueEncoder e
@Override
protected double getWeight(String originalForm, double w) {
- return w*weight(originalForm);
+ return w * weight(originalForm);
}
@Override
- protected int hashForProbe(String originalForm, Vector data, String name, int i) {
- return hash(name, originalForm, WORD_LIKE_VALUE_HASH_SEED + i, data.size());
+ protected int hashForProbe(String originalForm, int dataSize, String name, int probe) {
+ return hash(name, originalForm, WORD_LIKE_VALUE_HASH_SEED + probe, dataSize);
}
/**