You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ss...@apache.org on 2011/09/09 09:45:17 UTC

svn commit: r1167027 [2/2] - in /mahout/trunk: ./ core/src/main/java/org/apache/mahout/cf/taste/hadoop/ core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ core/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ core/src/main/java/org/...

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/Vectors.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/Vectors.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/Vectors.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/Vectors.java Fri Sep  9 07:45:16 2011
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence;
+
+import com.google.common.base.Preconditions;
+import com.google.common.io.Closeables;
+import com.google.common.primitives.Doubles;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.cf.taste.common.TopK;
+import org.apache.mahout.common.iterator.FixedSizeSamplingIterator;
+import org.apache.mahout.math.Varint;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.map.OpenDoubleIntHashMap;
+import org.apache.mahout.math.map.OpenIntDoubleHashMap;
+import org.apache.mahout.math.map.OpenIntIntHashMap;
+
+import java.io.DataInput;
+import java.io.IOException;
+import java.util.Comparator;
+import java.util.Iterator;
+
+public class Vectors {
+
+  private Vectors() {}
+
+  public static Vector maybeSample(Vector original, int sampleSize) {
+    if (original.getNumNondefaultElements() <= sampleSize) {
+      return original;
+    }
+    Vector sample = original.like();
+    Iterator<Vector.Element> sampledElements =
+        new FixedSizeSamplingIterator<Vector.Element>(sampleSize, original.iterateNonZero());
+    while (sampledElements.hasNext()) {
+      Vector.Element elem = sampledElements.next();
+      sample.setQuick(elem.index(), elem.get());
+    }
+    return sample;
+  }
+
+  public static Vector topKElements(int k, Vector original) {
+    if (original.getNumNondefaultElements() <= k) {
+      return original;
+    }
+    TopK<Vector.Element> topKQueue = new TopK<Vector.Element>(k, BY_VALUE);
+    Iterator<Vector.Element> nonZeroElements = original.iterateNonZero();
+    while (nonZeroElements.hasNext()) {
+      Vector.Element nonZeroElement = nonZeroElements.next();
+      topKQueue.offer(new Vectors.TemporaryElement(nonZeroElement));
+    }
+    Vector topKSimilarities = original.like();
+    for (Vector.Element topKSimilarity : topKQueue.retrieve()) {
+      topKSimilarities.setQuick(topKSimilarity.index(), topKSimilarity.get());
+    }
+    return topKSimilarities;
+  }
+
+  public static Vector merge(Iterable<VectorWritable> partialVectors) {
+    Iterator<VectorWritable> vectors = partialVectors.iterator();
+    Vector accumulator = vectors.next().get();
+    while (vectors.hasNext()) {
+      VectorWritable v = vectors.next();
+      if (v != null) {
+        Iterator<Vector.Element> nonZeroElements = v.get().iterateNonZero();
+        while (nonZeroElements.hasNext()) {
+          Vector.Element nonZeroElement = nonZeroElements.next();
+          accumulator.setQuick(nonZeroElement.index(), nonZeroElement.get());
+        }
+      }
+    }
+    return accumulator;
+  }
+
+  static final Comparator<Vector.Element> BY_VALUE = new Comparator<Vector.Element>() {
+    @Override
+    public int compare(Vector.Element elem1, Vector.Element elem2) {
+      return Doubles.compare(elem1.get(), elem2.get());
+    }
+  };
+
+  static class TemporaryElement implements Vector.Element {
+
+    private int index;
+    private double value;
+
+    TemporaryElement(int index, double value) {
+      this.index = index;
+      this.value = value;
+    }
+
+    TemporaryElement(Vector.Element toClone) {
+      this(toClone.index(), toClone.get());
+    }
+
+    @Override
+    public double get() {
+      return value;
+    }
+
+    @Override
+    public int index() {
+      return index;
+    }
+
+    @Override
+    public void set(double value) {
+      this.value = value;
+    }
+  }
+
+  public static Vector.Element[] toArray(VectorWritable vectorWritable) {
+    Vector.Element[] elements = new Vector.Element[vectorWritable.get().getNumNondefaultElements()];
+    int k = 0;
+    Iterator<Vector.Element> nonZeroElements = vectorWritable.get().iterateNonZero();
+    while (nonZeroElements.hasNext()) {
+      Vector.Element nonZeroElement = nonZeroElements.next();
+      elements[k++] = new TemporaryElement(nonZeroElement.index(), nonZeroElement.get());
+    }
+    return elements;
+  }
+
+  public static void write(Vector vector, Path path, Configuration conf) throws IOException {
+    write(vector, path, conf, false);
+  }
+
+  public static void write(Vector vector, Path path, Configuration conf, boolean laxPrecision) throws IOException {
+    FileSystem fs = FileSystem.get(path.toUri(), conf);
+    FSDataOutputStream out = fs.create(path);
+    try {
+      VectorWritable vectorWritable = new VectorWritable(vector);
+      vectorWritable.setWritesLaxPrecision(laxPrecision);
+      vectorWritable.write(out);
+    } finally {
+      Closeables.closeQuietly(out);
+    }
+  }
+
+  public static OpenIntIntHashMap readAsIntMap(Path path, Configuration conf) throws IOException {
+    FileSystem fs = FileSystem.get(path.toUri(), conf);
+    FSDataInputStream in = fs.open(path);
+    try {
+      return readAsIntMap(in);
+    } finally {
+      Closeables.closeQuietly(in);
+    }
+  }
+
+  /* ugly optimization for loading sparse vectors containing ints only */
+  public static OpenIntIntHashMap readAsIntMap(DataInput in) throws IOException {
+    int flags = in.readByte();
+    Preconditions.checkArgument(flags >> VectorWritable.NUM_FLAGS == 0, "Unknown flags set: %d", Integer.toString(flags, 2));
+    boolean dense = (flags & VectorWritable.FLAG_DENSE) != 0;
+    boolean sequential = (flags & VectorWritable.FLAG_SEQUENTIAL) != 0;
+    boolean laxPrecision = (flags & VectorWritable.FLAG_LAX_PRECISION) != 0;
+    Preconditions.checkState(!dense && !sequential, "Only for reading sparse vectors!");
+
+    Varint.readUnsignedVarInt(in);
+
+    OpenIntIntHashMap values = new OpenIntIntHashMap();
+    int numNonDefaultElements = Varint.readUnsignedVarInt(in);
+    for (int i = 0; i < numNonDefaultElements; i++) {
+      int index = Varint.readUnsignedVarInt(in);
+      double value = laxPrecision ? in.readFloat() : in.readDouble();
+      values.put(index, (int) value);
+    }
+    return values;
+  }
+
+  public static Vector read(Path path, Configuration conf) throws IOException {
+    FileSystem fs = FileSystem.get(path.toUri(), conf);
+    FSDataInputStream in = fs.open(path);
+    try {
+      return VectorWritable.readVector(in);
+    } finally {
+      Closeables.closeQuietly(in);
+    }
+  }
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CityBlockSimilarity.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CityBlockSimilarity.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CityBlockSimilarity.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CityBlockSimilarity.java Fri Sep  9 07:45:16 2011
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures;
+
+public class CityBlockSimilarity extends CountbasedMeasure {
+
+  @Override
+  public double similarity(double dots, double normA, double normB, int numberOfColumns) {
+    return 1.0 / (1.0 + normA + normB - 2 * dots);
+  }
+
+  @Override
+  public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB, double maxValueA, double treshold) {
+    return true;
+  }
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CooccurrenceCountSimilarity.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CooccurrenceCountSimilarity.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CooccurrenceCountSimilarity.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CooccurrenceCountSimilarity.java Fri Sep  9 07:45:16 2011
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures;
+
+public class CooccurrenceCountSimilarity extends CountbasedMeasure {
+
+  @Override
+  public double similarity(double dots, double normA, double normB, int numberOfColumns) {
+    return dots;
+  }
+
+  @Override
+  public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB, double maxValueA, double treshold) {
+    return numNonZeroEntriesA >= treshold && numNonZeroEntriesB >= treshold;
+  }
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CosineSimilarity.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CosineSimilarity.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CosineSimilarity.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CosineSimilarity.java Fri Sep  9 07:45:16 2011
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures;
+
+import org.apache.mahout.math.Vector;
+
+public class CosineSimilarity implements VectorSimilarityMeasure {
+
+  @Override
+  public Vector normalize(Vector vector) {
+    return vector.normalize();
+  }
+
+  @Override
+  public double norm(Vector vector) {
+    return VectorSimilarityMeasure.NO_NORM;
+  }
+
+  @Override
+  public double aggregate(double valueA, double nonZeroValueB) {
+    return valueA * nonZeroValueB;
+  }
+
+  @Override
+  public double similarity(double dots, double normA, double normB, int numberOfColumns) {
+    return dots;
+  }
+
+  @Override
+  public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB, double maxValueA, double treshold) {
+    return numNonZeroEntriesB >= treshold / maxValueA;
+  }
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CountbasedMeasure.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CountbasedMeasure.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CountbasedMeasure.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CountbasedMeasure.java Fri Sep  9 07:45:16 2011
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures;
+
+import org.apache.mahout.math.Vector;
+
+public abstract class CountbasedMeasure implements VectorSimilarityMeasure {
+
+  @Override
+  public Vector normalize(Vector vector) {
+    return vector;
+  }
+
+  @Override
+  public double norm(Vector vector) {
+    return vector.norm(0);
+  }
+
+  @Override
+  public double aggregate(double valueA, double nonZeroValueB) {
+    return 1;
+  }
+
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java Fri Sep  9 07:45:16 2011
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures;
+
+import org.apache.mahout.math.Vector;
+
+import java.util.Iterator;
+
+public class EuclideanDistanceSimilarity implements VectorSimilarityMeasure {
+
+  @Override
+  public Vector normalize(Vector vector) {
+    return vector;
+  }
+
+  @Override
+  public double norm(Vector vector) {
+    double norm = 0;
+    Iterator<Vector.Element> nonZeroElements = vector.iterateNonZero();
+    while (nonZeroElements.hasNext()) {
+      double value = nonZeroElements.next().get();
+      norm += value * value;
+    }
+    return norm;
+  }
+
+  @Override
+  public double aggregate(double valueA, double nonZeroValueB) {
+    return valueA * nonZeroValueB;
+  }
+
+  @Override
+  public double similarity(double dots, double normA, double normB, int numberOfColumns) {
+    double euclideanDistance = Math.sqrt(normA - 2 * dots + normB);
+    return 1.0 - 1.0 / (1.0 + euclideanDistance);
+  }
+
+  @Override
+  public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB, double maxValueA, double treshold) {
+    return true;
+  }
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/LoglikelihoodSimilarity.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/LoglikelihoodSimilarity.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/LoglikelihoodSimilarity.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/LoglikelihoodSimilarity.java Fri Sep  9 07:45:16 2011
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures;
+
+import org.apache.mahout.math.stats.LogLikelihood;
+
+public class LoglikelihoodSimilarity extends CountbasedMeasure {
+
+  @Override
+  public double similarity(double summedAggregations, double normA, double normB, int numberOfColumns) {
+    double logLikelihood = LogLikelihood.logLikelihoodRatio((long) summedAggregations, (long) (normB - summedAggregations),
+        (long) (normA - summedAggregations), (long) (numberOfColumns - normA - normB + summedAggregations));
+
+    return 1.0 - 1.0 / (1.0 + logLikelihood);
+  }
+
+  @Override
+  public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB, double maxValueA, double treshold) {
+    return true;
+  }
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/PearsonCorrelationSimilarity.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/PearsonCorrelationSimilarity.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/PearsonCorrelationSimilarity.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/PearsonCorrelationSimilarity.java Fri Sep  9 07:45:16 2011
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures;
+
+import org.apache.mahout.math.Vector;
+
+import java.util.Iterator;
+
+public class PearsonCorrelationSimilarity extends CosineSimilarity {
+
+  @Override
+  public Vector normalize(Vector vector) {
+    if (vector.getNumNondefaultElements() == 0) {
+      return vector;
+    }
+    // center non-zero elements
+    final double average = vector.norm(1) / vector.getNumNondefaultElements();
+    Iterator<Vector.Element> nonZeroElements = vector.iterateNonZero();
+    while (nonZeroElements.hasNext()) {
+      Vector.Element nonZeroElement = nonZeroElements.next();
+      vector.setQuick(nonZeroElement.index(), nonZeroElement.get() - average);
+    }
+    return super.normalize(vector);
+  }
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/TanimotoCoefficientSimilarity.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/TanimotoCoefficientSimilarity.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/TanimotoCoefficientSimilarity.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/TanimotoCoefficientSimilarity.java Fri Sep  9 07:45:16 2011
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures;
+
+public class TanimotoCoefficientSimilarity extends CountbasedMeasure {
+
+  @Override
+  public double similarity(double dots, double normA, double normB, int numberOfColumns) {
+    return dots / (normA + normB - dots);
+  }
+
+  @Override
+  public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB, double maxValueA, double treshold) {
+    return numNonZeroEntriesA >= numNonZeroEntriesB * treshold;
+  }
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasure.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasure.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasure.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasure.java Fri Sep  9 07:45:16 2011
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures;
+
+import org.apache.mahout.math.Vector;
+
+public interface VectorSimilarityMeasure {
+
+  public static final double NO_NORM = 0;
+
+  Vector normalize(Vector vector);
+  double norm(Vector vector);
+  double aggregate(double nonZeroValueA, double nonZeroValueB);
+  double similarity(double summedAggregations, double normA, double normB, int numberOfColumns);
+  boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB, double maxValueA, double treshold);
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasures.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasures.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasures.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasures.java Fri Sep  9 07:45:16 2011
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures;
+
+import java.util.Arrays;
+
+public enum VectorSimilarityMeasures {
+
+  SIMILARITY_COOCCURRENCE(CooccurrenceCountSimilarity.class),
+  SIMILARITY_LOGLIKELIHOOD(LoglikelihoodSimilarity.class),
+  SIMILARITY_TANIMOTO_COEFFICIENT(TanimotoCoefficientSimilarity.class),
+  SIMILARITY_CITY_BLOCK(CityBlockSimilarity.class),
+  SIMILARITY_COSINE(CosineSimilarity.class),
+  SIMILARITY_PEARSON_CORRELATION(PearsonCorrelationSimilarity.class),
+  SIMILARITY_EUCLIDEAN_DISTANCE(EuclideanDistanceSimilarity.class);
+
+  private final Class<? extends VectorSimilarityMeasure> implementingClass;
+
+  VectorSimilarityMeasures(Class<? extends VectorSimilarityMeasure> impl) {
+    this.implementingClass = impl;
+  }
+
+  public String getClassname() {
+    return implementingClass.getName();
+  }
+
+  public static String list() {
+    return Arrays.toString(values());
+  }
+
+}

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJobTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJobTest.java?rev=1167027&r1=1167026&r2=1167027&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJobTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJobTest.java Fri Sep  9 07:45:16 2011
@@ -251,10 +251,10 @@ public class ParallelALSFactorizationJob
     alsFactorization.run(new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--lambda", String.valueOf(lambda),
         "--numFeatures", String.valueOf(numFeatures), "--numIterations", String.valueOf(numIterations) });
 
-    Matrix u = MathHelper.readEntries(conf, new Path(outputDir.getAbsolutePath(), "U/part-r-00000"),
+    Matrix u = MathHelper.readMatrix(conf, new Path(outputDir.getAbsolutePath(), "U/part-r-00000"),
         preferences.numRows(), numFeatures);
-    Matrix m = MathHelper.readEntries(conf, new Path(outputDir.getAbsolutePath(), "M/part-r-00000"),
-      preferences.numCols(), numFeatures);
+    Matrix m = MathHelper.readMatrix(conf, new Path(outputDir.getAbsolutePath(), "M/part-r-00000"),
+        preferences.numCols(), numFeatures);
 
     RunningAverage avg = new FullRunningAverage();
     sliceIterator = preferences.iterateAll();

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/als/PredictionJobTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/als/PredictionJobTest.java?rev=1167027&r1=1167026&r2=1167027&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/als/PredictionJobTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/als/PredictionJobTest.java Fri Sep  9 07:45:16 2011
@@ -43,15 +43,15 @@ public class PredictionJobTest extends T
     Path inputPath = new Path(pairs.getAbsolutePath());
     FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
 
-    MathHelper.writeEntries(new double[][]{
-        new double[] {  1.5, -2,    0.3 },
-        new double[] { -0.7,  2,    0.6 },
-        new double[] { -1,    2.5,  3   } }, fs, conf, new Path(userFeatures.getAbsolutePath()));
-
-    MathHelper.writeEntries(new double [][] {
-        new double[] {  2.3,  0.5, 0   },
-        new double[] {  4.7, -1,   0.2 },
-        new double[] {  0.6,  2,   1.3 } }, fs, conf, new Path(itemFeatures.getAbsolutePath()));
+    MathHelper.writeDistributedRowMatrix(new double[][]{
+        new double[]{1.5, -2, 0.3},
+        new double[]{-0.7, 2, 0.6},
+        new double[]{-1, 2.5, 3}}, fs, conf, new Path(userFeatures.getAbsolutePath()));
+
+    MathHelper.writeDistributedRowMatrix(new double[][]{
+        new double[]{2.3, 0.5, 0},
+        new double[]{4.7, -1, 0.2},
+        new double[]{0.6, 2, 1.3}}, fs, conf, new Path(itemFeatures.getAbsolutePath()));
 
     writeLines(pairs, "0,0", "2,1", "1,0");
 

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJobTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJobTest.java?rev=1167027&r1=1167026&r2=1167027&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJobTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJobTest.java Fri Sep  9 07:45:16 2011
@@ -49,8 +49,8 @@ import org.apache.mahout.math.VarLongWri
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
 import org.apache.mahout.math.hadoop.MathHelper;
-import org.apache.mahout.math.hadoop.similarity.vector.DistributedCooccurrenceVectorSimilarity;
-import org.apache.mahout.math.hadoop.similarity.vector.DistributedTanimotoCoefficientVectorSimilarity;
+import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.CooccurrenceCountSimilarity;
+import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.TanimotoCoefficientSimilarity;
 import org.apache.mahout.math.map.OpenIntLongHashMap;
 import org.easymock.IArgumentMatcher;
 import org.easymock.EasyMock;
@@ -131,7 +131,7 @@ public class RecommenderJobTest extends 
   }
 
   /**
-   * tests {@link ToUserVectorReducer}
+   * tests {@link ToUserVectorsReducer}
    */
   @Test
   public void testToUserVectorReducer() throws Exception {
@@ -139,7 +139,7 @@ public class RecommenderJobTest extends 
       EasyMock.createMock(Reducer.Context.class);
     Counter userCounters = EasyMock.createMock(Counter.class);
 
-    EasyMock.expect(context.getCounter(ToUserVectorReducer.Counters.USERS)).andReturn(userCounters);
+    EasyMock.expect(context.getCounter(ToUserVectorsReducer.Counters.USERS)).andReturn(userCounters);
     userCounters.increment(1);
     context.write(EasyMock.eq(new VarLongWritable(12L)), MathHelper.vectorMatches(
         MathHelper.elem(TasteHadoopUtils.idToIndex(34L), 1.0), MathHelper.elem(TasteHadoopUtils.idToIndex(56L), 2.0)));
@@ -150,13 +150,13 @@ public class RecommenderJobTest extends 
     varLongWritables.add(new EntityPrefWritable(34L, 1.0f));
     varLongWritables.add(new EntityPrefWritable(56L, 2.0f));
 
-    new ToUserVectorReducer().reduce(new VarLongWritable(12L), varLongWritables, context);
+    new ToUserVectorsReducer().reduce(new VarLongWritable(12L), varLongWritables, context);
 
     EasyMock.verify(context, userCounters);
   }
 
   /**
-   * tests {@link ToUserVectorReducer} using boolean data
+   * tests {@link ToUserVectorsReducer} using boolean data
    */
   @Test
   public void testToUserVectorReducerWithBooleanData() throws Exception {
@@ -164,14 +164,14 @@ public class RecommenderJobTest extends 
       EasyMock.createMock(Reducer.Context.class);
     Counter userCounters = EasyMock.createMock(Counter.class);
 
-    EasyMock.expect(context.getCounter(ToUserVectorReducer.Counters.USERS)).andReturn(userCounters);
+    EasyMock.expect(context.getCounter(ToUserVectorsReducer.Counters.USERS)).andReturn(userCounters);
     userCounters.increment(1);
     context.write(EasyMock.eq(new VarLongWritable(12L)), MathHelper.vectorMatches(
         MathHelper.elem(TasteHadoopUtils.idToIndex(34L), 1.0), MathHelper.elem(TasteHadoopUtils.idToIndex(56L), 1.0)));
 
     EasyMock.replay(context, userCounters);
 
-    new ToUserVectorReducer().reduce(new VarLongWritable(12L), Arrays.asList(new VarLongWritable(34L),
+    new ToUserVectorsReducer().reduce(new VarLongWritable(12L), Arrays.asList(new VarLongWritable(34L),
         new VarLongWritable(56L)), context);
 
     EasyMock.verify(context, userCounters);
@@ -728,7 +728,7 @@ public class RecommenderJobTest extends 
     recommenderJob.setConf(conf);
 
     recommenderJob.run(new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--similarityClassname",
-       DistributedTanimotoCoefficientVectorSimilarity.class.getName(), "--numRecommendations", "4" });
+       TanimotoCoefficientSimilarity.class.getName(), "--numRecommendations", "4" });
 
     Map<Long,List<RecommendedItem>> recommendations = readRecommendations(new File(outputDir, "part-r-00000"));
 
@@ -804,7 +804,7 @@ public class RecommenderJobTest extends 
     recommenderJob.setConf(conf);
 
     recommenderJob.run(new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--similarityClassname",
-        DistributedCooccurrenceVectorSimilarity.class.getName(), "--booleanData", "true",
+        CooccurrenceCountSimilarity.class.getName(), "--booleanData", "true",
         "--usersFile", usersFile.getAbsolutePath() });
 
     Map<Long,List<RecommendedItem>> recommendations = readRecommendations(new File(outputDir, "part-r-00000"));
@@ -865,7 +865,7 @@ public class RecommenderJobTest extends 
      recommenderJob.setConf(conf);
 
      recommenderJob.run(new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--similarityClassname",
-        DistributedTanimotoCoefficientVectorSimilarity.class.getName(), "--numRecommendations", "1",
+        TanimotoCoefficientSimilarity.class.getName(), "--numRecommendations", "1",
         "--usersFile", userFile.getAbsolutePath(), "--filterFile", filterFile.getAbsolutePath() });
 
      Map<Long,List<RecommendedItem>> recommendations = readRecommendations(new File(outputDir, "part-r-00000"));

Copied: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorsReducerTest.java (from r1164967, mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducerTest.java)
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorsReducerTest.java?p2=mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorsReducerTest.java&p1=mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducerTest.java&r1=1164967&r2=1167027&rev=1167027&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducerTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorsReducerTest.java Fri Sep  9 07:45:16 2011
@@ -30,16 +30,16 @@ import org.junit.Test;
 import java.util.Arrays;
 
 /**
- * tests {@link org.apache.mahout.cf.taste.hadoop.item.ToUserVectorReducer}
+ * tests {@link ToUserVectorsReducer}
  */
-public class ToUserVectorReducerTest extends TasteTestCase {
+public class ToUserVectorsReducerTest extends TasteTestCase {
 
   @Test
   public void testToUsersReducerMinPreferencesUserIgnored() throws Exception {
     Reducer<VarLongWritable,VarLongWritable,VarLongWritable,VectorWritable>.Context context =
         EasyMock.createMock(Reducer.Context.class);
 
-    ToUserVectorReducer reducer = new ToUserVectorReducer();
+    ToUserVectorsReducer reducer = new ToUserVectorsReducer();
     setField(reducer, "minPreferences", 2);
 
     EasyMock.replay(context);
@@ -55,10 +55,10 @@ public class ToUserVectorReducerTest ext
         EasyMock.createMock(Reducer.Context.class);
     Counter userCounters = EasyMock.createMock(Counter.class);
 
-    ToUserVectorReducer reducer = new ToUserVectorReducer();
+    ToUserVectorsReducer reducer = new ToUserVectorsReducer();
     setField(reducer, "minPreferences", 2);
 
-    EasyMock.expect(context.getCounter(ToUserVectorReducer.Counters.USERS)).andReturn(userCounters);
+    EasyMock.expect(context.getCounter(ToUserVectorsReducer.Counters.USERS)).andReturn(userCounters);
     userCounters.increment(1);
     context.write(EasyMock.eq(new VarLongWritable(123)), MathHelper.vectorMatches(
         MathHelper.elem(TasteHadoopUtils.idToIndex(456L), 1.0), MathHelper.elem(TasteHadoopUtils.idToIndex(789L), 1.0)));

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java?rev=1167027&r1=1167026&r2=1167027&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java Fri Sep  9 07:45:16 2011
@@ -21,27 +21,22 @@ import java.io.BufferedReader;
 import java.io.File;
 import java.io.FilenameFilter;
 import java.util.Arrays;
-import java.util.List;
 
 import com.google.common.base.Charsets;
 import com.google.common.io.Files;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.DoubleWritable;
 import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.Reducer;
 import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable;
 import org.apache.mahout.cf.taste.impl.TasteTestCase;
 import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.VarIntWritable;
-import org.apache.mahout.math.VarLongWritable;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.math.hadoop.similarity.vector.DistributedTanimotoCoefficientVectorSimilarity;
-import org.apache.mahout.math.hadoop.similarity.vector.DistributedUncenteredZeroAssumingCosineVectorSimilarity;
+import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.CosineSimilarity;
+import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.TanimotoCoefficientSimilarity;
 import org.apache.mahout.math.map.OpenIntLongHashMap;
-import org.easymock.IArgumentMatcher;
 import org.easymock.EasyMock;
 import org.junit.Test;
 
@@ -138,7 +133,7 @@ public final class ItemSimilarityJobTest
     similarityJob.setConf(conf);
 
     similarityJob.run(new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--similarityClassname",
-       DistributedUncenteredZeroAssumingCosineVectorSimilarity.class.getName() });
+       CosineSimilarity.class.getName() });
 
     File outPart = outputDir.listFiles(new FilenameFilter() {
       @Override
@@ -234,7 +229,7 @@ public final class ItemSimilarityJobTest
     similarityJob.setConf(conf);
 
     similarityJob.run(new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--similarityClassname",
-        DistributedTanimotoCoefficientVectorSimilarity.class.getName(), "--maxSimilaritiesPerItem", "1" });
+        TanimotoCoefficientSimilarity.class.getName(), "--maxSimilaritiesPerItem", "1" });
 
     File outPart = outputDir.listFiles(new FilenameFilter() {
       @Override

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/graph/linkanalysis/PageRankJobTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/graph/linkanalysis/PageRankJobTest.java?rev=1167027&r1=1167026&r2=1167027&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/graph/linkanalysis/PageRankJobTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/graph/linkanalysis/PageRankJobTest.java Fri Sep  9 07:45:16 2011
@@ -100,7 +100,7 @@ public class PageRankJobTest extends Gra
         { 0.266666667, 0,   0.8, 0.4 },
         { 0.266666667, 0.4, 0,   0 } });
 
-    Matrix actualTransitionMatrix = MathHelper.readEntries(conf, new Path(tempDir.getAbsolutePath(),
+    Matrix actualTransitionMatrix = MathHelper.readMatrix(conf, new Path(tempDir.getAbsolutePath(),
         "transitionMatrix/part-r-00000"), numVertices, numVertices);
 
     assertEquals(expectedTransitionMatrix, actualTransitionMatrix);

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/MathHelper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/MathHelper.java?rev=1167027&r1=1167026&r2=1167027&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/MathHelper.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/MathHelper.java Fri Sep  9 07:45:16 2011
@@ -168,10 +168,10 @@ public final class MathHelper {
   /**
    * read a {@link Matrix} from a SequenceFile<IntWritable,VectorWritable>
    */
-  public static Matrix readEntries(Configuration conf, Path path, int rows, int columns) {
+  public static Matrix readMatrix(Configuration conf, Path path, int rows, int columns) {
     Matrix matrix = new DenseMatrix(rows, columns);
     for (Pair<IntWritable,VectorWritable> record :
-         new SequenceFileIterable<IntWritable,VectorWritable>(path, true, conf)) {
+        new SequenceFileIterable<IntWritable,VectorWritable>(path, true, conf)) {
       IntWritable key = record.getFirst();
       VectorWritable value = record.getSecond();
       int row = key.get();
@@ -187,7 +187,7 @@ public final class MathHelper {
   /**
    * write a two-dimensional double array to an SequenceFile<IntWritable,VectorWritable>
    */
-  public static void writeEntries(double[][] entries, FileSystem fs, Configuration conf, Path path)
+  public static void writeDistributedRowMatrix(double[][] entries, FileSystem fs, Configuration conf, Path path)
       throws IOException {
     SequenceFile.Writer writer = null;
     try {

Added: mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJobTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJobTest.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJobTest.java (added)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJobTest.java Fri Sep  9 07:45:16 2011
@@ -0,0 +1,181 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.hadoop.MathHelper;
+import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.TanimotoCoefficientSimilarity;
+import org.junit.Test;
+
+import java.io.File;
+
+public class RowSimilarityJobTest extends MahoutTestCase {
+
+  /**
+   * integration test with a tiny data set
+   *
+   * <pre>
+   *
+   * input matrix:
+   *
+   * 1, 0, 1, 1, 0
+   * 0, 0, 1, 1, 0
+   * 0, 0, 0, 0, 1
+   *
+   * similarity matrix (via tanimoto):
+   *
+   * 1,     0.666, 0
+   * 0.666, 1,     0
+   * 0,     0,     1
+   * </pre>
+   */
+  @Test
+  public void toyIntegration() throws Exception {
+
+    File inputFile = getTestTempFile("rows");
+    File outputDir = getTestTempDir("output");
+    outputDir.delete();
+    File tmpDir = getTestTempDir("tmp");
+
+    Configuration conf = new Configuration();
+    Path inputPath = new Path(inputFile.getAbsolutePath());
+    FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
+
+    MathHelper.writeDistributedRowMatrix(new double[][] {
+        new double[] { 1, 0, 1, 1, 0 },
+        new double[] { 0, 0, 1, 1, 0 },
+        new double[] { 0, 0, 0, 0, 1 } },
+        fs, conf, inputPath);
+
+    RowSimilarityJob rowSimilarityJob = new RowSimilarityJob();
+    rowSimilarityJob.setConf(conf);
+    rowSimilarityJob.run(new String[] { "--input", inputFile.getAbsolutePath(), "--output", outputDir.getAbsolutePath(),
+        "--numberOfColumns", String.valueOf(5), "--similarityClassname", TanimotoCoefficientSimilarity.class.getName(),
+        "--tempDir", tmpDir.getAbsolutePath() });
+
+    Matrix similarityMatrix = MathHelper.readMatrix(conf, new Path(outputDir.getAbsolutePath(), "part-r-00000"), 3, 3);
+
+    assertNotNull(similarityMatrix);
+    assertEquals(3, similarityMatrix.numCols());
+    assertEquals(3, similarityMatrix.numRows());
+
+    assertEquals(1.0, similarityMatrix.get(0, 0), EPSILON);
+    assertEquals(1.0, similarityMatrix.get(1, 1), EPSILON);
+    assertEquals(1.0, similarityMatrix.get(2, 2), EPSILON);
+    assertEquals(0.0, similarityMatrix.get(2, 0), EPSILON);
+    assertEquals(0.0, similarityMatrix.get(2, 1), EPSILON);
+    assertEquals(0.0, similarityMatrix.get(0, 2), EPSILON);
+    assertEquals(0.0, similarityMatrix.get(1, 2), EPSILON);
+    assertEquals(0.666666, similarityMatrix.get(0, 1), EPSILON);
+    assertEquals(0.666666, similarityMatrix.get(1, 0), EPSILON);
+  }
+
+  @Test
+  public void toyIntegrationMaxSimilaritiesPerRow() throws Exception {
+
+    File inputFile = getTestTempFile("rows");
+    File outputDir = getTestTempDir("output");
+    outputDir.delete();
+    File tmpDir = getTestTempDir("tmp");
+
+    Configuration conf = new Configuration();
+    Path inputPath = new Path(inputFile.getAbsolutePath());
+    FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
+
+    MathHelper.writeDistributedRowMatrix(new double[][]{
+        new double[] { 1, 0, 1, 1, 0, 1 },
+        new double[] { 0, 1, 1, 1, 1, 1 },
+        new double[] { 1, 1, 0, 1, 0, 0 } },
+        fs, conf, inputPath);
+
+    RowSimilarityJob rowSimilarityJob = new RowSimilarityJob();
+    rowSimilarityJob.setConf(conf);
+    rowSimilarityJob.run(new String[] { "--input", inputFile.getAbsolutePath(), "--output", outputDir.getAbsolutePath(),
+        "--numberOfColumns", String.valueOf(6), "--similarityClassname", TanimotoCoefficientSimilarity.class.getName(),
+        "--maxSimilaritiesPerRow", String.valueOf(1), "--excludeSelfSimilarity", String.valueOf(true),
+        "--tempDir", tmpDir.getAbsolutePath() });
+
+    Matrix similarityMatrix = MathHelper.readMatrix(conf, new Path(outputDir.getAbsolutePath(), "part-r-00000"), 3, 3);
+
+    assertNotNull(similarityMatrix);
+    assertEquals(3, similarityMatrix.numCols());
+    assertEquals(3, similarityMatrix.numRows());
+
+    assertEquals(0.0, similarityMatrix.get(0, 0), EPSILON);
+    assertEquals(0.5, similarityMatrix.get(0, 1), EPSILON);
+    assertEquals(0.0, similarityMatrix.get(0, 2), EPSILON);
+
+    assertEquals(0.5, similarityMatrix.get(1, 0), EPSILON);
+    assertEquals(0.0, similarityMatrix.get(1, 1), EPSILON);
+    assertEquals(0.0, similarityMatrix.get(1, 2), EPSILON);
+
+    assertEquals(0.4, similarityMatrix.get(2, 0), EPSILON);
+    assertEquals(0.0, similarityMatrix.get(2, 1), EPSILON);
+    assertEquals(0.0, similarityMatrix.get(2, 2), EPSILON);
+  }
+
+  @Test
+  public void toyIntegrationWithThreshold() throws Exception {
+
+
+    File inputFile = getTestTempFile("rows");
+    File outputDir = getTestTempDir("output");
+    outputDir.delete();
+    File tmpDir = getTestTempDir("tmp");
+
+    Configuration conf = new Configuration();
+    Path inputPath = new Path(inputFile.getAbsolutePath());
+    FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
+
+    MathHelper.writeDistributedRowMatrix(new double[][]{
+        new double[] { 1, 0, 1, 1, 0, 1 },
+        new double[] { 0, 1, 1, 1, 1, 1 },
+        new double[] { 1, 1, 0, 1, 0, 0 } },
+        fs, conf, inputPath);
+
+    RowSimilarityJob rowSimilarityJob = new RowSimilarityJob();
+    rowSimilarityJob.setConf(conf);
+    rowSimilarityJob.run(new String[] { "--input", inputFile.getAbsolutePath(), "--output", outputDir.getAbsolutePath(),
+        "--numberOfColumns", String.valueOf(6), "--similarityClassname", TanimotoCoefficientSimilarity.class.getName(),
+        "--excludeSelfSimilarity", String.valueOf(true), "--threshold", String.valueOf(0.5),
+        "--tempDir", tmpDir.getAbsolutePath() });
+
+    Matrix similarityMatrix = MathHelper.readMatrix(conf, new Path(outputDir.getAbsolutePath(), "part-r-00000"), 3, 3);
+
+    assertNotNull(similarityMatrix);
+    assertEquals(3, similarityMatrix.numCols());
+    assertEquals(3, similarityMatrix.numRows());
+
+    assertEquals(0.0, similarityMatrix.get(0, 0), EPSILON);
+    assertEquals(0.5, similarityMatrix.get(0, 1), EPSILON);
+    assertEquals(0.0, similarityMatrix.get(0, 2), EPSILON);
+
+    assertEquals(0.5, similarityMatrix.get(1, 0), EPSILON);
+    assertEquals(0.0, similarityMatrix.get(1, 1), EPSILON);
+    assertEquals(0.0, similarityMatrix.get(1, 2), EPSILON);
+
+    assertEquals(0.0, similarityMatrix.get(2, 0), EPSILON);
+    assertEquals(0.0, similarityMatrix.get(2, 1), EPSILON);
+    assertEquals(0.0, similarityMatrix.get(2, 2), EPSILON);
+  }
+
+}

Added: mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasuresTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasuresTest.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasuresTest.java (added)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasuresTest.java Fri Sep  9 07:45:16 2011
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures;
+
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.junit.Test;
+
+public class VectorSimilarityMeasuresTest extends MahoutTestCase {
+
+  double distributedSimilarity(double[] one, double[] two, Class<? extends VectorSimilarityMeasure> similarityMeasureClass) {
+    VectorSimilarityMeasure similarityMeasure = ClassUtils.instantiateAs(similarityMeasureClass,
+        VectorSimilarityMeasure.class);
+    Vector oneNormalized = similarityMeasure.normalize(asSparseVector(one));
+    Vector twoNormalized = similarityMeasure.normalize(asSparseVector(two));
+
+    double normOne = similarityMeasure.norm(oneNormalized);
+    double normTwo = similarityMeasure.norm(twoNormalized);
+
+    double dot = 0;
+    for (int n = 0; n < one.length; n++) {
+      if (oneNormalized.get(n) != 0 && twoNormalized.get(n) != 0) {
+        dot += similarityMeasure.aggregate(oneNormalized.get(n), twoNormalized.get(n));
+      }
+    }
+    return similarityMeasure.similarity(dot, normOne, normTwo, one.length);
+  }
+
+  Vector asSparseVector(double[] values) {
+    Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE);
+    for (int dim = 0; dim < values.length; dim++) {
+      if (values[dim] != 0) {
+        vector.setQuick(dim, values[dim]);
+      }
+    }
+    return vector;
+  }
+
+  @Test
+  public void testCooccurrenceCountSimilarity() {
+    double similarity = distributedSimilarity(
+        new double[] { 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0 },
+        new double[] { 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, CooccurrenceCountSimilarity.class);
+
+    assertEquals(5d, similarity, 0);
+  }
+
+  @Test
+  public void testTanimotoCoefficientSimilarity() {
+    double similarity = distributedSimilarity(
+        new double[] { 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0 },
+        new double[] { 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, TanimotoCoefficientSimilarity.class);
+
+    assertEquals(0.454545455, similarity, EPSILON);
+  }
+
+  @Test
+  public void testCityblockSimilarity() {
+    double similarity = distributedSimilarity(
+        new double[] { 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0 },
+        new double[] { 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, CityBlockSimilarity.class);
+
+    assertEquals(0.142857143, similarity, EPSILON);
+  }
+
+  @Test
+  public void testLoglikelihoodSimilarity() {
+    double similarity = distributedSimilarity(
+        new double[] { 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0 },
+        new double[] { 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, LoglikelihoodSimilarity.class);
+
+    assertEquals(0.03320155369284261, similarity, EPSILON);
+  }
+
+  @Test
+  public void testCosineSimilarity() {
+    double similarity = distributedSimilarity(
+        new double[] { 0, 2, 0, 0, 8, 3, 0, 6, 0, 1, 2, 2, 0 },
+        new double[] { 3, 0, 0, 0, 7, 0, 2, 2, 1, 3, 2, 1, 1 }, CosineSimilarity.class);
+
+    assertEquals(0.769846046, similarity, EPSILON);
+  }
+
+  @Test
+  public void testPearsonCorrelationSimilarity() {
+    double similarity = distributedSimilarity(
+        new double[] { 0, 2, 0, 0, 8, 3, 0, 6, 0, 1, 1, 2, 1 },
+        new double[] { 3, 0, 0, 0, 7, 0, 2, 2, 1, 3, 2, 4, 3 }, PearsonCorrelationSimilarity.class);
+
+    assertEquals(0.5303300858899108, similarity, EPSILON);
+  }
+
+  @Test
+  public void testEuclideanDistanceSimilarity() {
+    double similarity = distributedSimilarity(
+        new double[] { 0, 2, 0, 0, 8, 3, 0, 6, 0, 1, 1, 2, 1 },
+        new double[] { 3, 0, 0, 0, 7, 0, 2, 2, 1, 3, 2, 4, 4 }, EuclideanDistanceSimilarity.class);
+
+    assertEquals(0.887311346, similarity, EPSILON);
+  }
+}

Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/utils/eval/ParallelFactorizationEvaluatorTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/eval/ParallelFactorizationEvaluatorTest.java?rev=1167027&r1=1167026&r2=1167027&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/utils/eval/ParallelFactorizationEvaluatorTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/utils/eval/ParallelFactorizationEvaluatorTest.java Fri Sep  9 07:45:16 2011
@@ -46,15 +46,15 @@ public class ParallelFactorizationEvalua
     Path inputPath = new Path(pairs.getAbsolutePath());
     FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
 
-    MathHelper.writeEntries(new double[][] {
-        new double[] {  1.5, -2,   0.3 },
-        new double[] { -0.7,  2,   0.6 },
-        new double[] { -1,    2.5, 3   } }, fs, conf, new Path(userFeatures.getAbsolutePath()));
-
-    MathHelper.writeEntries(new double [][] {
-        new double[] {  2.3,  0.5, 0   },
-        new double[] {  4.7, -1,   0.2 },
-        new double[] {  0.6,  2,   1.3 } }, fs, conf, new Path(itemFeatures.getAbsolutePath()));
+    MathHelper.writeDistributedRowMatrix(new double[][]{
+        new double[]{1.5, -2, 0.3},
+        new double[]{-0.7, 2, 0.6},
+        new double[]{-1, 2.5, 3}}, fs, conf, new Path(userFeatures.getAbsolutePath()));
+
+    MathHelper.writeDistributedRowMatrix(new double[][]{
+        new double[]{2.3, 0.5, 0},
+        new double[]{4.7, -1, 0.2},
+        new double[]{0.6, 2, 1.3}}, fs, conf, new Path(itemFeatures.getAbsolutePath()));
 
     writeLines(pairs, "0,0,3", "2,1,-7", "1,0,-2");
 
@@ -74,4 +74,4 @@ public class ParallelFactorizationEvalua
     }
 
   }
-}
\ No newline at end of file
+}

Modified: mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractVector.java
URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractVector.java?rev=1167027&r1=1167026&r2=1167027&view=diff
==============================================================================
--- mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractVector.java (original)
+++ mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractVector.java Fri Sep  9 07:45:16 2011
@@ -20,6 +20,7 @@ package org.apache.mahout.math;
 import org.apache.mahout.common.RandomUtils;
 import org.apache.mahout.math.function.DoubleDoubleFunction;
 import org.apache.mahout.math.function.DoubleFunction;
+import org.apache.mahout.math.function.Functions;
 
 import java.util.Iterator;
 
@@ -515,8 +516,18 @@ public abstract class AbstractVector imp
     if (size != other.size()) {
       throw new CardinalityException(size, other.size());
     }
-    for (int i = 0; i < size; i++) {
-      setQuick(i, function.apply(getQuick(i), other.getQuick(i)));
+
+    /* special case: we only need to iterate over the non-zero elements of the vector to add */
+    if (Functions.PLUS.equals(function)) {
+      Iterator<Vector.Element> nonZeroElements = other.iterateNonZero();
+      while (nonZeroElements.hasNext()) {
+        Vector.Element e = nonZeroElements.next();
+        setQuick(e.index(), function.apply(getQuick(e.index()), e.get()));
+      }
+    } else {
+      for (int i = 0; i < size; i++) {
+        setQuick(i, function.apply(getQuick(i), other.getQuick(i)));
+      }
     }
     return this;
   }

Modified: mahout/trunk/pom.xml
URL: http://svn.apache.org/viewvc/mahout/trunk/pom.xml?rev=1167027&r1=1167026&r2=1167027&view=diff
==============================================================================
--- mahout/trunk/pom.xml (original)
+++ mahout/trunk/pom.xml Fri Sep  9 07:45:16 2011
@@ -586,7 +586,7 @@
         <artifactId>maven-surefire-plugin</artifactId>
         <configuration>
           <forkMode>once</forkMode>
-          <argLine>-Xms256m -Xmx512m</argLine>
+          <argLine>-Xms256m -Xmx768m</argLine>
           <testFailureIgnore>false</testFailureIgnore>
           <redirectTestOutputToFile>true</redirectTestOutputToFile>
         </configuration>