You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ss...@apache.org on 2011/09/09 09:45:17 UTC
svn commit: r1167027 [2/2] - in /mahout/trunk: ./
core/src/main/java/org/apache/mahout/cf/taste/hadoop/
core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/
core/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/
core/src/main/java/org/...
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/Vectors.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/Vectors.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/Vectors.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/Vectors.java Fri Sep 9 07:45:16 2011
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence;
+
+import com.google.common.base.Preconditions;
+import com.google.common.io.Closeables;
+import com.google.common.primitives.Doubles;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.cf.taste.common.TopK;
+import org.apache.mahout.common.iterator.FixedSizeSamplingIterator;
+import org.apache.mahout.math.Varint;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.map.OpenDoubleIntHashMap;
+import org.apache.mahout.math.map.OpenIntDoubleHashMap;
+import org.apache.mahout.math.map.OpenIntIntHashMap;
+
+import java.io.DataInput;
+import java.io.IOException;
+import java.util.Comparator;
+import java.util.Iterator;
+
+public class Vectors {
+
+ private Vectors() {}
+
+ public static Vector maybeSample(Vector original, int sampleSize) {
+ if (original.getNumNondefaultElements() <= sampleSize) {
+ return original;
+ }
+ Vector sample = original.like();
+ Iterator<Vector.Element> sampledElements =
+ new FixedSizeSamplingIterator<Vector.Element>(sampleSize, original.iterateNonZero());
+ while (sampledElements.hasNext()) {
+ Vector.Element elem = sampledElements.next();
+ sample.setQuick(elem.index(), elem.get());
+ }
+ return sample;
+ }
+
+ public static Vector topKElements(int k, Vector original) {
+ if (original.getNumNondefaultElements() <= k) {
+ return original;
+ }
+ TopK<Vector.Element> topKQueue = new TopK<Vector.Element>(k, BY_VALUE);
+ Iterator<Vector.Element> nonZeroElements = original.iterateNonZero();
+ while (nonZeroElements.hasNext()) {
+ Vector.Element nonZeroElement = nonZeroElements.next();
+ topKQueue.offer(new Vectors.TemporaryElement(nonZeroElement));
+ }
+ Vector topKSimilarities = original.like();
+ for (Vector.Element topKSimilarity : topKQueue.retrieve()) {
+ topKSimilarities.setQuick(topKSimilarity.index(), topKSimilarity.get());
+ }
+ return topKSimilarities;
+ }
+
+ public static Vector merge(Iterable<VectorWritable> partialVectors) {
+ Iterator<VectorWritable> vectors = partialVectors.iterator();
+ Vector accumulator = vectors.next().get();
+ while (vectors.hasNext()) {
+ VectorWritable v = vectors.next();
+ if (v != null) {
+ Iterator<Vector.Element> nonZeroElements = v.get().iterateNonZero();
+ while (nonZeroElements.hasNext()) {
+ Vector.Element nonZeroElement = nonZeroElements.next();
+ accumulator.setQuick(nonZeroElement.index(), nonZeroElement.get());
+ }
+ }
+ }
+ return accumulator;
+ }
+
+ static final Comparator<Vector.Element> BY_VALUE = new Comparator<Vector.Element>() {
+ @Override
+ public int compare(Vector.Element elem1, Vector.Element elem2) {
+ return Doubles.compare(elem1.get(), elem2.get());
+ }
+ };
+
+ static class TemporaryElement implements Vector.Element {
+
+ private int index;
+ private double value;
+
+ TemporaryElement(int index, double value) {
+ this.index = index;
+ this.value = value;
+ }
+
+ TemporaryElement(Vector.Element toClone) {
+ this(toClone.index(), toClone.get());
+ }
+
+ @Override
+ public double get() {
+ return value;
+ }
+
+ @Override
+ public int index() {
+ return index;
+ }
+
+ @Override
+ public void set(double value) {
+ this.value = value;
+ }
+ }
+
+ public static Vector.Element[] toArray(VectorWritable vectorWritable) {
+ Vector.Element[] elements = new Vector.Element[vectorWritable.get().getNumNondefaultElements()];
+ int k = 0;
+ Iterator<Vector.Element> nonZeroElements = vectorWritable.get().iterateNonZero();
+ while (nonZeroElements.hasNext()) {
+ Vector.Element nonZeroElement = nonZeroElements.next();
+ elements[k++] = new TemporaryElement(nonZeroElement.index(), nonZeroElement.get());
+ }
+ return elements;
+ }
+
+ public static void write(Vector vector, Path path, Configuration conf) throws IOException {
+ write(vector, path, conf, false);
+ }
+
+ public static void write(Vector vector, Path path, Configuration conf, boolean laxPrecision) throws IOException {
+ FileSystem fs = FileSystem.get(path.toUri(), conf);
+ FSDataOutputStream out = fs.create(path);
+ try {
+ VectorWritable vectorWritable = new VectorWritable(vector);
+ vectorWritable.setWritesLaxPrecision(laxPrecision);
+ vectorWritable.write(out);
+ } finally {
+ Closeables.closeQuietly(out);
+ }
+ }
+
+ public static OpenIntIntHashMap readAsIntMap(Path path, Configuration conf) throws IOException {
+ FileSystem fs = FileSystem.get(path.toUri(), conf);
+ FSDataInputStream in = fs.open(path);
+ try {
+ return readAsIntMap(in);
+ } finally {
+ Closeables.closeQuietly(in);
+ }
+ }
+
+ /* ugly optimization for loading sparse vectors containing ints only */
+ public static OpenIntIntHashMap readAsIntMap(DataInput in) throws IOException {
+ int flags = in.readByte();
+ Preconditions.checkArgument(flags >> VectorWritable.NUM_FLAGS == 0, "Unknown flags set: %d", Integer.toString(flags, 2));
+ boolean dense = (flags & VectorWritable.FLAG_DENSE) != 0;
+ boolean sequential = (flags & VectorWritable.FLAG_SEQUENTIAL) != 0;
+ boolean laxPrecision = (flags & VectorWritable.FLAG_LAX_PRECISION) != 0;
+ Preconditions.checkState(!dense && !sequential, "Only for reading sparse vectors!");
+
+ Varint.readUnsignedVarInt(in);
+
+ OpenIntIntHashMap values = new OpenIntIntHashMap();
+ int numNonDefaultElements = Varint.readUnsignedVarInt(in);
+ for (int i = 0; i < numNonDefaultElements; i++) {
+ int index = Varint.readUnsignedVarInt(in);
+ double value = laxPrecision ? in.readFloat() : in.readDouble();
+ values.put(index, (int) value);
+ }
+ return values;
+ }
+
+ public static Vector read(Path path, Configuration conf) throws IOException {
+ FileSystem fs = FileSystem.get(path.toUri(), conf);
+ FSDataInputStream in = fs.open(path);
+ try {
+ return VectorWritable.readVector(in);
+ } finally {
+ Closeables.closeQuietly(in);
+ }
+ }
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CityBlockSimilarity.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CityBlockSimilarity.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CityBlockSimilarity.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CityBlockSimilarity.java Fri Sep 9 07:45:16 2011
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures;
+
+public class CityBlockSimilarity extends CountbasedMeasure {
+
+ @Override
+ public double similarity(double dots, double normA, double normB, int numberOfColumns) {
+ return 1.0 / (1.0 + normA + normB - 2 * dots);
+ }
+
+ @Override
+ public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB, double maxValueA, double treshold) {
+ return true;
+ }
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CooccurrenceCountSimilarity.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CooccurrenceCountSimilarity.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CooccurrenceCountSimilarity.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CooccurrenceCountSimilarity.java Fri Sep 9 07:45:16 2011
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures;
+
+public class CooccurrenceCountSimilarity extends CountbasedMeasure {
+
+ @Override
+ public double similarity(double dots, double normA, double normB, int numberOfColumns) {
+ return dots;
+ }
+
+ @Override
+ public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB, double maxValueA, double treshold) {
+ return numNonZeroEntriesA >= treshold && numNonZeroEntriesB >= treshold;
+ }
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CosineSimilarity.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CosineSimilarity.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CosineSimilarity.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CosineSimilarity.java Fri Sep 9 07:45:16 2011
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures;
+
+import org.apache.mahout.math.Vector;
+
+public class CosineSimilarity implements VectorSimilarityMeasure {
+
+ @Override
+ public Vector normalize(Vector vector) {
+ return vector.normalize();
+ }
+
+ @Override
+ public double norm(Vector vector) {
+ return VectorSimilarityMeasure.NO_NORM;
+ }
+
+ @Override
+ public double aggregate(double valueA, double nonZeroValueB) {
+ return valueA * nonZeroValueB;
+ }
+
+ @Override
+ public double similarity(double dots, double normA, double normB, int numberOfColumns) {
+ return dots;
+ }
+
+ @Override
+ public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB, double maxValueA, double treshold) {
+ return numNonZeroEntriesB >= treshold / maxValueA;
+ }
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CountbasedMeasure.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CountbasedMeasure.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CountbasedMeasure.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/CountbasedMeasure.java Fri Sep 9 07:45:16 2011
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures;
+
+import org.apache.mahout.math.Vector;
+
+public abstract class CountbasedMeasure implements VectorSimilarityMeasure {
+
+ @Override
+ public Vector normalize(Vector vector) {
+ return vector;
+ }
+
+ @Override
+ public double norm(Vector vector) {
+ return vector.norm(0);
+ }
+
+ @Override
+ public double aggregate(double valueA, double nonZeroValueB) {
+ return 1;
+ }
+
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java Fri Sep 9 07:45:16 2011
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures;
+
+import org.apache.mahout.math.Vector;
+
+import java.util.Iterator;
+
+public class EuclideanDistanceSimilarity implements VectorSimilarityMeasure {
+
+ @Override
+ public Vector normalize(Vector vector) {
+ return vector;
+ }
+
+ @Override
+ public double norm(Vector vector) {
+ double norm = 0;
+ Iterator<Vector.Element> nonZeroElements = vector.iterateNonZero();
+ while (nonZeroElements.hasNext()) {
+ double value = nonZeroElements.next().get();
+ norm += value * value;
+ }
+ return norm;
+ }
+
+ @Override
+ public double aggregate(double valueA, double nonZeroValueB) {
+ return valueA * nonZeroValueB;
+ }
+
+ @Override
+ public double similarity(double dots, double normA, double normB, int numberOfColumns) {
+ double euclideanDistance = Math.sqrt(normA - 2 * dots + normB);
+ return 1.0 - 1.0 / (1.0 + euclideanDistance);
+ }
+
+ @Override
+ public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB, double maxValueA, double treshold) {
+ return true;
+ }
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/LoglikelihoodSimilarity.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/LoglikelihoodSimilarity.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/LoglikelihoodSimilarity.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/LoglikelihoodSimilarity.java Fri Sep 9 07:45:16 2011
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures;
+
+import org.apache.mahout.math.stats.LogLikelihood;
+
+public class LoglikelihoodSimilarity extends CountbasedMeasure {
+
+ @Override
+ public double similarity(double summedAggregations, double normA, double normB, int numberOfColumns) {
+ double logLikelihood = LogLikelihood.logLikelihoodRatio((long) summedAggregations, (long) (normB - summedAggregations),
+ (long) (normA - summedAggregations), (long) (numberOfColumns - normA - normB + summedAggregations));
+
+ return 1.0 - 1.0 / (1.0 + logLikelihood);
+ }
+
+ @Override
+ public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB, double maxValueA, double treshold) {
+ return true;
+ }
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/PearsonCorrelationSimilarity.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/PearsonCorrelationSimilarity.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/PearsonCorrelationSimilarity.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/PearsonCorrelationSimilarity.java Fri Sep 9 07:45:16 2011
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures;
+
+import org.apache.mahout.math.Vector;
+
+import java.util.Iterator;
+
+public class PearsonCorrelationSimilarity extends CosineSimilarity {
+
+ @Override
+ public Vector normalize(Vector vector) {
+ if (vector.getNumNondefaultElements() == 0) {
+ return vector;
+ }
+ // center non-zero elements
+ final double average = vector.norm(1) / vector.getNumNondefaultElements();
+ Iterator<Vector.Element> nonZeroElements = vector.iterateNonZero();
+ while (nonZeroElements.hasNext()) {
+ Vector.Element nonZeroElement = nonZeroElements.next();
+ vector.setQuick(nonZeroElement.index(), nonZeroElement.get() - average);
+ }
+ return super.normalize(vector);
+ }
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/TanimotoCoefficientSimilarity.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/TanimotoCoefficientSimilarity.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/TanimotoCoefficientSimilarity.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/TanimotoCoefficientSimilarity.java Fri Sep 9 07:45:16 2011
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures;
+
+public class TanimotoCoefficientSimilarity extends CountbasedMeasure {
+
+ @Override
+ public double similarity(double dots, double normA, double normB, int numberOfColumns) {
+ return dots / (normA + normB - dots);
+ }
+
+ @Override
+ public boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB, double maxValueA, double treshold) {
+ return numNonZeroEntriesA >= numNonZeroEntriesB * treshold;
+ }
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasure.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasure.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasure.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasure.java Fri Sep 9 07:45:16 2011
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures;
+
+import org.apache.mahout.math.Vector;
+
+public interface VectorSimilarityMeasure {
+
+ public static final double NO_NORM = 0;
+
+ Vector normalize(Vector vector);
+ double norm(Vector vector);
+ double aggregate(double nonZeroValueA, double nonZeroValueB);
+ double similarity(double summedAggregations, double normA, double normB, int numberOfColumns);
+ boolean consider(int numNonZeroEntriesA, int numNonZeroEntriesB, double maxValueA, double treshold);
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasures.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasures.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasures.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasures.java Fri Sep 9 07:45:16 2011
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures;
+
+import java.util.Arrays;
+
+public enum VectorSimilarityMeasures {
+
+ SIMILARITY_COOCCURRENCE(CooccurrenceCountSimilarity.class),
+ SIMILARITY_LOGLIKELIHOOD(LoglikelihoodSimilarity.class),
+ SIMILARITY_TANIMOTO_COEFFICIENT(TanimotoCoefficientSimilarity.class),
+ SIMILARITY_CITY_BLOCK(CityBlockSimilarity.class),
+ SIMILARITY_COSINE(CosineSimilarity.class),
+ SIMILARITY_PEARSON_CORRELATION(PearsonCorrelationSimilarity.class),
+ SIMILARITY_EUCLIDEAN_DISTANCE(EuclideanDistanceSimilarity.class);
+
+ private final Class<? extends VectorSimilarityMeasure> implementingClass;
+
+ VectorSimilarityMeasures(Class<? extends VectorSimilarityMeasure> impl) {
+ this.implementingClass = impl;
+ }
+
+ public String getClassname() {
+ return implementingClass.getName();
+ }
+
+ public static String list() {
+ return Arrays.toString(values());
+ }
+
+}
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJobTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJobTest.java?rev=1167027&r1=1167026&r2=1167027&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJobTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJobTest.java Fri Sep 9 07:45:16 2011
@@ -251,10 +251,10 @@ public class ParallelALSFactorizationJob
alsFactorization.run(new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--lambda", String.valueOf(lambda),
"--numFeatures", String.valueOf(numFeatures), "--numIterations", String.valueOf(numIterations) });
- Matrix u = MathHelper.readEntries(conf, new Path(outputDir.getAbsolutePath(), "U/part-r-00000"),
+ Matrix u = MathHelper.readMatrix(conf, new Path(outputDir.getAbsolutePath(), "U/part-r-00000"),
preferences.numRows(), numFeatures);
- Matrix m = MathHelper.readEntries(conf, new Path(outputDir.getAbsolutePath(), "M/part-r-00000"),
- preferences.numCols(), numFeatures);
+ Matrix m = MathHelper.readMatrix(conf, new Path(outputDir.getAbsolutePath(), "M/part-r-00000"),
+ preferences.numCols(), numFeatures);
RunningAverage avg = new FullRunningAverage();
sliceIterator = preferences.iterateAll();
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/als/PredictionJobTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/als/PredictionJobTest.java?rev=1167027&r1=1167026&r2=1167027&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/als/PredictionJobTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/als/PredictionJobTest.java Fri Sep 9 07:45:16 2011
@@ -43,15 +43,15 @@ public class PredictionJobTest extends T
Path inputPath = new Path(pairs.getAbsolutePath());
FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
- MathHelper.writeEntries(new double[][]{
- new double[] { 1.5, -2, 0.3 },
- new double[] { -0.7, 2, 0.6 },
- new double[] { -1, 2.5, 3 } }, fs, conf, new Path(userFeatures.getAbsolutePath()));
-
- MathHelper.writeEntries(new double [][] {
- new double[] { 2.3, 0.5, 0 },
- new double[] { 4.7, -1, 0.2 },
- new double[] { 0.6, 2, 1.3 } }, fs, conf, new Path(itemFeatures.getAbsolutePath()));
+ MathHelper.writeDistributedRowMatrix(new double[][]{
+ new double[]{1.5, -2, 0.3},
+ new double[]{-0.7, 2, 0.6},
+ new double[]{-1, 2.5, 3}}, fs, conf, new Path(userFeatures.getAbsolutePath()));
+
+ MathHelper.writeDistributedRowMatrix(new double[][]{
+ new double[]{2.3, 0.5, 0},
+ new double[]{4.7, -1, 0.2},
+ new double[]{0.6, 2, 1.3}}, fs, conf, new Path(itemFeatures.getAbsolutePath()));
writeLines(pairs, "0,0", "2,1", "1,0");
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJobTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJobTest.java?rev=1167027&r1=1167026&r2=1167027&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJobTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJobTest.java Fri Sep 9 07:45:16 2011
@@ -49,8 +49,8 @@ import org.apache.mahout.math.VarLongWri
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.hadoop.MathHelper;
-import org.apache.mahout.math.hadoop.similarity.vector.DistributedCooccurrenceVectorSimilarity;
-import org.apache.mahout.math.hadoop.similarity.vector.DistributedTanimotoCoefficientVectorSimilarity;
+import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.CooccurrenceCountSimilarity;
+import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.TanimotoCoefficientSimilarity;
import org.apache.mahout.math.map.OpenIntLongHashMap;
import org.easymock.IArgumentMatcher;
import org.easymock.EasyMock;
@@ -131,7 +131,7 @@ public class RecommenderJobTest extends
}
/**
- * tests {@link ToUserVectorReducer}
+ * tests {@link ToUserVectorsReducer}
*/
@Test
public void testToUserVectorReducer() throws Exception {
@@ -139,7 +139,7 @@ public class RecommenderJobTest extends
EasyMock.createMock(Reducer.Context.class);
Counter userCounters = EasyMock.createMock(Counter.class);
- EasyMock.expect(context.getCounter(ToUserVectorReducer.Counters.USERS)).andReturn(userCounters);
+ EasyMock.expect(context.getCounter(ToUserVectorsReducer.Counters.USERS)).andReturn(userCounters);
userCounters.increment(1);
context.write(EasyMock.eq(new VarLongWritable(12L)), MathHelper.vectorMatches(
MathHelper.elem(TasteHadoopUtils.idToIndex(34L), 1.0), MathHelper.elem(TasteHadoopUtils.idToIndex(56L), 2.0)));
@@ -150,13 +150,13 @@ public class RecommenderJobTest extends
varLongWritables.add(new EntityPrefWritable(34L, 1.0f));
varLongWritables.add(new EntityPrefWritable(56L, 2.0f));
- new ToUserVectorReducer().reduce(new VarLongWritable(12L), varLongWritables, context);
+ new ToUserVectorsReducer().reduce(new VarLongWritable(12L), varLongWritables, context);
EasyMock.verify(context, userCounters);
}
/**
- * tests {@link ToUserVectorReducer} using boolean data
+ * tests {@link ToUserVectorsReducer} using boolean data
*/
@Test
public void testToUserVectorReducerWithBooleanData() throws Exception {
@@ -164,14 +164,14 @@ public class RecommenderJobTest extends
EasyMock.createMock(Reducer.Context.class);
Counter userCounters = EasyMock.createMock(Counter.class);
- EasyMock.expect(context.getCounter(ToUserVectorReducer.Counters.USERS)).andReturn(userCounters);
+ EasyMock.expect(context.getCounter(ToUserVectorsReducer.Counters.USERS)).andReturn(userCounters);
userCounters.increment(1);
context.write(EasyMock.eq(new VarLongWritable(12L)), MathHelper.vectorMatches(
MathHelper.elem(TasteHadoopUtils.idToIndex(34L), 1.0), MathHelper.elem(TasteHadoopUtils.idToIndex(56L), 1.0)));
EasyMock.replay(context, userCounters);
- new ToUserVectorReducer().reduce(new VarLongWritable(12L), Arrays.asList(new VarLongWritable(34L),
+ new ToUserVectorsReducer().reduce(new VarLongWritable(12L), Arrays.asList(new VarLongWritable(34L),
new VarLongWritable(56L)), context);
EasyMock.verify(context, userCounters);
@@ -728,7 +728,7 @@ public class RecommenderJobTest extends
recommenderJob.setConf(conf);
recommenderJob.run(new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--similarityClassname",
- DistributedTanimotoCoefficientVectorSimilarity.class.getName(), "--numRecommendations", "4" });
+ TanimotoCoefficientSimilarity.class.getName(), "--numRecommendations", "4" });
Map<Long,List<RecommendedItem>> recommendations = readRecommendations(new File(outputDir, "part-r-00000"));
@@ -804,7 +804,7 @@ public class RecommenderJobTest extends
recommenderJob.setConf(conf);
recommenderJob.run(new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--similarityClassname",
- DistributedCooccurrenceVectorSimilarity.class.getName(), "--booleanData", "true",
+ CooccurrenceCountSimilarity.class.getName(), "--booleanData", "true",
"--usersFile", usersFile.getAbsolutePath() });
Map<Long,List<RecommendedItem>> recommendations = readRecommendations(new File(outputDir, "part-r-00000"));
@@ -865,7 +865,7 @@ public class RecommenderJobTest extends
recommenderJob.setConf(conf);
recommenderJob.run(new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--similarityClassname",
- DistributedTanimotoCoefficientVectorSimilarity.class.getName(), "--numRecommendations", "1",
+ TanimotoCoefficientSimilarity.class.getName(), "--numRecommendations", "1",
"--usersFile", userFile.getAbsolutePath(), "--filterFile", filterFile.getAbsolutePath() });
Map<Long,List<RecommendedItem>> recommendations = readRecommendations(new File(outputDir, "part-r-00000"));
Copied: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorsReducerTest.java (from r1164967, mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducerTest.java)
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorsReducerTest.java?p2=mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorsReducerTest.java&p1=mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducerTest.java&r1=1164967&r2=1167027&rev=1167027&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducerTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorsReducerTest.java Fri Sep 9 07:45:16 2011
@@ -30,16 +30,16 @@ import org.junit.Test;
import java.util.Arrays;
/**
- * tests {@link org.apache.mahout.cf.taste.hadoop.item.ToUserVectorReducer}
+ * tests {@link ToUserVectorsReducer}
*/
-public class ToUserVectorReducerTest extends TasteTestCase {
+public class ToUserVectorsReducerTest extends TasteTestCase {
@Test
public void testToUsersReducerMinPreferencesUserIgnored() throws Exception {
Reducer<VarLongWritable,VarLongWritable,VarLongWritable,VectorWritable>.Context context =
EasyMock.createMock(Reducer.Context.class);
- ToUserVectorReducer reducer = new ToUserVectorReducer();
+ ToUserVectorsReducer reducer = new ToUserVectorsReducer();
setField(reducer, "minPreferences", 2);
EasyMock.replay(context);
@@ -55,10 +55,10 @@ public class ToUserVectorReducerTest ext
EasyMock.createMock(Reducer.Context.class);
Counter userCounters = EasyMock.createMock(Counter.class);
- ToUserVectorReducer reducer = new ToUserVectorReducer();
+ ToUserVectorsReducer reducer = new ToUserVectorsReducer();
setField(reducer, "minPreferences", 2);
- EasyMock.expect(context.getCounter(ToUserVectorReducer.Counters.USERS)).andReturn(userCounters);
+ EasyMock.expect(context.getCounter(ToUserVectorsReducer.Counters.USERS)).andReturn(userCounters);
userCounters.increment(1);
context.write(EasyMock.eq(new VarLongWritable(123)), MathHelper.vectorMatches(
MathHelper.elem(TasteHadoopUtils.idToIndex(456L), 1.0), MathHelper.elem(TasteHadoopUtils.idToIndex(789L), 1.0)));
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java?rev=1167027&r1=1167026&r2=1167027&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java Fri Sep 9 07:45:16 2011
@@ -21,27 +21,22 @@ import java.io.BufferedReader;
import java.io.File;
import java.io.FilenameFilter;
import java.util.Arrays;
-import java.util.List;
import com.google.common.base.Charsets;
import com.google.common.io.Files;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable;
import org.apache.mahout.cf.taste.impl.TasteTestCase;
import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.VarIntWritable;
-import org.apache.mahout.math.VarLongWritable;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.math.hadoop.similarity.vector.DistributedTanimotoCoefficientVectorSimilarity;
-import org.apache.mahout.math.hadoop.similarity.vector.DistributedUncenteredZeroAssumingCosineVectorSimilarity;
+import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.CosineSimilarity;
+import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.TanimotoCoefficientSimilarity;
import org.apache.mahout.math.map.OpenIntLongHashMap;
-import org.easymock.IArgumentMatcher;
import org.easymock.EasyMock;
import org.junit.Test;
@@ -138,7 +133,7 @@ public final class ItemSimilarityJobTest
similarityJob.setConf(conf);
similarityJob.run(new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--similarityClassname",
- DistributedUncenteredZeroAssumingCosineVectorSimilarity.class.getName() });
+ CosineSimilarity.class.getName() });
File outPart = outputDir.listFiles(new FilenameFilter() {
@Override
@@ -234,7 +229,7 @@ public final class ItemSimilarityJobTest
similarityJob.setConf(conf);
similarityJob.run(new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--similarityClassname",
- DistributedTanimotoCoefficientVectorSimilarity.class.getName(), "--maxSimilaritiesPerItem", "1" });
+ TanimotoCoefficientSimilarity.class.getName(), "--maxSimilaritiesPerItem", "1" });
File outPart = outputDir.listFiles(new FilenameFilter() {
@Override
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/graph/linkanalysis/PageRankJobTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/graph/linkanalysis/PageRankJobTest.java?rev=1167027&r1=1167026&r2=1167027&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/graph/linkanalysis/PageRankJobTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/graph/linkanalysis/PageRankJobTest.java Fri Sep 9 07:45:16 2011
@@ -100,7 +100,7 @@ public class PageRankJobTest extends Gra
{ 0.266666667, 0, 0.8, 0.4 },
{ 0.266666667, 0.4, 0, 0 } });
- Matrix actualTransitionMatrix = MathHelper.readEntries(conf, new Path(tempDir.getAbsolutePath(),
+ Matrix actualTransitionMatrix = MathHelper.readMatrix(conf, new Path(tempDir.getAbsolutePath(),
"transitionMatrix/part-r-00000"), numVertices, numVertices);
assertEquals(expectedTransitionMatrix, actualTransitionMatrix);
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/MathHelper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/MathHelper.java?rev=1167027&r1=1167026&r2=1167027&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/MathHelper.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/MathHelper.java Fri Sep 9 07:45:16 2011
@@ -168,10 +168,10 @@ public final class MathHelper {
/**
* read a {@link Matrix} from a SequenceFile<IntWritable,VectorWritable>
*/
- public static Matrix readEntries(Configuration conf, Path path, int rows, int columns) {
+ public static Matrix readMatrix(Configuration conf, Path path, int rows, int columns) {
Matrix matrix = new DenseMatrix(rows, columns);
for (Pair<IntWritable,VectorWritable> record :
- new SequenceFileIterable<IntWritable,VectorWritable>(path, true, conf)) {
+ new SequenceFileIterable<IntWritable,VectorWritable>(path, true, conf)) {
IntWritable key = record.getFirst();
VectorWritable value = record.getSecond();
int row = key.get();
@@ -187,7 +187,7 @@ public final class MathHelper {
/**
* write a two-dimensional double array to an SequenceFile<IntWritable,VectorWritable>
*/
- public static void writeEntries(double[][] entries, FileSystem fs, Configuration conf, Path path)
+ public static void writeDistributedRowMatrix(double[][] entries, FileSystem fs, Configuration conf, Path path)
throws IOException {
SequenceFile.Writer writer = null;
try {
Added: mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJobTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJobTest.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJobTest.java (added)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJobTest.java Fri Sep 9 07:45:16 2011
@@ -0,0 +1,181 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.hadoop.MathHelper;
+import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.TanimotoCoefficientSimilarity;
+import org.junit.Test;
+
+import java.io.File;
+
+public class RowSimilarityJobTest extends MahoutTestCase {
+
+ /**
+ * integration test with a tiny data set
+ *
+ * <pre>
+ *
+ * input matrix:
+ *
+ * 1, 0, 1, 1, 0
+ * 0, 0, 1, 1, 0
+ * 0, 0, 0, 0, 1
+ *
+ * similarity matrix (via tanimoto):
+ *
+ * 1, 0.666, 0
+ * 0.666, 1, 0
+ * 0, 0, 1
+ * </pre>
+ */
+ @Test
+ public void toyIntegration() throws Exception {
+
+ File inputFile = getTestTempFile("rows");
+ File outputDir = getTestTempDir("output");
+ outputDir.delete();
+ File tmpDir = getTestTempDir("tmp");
+
+ Configuration conf = new Configuration();
+ Path inputPath = new Path(inputFile.getAbsolutePath());
+ FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
+
+ MathHelper.writeDistributedRowMatrix(new double[][] {
+ new double[] { 1, 0, 1, 1, 0 },
+ new double[] { 0, 0, 1, 1, 0 },
+ new double[] { 0, 0, 0, 0, 1 } },
+ fs, conf, inputPath);
+
+ RowSimilarityJob rowSimilarityJob = new RowSimilarityJob();
+ rowSimilarityJob.setConf(conf);
+ rowSimilarityJob.run(new String[] { "--input", inputFile.getAbsolutePath(), "--output", outputDir.getAbsolutePath(),
+ "--numberOfColumns", String.valueOf(5), "--similarityClassname", TanimotoCoefficientSimilarity.class.getName(),
+ "--tempDir", tmpDir.getAbsolutePath() });
+
+ Matrix similarityMatrix = MathHelper.readMatrix(conf, new Path(outputDir.getAbsolutePath(), "part-r-00000"), 3, 3);
+
+ assertNotNull(similarityMatrix);
+ assertEquals(3, similarityMatrix.numCols());
+ assertEquals(3, similarityMatrix.numRows());
+
+ assertEquals(1.0, similarityMatrix.get(0, 0), EPSILON);
+ assertEquals(1.0, similarityMatrix.get(1, 1), EPSILON);
+ assertEquals(1.0, similarityMatrix.get(2, 2), EPSILON);
+ assertEquals(0.0, similarityMatrix.get(2, 0), EPSILON);
+ assertEquals(0.0, similarityMatrix.get(2, 1), EPSILON);
+ assertEquals(0.0, similarityMatrix.get(0, 2), EPSILON);
+ assertEquals(0.0, similarityMatrix.get(1, 2), EPSILON);
+ assertEquals(0.666666, similarityMatrix.get(0, 1), EPSILON);
+ assertEquals(0.666666, similarityMatrix.get(1, 0), EPSILON);
+ }
+
+ @Test
+ public void toyIntegrationMaxSimilaritiesPerRow() throws Exception {
+
+ File inputFile = getTestTempFile("rows");
+ File outputDir = getTestTempDir("output");
+ outputDir.delete();
+ File tmpDir = getTestTempDir("tmp");
+
+ Configuration conf = new Configuration();
+ Path inputPath = new Path(inputFile.getAbsolutePath());
+ FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
+
+ MathHelper.writeDistributedRowMatrix(new double[][]{
+ new double[] { 1, 0, 1, 1, 0, 1 },
+ new double[] { 0, 1, 1, 1, 1, 1 },
+ new double[] { 1, 1, 0, 1, 0, 0 } },
+ fs, conf, inputPath);
+
+ RowSimilarityJob rowSimilarityJob = new RowSimilarityJob();
+ rowSimilarityJob.setConf(conf);
+ rowSimilarityJob.run(new String[] { "--input", inputFile.getAbsolutePath(), "--output", outputDir.getAbsolutePath(),
+ "--numberOfColumns", String.valueOf(6), "--similarityClassname", TanimotoCoefficientSimilarity.class.getName(),
+ "--maxSimilaritiesPerRow", String.valueOf(1), "--excludeSelfSimilarity", String.valueOf(true),
+ "--tempDir", tmpDir.getAbsolutePath() });
+
+ Matrix similarityMatrix = MathHelper.readMatrix(conf, new Path(outputDir.getAbsolutePath(), "part-r-00000"), 3, 3);
+
+ assertNotNull(similarityMatrix);
+ assertEquals(3, similarityMatrix.numCols());
+ assertEquals(3, similarityMatrix.numRows());
+
+ assertEquals(0.0, similarityMatrix.get(0, 0), EPSILON);
+ assertEquals(0.5, similarityMatrix.get(0, 1), EPSILON);
+ assertEquals(0.0, similarityMatrix.get(0, 2), EPSILON);
+
+ assertEquals(0.5, similarityMatrix.get(1, 0), EPSILON);
+ assertEquals(0.0, similarityMatrix.get(1, 1), EPSILON);
+ assertEquals(0.0, similarityMatrix.get(1, 2), EPSILON);
+
+ assertEquals(0.4, similarityMatrix.get(2, 0), EPSILON);
+ assertEquals(0.0, similarityMatrix.get(2, 1), EPSILON);
+ assertEquals(0.0, similarityMatrix.get(2, 2), EPSILON);
+ }
+
+ @Test
+ public void toyIntegrationWithThreshold() throws Exception {
+
+
+ File inputFile = getTestTempFile("rows");
+ File outputDir = getTestTempDir("output");
+ outputDir.delete();
+ File tmpDir = getTestTempDir("tmp");
+
+ Configuration conf = new Configuration();
+ Path inputPath = new Path(inputFile.getAbsolutePath());
+ FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
+
+ MathHelper.writeDistributedRowMatrix(new double[][]{
+ new double[] { 1, 0, 1, 1, 0, 1 },
+ new double[] { 0, 1, 1, 1, 1, 1 },
+ new double[] { 1, 1, 0, 1, 0, 0 } },
+ fs, conf, inputPath);
+
+ RowSimilarityJob rowSimilarityJob = new RowSimilarityJob();
+ rowSimilarityJob.setConf(conf);
+ rowSimilarityJob.run(new String[] { "--input", inputFile.getAbsolutePath(), "--output", outputDir.getAbsolutePath(),
+ "--numberOfColumns", String.valueOf(6), "--similarityClassname", TanimotoCoefficientSimilarity.class.getName(),
+ "--excludeSelfSimilarity", String.valueOf(true), "--threshold", String.valueOf(0.5),
+ "--tempDir", tmpDir.getAbsolutePath() });
+
+ Matrix similarityMatrix = MathHelper.readMatrix(conf, new Path(outputDir.getAbsolutePath(), "part-r-00000"), 3, 3);
+
+ assertNotNull(similarityMatrix);
+ assertEquals(3, similarityMatrix.numCols());
+ assertEquals(3, similarityMatrix.numRows());
+
+ assertEquals(0.0, similarityMatrix.get(0, 0), EPSILON);
+ assertEquals(0.5, similarityMatrix.get(0, 1), EPSILON);
+ assertEquals(0.0, similarityMatrix.get(0, 2), EPSILON);
+
+ assertEquals(0.5, similarityMatrix.get(1, 0), EPSILON);
+ assertEquals(0.0, similarityMatrix.get(1, 1), EPSILON);
+ assertEquals(0.0, similarityMatrix.get(1, 2), EPSILON);
+
+ assertEquals(0.0, similarityMatrix.get(2, 0), EPSILON);
+ assertEquals(0.0, similarityMatrix.get(2, 1), EPSILON);
+ assertEquals(0.0, similarityMatrix.get(2, 2), EPSILON);
+ }
+
+}
Added: mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasuresTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasuresTest.java?rev=1167027&view=auto
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasuresTest.java (added)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasuresTest.java Fri Sep 9 07:45:16 2011
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures;
+
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.junit.Test;
+
+public class VectorSimilarityMeasuresTest extends MahoutTestCase {
+
+ double distributedSimilarity(double[] one, double[] two, Class<? extends VectorSimilarityMeasure> similarityMeasureClass) {
+ VectorSimilarityMeasure similarityMeasure = ClassUtils.instantiateAs(similarityMeasureClass,
+ VectorSimilarityMeasure.class);
+ Vector oneNormalized = similarityMeasure.normalize(asSparseVector(one));
+ Vector twoNormalized = similarityMeasure.normalize(asSparseVector(two));
+
+ double normOne = similarityMeasure.norm(oneNormalized);
+ double normTwo = similarityMeasure.norm(twoNormalized);
+
+ double dot = 0;
+ for (int n = 0; n < one.length; n++) {
+ if (oneNormalized.get(n) != 0 && twoNormalized.get(n) != 0) {
+ dot += similarityMeasure.aggregate(oneNormalized.get(n), twoNormalized.get(n));
+ }
+ }
+ return similarityMeasure.similarity(dot, normOne, normTwo, one.length);
+ }
+
+ Vector asSparseVector(double[] values) {
+ Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE);
+ for (int dim = 0; dim < values.length; dim++) {
+ if (values[dim] != 0) {
+ vector.setQuick(dim, values[dim]);
+ }
+ }
+ return vector;
+ }
+
+ @Test
+ public void testCooccurrenceCountSimilarity() {
+ double similarity = distributedSimilarity(
+ new double[] { 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0 },
+ new double[] { 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, CooccurrenceCountSimilarity.class);
+
+ assertEquals(5d, similarity, 0);
+ }
+
+ @Test
+ public void testTanimotoCoefficientSimilarity() {
+ double similarity = distributedSimilarity(
+ new double[] { 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0 },
+ new double[] { 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, TanimotoCoefficientSimilarity.class);
+
+ assertEquals(0.454545455, similarity, EPSILON);
+ }
+
+ @Test
+ public void testCityblockSimilarity() {
+ double similarity = distributedSimilarity(
+ new double[] { 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0 },
+ new double[] { 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, CityBlockSimilarity.class);
+
+ assertEquals(0.142857143, similarity, EPSILON);
+ }
+
+ @Test
+ public void testLoglikelihoodSimilarity() {
+ double similarity = distributedSimilarity(
+ new double[] { 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0 },
+ new double[] { 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, LoglikelihoodSimilarity.class);
+
+ assertEquals(0.03320155369284261, similarity, EPSILON);
+ }
+
+ @Test
+ public void testCosineSimilarity() {
+ double similarity = distributedSimilarity(
+ new double[] { 0, 2, 0, 0, 8, 3, 0, 6, 0, 1, 2, 2, 0 },
+ new double[] { 3, 0, 0, 0, 7, 0, 2, 2, 1, 3, 2, 1, 1 }, CosineSimilarity.class);
+
+ assertEquals(0.769846046, similarity, EPSILON);
+ }
+
+ @Test
+ public void testPearsonCorrelationSimilarity() {
+ double similarity = distributedSimilarity(
+ new double[] { 0, 2, 0, 0, 8, 3, 0, 6, 0, 1, 1, 2, 1 },
+ new double[] { 3, 0, 0, 0, 7, 0, 2, 2, 1, 3, 2, 4, 3 }, PearsonCorrelationSimilarity.class);
+
+ assertEquals(0.5303300858899108, similarity, EPSILON);
+ }
+
+ @Test
+ public void testEuclideanDistanceSimilarity() {
+ double similarity = distributedSimilarity(
+ new double[] { 0, 2, 0, 0, 8, 3, 0, 6, 0, 1, 1, 2, 1 },
+ new double[] { 3, 0, 0, 0, 7, 0, 2, 2, 1, 3, 2, 4, 4 }, EuclideanDistanceSimilarity.class);
+
+ assertEquals(0.887311346, similarity, EPSILON);
+ }
+}
Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/utils/eval/ParallelFactorizationEvaluatorTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/eval/ParallelFactorizationEvaluatorTest.java?rev=1167027&r1=1167026&r2=1167027&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/utils/eval/ParallelFactorizationEvaluatorTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/utils/eval/ParallelFactorizationEvaluatorTest.java Fri Sep 9 07:45:16 2011
@@ -46,15 +46,15 @@ public class ParallelFactorizationEvalua
Path inputPath = new Path(pairs.getAbsolutePath());
FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
- MathHelper.writeEntries(new double[][] {
- new double[] { 1.5, -2, 0.3 },
- new double[] { -0.7, 2, 0.6 },
- new double[] { -1, 2.5, 3 } }, fs, conf, new Path(userFeatures.getAbsolutePath()));
-
- MathHelper.writeEntries(new double [][] {
- new double[] { 2.3, 0.5, 0 },
- new double[] { 4.7, -1, 0.2 },
- new double[] { 0.6, 2, 1.3 } }, fs, conf, new Path(itemFeatures.getAbsolutePath()));
+ MathHelper.writeDistributedRowMatrix(new double[][]{
+ new double[]{1.5, -2, 0.3},
+ new double[]{-0.7, 2, 0.6},
+ new double[]{-1, 2.5, 3}}, fs, conf, new Path(userFeatures.getAbsolutePath()));
+
+ MathHelper.writeDistributedRowMatrix(new double[][]{
+ new double[]{2.3, 0.5, 0},
+ new double[]{4.7, -1, 0.2},
+ new double[]{0.6, 2, 1.3}}, fs, conf, new Path(itemFeatures.getAbsolutePath()));
writeLines(pairs, "0,0,3", "2,1,-7", "1,0,-2");
@@ -74,4 +74,4 @@ public class ParallelFactorizationEvalua
}
}
-}
\ No newline at end of file
+}
Modified: mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractVector.java
URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractVector.java?rev=1167027&r1=1167026&r2=1167027&view=diff
==============================================================================
--- mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractVector.java (original)
+++ mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractVector.java Fri Sep 9 07:45:16 2011
@@ -20,6 +20,7 @@ package org.apache.mahout.math;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.math.function.DoubleDoubleFunction;
import org.apache.mahout.math.function.DoubleFunction;
+import org.apache.mahout.math.function.Functions;
import java.util.Iterator;
@@ -515,8 +516,18 @@ public abstract class AbstractVector imp
if (size != other.size()) {
throw new CardinalityException(size, other.size());
}
- for (int i = 0; i < size; i++) {
- setQuick(i, function.apply(getQuick(i), other.getQuick(i)));
+
+ /* special case: we only need to iterate over the non-zero elements of the vector to add */
+ if (Functions.PLUS.equals(function)) {
+ Iterator<Vector.Element> nonZeroElements = other.iterateNonZero();
+ while (nonZeroElements.hasNext()) {
+ Vector.Element e = nonZeroElements.next();
+ setQuick(e.index(), function.apply(getQuick(e.index()), e.get()));
+ }
+ } else {
+ for (int i = 0; i < size; i++) {
+ setQuick(i, function.apply(getQuick(i), other.getQuick(i)));
+ }
}
return this;
}
Modified: mahout/trunk/pom.xml
URL: http://svn.apache.org/viewvc/mahout/trunk/pom.xml?rev=1167027&r1=1167026&r2=1167027&view=diff
==============================================================================
--- mahout/trunk/pom.xml (original)
+++ mahout/trunk/pom.xml Fri Sep 9 07:45:16 2011
@@ -586,7 +586,7 @@
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<forkMode>once</forkMode>
- <argLine>-Xms256m -Xmx512m</argLine>
+ <argLine>-Xms256m -Xmx768m</argLine>
<testFailureIgnore>false</testFailureIgnore>
<redirectTestOutputToFile>true</redirectTestOutputToFile>
</configuration>