You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2010/07/12 13:54:40 UTC
svn commit: r963249 - in /mahout/trunk: conf/
core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/
core/src/main/java/org/apache/mahout/math/hadoop/similarity/
core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/
Author: srowen
Date: Mon Jul 12 11:54:39 2010
New Revision: 963249
URL: http://svn.apache.org/viewvc?rev=963249&view=rev
Log:
MAHOUT-440
Added:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/SimilarityType.java
Modified:
mahout/trunk/conf/driver.classes.props
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/RowSimilarityJob.java
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
Modified: mahout/trunk/conf/driver.classes.props
URL: http://svn.apache.org/viewvc/mahout/trunk/conf/driver.classes.props?rev=963249&r1=963248&r2=963249&view=diff
==============================================================================
--- mahout/trunk/conf/driver.classes.props (original)
+++ mahout/trunk/conf/driver.classes.props Mon Jul 12 11:54:39 2010
@@ -20,3 +20,5 @@ org.apache.mahout.classifier.bayes.TestC
org.apache.mahout.classifier.bayes.TrainClassifier = trainclassifier : Train Bayes Classifier
org.apache.mahout.math.hadoop.decomposer.DistributedLanczosSolver = svd : Lanczos Singular Value Decomposition
org.apache.mahout.math.hadoop.decomposer.EigenVerificationJob = cleansvd : Cleanup and verification of SVD output
+org.apache.mahout.math.hadoop.similarity.RowSimilarityJob = rowsimilarity : Compute the pairwise similarities of the rows of a matrix
+org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob = itemsimilarity : Compute the item-item-similarities for item-based collaborative filtering
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java?rev=963249&r1=963248&r2=963249&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java Mon Jul 12 11:54:39 2010
@@ -17,18 +17,12 @@
package org.apache.mahout.cf.taste.hadoop.similarity.item;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
-import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
@@ -47,6 +41,7 @@ import org.apache.mahout.math.VarLongWri
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.hadoop.DistributedRowMatrix;
import org.apache.mahout.math.hadoop.similarity.RowSimilarityJob;
+import org.apache.mahout.math.hadoop.similarity.SimilarityType;
public final class ItemSimilarityJob extends AbstractJob {
@@ -64,7 +59,8 @@ public final class ItemSimilarityJob ext
addInputOption();
addOutputOption();
- addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate");
+ addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use " +
+ "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')');
addOption("maxSimilaritiesPerItem", "m", "try to cap the number of similar items per item to this number " +
"(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')', String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
@@ -127,7 +123,7 @@ public final class ItemSimilarityJob ext
itemUserMatrix.waitForCompletion(true);
}
- int numberOfUsers = readNumberOfUsers(getConf(), countUsersPath);
+ int numberOfUsers = TasteHadoopUtils.readIntFromFile(getConf(), countUsersPath);
/* Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like
* new DistributedRowMatrix(...).rowSimilarity(...) */
@@ -156,18 +152,4 @@ public final class ItemSimilarityJob ext
return 0;
}
-
- static int readNumberOfUsers(Configuration conf, Path outputDir) throws IOException {
- FileSystem fs = FileSystem.get(conf);
- Path outputFile = fs.listStatus(outputDir, TasteHadoopUtils.PARTS_FILTER)[0].getPath();
- InputStream in = null;
- try {
- in = fs.open(outputFile);
- ByteArrayOutputStream out = new ByteArrayOutputStream();
- IOUtils.copyBytes(in, out, conf);
- return Integer.parseInt(new String(out.toByteArray(), Charset.forName("UTF-8")).trim());
- } finally {
- IOUtils.closeStream(in);
- }
- }
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/RowSimilarityJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/RowSimilarityJob.java?rev=963249&r1=963248&r2=963249&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/RowSimilarityJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/RowSimilarityJob.java Mon Jul 12 11:54:39 2010
@@ -91,7 +91,8 @@ public class RowSimilarityJob extends Ab
addInputOption();
addOutputOption();
addOption("numberOfColumns", "r", "Number of columns in the input matrix");
- addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate");
+ addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use " +
+ "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')');
addOption("maxSimilaritiesPerRow", "m", "Number of maximum similarities per row (default: "
+ DEFAULT_MAX_SIMILARITIES_PER_ROW + ')', String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW));
@@ -101,7 +102,15 @@ public class RowSimilarityJob extends Ab
}
int numberOfColumns = Integer.parseInt(parsedArgs.get("--numberOfColumns"));
- String distributedSimilarityClassname = parsedArgs.get("--similarityClassname");
+ String similarityClassnameArg = parsedArgs.get("--similarityClassname");
+ String distributedSimilarityClassname;
+ try {
+ distributedSimilarityClassname =
+ SimilarityType.valueOf(similarityClassnameArg).getSimilarityImplementationClassName();
+ } catch (IllegalArgumentException iae) {
+ distributedSimilarityClassname = similarityClassnameArg;
+ }
+
int maxSimilaritiesPerRow = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerRow"));
Path inputPath = getInputPath();
@@ -341,4 +350,4 @@ public class RowSimilarityJob extends Ab
}
}
-}
\ No newline at end of file
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/SimilarityType.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/SimilarityType.java?rev=963249&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/SimilarityType.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/SimilarityType.java Mon Jul 12 11:54:39 2010
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity;
+
+import java.util.Arrays;
+
+import org.apache.mahout.math.hadoop.similarity.vector.DistributedCooccurrenceVectorSimilarity;
+import org.apache.mahout.math.hadoop.similarity.vector.DistributedEuclideanDistanceVectorSimilarity;
+import org.apache.mahout.math.hadoop.similarity.vector.DistributedLoglikelihoodVectorSimilarity;
+import org.apache.mahout.math.hadoop.similarity.vector.DistributedPearsonCorrelationVectorSimilarity;
+import org.apache.mahout.math.hadoop.similarity.vector.DistributedTanimotoCoefficientVectorSimilarity;
+import org.apache.mahout.math.hadoop.similarity.vector.DistributedUncenteredCosineVectorSimilarity;
+import org.apache.mahout.math.hadoop.similarity.vector.DistributedUncenteredZeroAssumingCosineVectorSimilarity;
+import org.apache.mahout.math.hadoop.similarity.vector.DistributedVectorSimilarity;
+
+public enum SimilarityType {
+
+ SIMILARITY_COOCCURRENCE(DistributedCooccurrenceVectorSimilarity.class),
+ SIMILARITY_EUCLIDEAN_DISTANCE(DistributedEuclideanDistanceVectorSimilarity.class),
+ SIMILARITY_LOGLIKELIHOOD(DistributedLoglikelihoodVectorSimilarity.class),
+ SIMILARITY_PEARSON_CORRELATION(DistributedPearsonCorrelationVectorSimilarity.class),
+ SIMILARITY_TANIMOTO_COEFFICIENT(DistributedTanimotoCoefficientVectorSimilarity.class),
+ SIMILARITY_UNCENTERED_COSINE(DistributedUncenteredCosineVectorSimilarity.class),
+ SIMILARITY_UNCENTERED_ZERO_ASSUMING_COSINE(DistributedUncenteredZeroAssumingCosineVectorSimilarity.class);
+
+ private final Class<? extends DistributedVectorSimilarity> similarityImplementation;
+
+ SimilarityType(Class<? extends DistributedVectorSimilarity> similarityImplementation) {
+ this.similarityImplementation = similarityImplementation;
+ }
+
+ public String getSimilarityImplementationClassName() {
+ return similarityImplementation.getName();
+ }
+
+ public static String listEnumNames() {
+ return Arrays.toString(values());
+ }
+
+}
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java?rev=963249&r1=963248&r2=963249&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java Mon Jul 12 11:54:39 2010
@@ -41,7 +41,6 @@ import org.apache.mahout.math.VarIntWrit
import org.apache.mahout.math.VarLongWritable;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.math.hadoop.DistributedRowMatrix;
import org.apache.mahout.math.hadoop.MathHelper;
import org.apache.mahout.math.hadoop.DistributedRowMatrix.MatrixEntryWritable;
import org.apache.mahout.math.hadoop.similarity.vector.DistributedTanimotoCoefficientVectorSimilarity;
@@ -253,8 +252,8 @@ public final class ItemSimilarityTest ex
DistributedUncenteredZeroAssumingCosineVectorSimilarity.class.getName() });
File countUsersPart = new File(tmpDir, "countUsers");
- int numberOfUsers = ItemSimilarityJob.readNumberOfUsers(new Configuration(),
- new Path(countUsersPart.getAbsolutePath()));
+ int numberOfUsers = TasteHadoopUtils.readIntFromFile(new Configuration(),
+ new Path(countUsersPart.getAbsolutePath()));
assertEquals(3, numberOfUsers);