You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2011/10/19 22:23:53 UTC

svn commit: r1186452 - in /mahout/trunk: core/src/main/java/org/apache/mahout/clustering/dirichlet/models/DistanceMeasureClusterDistribution.java examples/bin/build-reuters.sh

Author: jeastman
Date: Wed Oct 19 20:23:50 2011
New Revision: 1186452

URL: http://svn.apache.org/viewvc?rev=1186452&view=rev
Log:
MAHOUT-846: Fixed prior of DMCD to have Gaussian element values vs. all 0. Updated build-reuters.sh to use DMCD. All tests run and reuters looks much better.

Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/DistanceMeasureClusterDistribution.java
    mahout/trunk/examples/bin/build-reuters.sh

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/DistanceMeasureClusterDistribution.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/DistanceMeasureClusterDistribution.java?rev=1186452&r1=1186451&r2=1186452&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/DistanceMeasureClusterDistribution.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/DistanceMeasureClusterDistribution.java Wed Oct 19 20:23:50 2011
@@ -19,14 +19,16 @@ package org.apache.mahout.clustering.dir
 
 import org.apache.mahout.clustering.DistanceMeasureCluster;
 import org.apache.mahout.clustering.Model;
+import org.apache.mahout.clustering.dirichlet.UncommonDistributions;
 import org.apache.mahout.common.distance.DistanceMeasure;
 import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
 
 /**
- * An implementation of the ModelDistribution interface suitable for testing the DirichletCluster algorithm.
- * Models use a DistanceMeasure to calculate pdf values.
+ * An implementation of the ModelDistribution interface suitable for testing the
+ * DirichletCluster algorithm. Models use a DistanceMeasure to calculate pdf
+ * values.
  */
 public class DistanceMeasureClusterDistribution extends AbstractVectorModelDistribution {
 
@@ -48,9 +50,12 @@ public class DistanceMeasureClusterDistr
   @Override
   public Model<VectorWritable>[] sampleFromPrior(int howMany) {
     Model<VectorWritable>[] result = new DistanceMeasureCluster[howMany];
+    Vector prototype = getModelPrototype().get().like();
+    for (int i = 0; i < prototype.size(); i++) {
+      prototype.setQuick(i, UncommonDistributions.rNorm(0, 1));
+    }
     for (int i = 0; i < howMany; i++) {
-      Vector prototype = getModelPrototype().get();
-      result[i] = new DistanceMeasureCluster(prototype.like(), i, measure);
+      result[i] = new DistanceMeasureCluster(prototype, i, measure);
     }
     return result;
   }

Modified: mahout/trunk/examples/bin/build-reuters.sh
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-reuters.sh?rev=1186452&r1=1186451&r2=1186452&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-reuters.sh (original)
+++ mahout/trunk/examples/bin/build-reuters.sh Wed Oct 19 20:23:50 2011
@@ -142,12 +142,15 @@ elif [ "x$clustertype" == "xdirichlet" ]
   && \
   $MAHOUT dirichlet \
     -i ${WORK_DIR}/reuters-out-seqdir-sparse-dirichlet/tfidf-vectors \
-    -o ${WORK_DIR}/reuters-dirichlet -k 20 -ow -x 20 \
+    -o ${WORK_DIR}/reuters-dirichlet -k 20 -ow -x 10 -a0 2 \
+    -md org.apache.mahout.clustering.dirichlet.models.DistanceMeasureClusterDistribution \
+    -mp org.apache.mahout.math.DenseVector \
+    -dm org.apache.mahout.common.distance.CosineDistanceMeasure \
   && \
   $MAHOUT clusterdump \
     -s ${WORK_DIR}/reuters-dirichlet/clusters-*-final \
     -d ${WORK_DIR}/reuters-out-seqdir-sparse-dirichlet/dictionary.file-0 \
     -dt sequencefile -b 100 -n 20
 else 
-  echo "unknown cluster type: $clustertype";
+  echo "unknown cluster type: $clustertype"
 fi