You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2013/12/08 19:16:10 UTC
svn commit: r1549087 - in /mahout/trunk: ./
core/src/main/java/org/apache/mahout/clustering/classify/
core/src/test/java/org/apache/mahout/clustering/classify/
core/src/test/java/org/apache/mahout/clustering/kmeans/
integration/src/main/java/org/apache...
Author: smarthi
Date: Sun Dec 8 18:16:10 2013
New Revision: 1549087
URL: http://svn.apache.org/r1549087
Log:
MAHOUT-1030: Regression: Clustered Points Should be WeightedPropertyVectorWritable not WeightedVectorWritable
Modified:
mahout/trunk/CHANGELOG
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/AbstractClusterWriter.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/CSVClusterWriter.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumperWriter.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1549087&r1=1549086&r2=1549087&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Sun Dec 8 18:16:10 2013
@@ -66,6 +66,8 @@ Release 0.9 - unreleased
MAHOUT-1242: No key redistribution function for associative maps (Tharindu Rusira via smarthi)
+ MAHOUT-1030: Regression: Clustered Points Should be WeightedPropertyVectorWritable not WeightedVectorWritable (Andrew Musselman, Pat Ferrel, Jeff Eastman, Lars Norskog, smarthi)
+
Release 0.8 - 2013-07-25
MAHOUT-1272: Parallel SGD matrix factorizer for SVDrecommender (Peng Cheng via ssc)
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java?rev=1549087&r1=1549086&r2=1549087&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java Sun Dec 8 18:16:10 2013
@@ -20,13 +20,16 @@ package org.apache.mahout.clustering.cla
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
+import java.util.Map;
import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Mapper;
@@ -106,7 +109,10 @@ public class ClusterClassificationMapper
throws IOException, InterruptedException {
Cluster cluster = clusterModels.get(clusterIndex);
clusterId.set(cluster.getId());
- context.write(clusterId, new WeightedVectorWritable(weight, vw.get()));
+ double d = cluster.getCenter().getDistanceSquared(vw.get());
+ Map<Text, Text> props = Maps.newHashMap();
+ props.put(new Text("distance-squared"), new Text(Double.toString(d)));
+ context.write(clusterId, new WeightedPropertyVectorWritable(weight, vw.get(), props));
}
public static List<Cluster> populateClusterModels(Path clusterOutputPath, Configuration conf) throws IOException {
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java?rev=1549087&r1=1549086&r2=1549087&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java Sun Dec 8 18:16:10 2013
@@ -167,7 +167,7 @@ public class ClusterClassificationDriver
SequenceFile.Reader classifiedVectors = new SequenceFile.Reader(fs,
partFile.getPath(), conf);
Writable clusterIdAsKey = new IntWritable();
- WeightedVectorWritable point = new WeightedVectorWritable();
+ WeightedPropertyVectorWritable point = new WeightedPropertyVectorWritable();
while (classifiedVectors.next(clusterIdAsKey, point)) {
collectVector(clusterIdAsKey.toString(), point.getVector());
}
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=1549087&r1=1549086&r2=1549087&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java Sun Dec 8 18:16:10 2013
@@ -29,7 +29,7 @@ import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.clustering.ClusteringTestUtils;
import org.apache.mahout.clustering.canopy.CanopyDriver;
-import org.apache.mahout.clustering.classify.WeightedVectorWritable;
+import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
import org.apache.mahout.clustering.iterator.ClusterWritable;
import org.apache.mahout.common.DummyOutputCollector;
import org.apache.mahout.common.MahoutTestCase;
@@ -106,30 +106,30 @@ public final class TestKmeansClustering
public void testRunKMeansIterationConvergesInOneRunWithGivenDistanceThreshold() {
double[][] rawPoints = { {0, 0}, {0, 0.25}, {0, 0.75}, {0, 1}};
List<Vector> points = getPoints(rawPoints);
-
+
ManhattanDistanceMeasure distanceMeasure = new ManhattanDistanceMeasure();
List<Kluster> clusters = Arrays.asList(new Kluster(points.get(0), 0, distanceMeasure), new Kluster(points.get(3),
3, distanceMeasure));
-
+
// To converge in a single run, the given distance threshold should be
// greater than or equal to 0.125,
// since 0.125 will be the distance between center and centroid for the
// initial two clusters after one run.
double distanceThreshold = 0.25;
-
+
boolean converged = KMeansClusterer.runKMeansIteration(points, clusters, distanceMeasure, distanceThreshold);
-
+
Vector cluster1Center = clusters.get(0).getCenter();
assertEquals(0, cluster1Center.get(0), EPSILON);
assertEquals(0.125, cluster1Center.get(1), EPSILON);
-
+
Vector cluster2Center = clusters.get(1).getCenter();
assertEquals(0, cluster2Center.get(0), EPSILON);
assertEquals(0.875, cluster2Center.get(1), EPSILON);
-
+
assertTrue("KMeans iteration should be converged after a single run", converged);
}*/
-
+
/** Story: User wishes to run kmeans job on reference data */
@Test
public void testKMeansSeqJob() throws Exception {
@@ -161,8 +161,6 @@ public final class TestKmeansClustering
}
// now run the Job
Path outputPath = getTestTempDirPath("output");
- // KMeansDriver.runJob(pointsPath, clustersPath, outputPath,
- // EuclideanDistanceMeasure.class.getName(), 0.001, 10, k + 1, true);
String[] args = {optKey(DefaultOptionCreator.INPUT_OPTION), pointsPath.toString(),
optKey(DefaultOptionCreator.CLUSTERS_IN_OPTION), clustersPath.toString(),
optKey(DefaultOptionCreator.OUTPUT_OPTION), outputPath.toString(),
@@ -176,9 +174,9 @@ public final class TestKmeansClustering
// now compare the expected clusters with actual
Path clusteredPointsPath = new Path(outputPath, "clusteredPoints");
int[] expect = EXPECTED_NUM_POINTS[k];
- DummyOutputCollector<IntWritable,WeightedVectorWritable> collector = new DummyOutputCollector<IntWritable,WeightedVectorWritable>();
+ DummyOutputCollector<IntWritable,WeightedPropertyVectorWritable> collector = new DummyOutputCollector<IntWritable,WeightedPropertyVectorWritable>();
// The key is the clusterId, the value is the weighted vector
- for (Pair<IntWritable,WeightedVectorWritable> record : new SequenceFileIterable<IntWritable,WeightedVectorWritable>(
+ for (Pair<IntWritable,WeightedPropertyVectorWritable> record : new SequenceFileIterable<IntWritable,WeightedPropertyVectorWritable>(
new Path(clusteredPointsPath, "part-m-0"), conf)) {
collector.collect(record.getFirst(), record.getSecond());
}
@@ -217,8 +215,6 @@ public final class TestKmeansClustering
}
// now run the Job
Path outputPath = getTestTempDirPath("output");
- // KMeansDriver.runJob(pointsPath, clustersPath, outputPath,
- // EuclideanDistanceMeasure.class.getName(), 0.001, 10, k + 1, true);
String[] args = {optKey(DefaultOptionCreator.INPUT_OPTION), pointsPath.toString(),
optKey(DefaultOptionCreator.CLUSTERS_IN_OPTION), clustersPath.toString(),
optKey(DefaultOptionCreator.OUTPUT_OPTION), outputPath.toString(),
@@ -232,9 +228,9 @@ public final class TestKmeansClustering
// now compare the expected clusters with actual
Path clusteredPointsPath = new Path(outputPath, "clusteredPoints");
int[] expect = EXPECTED_NUM_POINTS[k];
- DummyOutputCollector<IntWritable,WeightedVectorWritable> collector = new DummyOutputCollector<IntWritable,WeightedVectorWritable>();
+ DummyOutputCollector<IntWritable,WeightedPropertyVectorWritable> collector = new DummyOutputCollector<IntWritable,WeightedPropertyVectorWritable>();
// The key is the clusterId, the value is the weighted vector
- for (Pair<IntWritable,WeightedVectorWritable> record : new SequenceFileIterable<IntWritable,WeightedVectorWritable>(
+ for (Pair<IntWritable,WeightedPropertyVectorWritable> record : new SequenceFileIterable<IntWritable,WeightedPropertyVectorWritable>(
new Path(clusteredPointsPath, "part-m-0"), conf)) {
collector.collect(record.getFirst(), record.getSecond());
}
@@ -274,8 +270,6 @@ public final class TestKmeansClustering
}
// now run the Job
Path outputPath = getTestTempDirPath("output");
- // KMeansDriver.runJob(pointsPath, clustersPath, outputPath,
- // EuclideanDistanceMeasure.class.getName(), 0.001, 10, k + 1, true);
String[] args = {optKey(DefaultOptionCreator.INPUT_OPTION), pointsPath.toString(),
optKey(DefaultOptionCreator.CLUSTERS_IN_OPTION), clustersPath.toString(),
optKey(DefaultOptionCreator.OUTPUT_OPTION), outputPath.toString(),
@@ -289,9 +283,9 @@ public final class TestKmeansClustering
Path clusteredPointsPath = new Path(outputPath, "clusteredPoints");
// assertEquals("output dir files?", 4, outFiles.length);
int[] expect = EXPECTED_NUM_POINTS[k];
- DummyOutputCollector<IntWritable,WeightedVectorWritable> collector = new DummyOutputCollector<IntWritable,WeightedVectorWritable>();
+ DummyOutputCollector<IntWritable,WeightedPropertyVectorWritable> collector = new DummyOutputCollector<IntWritable,WeightedPropertyVectorWritable>();
// The key is the clusterId, the value is the weighted vector
- for (Pair<IntWritable,WeightedVectorWritable> record : new SequenceFileIterable<IntWritable,WeightedVectorWritable>(
+ for (Pair<IntWritable,WeightedPropertyVectorWritable> record : new SequenceFileIterable<IntWritable,WeightedPropertyVectorWritable>(
new Path(clusteredPointsPath, "part-m-00000"), conf)) {
collector.collect(record.getFirst(), record.getSecond());
}
@@ -360,35 +354,31 @@ public final class TestKmeansClustering
// now compare the expected clusters with actual
Path clusteredPointsPath = new Path(kmeansOutput, "clusteredPoints");
- DummyOutputCollector<IntWritable,WeightedVectorWritable> collector = new DummyOutputCollector<IntWritable,WeightedVectorWritable>();
+ DummyOutputCollector<IntWritable,WeightedPropertyVectorWritable> collector = new DummyOutputCollector<IntWritable,WeightedPropertyVectorWritable>();
// The key is the clusterId, the value is the weighted vector
- for (Pair<IntWritable,WeightedVectorWritable> record : new SequenceFileIterable<IntWritable,WeightedVectorWritable>(
+ for (Pair<IntWritable,WeightedPropertyVectorWritable> record : new SequenceFileIterable<IntWritable,WeightedPropertyVectorWritable>(
new Path(clusteredPointsPath, "part-m-00000"), conf)) {
collector.collect(record.getFirst(), record.getSecond());
}
- //boolean gotLowClust = false; // clusters should be [1, *] and [2, *]
- //boolean gotHighClust = false; // vs [3 , *], [4 , *] and [5, *]
for (IntWritable k : collector.getKeys()) {
- List<WeightedVectorWritable> wvList = collector.getValue(k);
- assertTrue("empty cluster!", !wvList.isEmpty());
- if (wvList.get(0).getVector().get(0) <= 2.0) {
- for (WeightedVectorWritable wv : wvList) {
+ List<WeightedPropertyVectorWritable> wpvList = collector.getValue(k);
+ assertTrue("empty cluster!", !wpvList.isEmpty());
+ if (wpvList.get(0).getVector().get(0) <= 2.0) {
+ for (WeightedPropertyVectorWritable wv : wpvList) {
Vector v = wv.getVector();
int idx = v.maxValueIndex();
assertTrue("bad cluster!", v.get(idx) <= 2.0);
}
- assertEquals("Wrong size cluster", 4, wvList.size());
- //gotLowClust= true;
+ assertEquals("Wrong size cluster", 4, wpvList.size());
} else {
- for (WeightedVectorWritable wv : wvList) {
+ for (WeightedPropertyVectorWritable wv : wpvList) {
Vector v = wv.getVector();
int idx = v.minValueIndex();
assertTrue("bad cluster!", v.get(idx) > 2.0);
}
- assertEquals("Wrong size cluster", 5, wvList.size());
- //gotHighClust= true;
+ assertEquals("Wrong size cluster", 5, wpvList.size());
}
}
}
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/AbstractClusterWriter.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/AbstractClusterWriter.java?rev=1549087&r1=1549086&r2=1549087&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/AbstractClusterWriter.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/AbstractClusterWriter.java Sun Dec 8 18:16:10 2013
@@ -27,7 +27,7 @@ import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
-import org.apache.mahout.clustering.classify.WeightedVectorWritable;
+import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
import org.apache.mahout.clustering.iterator.ClusterWritable;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.distance.DistanceMeasure;
@@ -45,7 +45,7 @@ public abstract class AbstractClusterWri
private static final Logger log = LoggerFactory.getLogger(AbstractClusterWriter.class);
protected final Writer writer;
- protected final Map<Integer, List<WeightedVectorWritable>> clusterIdToPoints;
+ protected final Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints;
protected final DistanceMeasure measure;
/**
@@ -56,7 +56,7 @@ public abstract class AbstractClusterWri
* @param measure The {@link org.apache.mahout.common.distance.DistanceMeasure} used to calculate the distance.
* Some writers may wish to use it for calculating weights for display. May be null.
*/
- protected AbstractClusterWriter(Writer writer, Map<Integer, List<WeightedVectorWritable>> clusterIdToPoints,
+ protected AbstractClusterWriter(Writer writer, Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints,
DistanceMeasure measure) {
this.writer = writer;
this.clusterIdToPoints = clusterIdToPoints;
@@ -67,7 +67,7 @@ public abstract class AbstractClusterWri
return writer;
}
- protected Map<Integer, List<WeightedVectorWritable>> getClusterIdToPoints() {
+ protected Map<Integer, List<WeightedPropertyVectorWritable>> getClusterIdToPoints() {
return clusterIdToPoints;
}
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/CSVClusterWriter.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/CSVClusterWriter.java?rev=1549087&r1=1549086&r2=1549087&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/CSVClusterWriter.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/CSVClusterWriter.java Sun Dec 8 18:16:10 2013
@@ -18,7 +18,7 @@
package org.apache.mahout.utils.clustering;
import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.classify.WeightedVectorWritable;
+import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
import org.apache.mahout.clustering.iterator.ClusterWritable;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.math.NamedVector;
@@ -39,7 +39,7 @@ public class CSVClusterWriter extends Ab
private static final Pattern VEC_PATTERN = Pattern.compile("\\{|\\:|\\,|\\}");
- public CSVClusterWriter(Writer writer, Map<Integer, List<WeightedVectorWritable>> clusterIdToPoints,
+ public CSVClusterWriter(Writer writer, Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints,
DistanceMeasure measure) {
super(writer, clusterIdToPoints, measure);
}
@@ -49,9 +49,9 @@ public class CSVClusterWriter extends Ab
StringBuilder line = new StringBuilder();
Cluster cluster = clusterWritable.getValue();
line.append(cluster.getId());
- List<WeightedVectorWritable> points = getClusterIdToPoints().get(cluster.getId());
+ List<WeightedPropertyVectorWritable> points = getClusterIdToPoints().get(cluster.getId());
if (points != null) {
- for (WeightedVectorWritable point : points) {
+ for (WeightedPropertyVectorWritable point : points) {
Vector theVec = point.getVector();
line.append(',');
if (theVec instanceof NamedVector) {
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java?rev=1549087&r1=1549086&r2=1549087&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java Sun Dec 8 18:16:10 2013
@@ -31,7 +31,7 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.mahout.clustering.cdbw.CDbwEvaluator;
-import org.apache.mahout.clustering.classify.WeightedVectorWritable;
+import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
import org.apache.mahout.clustering.evaluation.ClusterEvaluator;
import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
import org.apache.mahout.clustering.iterator.ClusterWritable;
@@ -83,7 +83,7 @@ public final class ClusterDumper extends
private String dictionaryFormat;
private int subString = Integer.MAX_VALUE;
private int numTopFeatures = 10;
- private Map<Integer, List<WeightedVectorWritable>> clusterIdToPoints;
+ private Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints;
private OUTPUT_FORMAT outputFormat = OUTPUT_FORMAT.TEXT;
private boolean runEvaluation;
@@ -275,7 +275,7 @@ public final class ClusterDumper extends
this.subString = subString;
}
- public Map<Integer, List<WeightedVectorWritable>> getClusterIdToPoints() {
+ public Map<Integer, List<WeightedPropertyVectorWritable>> getClusterIdToPoints() {
return clusterIdToPoints;
}
@@ -304,18 +304,17 @@ public final class ClusterDumper extends
this.maxPointsPerCluster = maxPointsPerCluster;
}
- public static Map<Integer, List<WeightedVectorWritable>> readPoints(Path pointsPathDir, long maxPointsPerCluster,
+ public static Map<Integer, List<WeightedPropertyVectorWritable>> readPoints(Path pointsPathDir, long maxPointsPerCluster,
Configuration conf) {
- Map<Integer, List<WeightedVectorWritable>> result = Maps.newTreeMap();
- for (Pair<IntWritable, WeightedVectorWritable> record
- : new SequenceFileDirIterable<IntWritable, WeightedVectorWritable>(pointsPathDir, PathType.LIST,
+ Map<Integer, List<WeightedPropertyVectorWritable>> result = Maps.newTreeMap();
+ for (Pair<IntWritable, WeightedPropertyVectorWritable> record
+ : new SequenceFileDirIterable<IntWritable, WeightedPropertyVectorWritable>(pointsPathDir, PathType.LIST,
PathFilters.logsCRCFilter(), conf)) {
// value is the cluster id as an int, key is the name/id of the
- // vector, but that doesn't matter because we only care about printing
- // it
+ // vector, but that doesn't matter because we only care about printing it
//String clusterId = value.toString();
int keyValue = record.getFirst().get();
- List<WeightedVectorWritable> pointList = result.get(keyValue);
+ List<WeightedPropertyVectorWritable> pointList = result.get(keyValue);
if (pointList == null) {
pointList = Lists.newArrayList();
result.put(keyValue, pointList);
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumperWriter.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumperWriter.java?rev=1549087&r1=1549086&r2=1549087&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumperWriter.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumperWriter.java Sun Dec 8 18:16:10 2013
@@ -21,7 +21,6 @@ import org.apache.hadoop.io.Text;
import org.apache.mahout.clustering.AbstractCluster;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
-import org.apache.mahout.clustering.classify.WeightedVectorWritable;
import org.apache.mahout.clustering.iterator.ClusterWritable;
import org.apache.mahout.common.distance.DistanceMeasure;
@@ -40,7 +39,7 @@ public class ClusterDumperWriter extends
private final String[] dictionary;
private final int numTopFeatures;
- public ClusterDumperWriter(Writer writer, Map<Integer,List<WeightedVectorWritable>> clusterIdToPoints,
+ public ClusterDumperWriter(Writer writer, Map<Integer,List<WeightedPropertyVectorWritable>> clusterIdToPoints,
DistanceMeasure measure, int numTopFeatures, String[] dictionary, int subString) {
super(writer, clusterIdToPoints, measure);
this.numTopFeatures = numTopFeatures;
@@ -69,27 +68,24 @@ public class ClusterDumperWriter extends
writer.write('\n');
}
- Map<Integer,List<WeightedVectorWritable>> clusterIdToPoints = getClusterIdToPoints();
- List<WeightedVectorWritable> points = clusterIdToPoints.get(clusterWritable.getValue().getId());
+ Map<Integer,List<WeightedPropertyVectorWritable>> clusterIdToPoints = getClusterIdToPoints();
+ List<WeightedPropertyVectorWritable> points = clusterIdToPoints.get(clusterWritable.getValue().getId());
if (points != null) {
writer.write("\tWeight : [props - optional]: Point:\n\t");
- for (Iterator<WeightedVectorWritable> iterator = points.iterator(); iterator.hasNext();) {
- WeightedVectorWritable point = iterator.next();
+ for (Iterator<WeightedPropertyVectorWritable> iterator = points.iterator(); iterator.hasNext();) {
+ WeightedPropertyVectorWritable point = iterator.next();
writer.write(String.valueOf(point.getWeight()));
- if (point instanceof WeightedPropertyVectorWritable) {
- WeightedPropertyVectorWritable tmp = (WeightedPropertyVectorWritable) point;
- Map<Text,Text> map = tmp.getProperties();
- // map can be null since empty maps when written are returned as null
- writer.write(" : [");
- if (map != null) {
- for (Map.Entry<Text,Text> entry : map.entrySet()) {
- writer.write(entry.getKey().toString());
- writer.write("=");
- writer.write(entry.getValue().toString());
- }
+ Map<Text,Text> map = point.getProperties();
+ // map can be null since empty maps when written are returned as null
+ writer.write(" : [");
+ if (map != null) {
+ for (Map.Entry<Text,Text> entry : map.entrySet()) {
+ writer.write(entry.getKey().toString());
+ writer.write("=");
+ writer.write(entry.getValue().toString());
}
- writer.write("]");
}
+ writer.write("]");
writer.write(": ");
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java?rev=1549087&r1=1549086&r2=1549087&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java Sun Dec 8 18:16:10 2013
@@ -26,6 +26,7 @@ import java.util.Random;
import java.util.regex.Pattern;
import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
import org.apache.mahout.clustering.classify.WeightedVectorWritable;
import org.apache.mahout.clustering.iterator.ClusterWritable;
import org.apache.mahout.common.RandomUtils;
@@ -50,7 +51,7 @@ public class GraphMLClusterWriter extend
private final int numTopFeatures;
private final int subString;
- public GraphMLClusterWriter(Writer writer, Map<Integer, List<WeightedVectorWritable>> clusterIdToPoints,
+ public GraphMLClusterWriter(Writer writer, Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints,
DistanceMeasure measure, int numTopFeatures, String[] dictionary, int subString)
throws IOException {
super(writer, clusterIdToPoints, measure);
@@ -115,7 +116,7 @@ public class GraphMLClusterWriter extend
}
line.append(createNode(clusterLabel, rgb, x, y));
- List<WeightedVectorWritable> points = clusterIdToPoints.get(cluster.getId());
+ List<WeightedPropertyVectorWritable> points = clusterIdToPoints.get(cluster.getId());
if (points != null) {
for (WeightedVectorWritable point : points) {
Vector theVec = point.getVector();
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java?rev=1549087&r1=1549086&r2=1549087&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/JsonClusterWriter.java Sun Dec 8 18:16:10 2013
@@ -27,7 +27,7 @@ import java.util.regex.Pattern;
import com.google.common.collect.Maps;
import org.apache.mahout.clustering.AbstractCluster;
import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.classify.WeightedVectorWritable;
+import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
import org.apache.mahout.clustering.iterator.ClusterWritable;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.math.NamedVector;
@@ -53,7 +53,7 @@ public class JsonClusterWriter extends A
private static final Pattern VEC_PATTERN = Pattern.compile("\\{|\\:|\\,|\\}");
public JsonClusterWriter(Writer writer,
- Map<Integer, List<WeightedVectorWritable>> clusterIdToPoints,
+ Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints,
DistanceMeasure measure, int numTopFeatures, String[] dictionary) {
super(writer, clusterIdToPoints, measure);
this.numTopFeatures = numTopFeatures;
@@ -136,11 +136,11 @@ public class JsonClusterWriter extends A
*/
public List<Object> getPoints(Cluster cluster, String[] dictionary) {
List<Object> vectorObjs = Lists.newLinkedList();
- List<WeightedVectorWritable> points = getClusterIdToPoints().get(
+ List<WeightedPropertyVectorWritable> points = getClusterIdToPoints().get(
cluster.getId());
if (points != null) {
- for (WeightedVectorWritable point : points) {
+ for (WeightedPropertyVectorWritable point : points) {
Map<String, Object> entry = Maps.newHashMap();
Vector theVec = point.getVector();
if (theVec instanceof NamedVector) {
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java?rev=1549087&r1=1549086&r2=1549087&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java Sun Dec 8 18:16:10 2013
@@ -56,7 +56,7 @@ import org.apache.lucene.store.FSDirecto
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.OpenBitSet;
-import org.apache.mahout.clustering.classify.WeightedVectorWritable;
+import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.math.NamedVector;
@@ -86,7 +86,7 @@ public class ClusterLabels {
private final String indexDir;
private final String contentField;
private String idField;
- private final Map<Integer, List<WeightedVectorWritable>> clusterIdToPoints;
+ private final Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints;
private String output;
private final int minNumIds;
private final int maxLabels;
@@ -114,15 +114,15 @@ public class ClusterLabels {
writer = Files.newWriter(new File(this.output), Charsets.UTF_8);
}
try {
- for (Map.Entry<Integer, List<WeightedVectorWritable>> integerListEntry : clusterIdToPoints.entrySet()) {
- List<WeightedVectorWritable> wvws = integerListEntry.getValue();
- List<TermInfoClusterInOut> termInfos = getClusterLabels(integerListEntry.getKey(), wvws);
+ for (Map.Entry<Integer, List<WeightedPropertyVectorWritable>> integerListEntry : clusterIdToPoints.entrySet()) {
+ List<WeightedPropertyVectorWritable> wpvws = integerListEntry.getValue();
+ List<TermInfoClusterInOut> termInfos = getClusterLabels(integerListEntry.getKey(), wpvws);
if (termInfos != null) {
writer.write('\n');
writer.write("Top labels for Cluster ");
writer.write(String.valueOf(integerListEntry.getKey()));
writer.write(" containing ");
- writer.write(String.valueOf(wvws.size()));
+ writer.write(String.valueOf(wpvws.size()));
writer.write(" vectors");
writer.write('\n');
writer.write("Term \t\t LLR \t\t In-ClusterDF \t\t Out-ClusterDF ");
@@ -148,14 +148,14 @@ public class ClusterLabels {
* Get the list of labels, sorted by best score.
*/
protected List<TermInfoClusterInOut> getClusterLabels(Integer integer,
- Collection<WeightedVectorWritable> wvws) throws IOException {
+ Collection<WeightedPropertyVectorWritable> wpvws) throws IOException {
- if (wvws.size() < minNumIds) {
- log.info("Skipping small cluster {} with size: {}", integer, wvws.size());
+ if (wpvws.size() < minNumIds) {
+ log.info("Skipping small cluster {} with size: {}", integer, wpvws.size());
return null;
}
- log.info("Processing Cluster {} with {} documents", integer, wvws.size());
+ log.info("Processing Cluster {} with {} documents", integer, wpvws.size());
Directory dir = FSDirectory.open(new File(this.indexDir));
IndexReader reader = DirectoryReader.open(dir);
@@ -163,8 +163,8 @@ public class ClusterLabels {
log.info("# of documents in the index {}", reader.numDocs());
Collection<String> idSet = Sets.newHashSet();
- for (WeightedVectorWritable wvw : wvws) {
- Vector vector = wvw.getVector();
+ for (WeightedPropertyVectorWritable wpvw : wpvws) {
+ Vector vector = wpvw.getVector();
if (vector instanceof NamedVector) {
idSet.add(((NamedVector) vector).getName());
}
@@ -216,7 +216,7 @@ public class ClusterLabels {
List<TermInfoClusterInOut> clusteredTermInfo = Lists.newLinkedList();
- int clusterSize = wvws.size();
+ int clusterSize = wpvws.size();
for (TermEntry termEntry : termEntryMap.values()) {