You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hama.apache.org by ed...@apache.org on 2013/12/08 05:55:38 UTC

svn commit: r1548995 - in /hama/trunk: ./ commons/src/main/java/org/apache/hama/commons/io/ commons/src/main/java/org/apache/hama/commons/math/ examples/src/main/java/org/apache/hama/examples/ ml/src/main/java/org/apache/hama/ml/kmeans/ ml/src/test/jav...

Author: edwardyoon
Date: Sun Dec  8 04:55:37 2013
New Revision: 1548995

URL: http://svn.apache.org/r1548995
Log:
HAMA-827: Add NamedVector

Added:
    hama/trunk/commons/src/main/java/org/apache/hama/commons/math/NamedDoubleVector.java
Modified:
    hama/trunk/CHANGES.txt
    hama/trunk/commons/src/main/java/org/apache/hama/commons/io/VectorWritable.java
    hama/trunk/commons/src/main/java/org/apache/hama/commons/math/DenseDoubleVector.java
    hama/trunk/examples/src/main/java/org/apache/hama/examples/Kmeans.java
    hama/trunk/ml/src/main/java/org/apache/hama/ml/kmeans/KMeansBSP.java
    hama/trunk/ml/src/test/java/org/apache/hama/ml/kmeans/TestKMeansBSP.java

Modified: hama/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hama/trunk/CHANGES.txt?rev=1548995&r1=1548994&r2=1548995&view=diff
==============================================================================
--- hama/trunk/CHANGES.txt (original)
+++ hama/trunk/CHANGES.txt Sun Dec  8 04:55:37 2013
@@ -4,6 +4,7 @@ Release 0.7.0 (unreleased changes)
 
   NEW FEATURES
 
+   HAMA-827: Add NamedVector (edwardyoon)
    HAMA-822: Add feature transformer interface to improve the power and flexibility of existing machine learning model (Yexi Jiang)
    HAMA-774: CompositeInputFormat in Hama (Martin Illecker)
    HAMA-815: Hama Pipes uses C++ templates (Martin Illecker)  

Modified: hama/trunk/commons/src/main/java/org/apache/hama/commons/io/VectorWritable.java
URL: http://svn.apache.org/viewvc/hama/trunk/commons/src/main/java/org/apache/hama/commons/io/VectorWritable.java?rev=1548995&r1=1548994&r2=1548995&view=diff
==============================================================================
--- hama/trunk/commons/src/main/java/org/apache/hama/commons/io/VectorWritable.java (original)
+++ hama/trunk/commons/src/main/java/org/apache/hama/commons/io/VectorWritable.java Sun Dec  8 04:55:37 2013
@@ -24,6 +24,7 @@ import java.io.IOException;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hama.commons.math.DenseDoubleVector;
 import org.apache.hama.commons.math.DoubleVector;
+import org.apache.hama.commons.math.NamedDoubleVector;
 
 /**
  * Writable for dense vectors.
@@ -102,6 +103,13 @@ public class VectorWritable implements W
     for (int i = 0; i < vector.getDimension(); i++) {
       out.writeDouble(vector.get(i));
     }
+
+    if (vector.isNamed() && vector.getName() != null) {
+      out.writeBoolean(true);
+      out.writeUTF(vector.getName());
+    } else {
+      out.writeBoolean(false);
+    }
   }
 
   public static DoubleVector readVector(DataInput in) throws IOException {
@@ -111,6 +119,10 @@ public class VectorWritable implements W
     for (int i = 0; i < length; i++) {
       vector.set(i, in.readDouble());
     }
+
+    if (in.readBoolean()) {
+      vector = new NamedDoubleVector(in.readUTF(), vector);
+    }
     return vector;
   }
 

Modified: hama/trunk/commons/src/main/java/org/apache/hama/commons/math/DenseDoubleVector.java
URL: http://svn.apache.org/viewvc/hama/trunk/commons/src/main/java/org/apache/hama/commons/math/DenseDoubleVector.java?rev=1548995&r1=1548994&r2=1548995&view=diff
==============================================================================
--- hama/trunk/commons/src/main/java/org/apache/hama/commons/math/DenseDoubleVector.java (original)
+++ hama/trunk/commons/src/main/java/org/apache/hama/commons/math/DenseDoubleVector.java Sun Dec  8 04:55:37 2013
@@ -181,10 +181,6 @@ public final class DenseDoubleVector imp
     return newv;
   }
 
-  /*
-   * (non-Javadoc)
-   * @see de.jungblut.math.DoubleVector#subtract(de.jungblut.math.DoubleVector)
-   */
   @Override
   public final DoubleVector subtractUnsafe(DoubleVector v) {
     DoubleVector newv = new DenseDoubleVector(v.getLength());
@@ -194,10 +190,6 @@ public final class DenseDoubleVector imp
     return newv;
   }
 
-  /*
-   * (non-Javadoc)
-   * @see de.jungblut.math.DoubleVector#subtract(double)
-   */
   @Override
   public final DoubleVector subtract(double v) {
     DenseDoubleVector newv = new DenseDoubleVector(vector.length);

Added: hama/trunk/commons/src/main/java/org/apache/hama/commons/math/NamedDoubleVector.java
URL: http://svn.apache.org/viewvc/hama/trunk/commons/src/main/java/org/apache/hama/commons/math/NamedDoubleVector.java?rev=1548995&view=auto
==============================================================================
--- hama/trunk/commons/src/main/java/org/apache/hama/commons/math/NamedDoubleVector.java (added)
+++ hama/trunk/commons/src/main/java/org/apache/hama/commons/math/NamedDoubleVector.java Sun Dec  8 04:55:37 2013
@@ -0,0 +1,245 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hama.commons.math;
+
+import java.util.Iterator;
+
+public final class NamedDoubleVector implements DoubleVector {
+  
+  private final String name;
+  private final DoubleVector vector;
+  
+  public NamedDoubleVector(String name, DoubleVector deepCopy) {
+    super();
+    this.name = name;
+    this.vector = deepCopy;
+  }
+
+  @Override
+  public double get(int index) {
+    return vector.get(index);
+  }
+
+  @Override
+  public int getLength() {
+    return vector.getLength();
+  }
+
+  @Override
+  public int getDimension() {
+    return vector.getDimension();
+  }
+
+  @Override
+  public void set(int index, double value) {
+    vector.set(index, value);
+  }
+
+  @Override
+  @Deprecated
+  public DoubleVector apply(DoubleVectorFunction func) {
+    return vector.apply(func);
+  }
+
+  @Override
+  @Deprecated
+  public DoubleVector apply(DoubleVector other, DoubleDoubleVectorFunction func) {
+    return vector.apply(other, func);
+  }
+
+  @Override
+  public DoubleVector applyToElements(DoubleFunction func) {
+    return vector.applyToElements(func);
+  }
+
+  @Override
+  public DoubleVector applyToElements(DoubleVector other,
+      DoubleDoubleFunction func) {
+    return vector.applyToElements(other, func);
+  }
+
+  @Override
+  public DoubleVector addUnsafe(DoubleVector vector2) {
+    return vector.addUnsafe(vector2);
+  }
+
+  @Override
+  public DoubleVector add(DoubleVector vector2) {
+    return vector.add(vector2);
+  }
+
+  @Override
+  public DoubleVector add(double scalar) {
+    return vector.add(scalar);
+  }
+
+  @Override
+  public DoubleVector subtractUnsafe(DoubleVector vector2) {
+    return vector.subtractUnsafe(vector2);
+  }
+
+  @Override
+  public DoubleVector subtract(DoubleVector vector2) {
+    return vector.subtract(vector2);
+  }
+
+  @Override
+  public DoubleVector subtract(double scalar) {
+    return vector.subtract(scalar);
+  }
+
+  @Override
+  public DoubleVector subtractFrom(double scalar) {
+    return vector.subtractFrom(scalar);
+  }
+
+  @Override
+  public DoubleVector multiply(double scalar) {
+    return vector.multiply(scalar);
+  }
+
+  @Override
+  public DoubleVector multiplyUnsafe(DoubleVector vector2) {
+    return vector.multiplyUnsafe(vector2);
+  }
+
+  @Override
+  public DoubleVector multiply(DoubleVector vector2) {
+    return vector.multiply(vector2);
+  }
+
+  @Override
+  public DoubleVector multiply(DoubleMatrix matrix) {
+    return vector.multiply(matrix);
+  }
+
+  @Override
+  public DoubleVector multiplyUnsafe(DoubleMatrix matrix) {
+    return vector.multiplyUnsafe(matrix);
+  }
+
+  @Override
+  public DoubleVector divide(double scalar) {
+    return vector.divide(scalar);
+  }
+
+  @Override
+  public DoubleVector divideFrom(double scalar) {
+    return vector.divideFrom(scalar);
+  }
+
+  @Override
+  public DoubleVector pow(int x) {
+    return vector.pow(x);
+  }
+
+  @Override
+  public DoubleVector abs() {
+    return vector.abs();
+  }
+
+  @Override
+  public DoubleVector sqrt() {
+    return vector.sqrt();
+  }
+
+  @Override
+  public double sum() {
+    return vector.sum();
+  }
+
+  @Override
+  public double dotUnsafe(DoubleVector vector2) {
+    return vector.dotUnsafe(vector2);
+  }
+
+  @Override
+  public double dot(DoubleVector vector2) {
+    return vector.dot(vector2);
+  }
+
+  @Override
+  public DoubleVector slice(int length) {
+    return vector.slice(length);
+  }
+
+  @Override
+  public DoubleVector sliceUnsafe(int length) {
+    return vector.sliceUnsafe(length);
+  }
+
+  @Override
+  public DoubleVector slice(int start, int end) {
+    return vector.slice(start, end);
+  }
+
+  @Override
+  public DoubleVector sliceUnsafe(int start, int end) {
+    return vector.sliceUnsafe(start, end);
+  }
+
+  @Override
+  public double max() {
+    return vector.max();
+  }
+
+  @Override
+  public double min() {
+    return vector.min();
+  }
+
+  @Override
+  public double[] toArray() {
+    return vector.toArray();
+  }
+
+  @Override
+  public DoubleVector deepCopy() {
+    return new NamedDoubleVector(name, vector.deepCopy());
+  }
+
+  @Override
+  public Iterator<DoubleVectorElement> iterateNonZero() {
+    return vector.iterateNonZero();
+  }
+
+  @Override
+  public Iterator<DoubleVectorElement> iterate() {
+    return vector.iterate();
+  }
+
+  @Override
+  public boolean isSparse() {
+    return vector.isSparse();
+  }
+
+  @Override
+  public boolean isNamed() {
+    return true;
+  }
+
+  @Override
+  public String getName() {
+    return name;
+  }
+  
+  public String toString() {
+    return name + ": " + vector.toString();
+  }
+
+}

Modified: hama/trunk/examples/src/main/java/org/apache/hama/examples/Kmeans.java
URL: http://svn.apache.org/viewvc/hama/trunk/examples/src/main/java/org/apache/hama/examples/Kmeans.java?rev=1548995&r1=1548994&r2=1548995&view=diff
==============================================================================
--- hama/trunk/examples/src/main/java/org/apache/hama/examples/Kmeans.java (original)
+++ hama/trunk/examples/src/main/java/org/apache/hama/examples/Kmeans.java Sun Dec  8 04:55:37 2013
@@ -86,7 +86,8 @@ public class Kmeans {
       // prepare the input, like deleting old versions and creating centers
       KMeansBSP.prepareInput(count, k, dimension, conf, in, center, out, fs);
     } else {
-      KMeansBSP.prepareInputText(k, conf, in, center, out, fs);
+      // Set the last argument to TRUE if first column is required to be the key
+      KMeansBSP.prepareInputText(k, conf, in, center, out, fs, true);
       in = new Path(in.getParent(), "textinput/in.seq");
     }
 
@@ -95,10 +96,9 @@ public class Kmeans {
     // just submit the job
     job.waitForCompletion(true);
 
-    List<String> results = KMeansBSP.readOutput(conf, out, fs, 5);
+    List<String> results = KMeansBSP.readOutput(conf, out, fs, 10);
     for (String line : results) {
       System.out.println(line);
     }
-    System.out.println("...");
   }
 }

Modified: hama/trunk/ml/src/main/java/org/apache/hama/ml/kmeans/KMeansBSP.java
URL: http://svn.apache.org/viewvc/hama/trunk/ml/src/main/java/org/apache/hama/ml/kmeans/KMeansBSP.java?rev=1548995&r1=1548994&r2=1548995&view=diff
==============================================================================
--- hama/trunk/ml/src/main/java/org/apache/hama/ml/kmeans/KMeansBSP.java (original)
+++ hama/trunk/ml/src/main/java/org/apache/hama/ml/kmeans/KMeansBSP.java Sun Dec  8 04:55:37 2013
@@ -44,6 +44,7 @@ import org.apache.hama.bsp.sync.SyncExce
 import org.apache.hama.commons.io.VectorWritable;
 import org.apache.hama.commons.math.DenseDoubleVector;
 import org.apache.hama.commons.math.DoubleVector;
+import org.apache.hama.commons.math.NamedDoubleVector;
 import org.apache.hama.ml.distance.DistanceMeasurer;
 import org.apache.hama.ml.distance.EuclidianDistance;
 import org.apache.hama.util.ReflectionUtils;
@@ -80,7 +81,6 @@ public final class KMeansBSP
   public final void setup(
       BSPPeer<VectorWritable, NullWritable, IntWritable, VectorWritable, CenterMessage> peer)
       throws IOException, InterruptedException {
-
     conf = peer.getConfiguration();
 
     Path centroids = new Path(peer.getConfiguration().get(CENTER_IN_PATH));
@@ -195,6 +195,7 @@ public final class KMeansBSP
     // needs to be broadcasted.
     final DoubleVector[] newCenterArray = new DoubleVector[centers.length];
     final int[] summationCount = new int[centers.length];
+
     // if our cache is not enabled, iterate over the disk items
     if (cache == null) {
       // we have an assignment step
@@ -222,6 +223,7 @@ public final class KMeansBSP
         }
       }
     }
+
     // now send messages about the local updates to each other peer
     for (int i = 0; i < newCenterArray.length; i++) {
       if (newCenterArray[i] != null) {
@@ -237,6 +239,7 @@ public final class KMeansBSP
       final int[] summationCount, final DoubleVector key) {
     final int lowestDistantCenter = getNearestCenter(key);
     final DoubleVector clusterCenter = newCenterArray[lowestDistantCenter];
+
     if (clusterCenter == null) {
       newCenterArray[lowestDistantCenter] = key;
     } else {
@@ -250,6 +253,7 @@ public final class KMeansBSP
   private int getNearestCenter(DoubleVector key) {
     int lowestDistantCenter = 0;
     double lowestDistance = Double.MAX_VALUE;
+
     for (int i = 0; i < centers.length; i++) {
       final double estimatedDistance = distanceMeasurer.measureDistance(
           centers[i], key);
@@ -419,9 +423,19 @@ public final class KMeansBSP
 
   /**
    * Reads input text files and writes it to a sequencefile.
+   * 
+   * @param k
+   * @param conf
+   * @param txtIn
+   * @param center
+   * @param out
+   * @param fs
+   * @param hasKey true if first column is required to be the key.
+   * @return
+   * @throws IOException
    */
   public static Path prepareInputText(int k, Configuration conf, Path txtIn,
-      Path center, Path out, FileSystem fs) throws IOException {
+      Path center, Path out, FileSystem fs, boolean hasKey) throws IOException {
 
     Path in;
     if (fs.isFile(txtIn)) {
@@ -429,7 +443,7 @@ public final class KMeansBSP
     } else {
       in = new Path(txtIn, "textinput/in.seq");
     }
-    
+
     if (fs.exists(out))
       fs.delete(out, true);
 
@@ -454,11 +468,26 @@ public final class KMeansBSP
     String line;
     while ((line = br.readLine()) != null) {
       String[] split = line.split("\t");
-      DenseDoubleVector vec = new DenseDoubleVector(split.length);
-      for (int j = 0; j < split.length; j++) {
-        vec.set(j, Double.parseDouble(split[j]));
+      int columnLength = split.length;
+      int indexPos = 0;
+      if (hasKey) {
+        columnLength = columnLength - 1;
+        indexPos++;
+      }
+
+      DenseDoubleVector vec = new DenseDoubleVector(columnLength);
+      for (int j = 0; j < columnLength; j++) {
+        vec.set(j, Double.parseDouble(split[j + indexPos]));
+      }
+
+      VectorWritable vector;
+      if (hasKey) {
+        NamedDoubleVector named = new NamedDoubleVector(split[0], vec);
+        vector = new VectorWritable(named);
+      } else {
+        vector = new VectorWritable(vec);
       }
-      VectorWritable vector = new VectorWritable(vec);
+
       dataWriter.append(vector, value);
       if (k > i) {
         assert centerWriter != null;

Modified: hama/trunk/ml/src/test/java/org/apache/hama/ml/kmeans/TestKMeansBSP.java
URL: http://svn.apache.org/viewvc/hama/trunk/ml/src/test/java/org/apache/hama/ml/kmeans/TestKMeansBSP.java?rev=1548995&r1=1548994&r2=1548995&view=diff
==============================================================================
--- hama/trunk/ml/src/test/java/org/apache/hama/ml/kmeans/TestKMeansBSP.java (original)
+++ hama/trunk/ml/src/test/java/org/apache/hama/ml/kmeans/TestKMeansBSP.java Sun Dec  8 04:55:37 2013
@@ -63,7 +63,7 @@ public class TestKMeansBSP extends TestC
       bw.write(sb.toString());
       bw.close();
 
-      in = KMeansBSP.prepareInputText(k, conf, in, center, out, fs);
+      in = KMeansBSP.prepareInputText(k, conf, in, center, out, fs, false);
 
       BSPJob job = KMeansBSP.createJob(conf, in, out, true);