You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by dl...@apache.org on 2015/06/11 02:09:08 UTC
[1/4] mahout git commit: MAHOUT-1660 MAHOUT-1713 MAHOUT-1714
MAHOUT-1715 MAHOUT-1716 MAHOUT-1717 MAHOUT-1718 MAHOUT-1719 MAHOUT-1720
MAHOUT-1721 MAHOUT-1722 MAHOUT-1723 MAHOUT-1724 MAHOUT-1725 MAHOUT-1726
MAHOUT-1727 MAHOUT-1728 MAHOUT-1729 MAHOUT-1730 M
Repository: mahout
Updated Branches:
refs/heads/mahout-0.10.x e6d24b90a -> 8a6b805a3
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/CheckpointedDrmSpark.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/CheckpointedDrmSpark.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/CheckpointedDrmSpark.scala
index e5a2b2a..41efc27 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/CheckpointedDrmSpark.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/CheckpointedDrmSpark.scala
@@ -33,7 +33,7 @@ import org.apache.spark.SparkContext._
/** ==Spark-specific optimizer-checkpointed DRM.==
*
- * @param rdd underlying rdd to wrap over.
+ * @param rddInput underlying rdd to wrap over.
* @param _nrow number of rows; if unspecified, we will compute with an inexpensive traversal.
* @param _ncol number of columns; if unspecified, we will try to guess with an inexpensive traversal.
* @param _cacheStorageLevel storage level
@@ -44,9 +44,9 @@ import org.apache.spark.SparkContext._
* @tparam K matrix key type (e.g. the keys of sequence files once persisted)
*/
class CheckpointedDrmSpark[K: ClassTag](
- val rdd: DrmRdd[K],
- private var _nrow: Long = -1L,
- private var _ncol: Int = -1,
+ private[sparkbindings] val rddInput: DrmRddInput[K],
+ private[sparkbindings] var _nrow: Long = -1L,
+ private[sparkbindings] var _ncol: Int = -1,
private val _cacheStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY,
override protected[mahout] val partitioningTag: Long = Random.nextLong(),
private var _canHaveMissingRows: Boolean = false
@@ -63,7 +63,7 @@ class CheckpointedDrmSpark[K: ClassTag](
private[mahout] var intFixExtra: Long = 0L
private var cached: Boolean = false
- override val context: DistributedContext = rdd.context
+ override val context: DistributedContext = rddInput.backingRdd.context
/** Explicit extraction of key class Tag */
def keyClassTag: ClassTag[K] = implicitly[ClassTag[K]]
@@ -78,8 +78,8 @@ class CheckpointedDrmSpark[K: ClassTag](
}
def cache() = {
- if (!cached) {
- rdd.persist(_cacheStorageLevel)
+ if (!cached && _cacheStorageLevel != StorageLevel.NONE) {
+ rddInput.backingRdd.persist(_cacheStorageLevel)
cached = true
}
this
@@ -92,7 +92,7 @@ class CheckpointedDrmSpark[K: ClassTag](
*/
def uncache(): this.type = {
if (cached) {
- rdd.unpersist(blocking = false)
+ rddInput.backingRdd.unpersist(blocking = false)
cached = false
}
this
@@ -115,7 +115,7 @@ class CheckpointedDrmSpark[K: ClassTag](
*/
def collect: Matrix = {
- val intRowIndices = implicitly[ClassTag[K]] == implicitly[ClassTag[Int]]
+ val intRowIndices = classTag[K] == ClassTag.Int
val cols = ncol
val rows = safeToNonNegInt(nrow)
@@ -124,7 +124,7 @@ class CheckpointedDrmSpark[K: ClassTag](
// since currently spark #collect() requires Serializeable support,
// we serialize DRM vectors into byte arrays on backend and restore Vector
// instances on the front end:
- val data = rdd.map(t => (t._1, t._2)).collect()
+ val data = rddInput.toDrmRdd().map(t => (t._1, t._2)).collect()
val m = if (data.forall(_._2.isDense))
@@ -165,7 +165,7 @@ class CheckpointedDrmSpark[K: ClassTag](
else if (classOf[Writable].isAssignableFrom(ktag.runtimeClass)) (x: K) => x.asInstanceOf[Writable]
else throw new IllegalArgumentException("Do not know how to convert class tag %s to Writable.".format(ktag))
- rdd.saveAsSequenceFile(path)
+ rddInput.toDrmRdd().saveAsSequenceFile(path)
}
protected def computeNRow = {
@@ -173,7 +173,7 @@ class CheckpointedDrmSpark[K: ClassTag](
val intRowIndex = classTag[K] == classTag[Int]
if (intRowIndex) {
- val rdd = cache().rdd.asInstanceOf[DrmRdd[Int]]
+ val rdd = cache().rddInput.toDrmRdd().asInstanceOf[DrmRdd[Int]]
// I guess it is a suitable place to compute int keys consistency test here because we know
// that nrow can be computed lazily, which always happens when rdd is already available, cached,
@@ -186,16 +186,21 @@ class CheckpointedDrmSpark[K: ClassTag](
intFixExtra = (maxPlus1 - rowCount) max 0L
maxPlus1
} else
- cache().rdd.count()
+ cache().rddInput.toDrmRdd().count()
}
- protected def computeNCol =
- cache().rdd.map(_._2.length).fold(-1)(max(_, _))
+ protected def computeNCol = {
+ rddInput.isBlockified match {
+ case true ⇒ rddInput.toBlockifiedDrmRdd(throw new AssertionError("not reached"))
+ .map(_._2.ncol).reduce(max(_, _))
+ case false ⇒ cache().rddInput.toDrmRdd().map(_._2.length).fold(-1)(max(_, _))
+ }
+ }
protected def computeNNonZero =
- cache().rdd.map(_._2.getNumNonZeroElements.toLong).sum().toLong
+ cache().rddInput.toDrmRdd().map(_._2.getNumNonZeroElements.toLong).sum().toLong
/** Changes the number of rows in the DRM without actually touching the underlying data. Used to
* redimension a DRM after it has been created, which implies some blank, non-existent rows.
@@ -205,8 +210,8 @@ class CheckpointedDrmSpark[K: ClassTag](
override def newRowCardinality(n: Int): CheckpointedDrm[K] = {
assert(n > -1)
assert( n >= nrow)
- val newCheckpointedDrm = drmWrap[K](rdd, n, ncol)
- newCheckpointedDrm
+ new CheckpointedDrmSpark(rddInput = rddInput, _nrow = n, _ncol = _ncol, _cacheStorageLevel = _cacheStorageLevel,
+ partitioningTag = partitioningTag, _canHaveMissingRows = _canHaveMissingRows)
}
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/CheckpointedDrmSparkOps.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/CheckpointedDrmSparkOps.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/CheckpointedDrmSparkOps.scala
index 7cf6bd6..abcfc64 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/CheckpointedDrmSparkOps.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/CheckpointedDrmSparkOps.scala
@@ -11,6 +11,6 @@ class CheckpointedDrmSparkOps[K: ClassTag](drm: CheckpointedDrm[K]) {
private[sparkbindings] val sparkDrm = drm.asInstanceOf[CheckpointedDrmSpark[K]]
/** Spark matrix customization exposure */
- def rdd = sparkDrm.rdd
+ def rdd = sparkDrm.rddInput.toDrmRdd()
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/DrmRddInput.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/DrmRddInput.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/DrmRddInput.scala
index b72818c..d9dbada 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/DrmRddInput.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/DrmRddInput.scala
@@ -23,22 +23,18 @@ import org.apache.spark.storage.StorageLevel
import org.apache.mahout.sparkbindings._
/** Encapsulates either DrmRdd[K] or BlockifiedDrmRdd[K] */
-class DrmRddInput[K: ClassTag](
- private val rowWiseSrc: Option[( /*ncol*/ Int, /*rdd*/ DrmRdd[K])] = None,
- private val blockifiedSrc: Option[BlockifiedDrmRdd[K]] = None
- ) {
+class DrmRddInput[K: ClassTag](private val input: Either[DrmRdd[K], BlockifiedDrmRdd[K]]) {
- assert(rowWiseSrc.isDefined || blockifiedSrc.isDefined, "Undefined input")
+ private[sparkbindings] lazy val backingRdd = input.left.getOrElse(input.right.get)
- private lazy val backingRdd = rowWiseSrc.map(_._2).getOrElse(blockifiedSrc.get)
+ def isBlockified: Boolean = input.isRight
- def isBlockified:Boolean = blockifiedSrc.isDefined
+ def isRowWise: Boolean = input.isLeft
- def isRowWise:Boolean = rowWiseSrc.isDefined
+ def toDrmRdd(): DrmRdd[K] = input.left.getOrElse(deblockify(rdd = input.right.get))
- def toDrmRdd(): DrmRdd[K] = rowWiseSrc.map(_._2).getOrElse(deblockify(rdd = blockifiedSrc.get))
-
- def toBlockifiedDrmRdd() = blockifiedSrc.getOrElse(blockify(rdd = rowWiseSrc.get._2, blockncol = rowWiseSrc.get._1))
+ /** Use late binding for this. It may or may not be needed, depending on current config. */
+ def toBlockifiedDrmRdd(ncol: ⇒ Int) = input.right.getOrElse(blockify(rdd = input.left.get, blockncol = ncol))
def sparkContext: SparkContext = backingRdd.sparkContext
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/SparkBCast.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/SparkBCast.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/SparkBCast.scala
index ac36f60..0371f9b 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/SparkBCast.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/SparkBCast.scala
@@ -22,4 +22,6 @@ import org.apache.spark.broadcast.Broadcast
class SparkBCast[T](val sbcast: Broadcast[T]) extends BCast[T] with Serializable {
def value: T = sbcast.value
+
+ override def close(): Unit = sbcast.unpersist()
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/package.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/package.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/package.scala
index c04b306..0de5ff8 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/package.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/package.scala
@@ -37,18 +37,19 @@ package object drm {
private[drm] final val log = Logger.getLogger("org.apache.mahout.sparkbindings");
- private[sparkbindings] implicit def input2drmRdd[K](input: DrmRddInput[K]): DrmRdd[K] = input.toDrmRdd()
+ private[sparkbindings] implicit def cpDrm2DrmRddInput[K: ClassTag](cp: CheckpointedDrmSpark[K]): DrmRddInput[K] =
+ cp.rddInput
- private[sparkbindings] implicit def input2blockifiedDrmRdd[K](input: DrmRddInput[K]): BlockifiedDrmRdd[K] = input.toBlockifiedDrmRdd()
+ private[sparkbindings] implicit def cpDrmGeneric2DrmRddInput[K: ClassTag](cp: CheckpointedDrm[K]): DrmRddInput[K] =
+ cp.asInstanceOf[CheckpointedDrmSpark[K]]
+
+ private[sparkbindings] implicit def drmRdd2drmRddInput[K: ClassTag](rdd: DrmRdd[K]) = new DrmRddInput[K](Left(rdd))
+
+ private[sparkbindings] implicit def blockifiedRdd2drmRddInput[K: ClassTag](rdd: BlockifiedDrmRdd[K]) = new
+ DrmRddInput[K](
+ Right(rdd))
- private[sparkbindings] implicit def cpDrm2DrmRddInput[K: ClassTag](cp: CheckpointedDrm[K]): DrmRddInput[K] =
- new DrmRddInput(rowWiseSrc = Some(cp.ncol -> cp.rdd))
-// /** Broadcast vector (Mahout vectors are not closure-friendly, use this instead. */
-// private[sparkbindings] def drmBroadcast(x: Vector)(implicit sc: SparkContext): Broadcast[Vector] = sc.broadcast(x)
-//
-// /** Broadcast in-core Mahout matrix. Use this instead of closure. */
-// private[sparkbindings] def drmBroadcast(m: Matrix)(implicit sc: SparkContext): Broadcast[Matrix] = sc.broadcast(m)
/** Implicit broadcast cast for Spark physical op implementations. */
private[sparkbindings] implicit def bcast2val[K](bcast:Broadcast[K]):K = bcast.value
@@ -74,7 +75,7 @@ package object drm {
}
block
} else {
- new SparseRowMatrix(vectors.size, blockncol, vectors)
+ new SparseRowMatrix(vectors.size, blockncol, vectors, true, false)
}
Iterator(keys -> block)
@@ -101,7 +102,7 @@ package object drm {
blockKeys.ensuring(blockKeys.size == block.nrow)
blockKeys.view.zipWithIndex.map {
case (key, idx) =>
- var v = block(idx, ::) // This is just a view!
+ val v = block(idx, ::) // This is just a view!
// If a view rather than a concrete vector, clone into a concrete vector in order not to
// attempt to serialize outer matrix when we save it (Although maybe most often this
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/io/GenericMatrixKryoSerializer.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/io/GenericMatrixKryoSerializer.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/io/GenericMatrixKryoSerializer.scala
new file mode 100644
index 0000000..da58b35
--- /dev/null
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/io/GenericMatrixKryoSerializer.scala
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.sparkbindings.io
+
+
+import com.esotericsoftware.kryo.io.{Output, Input}
+import com.esotericsoftware.kryo.{Kryo, Serializer}
+import org.apache.log4j.Logger
+import org.apache.mahout.logging._
+import org.apache.mahout.math._
+import org.apache.mahout.math.flavor.TraversingStructureEnum
+import scalabindings._
+import RLikeOps._
+import collection._
+import JavaConversions._
+
+object GenericMatrixKryoSerializer {
+
+ private implicit final val log = Logger.getLogger(classOf[GenericMatrixKryoSerializer])
+
+}
+
+/** Serializes Sparse or Dense in-core generic matrix (row-wise or column-wise backed) */
+class GenericMatrixKryoSerializer extends Serializer[Matrix] {
+
+ import GenericMatrixKryoSerializer._
+
+ override def write(kryo: Kryo, output: Output, mx: Matrix): Unit = {
+
+ debug(s"Writing mx of type ${mx.getClass.getName}")
+
+ val structure = mx.getFlavor.getStructure
+
+ // Write structure bit
+ output.writeInt(structure.ordinal(), true)
+
+ // Write geometry
+ output.writeInt(mx.nrow, true)
+ output.writeInt(mx.ncol, true)
+
+ // Write in most efficient traversal order (using backing vectors perhaps)
+ structure match {
+ case TraversingStructureEnum.COLWISE => writeRowWise(kryo, output, mx.t)
+ case TraversingStructureEnum.SPARSECOLWISE => writeSparseRowWise(kryo, output, mx.t)
+ case TraversingStructureEnum.SPARSEROWWISE => writeSparseRowWise(kryo, output, mx)
+ case TraversingStructureEnum.VECTORBACKED => writeVectorBacked(kryo, output, mx)
+ case _ => writeRowWise(kryo, output, mx)
+ }
+
+ }
+
+ private def writeVectorBacked(kryo: Kryo, output: Output, mx: Matrix) {
+
+ require(mx != null)
+
+ // At this point we are just doing some vector-backed classes individually. TODO: create
+ // api to obtain vector-backed matrix data.
+ kryo.writeClass(output, mx.getClass)
+ mx match {
+ case mxD: DiagonalMatrix => kryo.writeObject(output, mxD.diagv)
+ case mxS: DenseSymmetricMatrix => kryo.writeObject(output, dvec(mxS.getData))
+ case mxT: UpperTriangular => kryo.writeObject(output, dvec(mxT.getData))
+ case _ => throw new IllegalArgumentException(s"Unsupported matrix type:${mx.getClass.getName}")
+ }
+ }
+
+ private def readVectorBacked(kryo: Kryo, input: Input, nrow: Int, ncol: Int) = {
+
+ // We require vector-backed matrices to have vector-parameterized constructor to construct.
+ val clazz = kryo.readClass(input).getType
+
+ debug(s"Deserializing vector-backed mx of type ${clazz.getName}.")
+
+ clazz.getConstructor(classOf[Vector]).newInstance(kryo.readObject(input, classOf[Vector])).asInstanceOf[Matrix]
+ }
+
+ private def writeRowWise(kryo: Kryo, output: Output, mx: Matrix): Unit = {
+ for (row <- mx) kryo.writeObject(output, row)
+ }
+
+ private def readRows(kryo: Kryo, input: Input, nrow: Int) = {
+ Array.tabulate(nrow) { _ => kryo.readObject(input, classOf[Vector])}
+ }
+
+ private def readSparseRows(kryo: Kryo, input: Input) = {
+
+ // Number of slices
+ val nslices = input.readInt(true)
+
+ Array.tabulate(nslices) { _ =>
+ input.readInt(true) -> kryo.readObject(input, classOf[Vector])
+ }
+ }
+
+ private def writeSparseRowWise(kryo: Kryo, output: Output, mx: Matrix): Unit = {
+
+ val nslices = mx.numSlices()
+
+ output.writeInt(nslices, true)
+
+ var actualNSlices = 0;
+ for (row <- mx.iterateNonEmpty()) {
+ output.writeInt(row.index(), true)
+ kryo.writeObject(output, row.vector())
+ actualNSlices += 1
+ }
+
+ require(nslices == actualNSlices, "Number of slices reported by Matrix.numSlices() was different from actual " +
+ "slice iterator size.")
+ }
+
+ override def read(kryo: Kryo, input: Input, mxClass: Class[Matrix]): Matrix = {
+
+ // Read structure hint
+ val structure = TraversingStructureEnum.values()(input.readInt(true))
+
+ // Read geometry
+ val nrow = input.readInt(true)
+ val ncol = input.readInt(true)
+
+ debug(s"read matrix geometry: $nrow x $ncol.")
+
+ structure match {
+
+ // Sparse or dense column wise
+ case TraversingStructureEnum.COLWISE =>
+ val cols = readRows(kryo, input, ncol)
+
+ if (!cols.isEmpty && cols.head.isDense)
+ dense(cols).t
+ else {
+ debug("Deserializing as SparseRowMatrix.t (COLWISE).")
+ new SparseRowMatrix(ncol, nrow, cols, true, false).t
+ }
+
+ // transposed SparseMatrix case
+ case TraversingStructureEnum.SPARSECOLWISE =>
+ val cols = readSparseRows(kryo, input)
+ val javamap = new java.util.HashMap[Integer, Vector]((cols.size << 1) + 1)
+ cols.foreach { case (idx, vec) => javamap.put(idx, vec)}
+
+ debug("Deserializing as SparseMatrix.t (SPARSECOLWISE).")
+ new SparseMatrix(ncol, nrow, javamap, true).t
+
+ // Sparse Row-wise -- this will be created as a SparseMatrix.
+ case TraversingStructureEnum.SPARSEROWWISE =>
+ val rows = readSparseRows(kryo, input)
+ val javamap = new java.util.HashMap[Integer, Vector]((rows.size << 1) + 1)
+ rows.foreach { case (idx, vec) => javamap.put(idx, vec)}
+
+ debug("Deserializing as SparseMatrix (SPARSEROWWISE).")
+ new SparseMatrix(nrow, ncol, javamap, true)
+ case TraversingStructureEnum.VECTORBACKED =>
+
+ debug("Deserializing vector-backed...")
+ readVectorBacked(kryo, input, nrow, ncol)
+
+ // By default, read row-wise.
+ case _ =>
+ val cols = readRows(kryo, input, nrow)
+ // this still copies a lot of stuff...
+ if (!cols.isEmpty && cols.head.isDense) {
+
+ debug("Deserializing as DenseMatrix.")
+ dense(cols)
+ } else {
+
+ debug("Deserializing as SparseRowMatrix(default).")
+ new SparseRowMatrix(nrow, ncol, cols, true, false)
+ }
+ }
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/io/MahoutKryoRegistrator.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/io/MahoutKryoRegistrator.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/io/MahoutKryoRegistrator.scala
index a8a0bb4..5806ff5 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/io/MahoutKryoRegistrator.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/io/MahoutKryoRegistrator.scala
@@ -18,22 +18,28 @@
package org.apache.mahout.sparkbindings.io
import com.esotericsoftware.kryo.Kryo
-import com.esotericsoftware.kryo.serializers.JavaSerializer
import org.apache.mahout.math._
-import org.apache.mahout.math.indexeddataset.{BiMap, BiDictionary}
import org.apache.spark.serializer.KryoRegistrator
-import org.apache.mahout.sparkbindings._
-import org.apache.mahout.math.Vector.Element
+import org.apache.mahout.logging._
-import scala.collection.immutable.List
+object MahoutKryoRegistrator {
-/** Kryo serialization registrator for Mahout */
-class MahoutKryoRegistrator extends KryoRegistrator {
+ private final implicit val log = getLog(this.getClass)
+
+ def registerClasses(kryo: Kryo) = {
- override def registerClasses(kryo: Kryo) = {
+ trace("Registering mahout classes.")
+
+ kryo.register(classOf[SparseColumnMatrix], new UnsupportedSerializer)
+ kryo.addDefaultSerializer(classOf[Vector], new VectorKryoSerializer())
+ kryo.addDefaultSerializer(classOf[Matrix], new GenericMatrixKryoSerializer)
- kryo.addDefaultSerializer(classOf[Vector], new WritableKryoSerializer[Vector, VectorWritable])
- kryo.addDefaultSerializer(classOf[DenseVector], new WritableKryoSerializer[Vector, VectorWritable])
- kryo.addDefaultSerializer(classOf[Matrix], new WritableKryoSerializer[Matrix, MatrixWritable])
}
+
+}
+
+/** Kryo serialization registrator for Mahout */
+class MahoutKryoRegistrator extends KryoRegistrator {
+
+ override def registerClasses(kryo: Kryo) = MahoutKryoRegistrator.registerClasses(kryo)
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/io/UnsupportedSerializer.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/io/UnsupportedSerializer.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/io/UnsupportedSerializer.scala
new file mode 100644
index 0000000..66b79f4
--- /dev/null
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/io/UnsupportedSerializer.scala
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.sparkbindings.io
+
+import com.esotericsoftware.kryo.io.{Output, Input}
+import com.esotericsoftware.kryo.{Kryo, Serializer}
+
+class UnsupportedSerializer extends Serializer[Any] {
+
+ override def write(kryo: Kryo, output: Output, obj: Any): Unit = {
+ throw new IllegalArgumentException(s"I/O of this type(${obj.getClass.getName} is explicitly unsupported for a " +
+ "good reason.")
+ }
+
+ override def read(kryo: Kryo, input: Input, `type`: Class[Any]): Any = ???
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/io/VectorKryoSerializer.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/io/VectorKryoSerializer.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/io/VectorKryoSerializer.scala
new file mode 100644
index 0000000..175778f
--- /dev/null
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/io/VectorKryoSerializer.scala
@@ -0,0 +1,252 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.sparkbindings.io
+
+import org.apache.log4j.Logger
+import org.apache.mahout.logging._
+import org.apache.mahout.math._
+import org.apache.mahout.math.scalabindings._
+import RLikeOps._
+
+import com.esotericsoftware.kryo.io.{OutputChunked, Output, Input}
+import com.esotericsoftware.kryo.{Kryo, Serializer}
+
+import collection._
+import JavaConversions._
+
+
+object VectorKryoSerializer {
+
+ final val FLAG_DENSE: Int = 0x01
+ final val FLAG_SEQUENTIAL: Int = 0x02
+ final val FLAG_NAMED: Int = 0x04
+ final val FLAG_LAX_PRECISION: Int = 0x08
+
+ private final implicit val log = getLog(classOf[VectorKryoSerializer])
+
+}
+
+class VectorKryoSerializer(val laxPrecision: Boolean = false) extends Serializer[Vector] {
+
+ import VectorKryoSerializer._
+
+ override def write(kryo: Kryo, output: Output, vector: Vector): Unit = {
+
+ require(vector != null)
+
+ trace(s"Serializing vector of ${vector.getClass.getName} class.")
+
+ // Write length
+ val len = vector.length
+ output.writeInt(len, true)
+
+ // Interrogate vec properties
+ val dense = vector.isDense
+ val sequential = vector.isSequentialAccess
+ val named = vector.isInstanceOf[NamedVector]
+
+ var flag = 0
+
+ if (dense) {
+ flag |= FLAG_DENSE
+ } else if (sequential) {
+ flag |= FLAG_SEQUENTIAL
+ }
+
+ if (vector.isInstanceOf[NamedVector]) {
+ flag |= FLAG_NAMED
+ }
+
+ if (laxPrecision) flag |= FLAG_LAX_PRECISION
+
+ // Write flags
+ output.writeByte(flag)
+
+ // Write name if needed
+ if (named) output.writeString(vector.asInstanceOf[NamedVector].getName)
+
+ dense match {
+
+ // Dense vector.
+ case true =>
+
+ laxPrecision match {
+ case true =>
+ for (i <- 0 until vector.length) output.writeFloat(vector(i).toFloat)
+ case _ =>
+ for (i <- 0 until vector.length) output.writeDouble(vector(i))
+ }
+ case _ =>
+
+ // Turns out getNumNonZeroElements must check every element if it is indeed non-zero. The
+ // iterateNonZeros() on the other hand doesn't do that, so that's all inconsistent right
+ // now. so we'll just auto-terminate.
+ val iter = vector.nonZeroes.toIterator.filter(_.get() != 0.0)
+
+ sequential match {
+
+ // Delta encoding
+ case true =>
+
+ var idx = 0
+ laxPrecision match {
+ case true =>
+ while (iter.hasNext) {
+ val el = iter.next()
+ output.writeFloat(el.toFloat)
+ output.writeInt(el.index() - idx, true)
+ idx = el.index
+ }
+ // Terminate delta encoding.
+ output.writeFloat(0.0.toFloat)
+ case _ =>
+ while (iter.hasNext) {
+ val el = iter.next()
+ output.writeDouble(el.get())
+ output.writeInt(el.index() - idx, true)
+ idx = el.index
+ }
+ // Terminate delta encoding.
+ output.writeDouble(0.0)
+ }
+
+ // Random access.
+ case _ =>
+
+ laxPrecision match {
+
+ case true =>
+ iter.foreach { el =>
+ output.writeFloat(el.get().toFloat)
+ output.writeInt(el.index(), true)
+ }
+ // Terminate random access with 0.0 value.
+ output.writeFloat(0.0.toFloat)
+ case _ =>
+ iter.foreach { el =>
+ output.writeDouble(el.get())
+ output.writeInt(el.index(), true)
+ }
+ // Terminate random access with 0.0 value.
+ output.writeDouble(0.0)
+ }
+
+ }
+
+ }
+ }
+
+ override def read(kryo: Kryo, input: Input, vecClass: Class[Vector]): Vector = {
+
+ val len = input.readInt(true)
+ val flags = input.readByte().toInt
+ val name = if ((flags & FLAG_NAMED) != 0) Some(input.readString()) else None
+
+ val vec: Vector = flags match {
+
+ // Dense
+ case _: Int if ((flags & FLAG_DENSE) != 0) =>
+
+ trace(s"Deserializing dense vector.")
+
+ if ((flags & FLAG_LAX_PRECISION) != 0) {
+ new DenseVector(len) := { _ => input.readFloat()}
+ } else {
+ new DenseVector(len) := { _ => input.readDouble()}
+ }
+
+ // Sparse case.
+ case _ =>
+
+ flags match {
+
+ // Sequential.
+ case _: Int if ((flags & FLAG_SEQUENTIAL) != 0) =>
+
+ trace("Deserializing as sequential sparse vector.")
+
+ val v = new SequentialAccessSparseVector(len)
+ var idx = 0
+ var stop = false
+
+ if ((flags & FLAG_LAX_PRECISION) != 0) {
+
+ while (!stop) {
+ val value = input.readFloat()
+ if (value == 0.0) {
+ stop = true
+ } else {
+ idx += input.readInt(true)
+ v(idx) = value
+ }
+ }
+ } else {
+ while (!stop) {
+ val value = input.readDouble()
+ if (value == 0.0) {
+ stop = true
+ } else {
+ idx += input.readInt(true)
+ v(idx) = value
+ }
+ }
+ }
+ v
+
+ // Random access
+ case _ =>
+
+ trace("Deserializing as random access vector.")
+
+ // Read pairs until we see 0.0 value. Prone to corruption attacks obviously.
+ val v = new RandomAccessSparseVector(len)
+ var stop = false
+ if ((flags & FLAG_LAX_PRECISION) != 0) {
+ while (! stop ) {
+ val value = input.readFloat()
+ if ( value == 0.0 ) {
+ stop = true
+ } else {
+ val idx = input.readInt(true)
+ v(idx) = value
+ }
+ }
+ } else {
+ while (! stop ) {
+ val value = input.readDouble()
+ if (value == 0.0) {
+ stop = true
+ } else {
+ val idx = input.readInt(true)
+ v(idx) = value
+ }
+ }
+ }
+ v
+ }
+ }
+
+ name.map{name =>
+
+ trace(s"Recovering named vector's name ${name}.")
+
+ new NamedVector(vec, name)
+ }
+ .getOrElse(vec)
+ }
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/package.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/package.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/package.scala
index 02f6b8c..330ae38 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/package.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/package.scala
@@ -17,27 +17,27 @@
package org.apache.mahout
-import org.apache.mahout.drivers.TextDelimitedIndexedDatasetReader
-import org.apache.mahout.math.indexeddataset.Schema
-import org.apache.mahout.sparkbindings.indexeddataset.IndexedDatasetSpark
-import org.apache.spark.{SparkConf, SparkContext}
import java.io._
-import scala.collection.mutable.ArrayBuffer
-import org.apache.mahout.common.IOUtils
-import org.apache.log4j.Logger
+
+import org.apache.mahout.logging._
import org.apache.mahout.math.drm._
-import scala.reflect.ClassTag
-import org.apache.mahout.sparkbindings.drm.{DrmRddInput, SparkBCast, CheckpointedDrmSparkOps, CheckpointedDrmSpark}
-import org.apache.spark.rdd.RDD
+import org.apache.mahout.math.{MatrixWritable, VectorWritable, Matrix, Vector}
+import org.apache.mahout.sparkbindings.drm.{CheckpointedDrmSpark, CheckpointedDrmSparkOps, SparkBCast}
+import org.apache.mahout.util.IOUtilsScala
import org.apache.spark.broadcast.Broadcast
-import org.apache.mahout.math.{VectorWritable, Vector, MatrixWritable, Matrix}
-import org.apache.hadoop.io.Writable
-import org.apache.spark.storage.StorageLevel
+import org.apache.spark.rdd.RDD
+import org.apache.spark.{SparkConf, SparkContext}
+
+import collection._
+import collection.generic.Growable
+import scala.reflect.ClassTag
+
+
/** Public api for Spark-specific operators */
package object sparkbindings {
- private[sparkbindings] val log = Logger.getLogger("org.apache.mahout.sparkbindings")
+ private final implicit val log = getLog(`package`.getClass)
/** Row-wise organized DRM rdd type */
type DrmRdd[K] = RDD[DrmTuple[K]]
@@ -55,15 +55,11 @@ package object sparkbindings {
* @param customJars
* @return
*/
- def mahoutSparkContext(
- masterUrl: String,
- appName: String,
- customJars: TraversableOnce[String] = Nil,
- sparkConf: SparkConf = new SparkConf(),
- addMahoutJars: Boolean = true
- ): SparkDistributedContext = {
+ def mahoutSparkContext(masterUrl: String, appName: String, customJars: TraversableOnce[String] = Nil,
+ sparkConf: SparkConf = new SparkConf(), addMahoutJars: Boolean = true):
+ SparkDistributedContext = {
- val closeables = new java.util.ArrayDeque[Closeable]()
+ val closeables = mutable.ListBuffer.empty[Closeable]
try {
@@ -84,9 +80,9 @@ package object sparkbindings {
sparkConf.setJars(customJars.toSeq)
}
- sparkConf.setAppName(appName).setMaster(masterUrl)
- .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
- .set("spark.kryo.registrator", "org.apache.mahout.sparkbindings.io.MahoutKryoRegistrator")
+ sparkConf.setAppName(appName).setMaster(masterUrl).set("spark.serializer",
+ "org.apache.spark.serializer.KryoSerializer").set("spark.kryo.registrator",
+ "org.apache.mahout.sparkbindings.io.MahoutKryoRegistrator")
if (System.getenv("SPARK_HOME") != null) {
sparkConf.setSparkHome(System.getenv("SPARK_HOME"))
@@ -95,7 +91,7 @@ package object sparkbindings {
new SparkDistributedContext(new SparkContext(config = sparkConf))
} finally {
- IOUtils.close(closeables)
+ IOUtilsScala.close(closeables)
}
}
@@ -103,19 +99,19 @@ package object sparkbindings {
implicit def sc2sdc(sc: SparkContext): SparkDistributedContext = new SparkDistributedContext(sc)
- implicit def dc2sc(dc:DistributedContext):SparkContext = {
- assert (dc.isInstanceOf[SparkDistributedContext],"distributed context must be Spark-specific.")
+ implicit def dc2sc(dc: DistributedContext): SparkContext = {
+ assert(dc.isInstanceOf[SparkDistributedContext], "distributed context must be Spark-specific.")
sdc2sc(dc.asInstanceOf[SparkDistributedContext])
}
/** Broadcast transforms */
- implicit def sb2bc[T](b:Broadcast[T]):BCast[T] = new SparkBCast(b)
+ implicit def sb2bc[T](b: Broadcast[T]): BCast[T] = new SparkBCast(b)
/** Adding Spark-specific ops */
implicit def cpDrm2cpDrmSparkOps[K: ClassTag](drm: CheckpointedDrm[K]): CheckpointedDrmSparkOps[K] =
new CheckpointedDrmSparkOps[K](drm)
- implicit def drm2cpDrmSparkOps[K:ClassTag](drm:DrmLike[K]):CheckpointedDrmSparkOps[K] = drm:CheckpointedDrm[K]
+ implicit def drm2cpDrmSparkOps[K: ClassTag](drm: DrmLike[K]): CheckpointedDrmSparkOps[K] = drm: CheckpointedDrm[K]
private[sparkbindings] implicit def m2w(m: Matrix): MatrixWritable = new MatrixWritable(m)
@@ -123,7 +119,7 @@ package object sparkbindings {
private[sparkbindings] implicit def v2w(v: Vector): VectorWritable = new VectorWritable(v)
- private[sparkbindings] implicit def w2v(w:VectorWritable):Vector = w.get()
+ private[sparkbindings] implicit def w2v(w: VectorWritable): Vector = w.get()
/**
* ==Wrap existing RDD into a matrix==
@@ -141,34 +137,31 @@ package object sparkbindings {
* @tparam K row key type
* @return wrapped DRM
*/
- def drmWrap[K: ClassTag](
- rdd: DrmRdd[K],
- nrow: Int = -1,
- ncol: Int = -1,
- cacheHint: CacheHint.CacheHint = CacheHint.NONE,
- canHaveMissingRows: Boolean = false
- ): CheckpointedDrm[K] =
-
- new CheckpointedDrmSpark[K](
- rdd = rdd,
- _nrow = nrow,
- _ncol = ncol,
- _cacheStorageLevel = SparkEngine.cacheHint2Spark(cacheHint),
- _canHaveMissingRows = canHaveMissingRows
- )
+ def drmWrap[K: ClassTag](rdd: DrmRdd[K], nrow: Long = -1, ncol: Int = -1, cacheHint: CacheHint.CacheHint =
+ CacheHint.NONE, canHaveMissingRows: Boolean = false): CheckpointedDrm[K] =
+
+ new CheckpointedDrmSpark[K](rddInput = rdd, _nrow = nrow, _ncol = ncol, _cacheStorageLevel = SparkEngine
+ .cacheHint2Spark(cacheHint), _canHaveMissingRows = canHaveMissingRows)
+
+
+ /** Another drmWrap version that takes in vertical block-partitioned input to form the matrix. */
+ def drmWrapBlockified[K: ClassTag](blockifiedDrmRdd: BlockifiedDrmRdd[K], nrow: Long = -1, ncol: Int = -1,
+ cacheHint: CacheHint.CacheHint = CacheHint.NONE,
+ canHaveMissingRows: Boolean = false): CheckpointedDrm[K] =
+
+ drmWrap(drm.deblockify(blockifiedDrmRdd), nrow, ncol, cacheHint, canHaveMissingRows)
private[sparkbindings] def getMahoutHome() = {
var mhome = System.getenv("MAHOUT_HOME")
if (mhome == null) mhome = System.getProperty("mahout.home")
- require(mhome != null, "MAHOUT_HOME is required to spawn mahout-based spark jobs" )
+ require(mhome != null, "MAHOUT_HOME is required to spawn mahout-based spark jobs")
mhome
}
/** Acquire proper Mahout jars to be added to task context based on current MAHOUT_HOME. */
- private[sparkbindings] def findMahoutContextJars(closeables:java.util.Deque[Closeable]) = {
+ private[sparkbindings] def findMahoutContextJars(closeables: Growable[Closeable]) = {
// Figure Mahout classpath using $MAHOUT_HOME/mahout classpath command.
-
val fmhome = new File(getMahoutHome())
val bin = new File(fmhome, "bin")
val exec = new File(bin, "mahout")
@@ -177,26 +170,25 @@ package object sparkbindings {
val p = Runtime.getRuntime.exec(Array(exec.getAbsolutePath, "-spark", "classpath"))
- closeables.addFirst(new Closeable {
+ closeables += new Closeable {
def close() {
p.destroy()
}
- })
+ }
val r = new BufferedReader(new InputStreamReader(p.getInputStream))
- closeables.addFirst(r)
+ closeables += r
val w = new StringWriter()
- closeables.addFirst(w)
+ closeables += w
var continue = true;
- val jars = new ArrayBuffer[String]()
+ val jars = new mutable.ArrayBuffer[String]()
do {
val cp = r.readLine()
if (cp == null)
- throw new IllegalArgumentException(
- "Unable to read output from \"mahout -spark classpath\". Is SPARK_HOME defined?"
- )
+ throw new IllegalArgumentException("Unable to read output from \"mahout -spark classpath\". Is SPARK_HOME " +
+ "defined?")
val j = cp.split(File.pathSeparatorChar)
if (j.size > 10) {
@@ -206,8 +198,7 @@ package object sparkbindings {
}
} while (continue)
-// jars.foreach(j => log.info(j))
-
+ // jars.foreach(j => log.info(j))
// context specific jars
val mcjars = jars.filter(j =>
j.matches(".*mahout-math-\\d.*\\.jar") ||
@@ -233,4 +224,13 @@ package object sparkbindings {
mcjars
}
+ private[sparkbindings] def validateBlockifiedDrmRdd[K](rdd: BlockifiedDrmRdd[K]): Boolean = {
+ // Mostly, here each block must contain exactly one block
+ val part1Req = rdd.mapPartitions(piter => Iterator(piter.size == 1)).reduce(_ && _)
+
+ if (!part1Req) warn("blockified rdd: condition not met: exactly 1 per partition")
+
+ return part1Req
+ }
+
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/test/scala/org/apache/mahout/sparkbindings/SparkBindingsSuite.scala
----------------------------------------------------------------------
diff --git a/spark/src/test/scala/org/apache/mahout/sparkbindings/SparkBindingsSuite.scala b/spark/src/test/scala/org/apache/mahout/sparkbindings/SparkBindingsSuite.scala
index fbc31f3..529d13c 100644
--- a/spark/src/test/scala/org/apache/mahout/sparkbindings/SparkBindingsSuite.scala
+++ b/spark/src/test/scala/org/apache/mahout/sparkbindings/SparkBindingsSuite.scala
@@ -1,10 +1,12 @@
package org.apache.mahout.sparkbindings
-import org.scalatest.FunSuite
+import java.io.{Closeable, File}
import java.util
-import java.io.{File, Closeable}
-import org.apache.mahout.common.IOUtils
+
import org.apache.mahout.sparkbindings.test.DistributedSparkSuite
+import org.apache.mahout.util.IOUtilsScala
+import org.scalatest.FunSuite
+import collection._
/**
* @author dmitriy
@@ -16,7 +18,7 @@ class SparkBindingsSuite extends FunSuite with DistributedSparkSuite {
// let it to be ignored.
ignore("context jars") {
System.setProperty("mahout.home", new File("..").getAbsolutePath/*"/home/dmitriy/projects/github/mahout-commits"*/)
- val closeables = new util.ArrayDeque[Closeable]()
+ val closeables = new mutable.ListBuffer[Closeable]()
try {
val mahoutJars = findMahoutContextJars(closeables)
mahoutJars.foreach {
@@ -26,7 +28,7 @@ class SparkBindingsSuite extends FunSuite with DistributedSparkSuite {
mahoutJars.size should be > 0
mahoutJars.size shouldBe 4
} finally {
- IOUtils.close(closeables)
+ IOUtilsScala.close(closeables)
}
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/test/scala/org/apache/mahout/sparkbindings/blas/BlasSuite.scala
----------------------------------------------------------------------
diff --git a/spark/src/test/scala/org/apache/mahout/sparkbindings/blas/BlasSuite.scala b/spark/src/test/scala/org/apache/mahout/sparkbindings/blas/BlasSuite.scala
index 1521cb8..8c8ac3f 100644
--- a/spark/src/test/scala/org/apache/mahout/sparkbindings/blas/BlasSuite.scala
+++ b/spark/src/test/scala/org/apache/mahout/sparkbindings/blas/BlasSuite.scala
@@ -26,7 +26,7 @@ import scalabindings._
import RLikeOps._
import drm._
import org.apache.mahout.sparkbindings._
-import org.apache.mahout.sparkbindings.drm.CheckpointedDrmSpark
+import org.apache.mahout.sparkbindings.drm._
import org.apache.mahout.math.drm.logical.{OpAt, OpAtA, OpAewB, OpABt}
import org.apache.mahout.sparkbindings.test.DistributedSparkSuite
@@ -142,7 +142,7 @@ class BlasSuite extends FunSuite with DistributedSparkSuite {
val drmA = drmParallelize(m = inCoreA, numPartitions = 2)
val op = new OpAt(drmA)
- val drmAt = new CheckpointedDrmSpark(rdd = At.at(op, srcA = drmA), _nrow = op.nrow, _ncol = op.ncol)
+ val drmAt = new CheckpointedDrmSpark(rddInput = At.at(op, srcA = drmA), _nrow = op.nrow, _ncol = op.ncol)
val inCoreAt = drmAt.collect
val inCoreControlAt = inCoreA.t
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/DrmLikeOpsSuite.scala
----------------------------------------------------------------------
diff --git a/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/DrmLikeOpsSuite.scala b/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/DrmLikeOpsSuite.scala
index 42026ae..7241660 100644
--- a/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/DrmLikeOpsSuite.scala
+++ b/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/DrmLikeOpsSuite.scala
@@ -23,13 +23,14 @@ import drm._
import RLikeOps._
import RLikeDrmOps._
import org.apache.mahout.sparkbindings._
-import org.scalatest.FunSuite
+import org.scalatest.{ConfigMap, BeforeAndAfterAllConfigMap, FunSuite}
import org.apache.mahout.sparkbindings.test.DistributedSparkSuite
+import scala.reflect.ClassTag
+
/** Tests for DrmLikeOps */
class DrmLikeOpsSuite extends FunSuite with DistributedSparkSuite with DrmLikeOpsSuiteBase {
-
test("exact, min and auto ||") {
val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6))
val A = drmParallelize(m = inCoreA, numPartitions = 2)
@@ -39,18 +40,20 @@ class DrmLikeOpsSuite extends FunSuite with DistributedSparkSuite with DrmLikeOp
(A + 1.0).par(exact = 4).rdd.partitions.size should equal(4)
A.par(exact = 2).rdd.partitions.size should equal(2)
A.par(exact = 1).rdd.partitions.size should equal(1)
- A.par(exact = 0).rdd.partitions.size should equal(2) // No effect for par <= 0
+
A.par(min = 4).rdd.partitions.size should equal(4)
A.par(min = 2).rdd.partitions.size should equal(2)
A.par(min = 1).rdd.partitions.size should equal(2)
A.par(auto = true).rdd.partitions.size should equal(10)
A.par(exact = 10).par(auto = true).rdd.partitions.size should equal(10)
A.par(exact = 11).par(auto = true).rdd.partitions.size should equal(19)
- A.par(exact = 20).par(auto = true).rdd.partitions.size should equal(20)
+ A.par(exact = 20).par(auto = true).rdd.partitions.size should equal(19)
+
+ A.keyClassTag shouldBe ClassTag.Int
+ A.par(auto = true).keyClassTag shouldBe ClassTag.Int
- intercept[AssertionError] {
- A.par()
- }
+ an[IllegalArgumentException] shouldBe thrownBy {A.par(exact = 0)}
+ an[IllegalArgumentException] shouldBe thrownBy {A.par()}
}
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/RLikeDrmOpsSuite.scala
----------------------------------------------------------------------
diff --git a/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/RLikeDrmOpsSuite.scala b/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/RLikeDrmOpsSuite.scala
index 2a4f213..f422f86 100644
--- a/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/RLikeDrmOpsSuite.scala
+++ b/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/RLikeDrmOpsSuite.scala
@@ -25,10 +25,16 @@ import drm._
import org.apache.mahout.sparkbindings._
import RLikeDrmOps._
import test.DistributedSparkSuite
+import org.apache.mahout.math.drm.logical.{OpAtB, OpAewUnaryFuncFusion}
+import org.apache.mahout.logging._
+
+import scala.util.Random
/** ==R-like DRM DSL operation tests -- Spark== */
class RLikeDrmOpsSuite extends FunSuite with DistributedSparkSuite with RLikeDrmOpsSuiteBase {
+ private final implicit val log = getLog(classOf[RLikeDrmOpsSuite])
+
test("C = A + B missing rows") {
val sc = mahoutCtx.asInstanceOf[SparkDistributedContext].sc
@@ -113,4 +119,61 @@ class RLikeDrmOpsSuite extends FunSuite with DistributedSparkSuite with RLikeDrm
}
+ test("A'B, bigger") {
+
+ val rnd = new Random()
+ val a = new SparseRowMatrix(200, 1544) := { _ => rnd.nextGaussian() }
+ val b = new SparseRowMatrix(200, 300) := { _ => rnd.nextGaussian() }
+
+ var ms = System.currentTimeMillis()
+ val atb = a.t %*% b
+ ms = System.currentTimeMillis() - ms
+
+ println(s"in-core mul ms: $ms")
+
+ val drmA = drmParallelize(a, numPartitions = 2)
+ val drmB = drmParallelize(b, numPartitions = 2)
+
+ ms = System.currentTimeMillis()
+ val drmAtB = drmA.t %*% drmB
+ val mxAtB = drmAtB.collect
+ ms = System.currentTimeMillis() - ms
+
+ println(s"a'b plan:${drmAtB.context.engine.optimizerRewrite(drmAtB)}")
+ println(s"a'b plan contains ${drmAtB.rdd.partitions.size} partitions.")
+ println(s"distributed mul ms: $ms.")
+
+ (atb - mxAtB).norm should be < 1e-5
+
+ }
+
+ test("C = At %*% B , zippable") {
+
+ val mxA = dense((1, 2), (3, 4), (-3, -5))
+
+ val A = drmParallelize(mxA, numPartitions = 2)
+ .mapBlock()({
+ case (keys, block) => keys.map(_.toString) -> block
+ })
+
+ val B = (A + 1.0)
+
+ .mapBlock() { case (keys, block) ⇒
+ val nblock = new SparseRowMatrix(block.nrow, block.ncol) := block
+ keys → nblock
+ }
+
+ B.collect
+
+ val C = A.t %*% B
+
+ mahoutCtx.optimizerRewrite(C) should equal(OpAtB[String](A, B))
+
+ val inCoreC = C.collect
+ val inCoreControlC = mxA.t %*% (mxA + 1.0)
+
+ (inCoreC - inCoreControlC).norm should be < 1E-10
+
+ }
+
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/test/scala/org/apache/mahout/sparkbindings/io/IOSuite.scala
----------------------------------------------------------------------
diff --git a/spark/src/test/scala/org/apache/mahout/sparkbindings/io/IOSuite.scala b/spark/src/test/scala/org/apache/mahout/sparkbindings/io/IOSuite.scala
new file mode 100644
index 0000000..f3a9721
--- /dev/null
+++ b/spark/src/test/scala/org/apache/mahout/sparkbindings/io/IOSuite.scala
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.sparkbindings.io
+
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
+
+import com.esotericsoftware.kryo.Kryo
+import com.esotericsoftware.kryo.io.{Input, Output}
+import com.twitter.chill.AllScalaRegistrar
+import org.apache.mahout.math._
+import scalabindings._
+import RLikeOps._
+
+import org.apache.mahout.common.RandomUtils
+import org.apache.mahout.test.MahoutSuite
+import org.scalatest.FunSuite
+
+import scala.util.Random
+
+class IOSuite extends FunSuite with MahoutSuite {
+
+ import IOSuite._
+
+ test("Dense vector kryo") {
+
+ val rnd = RandomUtils.getRandom
+ val vec = new DenseVector(165) := { _ => rnd.nextDouble()}
+
+ val ret = kryoClone(vec, vec, vec)
+ val vec2 = ret(2)
+
+ println(s"vec=$vec\nvc2=$vec2")
+
+ vec2 === vec shouldBe true
+ vec2.isInstanceOf[DenseVector] shouldBe true
+ }
+
+ test("Random sparse vector kryo") {
+
+ val rnd = RandomUtils.getRandom
+ val vec = new RandomAccessSparseVector(165) := { _ => if (rnd.nextDouble() < 0.3) rnd.nextDouble() else 0}
+ val vec1 = new RandomAccessSparseVector(165)
+ vec1(2) = 2
+ vec1(3) = 4
+ vec1(3) = 0
+ vec1(10) = 30
+
+ val ret = kryoClone(vec, vec1, vec)
+ val (vec2, vec3) = (ret(2), ret(1))
+
+ println(s"vec=$vec\nvc2=$vec2")
+
+ vec2 === vec shouldBe true
+ vec1 === vec3 shouldBe true
+ vec2.isInstanceOf[RandomAccessSparseVector] shouldBe true
+
+ }
+
+ test("100% sparse vectors") {
+
+ val vec1 = new SequentialAccessSparseVector(10)
+ val vec2 = new RandomAccessSparseVector(6)
+ val ret = kryoClone(vec1, vec2, vec1, vec2)
+ val vec3 = ret(2)
+ val vec4 = ret(3)
+
+ vec1 === vec3 shouldBe true
+ vec2 === vec4 shouldBe true
+ }
+
+ test("Sequential sparse vector kryo") {
+
+ val rnd = RandomUtils.getRandom
+ val vec = new SequentialAccessSparseVector(165) := { _ => if (rnd.nextDouble() < 0.3) rnd.nextDouble() else 0}
+
+ val vec1 = new SequentialAccessSparseVector(165)
+ vec1(2) = 0
+ vec1(3) = 3
+ vec1(4) = 2
+ vec1(3) = 0
+
+ val ret = kryoClone(vec, vec1, vec)
+ val (vec2, vec3) = (ret(2), ret(1))
+
+ println(s"vec=$vec\nvc2=$vec2")
+
+ vec2 === vec shouldBe true
+ vec1 === vec3 shouldBe true
+ vec2.isInstanceOf[SequentialAccessSparseVector] shouldBe true
+ }
+
+ test("kryo matrix tests") {
+ val rnd = new Random()
+
+ val mxA = new DenseMatrix(140, 150) := { _ => rnd.nextDouble()}
+
+ val mxB = new SparseRowMatrix(140, 150) := { _ => if (rnd.nextDouble() < .3) rnd.nextDouble() else 0.0}
+
+ val mxC = new SparseMatrix(140, 150)
+ for (i <- 0 until mxC.nrow) if (rnd.nextDouble() < .3)
+ mxC(i, ::) := { _ => if (rnd.nextDouble() < .3) rnd.nextDouble() else 0.0}
+
+ val cnsl = mxC.numSlices()
+ println(s"Number of slices in mxC: ${cnsl}")
+
+ val ret = kryoClone(mxA, mxA.t, mxB, mxB.t, mxC, mxC.t, mxA)
+
+ val (mxAA, mxAAt, mxBB, mxBBt, mxCC, mxCCt, mxAAA) = (ret(0), ret(1), ret(2), ret(3), ret(4), ret(5), ret(6))
+
+ // ret.size shouldBe 7
+
+ mxA === mxAA shouldBe true
+ mxA === mxAAA shouldBe true
+ mxA === mxAAt.t shouldBe true
+ mxAA.isInstanceOf[DenseMatrix] shouldBe true
+ mxAAt.isInstanceOf[DenseMatrix] shouldBe false
+
+
+ mxB === mxBB shouldBe true
+ mxB === mxBBt.t shouldBe true
+ mxBB.isInstanceOf[SparseRowMatrix] shouldBe true
+ mxBBt.isInstanceOf[SparseRowMatrix] shouldBe false
+ mxBB(0,::).isDense shouldBe false
+
+
+ // Assert no persistence operation increased slice sparsity
+ mxC.numSlices() shouldBe cnsl
+
+ // Assert deserialized product did not experience any empty slice inflation
+ mxCC.numSlices() shouldBe cnsl
+ mxCCt.t.numSlices() shouldBe cnsl
+
+ // Incidentally, but not very significantly, iterating thru all rows that happens in equivalence
+ // operator, inserts empty rows into SparseMatrix so these asserts should not be before numSlices
+ // asserts.
+ mxC === mxCC shouldBe true
+ mxC === mxCCt.t shouldBe true
+ mxCCt.t.isInstanceOf[SparseMatrix] shouldBe true
+
+ // Column-wise sparse matrix are deprecated and should be explicitly rejected by serializer.
+ an[IllegalArgumentException] should be thrownBy {
+ val mxDeprecated = new SparseColumnMatrix(14, 15)
+ kryoClone(mxDeprecated)
+ }
+
+ }
+
+ test("diag matrix") {
+
+ val mxD = diagv(dvec(1, 2, 3, 5))
+ val mxDD = kryoClone(mxD)(0)
+ mxD === mxDD shouldBe true
+ mxDD.isInstanceOf[DiagonalMatrix] shouldBe true
+
+ }
+}
+
+object IOSuite {
+
+ def kryoClone[T](obj: T*): Seq[T] = {
+
+ val kryo = new Kryo()
+ new AllScalaRegistrar()(kryo)
+
+ MahoutKryoRegistrator.registerClasses(kryo)
+
+ val baos = new ByteArrayOutputStream()
+ val output = new Output(baos)
+ obj.foreach(kryo.writeClassAndObject(output, _))
+ output.close
+
+ val input = new Input(new ByteArrayInputStream(baos.toByteArray))
+
+ def outStream: Stream[T] =
+ if (input.eof) Stream.empty
+ else kryo.readClassAndObject(input).asInstanceOf[T] #:: outStream
+
+ outStream
+ }
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/test/scala/org/apache/mahout/sparkbindings/test/DistributedSparkSuite.scala
----------------------------------------------------------------------
diff --git a/spark/src/test/scala/org/apache/mahout/sparkbindings/test/DistributedSparkSuite.scala b/spark/src/test/scala/org/apache/mahout/sparkbindings/test/DistributedSparkSuite.scala
index f18ec70..d917a22 100644
--- a/spark/src/test/scala/org/apache/mahout/sparkbindings/test/DistributedSparkSuite.scala
+++ b/spark/src/test/scala/org/apache/mahout/sparkbindings/test/DistributedSparkSuite.scala
@@ -17,11 +17,13 @@
package org.apache.mahout.sparkbindings.test
+import org.apache.log4j.{Level, Logger}
import org.scalatest.{ConfigMap, BeforeAndAfterAllConfigMap, Suite}
import org.apache.spark.SparkConf
import org.apache.mahout.sparkbindings._
import org.apache.mahout.test.{DistributedMahoutSuite, MahoutSuite}
import org.apache.mahout.math.drm.DistributedContext
+import collection.JavaConversions._
trait DistributedSparkSuite extends DistributedMahoutSuite with LoggerConfiguration {
this: Suite =>
@@ -30,16 +32,21 @@ trait DistributedSparkSuite extends DistributedMahoutSuite with LoggerConfigurat
protected var masterUrl = null.asInstanceOf[String]
protected def initContext() {
- masterUrl = "local[3]"
+ masterUrl = System.getProperties.getOrElse("test.spark.master", "local[3]")
+ val isLocal = masterUrl.startsWith("local")
mahoutCtx = mahoutSparkContext(masterUrl = this.masterUrl,
- appName = "MahoutLocalContext",
+ appName = "MahoutUnitTests",
// Do not run MAHOUT_HOME jars in unit tests.
- addMahoutJars = false,
+ addMahoutJars = !isLocal,
sparkConf = new SparkConf()
- .set("spark.kryoserializer.buffer.mb", "15")
+ .set("spark.kryoserializer.buffer.mb", "40")
.set("spark.akka.frameSize", "30")
.set("spark.default.parallelism", "10")
+ .set("spark.executor.memory", "2G")
)
+ // Spark reconfigures logging. Clamp down on it in tests.
+ Logger.getRootLogger.setLevel(Level.ERROR)
+ Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
}
protected def resetContext() {
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/test/scala/org/apache/mahout/sparkbindings/test/LoggerConfiguration.scala
----------------------------------------------------------------------
diff --git a/spark/src/test/scala/org/apache/mahout/sparkbindings/test/LoggerConfiguration.scala b/spark/src/test/scala/org/apache/mahout/sparkbindings/test/LoggerConfiguration.scala
index e48e7c7..2a996d7 100644
--- a/spark/src/test/scala/org/apache/mahout/sparkbindings/test/LoggerConfiguration.scala
+++ b/spark/src/test/scala/org/apache/mahout/sparkbindings/test/LoggerConfiguration.scala
@@ -25,6 +25,6 @@ trait LoggerConfiguration extends org.apache.mahout.test.LoggerConfiguration {
override protected def beforeAll(configMap: ConfigMap) {
super.beforeAll(configMap)
- Logger.getLogger("org.apache.mahout.sparkbindings").setLevel(Level.INFO)
+ BasicConfigurator.resetConfiguration()
}
}
[3/4] mahout git commit: MAHOUT-1660 MAHOUT-1713 MAHOUT-1714
MAHOUT-1715 MAHOUT-1716 MAHOUT-1717 MAHOUT-1718 MAHOUT-1719 MAHOUT-1720
MAHOUT-1721 MAHOUT-1722 MAHOUT-1723 MAHOUT-1724 MAHOUT-1725 MAHOUT-1726
MAHOUT-1727 MAHOUT-1728 MAHOUT-1729 MAHOUT-1730 M
Posted by dl...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOps.scala
index 97e06cf..7091c53 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOps.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOps.scala
@@ -16,18 +16,28 @@
*/
package org.apache.mahout.math.scalabindings
+import org.apache.mahout.math.function.Functions
import org.apache.mahout.math.{Vector, Matrix}
import scala.collection.JavaConversions._
import RLikeOps._
class RLikeMatrixOps(m: Matrix) extends MatrixOps(m) {
+ /** Structure-optimized mmul */
+ def %*%(that: Matrix) = MMul(m, that, None)
+
+ def :%*%(that:Matrix) = %*%(that)
+
+ def %*%:(that: Matrix) = that :%*% m
+
/**
- * matrix-matrix multiplication
- * @param that
- * @return
+ * The "legacy" matrix-matrix multiplication.
+ *
+ * @param that right hand operand
+ * @return matrix multiplication result
+ * @deprecated use %*%
*/
- def %*%(that: Matrix) = m.times(that)
+ def %***%(that: Matrix) = m.times(that)
/**
* matrix-vector multiplication
@@ -65,13 +75,16 @@ class RLikeMatrixOps(m: Matrix) extends MatrixOps(m) {
* @param that
*/
def *=(that: Matrix) = {
- m.zip(that).foreach(t => t._1.vector *= t._2.vector)
+ m.assign(that, Functions.MULT)
m
}
+ /** A *=: B is equivalent to B *= A. Included for completeness. */
+ def *=:(that: Matrix) = m *= that
+
/** Elementwise deletion */
def /=(that: Matrix) = {
- m.zip(that).foreach(t => t._1.vector() /= t._2.vector)
+ m.zip(that).foreach(t ⇒ t._1.vector() /= t._2.vector)
m
}
@@ -80,15 +93,63 @@ class RLikeMatrixOps(m: Matrix) extends MatrixOps(m) {
m
}
+ /** 5.0 *=: A is equivalent to A *= 5.0. Included for completeness. */
+ def *=:(that: Double) = m *= that
+
def /=(that: Double) = {
- m.foreach(_.vector() /= that)
+ m ::= { x ⇒ x / that }
m
}
/** 1.0 /=: A is equivalent to A = 1.0/A in R */
def /=:(that: Double) = {
- m.foreach(that /=: _.vector())
+ if (that != 0.0) m := { x ⇒ that / x }
m
}
+
+ def ^=(that: Double) = {
+ m ::= { x ⇒ math.pow(x, that) }
+ m
+ }
+
+ def ^(that: Double) = m.cloned ^= that
+
+ def cbind(that: Matrix): Matrix = {
+ require(m.nrow == that.nrow)
+ if (m.ncol > 0) {
+ if (that.ncol > 0) {
+ val mx = m.like(m.nrow, m.ncol + that.ncol)
+ mx(::, 0 until m.ncol) := m
+ mx(::, m.ncol until mx.ncol) := that
+ mx
+ } else m
+ } else that
+ }
+
+ def cbind(that: Double): Matrix = {
+ val mx = m.like(m.nrow, m.ncol + 1)
+ mx(::, 0 until m.ncol) := m
+ if (that != 0.0) mx(::, m.ncol) := that
+ mx
+ }
+
+ def rbind(that: Matrix): Matrix = {
+ require(m.ncol == that.ncol)
+ if (m.nrow > 0) {
+ if (that.nrow > 0) {
+ val mx = m.like(m.nrow + that.nrow, m.ncol)
+ mx(0 until m.nrow, ::) := m
+ mx(m.nrow until mx.nrow, ::) := that
+ mx
+ } else m
+ } else that
+ }
+
+ def rbind(that: Double): Matrix = {
+ val mx = m.like(m.nrow + 1, m.ncol)
+ mx(0 until m.nrow, ::) := m
+ if (that != 0.0) mx(m.nrow, ::) := that
+ mx
+ }
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeOps.scala
index ba32304..e10a01b 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeOps.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeOps.scala
@@ -24,13 +24,13 @@ import org.apache.mahout.math.{Vector, MatrixTimesOps, Matrix}
*/
object RLikeOps {
- implicit def double2Scalar(x:Double) = new DoubleScalarOps(x)
+ implicit def double2Scalar(x:Double) = new RLikeDoubleScalarOps(x)
implicit def v2vOps(v: Vector) = new RLikeVectorOps(v)
implicit def el2elOps(el: Vector.Element) = new ElementOps(el)
- implicit def times2timesOps(m: MatrixTimesOps) = new RLikeTimesOps(m)
+ implicit def el2Double(el:Vector.Element) = el.get()
implicit def m2mOps(m: Matrix) = new RLikeMatrixOps(m)
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeTimesOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeTimesOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeTimesOps.scala
deleted file mode 100644
index 51f0f63..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeTimesOps.scala
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.scalabindings
-
-import org.apache.mahout.math.{Matrix, MatrixTimesOps}
-
-class RLikeTimesOps(m: MatrixTimesOps) {
-
- def :%*%(that: Matrix) = m.timesRight(that)
-
- def %*%:(that: Matrix) = m.timesLeft(that)
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeVectorOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeVectorOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeVectorOps.scala
index d2198bd..38a55d6 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeVectorOps.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeVectorOps.scala
@@ -17,7 +17,7 @@
package org.apache.mahout.math.scalabindings
-import org.apache.mahout.math.Vector
+import org.apache.mahout.math.{Matrix, Vector}
import org.apache.mahout.math.function.Functions
import RLikeOps._
@@ -67,5 +67,32 @@ class RLikeVectorOps(_v: Vector) extends VectorOps(_v) {
/** Elementwise right-associative / */
def /:(that: Vector) = that.cloned /= v
+ def ^=(that: Double) = v.assign(Functions.POW, that)
+
+ def ^=(that: Vector) = v.assign(that, Functions.POW)
+
+ def ^(that: Double) = v.cloned ^= that
+
+ def ^(that: Vector) = v.cloned ^= that
+
+ def c(that: Vector) = {
+ if (v.length > 0) {
+ if (that.length > 0) {
+ val cv = v.like(v.length + that.length)
+ cv(0 until v.length) := cv
+ cv(v.length until cv.length) := that
+ cv
+ } else v
+ } else that
+ }
+
+ def c(that: Double) = {
+ val cv = v.like(v.length + 1)
+ cv(0 until v.length) := v
+ cv(v.length) = that
+ cv
+ }
+
+ def mean = sum / length
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/VectorOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/VectorOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/VectorOps.scala
index c20354d..ef9c494 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/VectorOps.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/VectorOps.scala
@@ -38,8 +38,13 @@ class VectorOps(private[scalabindings] val v: Vector) {
def update(r: Range, that: Vector) = apply(r) := that
+ /** R-like synonyms for java methods on vectors */
def sum = v.zSum()
+ def min = v.minValue()
+
+ def max = v.maxValue()
+
def :=(that: Vector): Vector = {
// assign op in Mahout requires same
@@ -58,11 +63,30 @@ class VectorOps(private[scalabindings] val v: Vector) {
def :=(that: Double): Vector = v.assign(that)
+ /** Functional assigment for a function with index and x */
def :=(f: (Int, Double) => Double): Vector = {
for (i <- 0 until length) v(i) = f(i, v(i))
v
}
+ /** Functional assignment for a function with just x (e.g. v := math.exp _) */
+ def :=(f:(Double)=>Double):Vector = {
+ for (i <- 0 until length) v(i) = f(v(i))
+ v
+ }
+
+ /** Sparse iteration functional assignment using function receiving index and x */
+ def ::=(f: (Int, Double) => Double): Vector = {
+ for (el <- v.nonZeroes) el := f(el.index, el.get)
+ v
+ }
+
+ /** Sparse iteration functional assignment using a function recieving just x */
+ def ::=(f: (Double) => Double): Vector = {
+ for (el <- v.nonZeroes) el := f(el.get)
+ v
+ }
+
def equiv(that: Vector) =
length == that.length &&
v.all.view.zip(that.all).forall(t => t._1.get == t._2.get)
@@ -121,21 +145,26 @@ class VectorOps(private[scalabindings] val v: Vector) {
}
class ElementOps(private[scalabindings] val el: Vector.Element) {
+ import RLikeOps._
+
+ def update(v: Double): Double = { el.set(v); v }
+
+ def :=(that: Double) = update(that)
- def apply = el.get()
+ def *(that: Vector.Element): Double = this * that
- def update(v: Double) = el.set(v)
+ def *(that: Vector): Vector = el.get * that
- def :=(v: Double) = el.set(v)
+ def +(that: Vector.Element): Double = this + that
- def +(that: Double) = el.get() + that
+ def +(that: Vector) :Vector = el.get + that
- def -(that: Double) = el.get() - that
+ def /(that: Vector.Element): Double = this / that
- def :-(that: Double) = that - el.get()
+ def /(that:Vector):Vector = el.get / that
- def /(that: Double) = el.get() / that
+ def -(that: Vector.Element): Double = this - that
- def :/(that: Double) = that / el.get()
+ def -(that: Vector) :Vector = el.get - that
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/package.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/package.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/package.scala
index 36f5103..20dc9cd 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/package.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/package.scala
@@ -18,12 +18,15 @@
package org.apache.mahout.math
import org.apache.mahout.math.solver.EigenDecomposition
+import collection._
+import JavaConversions._
/**
* Mahout matrices and vectors' scala syntactic sugar
*/
package object scalabindings {
+
// Reserved "ALL" range
final val `::`: Range = null
@@ -125,7 +128,6 @@ package object scalabindings {
val data = for (r <- rows) yield {
r match {
case n: Number => Array(n.doubleValue())
- case t: Product => t.productIterator.map(_.asInstanceOf[Number].doubleValue()).toArray
case t: Vector => Array.tabulate(t.length)(t(_))
case t: Array[Double] => t
case t: Iterable[_] =>
@@ -138,6 +140,7 @@ package object scalabindings {
}
return m
}
+ case t: Product => t.productIterator.map(_.asInstanceOf[Number].doubleValue()).toArray
case t: Array[Array[Double]] => if (rows.size == 1)
return new DenseMatrix(t)
else
@@ -164,7 +167,7 @@ package object scalabindings {
* (0,5)::(9,3)::Nil,
* (2,3.5)::(7,8)::Nil
* )
- *
+ *
* }}}
*
* @param rows
@@ -172,11 +175,18 @@ package object scalabindings {
*/
def sparse(rows: Vector*): SparseRowMatrix = {
- import MatrixOps._
+ import RLikeOps._
val nrow = rows.size
val ncol = rows.map(_.size()).max
val m = new SparseRowMatrix(nrow, ncol)
- m := rows
+ m := rows.map { row =>
+ if (row.length < ncol) {
+ val newRow = row.like(ncol)
+ newRow(0 until row.length) := row
+ newRow
+ }
+ else row
+ }
m
}
@@ -249,23 +259,23 @@ package object scalabindings {
(qrdec.getQ, qrdec.getR)
}
- /**
- * Solution <tt>X</tt> of <tt>A*X = B</tt> using QR-Decomposition, where <tt>A</tt> is a square, non-singular matrix.
+ /**
+ * Solution <tt>X</tt> of <tt>A*X = B</tt> using QR-Decomposition, where <tt>A</tt> is a square, non-singular matrix.
*
* @param a
* @param b
* @return (X)
*/
def solve(a: Matrix, b: Matrix): Matrix = {
- import MatrixOps._
- if (a.nrow != a.ncol) {
- throw new IllegalArgumentException("supplied matrix A is not square")
- }
- val qr = new QRDecomposition(a cloned)
- if (!qr.hasFullRank) {
- throw new IllegalArgumentException("supplied matrix A is singular")
- }
- qr.solve(b)
+ import MatrixOps._
+ if (a.nrow != a.ncol) {
+ throw new IllegalArgumentException("supplied matrix A is not square")
+ }
+ val qr = new QRDecomposition(a cloned)
+ if (!qr.hasFullRank) {
+ throw new IllegalArgumentException("supplied matrix A is singular")
+ }
+ qr.solve(b)
}
/**
@@ -293,5 +303,46 @@ package object scalabindings {
x(::, 0)
}
+ ///////////////////////////////////////////////////////////
+ // Elementwise unary functions. Actually this requires creating clones to avoid side effects. For
+ // efficiency reasons one may want to actually do in-place exression assignments instead, e.g.
+ //
+ // m := exp _
+
+ import RLikeOps._
+ import scala.math._
+
+ def mexp(m: Matrix): Matrix = m.cloned := exp _
+
+ def vexp(v: Vector): Vector = v.cloned := exp _
+
+ def mlog(m: Matrix): Matrix = m.cloned := log _
+
+ def vlog(v: Vector): Vector = v.cloned := log _
+
+ def mabs(m: Matrix): Matrix = m.cloned ::= (abs(_: Double))
+
+ def vabs(v: Vector): Vector = v.cloned ::= (abs(_: Double))
+
+ def msqrt(m: Matrix): Matrix = m.cloned ::= sqrt _
+
+ def vsqrt(v: Vector): Vector = v.cloned ::= sqrt _
+
+ def msignum(m: Matrix): Matrix = m.cloned ::= (signum(_: Double))
+
+ def vsignum(v: Vector): Vector = v.cloned ::= (signum(_: Double))
+
+ //////////////////////////////////////////////////////////
+ // operation funcs
+
+
+ /** Matrix-matrix unary func */
+ type MMUnaryFunc = (Matrix, Option[Matrix]) => Matrix
+ /** Binary matrix-matrix operations which may save result in-place, optionally */
+ type MMBinaryFunc = (Matrix, Matrix, Option[Matrix]) => Matrix
+ type MVBinaryFunc = (Matrix, Vector, Option[Matrix]) => Matrix
+ type VMBinaryFunc = (Vector, Matrix, Option[Matrix]) => Matrix
+ type MDBinaryFunc = (Matrix, Double, Option[Matrix]) => Matrix
+
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/util/IOUtilsScala.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/util/IOUtilsScala.scala b/math-scala/src/main/scala/org/apache/mahout/util/IOUtilsScala.scala
new file mode 100644
index 0000000..b61bea4
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/util/IOUtilsScala.scala
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.util
+
+import org.apache.mahout.logging._
+import collection._
+import java.io.Closeable
+
+object IOUtilsScala {
+
+ private final implicit val log = getLog(IOUtilsScala.getClass)
+
+ /**
+ * Try to close every resource in the sequence, in order of the sequence.
+ *
+ * Report all encountered exceptions to logging.
+ *
+ * Rethrow last exception only (if any)
+ * @param closeables
+ */
+ def close(closeables: Seq[Closeable]) = {
+
+ var lastThr: Option[Throwable] = None
+ closeables.foreach { c =>
+ try {
+ c.close()
+ } catch {
+ case t: Throwable =>
+ error(t.getMessage, t)
+ lastThr = Some(t)
+ }
+ }
+
+ // Rethrow most recent close exception (can throw only one)
+ lastThr.foreach(throw _)
+ }
+
+ /**
+ * Same as [[IOUtilsScala.close( )]] but do not re-throw any exceptions.
+ * @param closeables
+ */
+ def closeQuietly(closeables: Seq[Closeable]) = {
+ try {
+ close(closeables)
+ } catch {
+ case t: Throwable => // NOP
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeOpsSuiteBase.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeOpsSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeOpsSuiteBase.scala
index 849db68..bb42121 100644
--- a/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeOpsSuiteBase.scala
+++ b/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeOpsSuiteBase.scala
@@ -46,6 +46,26 @@ trait DrmLikeOpsSuiteBase extends DistributedMahoutSuite with Matchers {
}
+ test("allReduceBlock") {
+
+ val mxA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6))
+ val drmA = drmParallelize(mxA, numPartitions = 2)
+
+ try {
+ val mxB = drmA.allreduceBlock { case (keys, block) ⇒
+ block(::, 0 until 2).t %*% block(::, 2 until 3)
+ }
+
+ val mxControl = mxA(::, 0 until 2).t %*% mxA(::, 2 until 3)
+
+ (mxB - mxControl).norm should be < 1e-10
+
+ } catch {
+ case e: UnsupportedOperationException ⇒ // Some engines may not support this, so ignore.
+ }
+
+ }
+
test("col range") {
val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6))
val A = drmParallelize(m = inCoreA, numPartitions = 2)
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeSuiteBase.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeSuiteBase.scala
index 6c9313c..f215fb7 100644
--- a/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeSuiteBase.scala
+++ b/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeSuiteBase.scala
@@ -68,9 +68,8 @@ trait DrmLikeSuiteBase extends DistributedMahoutSuite with Matchers {
inCoreEmpty.nrow shouldBe 100
inCoreEmpty.ncol shouldBe 50
+ }
- }
-
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/test/scala/org/apache/mahout/math/drm/RLikeDrmOpsSuiteBase.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/drm/RLikeDrmOpsSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/drm/RLikeDrmOpsSuiteBase.scala
index 2e6204d..b46ee30 100644
--- a/math-scala/src/test/scala/org/apache/mahout/math/drm/RLikeDrmOpsSuiteBase.scala
+++ b/math-scala/src/test/scala/org/apache/mahout/math/drm/RLikeDrmOpsSuiteBase.scala
@@ -24,7 +24,13 @@ import scalabindings._
import RLikeOps._
import RLikeDrmOps._
import decompositions._
-import org.apache.mahout.math.drm.logical.{OpAtB, OpAtA, OpAtx}
+import org.apache.mahout.math.drm.logical._
+import org.apache.mahout.math.drm.logical.OpAtx
+import org.apache.mahout.math.drm.logical.OpAtB
+import org.apache.mahout.math.drm.logical.OpAtA
+import org.apache.mahout.math.drm.logical.OpAewUnaryFuncFusion
+
+import scala.util.Random
/** Common engine tests for distributed R-like DRM operations */
trait RLikeDrmOpsSuiteBase extends DistributedMahoutSuite with Matchers {
@@ -188,10 +194,13 @@ trait RLikeDrmOpsSuiteBase extends DistributedMahoutSuite with Matchers {
val A = drmParallelize(inCoreA, numPartitions = 2)
.mapBlock()({
- case (keys, block) => keys.map(_.toString) -> block
+ case (keys, block) ⇒ keys.map(_.toString) → block
})
- val B = A + 1.0
+ // Dense-A' x sparse-B used to produce error. We sparsify B here to test this as well.
+ val B = (A + 1.0).mapBlock() { case (keys, block) ⇒
+ keys → (new SparseRowMatrix(block.nrow, block.ncol) := block)
+ }
val C = A.t %*% B
@@ -204,6 +213,25 @@ trait RLikeDrmOpsSuiteBase extends DistributedMahoutSuite with Matchers {
}
+ test ("C = A %*% B.t") {
+
+ val inCoreA = dense((1, 2), (3, 4), (-3, -5))
+
+ val A = drmParallelize(inCoreA, numPartitions = 2)
+
+ val B = A + 1.0
+
+ val C = A %*% B.t
+
+ mahoutCtx.optimizerRewrite(C) should equal(OpABt[Int](A, B))
+
+ val inCoreC = C.collect
+ val inCoreControlC = inCoreA %*% (inCoreA + 1.0).t
+
+ (inCoreC - inCoreControlC).norm should be < 1E-10
+
+ }
+
test("C = A %*% inCoreB") {
val inCoreA = dense((1, 2, 3), (3, 4, 5), (4, 5, 6), (5, 6, 7))
@@ -503,6 +531,24 @@ trait RLikeDrmOpsSuiteBase extends DistributedMahoutSuite with Matchers {
}
+ test("B = 1 cbind A") {
+ val inCoreA = dense((1, 2), (3, 4))
+ val control = dense((1, 1, 2), (1, 3, 4))
+
+ val drmA = drmParallelize(inCoreA, numPartitions = 2)
+
+ (control - (1 cbind drmA) ).norm should be < 1e-10
+ }
+
+ test("B = A cbind 1") {
+ val inCoreA = dense((1, 2), (3, 4))
+ val control = dense((1, 2, 1), (3, 4, 1))
+
+ val drmA = drmParallelize(inCoreA, numPartitions = 2)
+
+ (control - (drmA cbind 1) ).norm should be < 1e-10
+ }
+
test("B = A + 1.0") {
val inCoreA = dense((1, 2), (2, 3), (3, 4))
val controlB = inCoreA + 1.0
@@ -547,4 +593,46 @@ trait RLikeDrmOpsSuiteBase extends DistributedMahoutSuite with Matchers {
(10 * drmA - (10 *: drmA)).norm shouldBe 0
}
+
+ test("A * A -> sqr(A) rewrite ") {
+ val mxA = dense(
+ (1, 2, 3),
+ (3, 4, 5),
+ (7, 8, 9)
+ )
+
+ val mxAAControl = mxA * mxA
+
+ val drmA = drmParallelize(mxA, 2)
+ val drmAA = drmA * drmA
+
+ val optimized = drmAA.context.engine.optimizerRewrite(drmAA)
+ println(s"optimized:$optimized")
+ optimized.isInstanceOf[OpAewUnaryFunc[Int]] shouldBe true
+
+ (mxAAControl -= drmAA).norm should be < 1e-10
+ }
+
+ test("B = 1 + 2 * (A * A) ew unary function fusion") {
+ val mxA = dense(
+ (1, 2, 3),
+ (3, 0, 5)
+ )
+ val controlB = mxA.cloned := { (x) => 1 + 2 * x * x}
+
+ val drmA = drmParallelize(mxA, 2)
+
+ // We need to use parenthesis, otherwise optimizer will see it as (2A) * (A) and that would not
+ // be rewritten as 2 * sqr(A). It is not that clever (yet) to try commutativity optimizations.
+ val drmB = 1 + 2 * (drmA * drmA)
+
+ val optimized = mahoutCtx.engine.optimizerRewrite(drmB)
+ println(s"optimizer rewritten:$optimized")
+ optimized.isInstanceOf[OpAewUnaryFuncFusion[Int]] shouldBe true
+
+ (controlB - drmB).norm should be < 1e-10
+
+ }
+
+
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MatrixOpsSuite.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MatrixOpsSuite.scala b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MatrixOpsSuite.scala
index d7b22d9..5c8a310 100644
--- a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MatrixOpsSuite.scala
+++ b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MatrixOpsSuite.scala
@@ -24,6 +24,8 @@ import org.apache.mahout.test.MahoutSuite
import org.apache.mahout.math.{RandomAccessSparseVector, SequentialAccessSparseVector, Matrices}
import org.apache.mahout.common.RandomUtils
+import scala.util.Random
+
class MatrixOpsSuite extends FunSuite with MahoutSuite {
@@ -93,12 +95,40 @@ class MatrixOpsSuite extends FunSuite with MahoutSuite {
val e = eye(5)
- printf("I(5)=\n%s\n", e)
+ println(s"I(5)=\n$e")
a(0 to 1, 1 to 2) = dense((3, 2), (2, 3))
a(0 to 1, 1 to 2) := dense((3, 2), (2, 3))
+ println(s"a=$a")
+
+ a(0 to 1, 1 to 2) := { _ => 45}
+ println(s"a=$a")
+
+// a(0 to 1, 1 to 2) ::= { _ => 44}
+ println(s"a=$a")
+
+ // Sparse assignment to a sparse block
+ val c = sparse(0 -> 1 :: Nil, 2 -> 2 :: Nil, 1 -> 5 :: Nil)
+ val d = c.cloned
+
+ println(s"d=$d")
+ d.ncol shouldBe 3
+ d(::, 1 to 2) ::= { _ => 4}
+ println(s"d=$d")
+ d(::, 1 to 2).sum shouldBe 8
+
+ d ::= {_ => 5}
+ d.sum shouldBe 15
+
+ val f = c.cloned.t
+ f ::= {_ => 6}
+ f.sum shouldBe 18
+
+ val g = c.cloned
+ g(::, 1 until g.nrow) ::= { x => if (x <= 0) 0.0 else 1.0}
+ g.sum shouldBe 3
}
test("sparse") {
@@ -182,4 +212,5 @@ class MatrixOpsSuite extends FunSuite with MahoutSuite {
}
+
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOpsSuite.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOpsSuite.scala b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOpsSuite.scala
index a943c5f..79d2899 100644
--- a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOpsSuite.scala
+++ b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOpsSuite.scala
@@ -17,9 +17,16 @@
package org.apache.mahout.math.scalabindings
+import java.util
+
+import org.apache.log4j.Level
+import org.apache.mahout.math._
import org.scalatest.FunSuite
import RLikeOps._
import org.apache.mahout.test.MahoutSuite
+import org.apache.mahout.logging._
+import scala.collection.JavaConversions._
+import scala.util.Random
class RLikeMatrixOpsSuite extends FunSuite with MahoutSuite {
@@ -63,6 +70,10 @@ class RLikeMatrixOpsSuite extends FunSuite with MahoutSuite {
}
+ test("Uniform view") {
+ val mxUnif = Matrices.symmetricUniformView(5000000, 5000000, 1234)
+ }
+
/** Test dsl overloads over scala operations over matrices */
test ("scalarOps") {
val a = dense(
@@ -77,4 +88,269 @@ class RLikeMatrixOpsSuite extends FunSuite with MahoutSuite {
}
+ test("Multiplication experimental performance") {
+
+ getLog(MMul.getClass).setLevel(Level.DEBUG)
+
+ val d = 300
+ val n = 3
+
+ // Dense row-wise
+ val mxAd = new DenseMatrix(d, d) := Matrices.gaussianView(d, d, 134) + 1
+ val mxBd = new DenseMatrix(d, d) := Matrices.gaussianView(d, d, 134) - 1
+
+ val rnd = new Random(1234)
+
+ // Sparse rows
+ val mxAsr = (new SparseRowMatrix(d,
+ d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() + 1 else 0.0 }) cloned
+ val mxBsr = (new SparseRowMatrix(d,
+ d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() - 1 else 0.0 }) cloned
+
+ // Hanging sparse rows
+ val mxAs = (new SparseMatrix(d, d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() + 1 else 0.0 }) cloned
+ val mxBs = (new SparseMatrix(d, d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() - 1 else 0.0 }) cloned
+
+ // DIAGONAL
+ val mxD = diagv(dvec(Array.tabulate(d)(_ => rnd.nextGaussian())))
+
+ def time(op: => Unit): Long = {
+ val ms = System.currentTimeMillis()
+ op
+ System.currentTimeMillis() - ms
+ }
+
+ def getMmulAvgs(mxA: Matrix, mxB: Matrix, n: Int) = {
+
+ var control: Matrix = null
+ var mmulVal: Matrix = null
+
+ val current = Stream.range(0, n).map { _ => time {control = mxA.times(mxB)} }.sum.toDouble / n
+ val experimental = Stream.range(0, n).map { _ => time {mmulVal = MMul(mxA, mxB, None)} }.sum.toDouble / n
+ (control - mmulVal).norm should be < 1e-10
+ current -> experimental
+ }
+
+ // Dense matrix tests.
+ println(s"Ad %*% Bd: ${getMmulAvgs(mxAd, mxBd, n)}")
+ println(s"Ad' %*% Bd: ${getMmulAvgs(mxAd.t, mxBd, n)}")
+ println(s"Ad %*% Bd': ${getMmulAvgs(mxAd, mxBd.t, n)}")
+ println(s"Ad' %*% Bd': ${getMmulAvgs(mxAd.t, mxBd.t, n)}")
+ println(s"Ad'' %*% Bd'': ${getMmulAvgs(mxAd.t.t, mxBd.t.t, n)}")
+ println
+
+ // Sparse row matrix tests.
+ println(s"Asr %*% Bsr: ${getMmulAvgs(mxAsr, mxBsr, n)}")
+ println(s"Asr' %*% Bsr: ${getMmulAvgs(mxAsr.t, mxBsr, n)}")
+ println(s"Asr %*% Bsr': ${getMmulAvgs(mxAsr, mxBsr.t, n)}")
+ println(s"Asr' %*% Bsr': ${getMmulAvgs(mxAsr.t, mxBsr.t, n)}")
+ println(s"Asr'' %*% Bsr'': ${getMmulAvgs(mxAsr.t.t, mxBsr.t.t, n)}")
+ println
+
+ // Sparse matrix tests.
+ println(s"Asm %*% Bsm: ${getMmulAvgs(mxAs, mxBs, n)}")
+ println(s"Asm' %*% Bsm: ${getMmulAvgs(mxAs.t, mxBs, n)}")
+ println(s"Asm %*% Bsm': ${getMmulAvgs(mxAs, mxBs.t, n)}")
+ println(s"Asm' %*% Bsm': ${getMmulAvgs(mxAs.t, mxBs.t, n)}")
+ println(s"Asm'' %*% Bsm'': ${getMmulAvgs(mxAs.t.t, mxBs.t.t, n)}")
+ println
+
+ // Mixed sparse matrix tests.
+ println(s"Asm %*% Bsr: ${getMmulAvgs(mxAs, mxBsr, n)}")
+ println(s"Asm' %*% Bsr: ${getMmulAvgs(mxAs.t, mxBsr, n)}")
+ println(s"Asm %*% Bsr': ${getMmulAvgs(mxAs, mxBsr.t, n)}")
+ println(s"Asm' %*% Bsr': ${getMmulAvgs(mxAs.t, mxBsr.t, n)}")
+ println(s"Asm'' %*% Bsr'': ${getMmulAvgs(mxAs.t.t, mxBsr.t.t, n)}")
+ println
+
+ println(s"Asr %*% Bsm: ${getMmulAvgs(mxAsr, mxBs, n)}")
+ println(s"Asr' %*% Bsm: ${getMmulAvgs(mxAsr.t, mxBs, n)}")
+ println(s"Asr %*% Bsm': ${getMmulAvgs(mxAsr, mxBs.t, n)}")
+ println(s"Asr' %*% Bsm': ${getMmulAvgs(mxAsr.t, mxBs.t, n)}")
+ println(s"Asr'' %*% Bsm'': ${getMmulAvgs(mxAsr.t.t, mxBs.t.t, n)}")
+ println
+
+ // Mixed dense/sparse
+ println(s"Ad %*% Bsr: ${getMmulAvgs(mxAd, mxBsr, n)}")
+ println(s"Ad' %*% Bsr: ${getMmulAvgs(mxAd.t, mxBsr, n)}")
+ println(s"Ad %*% Bsr': ${getMmulAvgs(mxAd, mxBsr.t, n)}")
+ println(s"Ad' %*% Bsr': ${getMmulAvgs(mxAd.t, mxBsr.t, n)}")
+ println(s"Ad'' %*% Bsr'': ${getMmulAvgs(mxAd.t.t, mxBsr.t.t, n)}")
+ println
+
+ println(s"Asr %*% Bd: ${getMmulAvgs(mxAsr, mxBd, n)}")
+ println(s"Asr' %*% Bd: ${getMmulAvgs(mxAsr.t, mxBd, n)}")
+ println(s"Asr %*% Bd': ${getMmulAvgs(mxAsr, mxBd.t, n)}")
+ println(s"Asr' %*% Bd': ${getMmulAvgs(mxAsr.t, mxBd.t, n)}")
+ println(s"Asr'' %*% Bd'': ${getMmulAvgs(mxAsr.t.t, mxBd.t.t, n)}")
+ println
+
+ println(s"Ad %*% Bsm: ${getMmulAvgs(mxAd, mxBs, n)}")
+ println(s"Ad' %*% Bsm: ${getMmulAvgs(mxAd.t, mxBs, n)}")
+ println(s"Ad %*% Bsm': ${getMmulAvgs(mxAd, mxBs.t, n)}")
+ println(s"Ad' %*% Bsm': ${getMmulAvgs(mxAd.t, mxBs.t, n)}")
+ println(s"Ad'' %*% Bsm'': ${getMmulAvgs(mxAd.t.t, mxBs.t.t, n)}")
+ println
+
+ println(s"Asm %*% Bd: ${getMmulAvgs(mxAs, mxBd, n)}")
+ println(s"Asm' %*% Bd: ${getMmulAvgs(mxAs.t, mxBd, n)}")
+ println(s"Asm %*% Bd': ${getMmulAvgs(mxAs, mxBd.t, n)}")
+ println(s"Asm' %*% Bd': ${getMmulAvgs(mxAs.t, mxBd.t, n)}")
+ println(s"Asm'' %*% Bd'': ${getMmulAvgs(mxAs.t.t, mxBd.t.t, n)}")
+ println
+
+ // Diagonal cases
+ println(s"Ad %*% D: ${getMmulAvgs(mxAd, mxD, n)}")
+ println(s"Asr %*% D: ${getMmulAvgs(mxAsr, mxD, n)}")
+ println(s"Asm %*% D: ${getMmulAvgs(mxAs, mxD, n)}")
+ println(s"D %*% Ad: ${getMmulAvgs(mxD, mxAd, n)}")
+ println(s"D %*% Asr: ${getMmulAvgs(mxD, mxAsr, n)}")
+ println(s"D %*% Asm: ${getMmulAvgs(mxD, mxAs, n)}")
+ println
+
+ println(s"Ad' %*% D: ${getMmulAvgs(mxAd.t, mxD, n)}")
+ println(s"Asr' %*% D: ${getMmulAvgs(mxAsr.t, mxD, n)}")
+ println(s"Asm' %*% D: ${getMmulAvgs(mxAs.t, mxD, n)}")
+ println(s"D %*% Ad': ${getMmulAvgs(mxD, mxAd.t, n)}")
+ println(s"D %*% Asr': ${getMmulAvgs(mxD, mxAsr.t, n)}")
+ println(s"D %*% Asm': ${getMmulAvgs(mxD, mxAs.t, n)}")
+ println
+
+ // Self-squared cases
+ println(s"Ad %*% Ad': ${getMmulAvgs(mxAd, mxAd.t, n)}")
+ println(s"Ad' %*% Ad: ${getMmulAvgs(mxAd.t, mxAd, n)}")
+ println(s"Ad' %*% Ad'': ${getMmulAvgs(mxAd.t, mxAd.t.t, n)}")
+ println(s"Ad'' %*% Ad': ${getMmulAvgs(mxAd.t.t, mxAd.t, n)}")
+
+ }
+
+
+ test("elementwise experimental performance") {
+
+ val d = 500
+ val n = 3
+
+ // Dense row-wise
+ val mxAd = new DenseMatrix(d, d) := Matrices.gaussianView(d, d, 134) + 1
+ val mxBd = new DenseMatrix(d, d) := Matrices.gaussianView(d, d, 134) - 1
+
+ val rnd = new Random(1234)
+
+ // Sparse rows
+ val mxAsr = (new SparseRowMatrix(d,
+ d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() + 1 else 0.0 }) cloned
+ val mxBsr = (new SparseRowMatrix(d,
+ d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() - 1 else 0.0 }) cloned
+
+ // Hanging sparse rows
+ val mxAs = (new SparseMatrix(d, d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() + 1 else 0.0 }) cloned
+ val mxBs = (new SparseMatrix(d, d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() - 1 else 0.0 }) cloned
+
+ // DIAGONAL
+ val mxD = diagv(dvec(Array.tabulate(d)(_ => rnd.nextGaussian())))
+
+ def time(op: => Unit): Long = {
+ val ms = System.currentTimeMillis()
+ op
+ System.currentTimeMillis() - ms
+ }
+
+ def getEWAvgs(mxA: Matrix, mxB: Matrix, n: Int) = {
+
+ var control: Matrix = null
+ var mmulVal: Matrix = null
+
+ val current = Stream.range(0, n).map { _ => time {control = mxA + mxB} }.sum.toDouble / n
+ val experimental = Stream.range(0, n).map { _ => time {mmulVal = mxA + mxB} }.sum.toDouble / n
+ (control - mmulVal).norm should be < 1e-10
+ current -> experimental
+ }
+
+ // Dense matrix tests.
+ println(s"Ad + Bd: ${getEWAvgs(mxAd, mxBd, n)}")
+ println(s"Ad' + Bd: ${getEWAvgs(mxAd.t, mxBd, n)}")
+ println(s"Ad + Bd': ${getEWAvgs(mxAd, mxBd.t, n)}")
+ println(s"Ad' + Bd': ${getEWAvgs(mxAd.t, mxBd.t, n)}")
+ println(s"Ad'' + Bd'': ${getEWAvgs(mxAd.t.t, mxBd.t.t, n)}")
+ println
+
+ // Sparse row matrix tests.
+ println(s"Asr + Bsr: ${getEWAvgs(mxAsr, mxBsr, n)}")
+ println(s"Asr' + Bsr: ${getEWAvgs(mxAsr.t, mxBsr, n)}")
+ println(s"Asr + Bsr': ${getEWAvgs(mxAsr, mxBsr.t, n)}")
+ println(s"Asr' + Bsr': ${getEWAvgs(mxAsr.t, mxBsr.t, n)}")
+ println(s"Asr'' + Bsr'': ${getEWAvgs(mxAsr.t.t, mxBsr.t.t, n)}")
+ println
+
+ // Sparse matrix tests.
+ println(s"Asm + Bsm: ${getEWAvgs(mxAs, mxBs, n)}")
+ println(s"Asm' + Bsm: ${getEWAvgs(mxAs.t, mxBs, n)}")
+ println(s"Asm + Bsm': ${getEWAvgs(mxAs, mxBs.t, n)}")
+ println(s"Asm' + Bsm': ${getEWAvgs(mxAs.t, mxBs.t, n)}")
+ println(s"Asm'' + Bsm'': ${getEWAvgs(mxAs.t.t, mxBs.t.t, n)}")
+ println
+
+ // Mixed sparse matrix tests.
+ println(s"Asm + Bsr: ${getEWAvgs(mxAs, mxBsr, n)}")
+ println(s"Asm' + Bsr: ${getEWAvgs(mxAs.t, mxBsr, n)}")
+ println(s"Asm + Bsr': ${getEWAvgs(mxAs, mxBsr.t, n)}")
+ println(s"Asm' + Bsr': ${getEWAvgs(mxAs.t, mxBsr.t, n)}")
+ println(s"Asm'' + Bsr'': ${getEWAvgs(mxAs.t.t, mxBsr.t.t, n)}")
+ println
+
+ println(s"Asr + Bsm: ${getEWAvgs(mxAsr, mxBs, n)}")
+ println(s"Asr' + Bsm: ${getEWAvgs(mxAsr.t, mxBs, n)}")
+ println(s"Asr + Bsm': ${getEWAvgs(mxAsr, mxBs.t, n)}")
+ println(s"Asr' + Bsm': ${getEWAvgs(mxAsr.t, mxBs.t, n)}")
+ println(s"Asr'' + Bsm'': ${getEWAvgs(mxAsr.t.t, mxBs.t.t, n)}")
+ println
+
+ // Mixed dense/sparse
+ println(s"Ad + Bsr: ${getEWAvgs(mxAd, mxBsr, n)}")
+ println(s"Ad' + Bsr: ${getEWAvgs(mxAd.t, mxBsr, n)}")
+ println(s"Ad + Bsr': ${getEWAvgs(mxAd, mxBsr.t, n)}")
+ println(s"Ad' + Bsr': ${getEWAvgs(mxAd.t, mxBsr.t, n)}")
+ println(s"Ad'' + Bsr'': ${getEWAvgs(mxAd.t.t, mxBsr.t.t, n)}")
+ println
+
+ println(s"Asr + Bd: ${getEWAvgs(mxAsr, mxBd, n)}")
+ println(s"Asr' + Bd: ${getEWAvgs(mxAsr.t, mxBd, n)}")
+ println(s"Asr + Bd': ${getEWAvgs(mxAsr, mxBd.t, n)}")
+ println(s"Asr' + Bd': ${getEWAvgs(mxAsr.t, mxBd.t, n)}")
+ println(s"Asr'' + Bd'': ${getEWAvgs(mxAsr.t.t, mxBd.t.t, n)}")
+ println
+
+ println(s"Ad + Bsm: ${getEWAvgs(mxAd, mxBs, n)}")
+ println(s"Ad' + Bsm: ${getEWAvgs(mxAd.t, mxBs, n)}")
+ println(s"Ad + Bsm': ${getEWAvgs(mxAd, mxBs.t, n)}")
+ println(s"Ad' + Bsm': ${getEWAvgs(mxAd.t, mxBs.t, n)}")
+ println(s"Ad'' + Bsm'': ${getEWAvgs(mxAd.t.t, mxBs.t.t, n)}")
+ println
+
+ println(s"Asm + Bd: ${getEWAvgs(mxAs, mxBd, n)}")
+ println(s"Asm' + Bd: ${getEWAvgs(mxAs.t, mxBd, n)}")
+ println(s"Asm + Bd': ${getEWAvgs(mxAs, mxBd.t, n)}")
+ println(s"Asm' + Bd': ${getEWAvgs(mxAs.t, mxBd.t, n)}")
+ println(s"Asm'' + Bd'': ${getEWAvgs(mxAs.t.t, mxBd.t.t, n)}")
+ println
+
+ // Diagonal cases
+ println(s"Ad + D: ${getEWAvgs(mxAd, mxD, n)}")
+ println(s"Asr + D: ${getEWAvgs(mxAsr, mxD, n)}")
+ println(s"Asm + D: ${getEWAvgs(mxAs, mxD, n)}")
+ println(s"D + Ad: ${getEWAvgs(mxD, mxAd, n)}")
+ println(s"D + Asr: ${getEWAvgs(mxD, mxAsr, n)}")
+ println(s"D + Asm: ${getEWAvgs(mxD, mxAs, n)}")
+ println
+
+ println(s"Ad' + D: ${getEWAvgs(mxAd.t, mxD, n)}")
+ println(s"Asr' + D: ${getEWAvgs(mxAsr.t, mxD, n)}")
+ println(s"Asm' + D: ${getEWAvgs(mxAs.t, mxD, n)}")
+ println(s"D + Ad': ${getEWAvgs(mxD, mxAd.t, n)}")
+ println(s"D + Asr': ${getEWAvgs(mxD, mxAsr.t, n)}")
+ println(s"D + Asm': ${getEWAvgs(mxD, mxAs.t, n)}")
+ println
+
+ }
+
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/VectorOpsSuite.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/VectorOpsSuite.scala b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/VectorOpsSuite.scala
index 037f562..d264514 100644
--- a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/VectorOpsSuite.scala
+++ b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/VectorOpsSuite.scala
@@ -18,10 +18,12 @@
package org.apache.mahout.math.scalabindings
import org.scalatest.FunSuite
-import org.apache.mahout.math.{RandomAccessSparseVector, Vector}
+import org.apache.mahout.math.{SequentialAccessSparseVector, RandomAccessSparseVector, Vector}
import RLikeOps._
import org.apache.mahout.test.MahoutSuite
+import scala.util.Random
+
/** VectorOps Suite */
class VectorOpsSuite extends FunSuite with MahoutSuite {
@@ -79,4 +81,19 @@ class VectorOpsSuite extends FunSuite with MahoutSuite {
}
+ test("sparse assignment") {
+
+ val svec = new SequentialAccessSparseVector(30)
+ svec(1) = -0.5
+ svec(3) = 0.5
+ println(svec)
+
+ svec(1 until svec.length) ::= ( _ => 0)
+ println(svec)
+
+ svec.sum shouldBe 0
+
+
+ }
+
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/AbstractMatrix.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/AbstractMatrix.java b/math/src/main/java/org/apache/mahout/math/AbstractMatrix.java
index e752422..a823d0b 100644
--- a/math/src/main/java/org/apache/mahout/math/AbstractMatrix.java
+++ b/math/src/main/java/org/apache/mahout/math/AbstractMatrix.java
@@ -19,13 +19,16 @@ package org.apache.mahout.math;
import com.google.common.collect.AbstractIterator;
import com.google.common.collect.Maps;
+import org.apache.mahout.math.flavor.BackEnum;
+import org.apache.mahout.math.flavor.MatrixFlavor;
+import org.apache.mahout.math.flavor.TraversingStructureEnum;
import org.apache.mahout.math.function.*;
import java.util.Iterator;
import java.util.Map;
/**
- * A few universal implementations of convenience functions
+ * A few universal implementations of convenience functions for a JVM-backed matrix.
*/
public abstract class AbstractMatrix implements Matrix {
@@ -57,19 +60,24 @@ public abstract class AbstractMatrix implements Matrix {
@Override
public Iterator<MatrixSlice> iterateAll() {
return new AbstractIterator<MatrixSlice>() {
- private int slice;
+ private int row;
@Override
protected MatrixSlice computeNext() {
- if (slice >= numSlices()) {
+ if (row >= numRows()) {
return endOfData();
}
- int i = slice++;
+ int i = row++;
return new MatrixSlice(viewRow(i), i);
}
};
}
+ @Override
+ public Iterator<MatrixSlice> iterateNonEmpty() {
+ return iterator();
+ }
+
/**
* Abstracted out for the iterator
*
@@ -813,4 +821,12 @@ public abstract class AbstractMatrix implements Matrix {
return returnString + ("}");
}
}
+
+ @Override
+ public MatrixFlavor getFlavor() {
+ throw new UnsupportedOperationException("Flavor support not implemented for this matrix.");
+ }
+
+ ////////////// Matrix flavor trait ///////////////////
+
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/ConstantVector.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/ConstantVector.java b/math/src/main/java/org/apache/mahout/math/ConstantVector.java
index 86ab82b..847bf85 100644
--- a/math/src/main/java/org/apache/mahout/math/ConstantVector.java
+++ b/math/src/main/java/org/apache/mahout/math/ConstantVector.java
@@ -132,6 +132,11 @@ public class ConstantVector extends AbstractVector {
return new DenseVector(size());
}
+ @Override
+ public Vector like(int cardinality) {
+ return new DenseVector(cardinality);
+ }
+
/**
* Set the value at the given index, without checking bounds
*
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/DelegatingVector.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/DelegatingVector.java b/math/src/main/java/org/apache/mahout/math/DelegatingVector.java
index a1fd291..0b2e36b 100644
--- a/math/src/main/java/org/apache/mahout/math/DelegatingVector.java
+++ b/math/src/main/java/org/apache/mahout/math/DelegatingVector.java
@@ -310,6 +310,11 @@ public class DelegatingVector implements Vector, LengthCachingVector {
}
@Override
+ public Vector like(int cardinality) {
+ return new DelegatingVector(delegate.like(cardinality));
+ }
+
+ @Override
public void setQuick(int index, double value) {
delegate.setQuick(index, value);
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/DenseMatrix.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/DenseMatrix.java b/math/src/main/java/org/apache/mahout/math/DenseMatrix.java
index 7f52c00..5c1ee12 100644
--- a/math/src/main/java/org/apache/mahout/math/DenseMatrix.java
+++ b/math/src/main/java/org/apache/mahout/math/DenseMatrix.java
@@ -17,6 +17,9 @@
package org.apache.mahout.math;
+import org.apache.mahout.math.flavor.MatrixFlavor;
+import org.apache.mahout.math.flavor.TraversingStructureEnum;
+
import java.util.Arrays;
/** Matrix of doubles implemented using a 2-d array */
@@ -175,5 +178,9 @@ public class DenseMatrix extends AbstractMatrix {
}
return new DenseVector(values[row], true);
}
-
+
+ @Override
+ public MatrixFlavor getFlavor() {
+ return MatrixFlavor.DENSELIKE;
+ }
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/DenseSymmetricMatrix.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/DenseSymmetricMatrix.java b/math/src/main/java/org/apache/mahout/math/DenseSymmetricMatrix.java
index e9cf3f1..7252b9b 100644
--- a/math/src/main/java/org/apache/mahout/math/DenseSymmetricMatrix.java
+++ b/math/src/main/java/org/apache/mahout/math/DenseSymmetricMatrix.java
@@ -17,6 +17,8 @@
package org.apache.mahout.math;
+import org.apache.mahout.math.flavor.TraversingStructureEnum;
+
/**
* Economy packaging for a dense symmetric in-core matrix.
*/
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/DenseVector.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/DenseVector.java b/math/src/main/java/org/apache/mahout/math/DenseVector.java
index 5b3dea7..3633e58 100644
--- a/math/src/main/java/org/apache/mahout/math/DenseVector.java
+++ b/math/src/main/java/org/apache/mahout/math/DenseVector.java
@@ -136,6 +136,11 @@ public class DenseVector extends AbstractVector {
}
@Override
+ public Vector like(int cardinality) {
+ return new DenseVector(cardinality);
+ }
+
+ @Override
public void setQuick(int index, double value) {
invalidateCachedLength();
values[index] = value;
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/DiagonalMatrix.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/DiagonalMatrix.java b/math/src/main/java/org/apache/mahout/math/DiagonalMatrix.java
index 3e20a4a..070fad2 100644
--- a/math/src/main/java/org/apache/mahout/math/DiagonalMatrix.java
+++ b/math/src/main/java/org/apache/mahout/math/DiagonalMatrix.java
@@ -17,6 +17,9 @@
package org.apache.mahout.math;
+import org.apache.mahout.math.flavor.MatrixFlavor;
+import org.apache.mahout.math.flavor.TraversingStructureEnum;
+
import java.util.Iterator;
import java.util.NoSuchElementException;
@@ -223,6 +226,11 @@ public class DiagonalMatrix extends AbstractMatrix implements MatrixTimesOps {
}
@Override
+ public Vector like(int cardinality) {
+ return new DenseVector(cardinality);
+ }
+
+ @Override
public void setQuick(int index, double value) {
if (index == this.index) {
diagonal.set(this.index, value);
@@ -361,4 +369,10 @@ public class DiagonalMatrix extends AbstractMatrix implements MatrixTimesOps {
}
return m;
}
+
+ @Override
+ public MatrixFlavor getFlavor() {
+ return MatrixFlavor.DIAGONALLIKE;
+ }
+
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/FileBasedSparseBinaryMatrix.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/FileBasedSparseBinaryMatrix.java b/math/src/main/java/org/apache/mahout/math/FileBasedSparseBinaryMatrix.java
index ba09aa8..56600cd 100644
--- a/math/src/main/java/org/apache/mahout/math/FileBasedSparseBinaryMatrix.java
+++ b/math/src/main/java/org/apache/mahout/math/FileBasedSparseBinaryMatrix.java
@@ -437,6 +437,11 @@ public final class FileBasedSparseBinaryMatrix extends AbstractMatrix {
return new RandomAccessSparseVector(size());
}
+ @Override
+ public Vector like(int cardinality) {
+ return new RandomAccessSparseVector(cardinality);
+ }
+
/**
* Copy the vector for fast operations.
*
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/FunctionalMatrixView.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/FunctionalMatrixView.java b/math/src/main/java/org/apache/mahout/math/FunctionalMatrixView.java
index 2a13611..9028e23 100644
--- a/math/src/main/java/org/apache/mahout/math/FunctionalMatrixView.java
+++ b/math/src/main/java/org/apache/mahout/math/FunctionalMatrixView.java
@@ -17,6 +17,9 @@
package org.apache.mahout.math;
+import org.apache.mahout.math.flavor.BackEnum;
+import org.apache.mahout.math.flavor.MatrixFlavor;
+import org.apache.mahout.math.flavor.TraversingStructureEnum;
import org.apache.mahout.math.function.IntIntFunction;
/**
@@ -29,6 +32,7 @@ class FunctionalMatrixView extends AbstractMatrix {
*/
private IntIntFunction gf;
private boolean denseLike;
+ private MatrixFlavor flavor;
public FunctionalMatrixView(int rows, int columns, IntIntFunction gf) {
this(rows, columns, gf, false);
@@ -42,6 +46,7 @@ class FunctionalMatrixView extends AbstractMatrix {
super(rows, columns);
this.gf = gf;
this.denseLike = denseLike;
+ flavor = new MatrixFlavor.FlavorImpl(BackEnum.JVMMEM, TraversingStructureEnum.BLOCKIFIED, denseLike);
}
@Override
@@ -87,4 +92,8 @@ class FunctionalMatrixView extends AbstractMatrix {
return new MatrixVectorView(this, 0, column, 1, 0, denseLike);
}
+ @Override
+ public MatrixFlavor getFlavor() {
+ return flavor;
+ }
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/Matrices.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/Matrices.java b/math/src/main/java/org/apache/mahout/math/Matrices.java
index 4a0c50c..fc45a16 100644
--- a/math/src/main/java/org/apache/mahout/math/Matrices.java
+++ b/math/src/main/java/org/apache/mahout/math/Matrices.java
@@ -17,7 +17,9 @@
package org.apache.mahout.math;
+import com.google.common.base.Preconditions;
import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.flavor.TraversingStructureEnum;
import org.apache.mahout.math.function.DoubleFunction;
import org.apache.mahout.math.function.Functions;
import org.apache.mahout.math.function.IntIntFunction;
@@ -63,16 +65,14 @@ public final class Matrices {
* @return transposed view of original matrix
*/
public static final Matrix transposedView(final Matrix m) {
- IntIntFunction tf = new IntIntFunction() {
- @Override
- public double apply(int row, int col) {
- return m.getQuick(col, row);
- }
- };
- // TODO: Matrix api does not support denseLike() interrogation.
- // so our guess has to be rough here.
- return functionalMatrixView(m.numCols(), m.numRows(), tf, m instanceof DenseMatrix);
+ Preconditions.checkArgument(!(m instanceof SparseColumnMatrix));
+
+ if (m instanceof TransposedMatrixView) {
+ return ((TransposedMatrixView) m).getDelegate();
+ } else {
+ return new TransposedMatrixView(m);
+ }
}
/**
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/Matrix.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/Matrix.java b/math/src/main/java/org/apache/mahout/math/Matrix.java
index afdbac5..47ba5cf 100644
--- a/math/src/main/java/org/apache/mahout/math/Matrix.java
+++ b/math/src/main/java/org/apache/mahout/math/Matrix.java
@@ -17,6 +17,7 @@
package org.apache.mahout.math;
+import org.apache.mahout.math.flavor.MatrixFlavor;
import org.apache.mahout.math.function.DoubleDoubleFunction;
import org.apache.mahout.math.function.DoubleFunction;
import org.apache.mahout.math.function.VectorFunction;
@@ -403,4 +404,10 @@ public interface Matrix extends Cloneable, VectorIterable {
* @return A vector that shares storage with the original matrix.
*/
Vector viewDiagonal();
+
+ /**
+ * Get matrix structural flavor (operations performance hints). This is optional operation, may
+ * throw {@link java.lang.UnsupportedOperationException}.
+ */
+ MatrixFlavor getFlavor();
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/MatrixVectorView.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/MatrixVectorView.java b/math/src/main/java/org/apache/mahout/math/MatrixVectorView.java
index 074d7a6..52ae722 100644
--- a/math/src/main/java/org/apache/mahout/math/MatrixVectorView.java
+++ b/math/src/main/java/org/apache/mahout/math/MatrixVectorView.java
@@ -211,6 +211,11 @@ public class MatrixVectorView extends AbstractVector {
return matrix.like(size(), 1).viewColumn(0);
}
+ @Override
+ public Vector like(int cardinality) {
+ return matrix.like(cardinality, 1).viewColumn(0);
+ }
+
/**
* Set the value at the given index, without checking bounds
*
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/MatrixView.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/MatrixView.java b/math/src/main/java/org/apache/mahout/math/MatrixView.java
index e2f7f48..86760d5 100644
--- a/math/src/main/java/org/apache/mahout/math/MatrixView.java
+++ b/math/src/main/java/org/apache/mahout/math/MatrixView.java
@@ -17,6 +17,8 @@
package org.apache.mahout.math;
+import org.apache.mahout.math.flavor.MatrixFlavor;
+
/** Implements subset view of a Matrix */
public class MatrixView extends AbstractMatrix {
@@ -151,4 +153,8 @@ public class MatrixView extends AbstractMatrix {
return new VectorView(matrix.viewRow(row + offset[ROW]), offset[COL], columnSize());
}
+ @Override
+ public MatrixFlavor getFlavor() {
+ return matrix.getFlavor();
+ }
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/NamedVector.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/NamedVector.java b/math/src/main/java/org/apache/mahout/math/NamedVector.java
index 0bf49c8..d4fa609 100644
--- a/math/src/main/java/org/apache/mahout/math/NamedVector.java
+++ b/math/src/main/java/org/apache/mahout/math/NamedVector.java
@@ -177,6 +177,11 @@ public class NamedVector implements Vector {
}
@Override
+ public Vector like(int cardinality) {
+ return new NamedVector(delegate.like(cardinality), name);
+ }
+
+ @Override
public Vector minus(Vector x) {
return delegate.minus(x);
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/PermutedVectorView.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/PermutedVectorView.java b/math/src/main/java/org/apache/mahout/math/PermutedVectorView.java
index f34f2b0..a76f78c 100644
--- a/math/src/main/java/org/apache/mahout/math/PermutedVectorView.java
+++ b/math/src/main/java/org/apache/mahout/math/PermutedVectorView.java
@@ -204,6 +204,11 @@ public class PermutedVectorView extends AbstractVector {
return vector.like();
}
+ @Override
+ public Vector like(int cardinality) {
+ return vector.like(cardinality);
+ }
+
/**
* Set the value at the given index, without checking bounds
*
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/RandomAccessSparseVector.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/RandomAccessSparseVector.java b/math/src/main/java/org/apache/mahout/math/RandomAccessSparseVector.java
index dbe5d3a..3efac7e 100644
--- a/math/src/main/java/org/apache/mahout/math/RandomAccessSparseVector.java
+++ b/math/src/main/java/org/apache/mahout/math/RandomAccessSparseVector.java
@@ -142,6 +142,11 @@ public class RandomAccessSparseVector extends AbstractVector {
}
@Override
+ public Vector like(int cardinality) {
+ return new RandomAccessSparseVector(cardinality, values.size());
+ }
+
+ @Override
public int getNumNondefaultElements() {
return values.size();
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/SequentialAccessSparseVector.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/SequentialAccessSparseVector.java b/math/src/main/java/org/apache/mahout/math/SequentialAccessSparseVector.java
index 331662c..f7d67a7 100644
--- a/math/src/main/java/org/apache/mahout/math/SequentialAccessSparseVector.java
+++ b/math/src/main/java/org/apache/mahout/math/SequentialAccessSparseVector.java
@@ -180,6 +180,11 @@ public class SequentialAccessSparseVector extends AbstractVector {
}
@Override
+ public Vector like(int cardinality) {
+ return new SequentialAccessSparseVector(cardinality);
+ }
+
+ @Override
public int getNumNondefaultElements() {
return values.getNumMappings();
}
@@ -214,6 +219,8 @@ public class SequentialAccessSparseVector extends AbstractVector {
@Override
public Iterator<Element> iterateNonZero() {
+
+ // TODO: this is a bug, since nonDefaultIterator doesn't hold to non-zero contract.
return new NonDefaultIterator();
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/SparseColumnMatrix.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/SparseColumnMatrix.java b/math/src/main/java/org/apache/mahout/math/SparseColumnMatrix.java
index f62d553..eeffc78 100644
--- a/math/src/main/java/org/apache/mahout/math/SparseColumnMatrix.java
+++ b/math/src/main/java/org/apache/mahout/math/SparseColumnMatrix.java
@@ -17,9 +17,13 @@
package org.apache.mahout.math;
+import org.apache.mahout.math.flavor.TraversingStructureEnum;
+
/**
* sparse matrix with general element values whose columns are accessible quickly. Implemented as a column array of
* SparseVectors.
+ *
+ * @deprecated tons of inconsistences. Use transpose view of SparseRowMatrix for fast column-wise iteration.
*/
public class SparseColumnMatrix extends AbstractMatrix {
@@ -31,11 +35,19 @@ public class SparseColumnMatrix extends AbstractMatrix {
* @param columns a RandomAccessSparseVector[] array of columns
* @param columnVectors
*/
- public SparseColumnMatrix(int rows, int columns, RandomAccessSparseVector[] columnVectors) {
+ public SparseColumnMatrix(int rows, int columns, Vector[] columnVectors) {
+ this(rows, columns, columnVectors, false);
+ }
+
+ public SparseColumnMatrix(int rows, int columns, Vector[] columnVectors, boolean shallow) {
super(rows, columns);
- this.columnVectors = columnVectors.clone();
- for (int col = 0; col < columnSize(); col++) {
- this.columnVectors[col] = this.columnVectors[col].clone();
+ if (shallow) {
+ this.columnVectors = columnVectors;
+ } else {
+ this.columnVectors = columnVectors.clone();
+ for (int col = 0; col < columnSize(); col++) {
+ this.columnVectors[col] = this.columnVectors[col].clone();
+ }
}
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/SparseMatrix.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/SparseMatrix.java b/math/src/main/java/org/apache/mahout/math/SparseMatrix.java
index 88e15a0..bf4f1a0 100644
--- a/math/src/main/java/org/apache/mahout/math/SparseMatrix.java
+++ b/math/src/main/java/org/apache/mahout/math/SparseMatrix.java
@@ -18,6 +18,8 @@
package org.apache.mahout.math;
import com.google.common.collect.AbstractIterator;
+import org.apache.mahout.math.flavor.MatrixFlavor;
+import org.apache.mahout.math.flavor.TraversingStructureEnum;
import org.apache.mahout.math.function.DoubleDoubleFunction;
import org.apache.mahout.math.function.Functions;
import org.apache.mahout.math.function.IntObjectProcedure;
@@ -40,11 +42,23 @@ public class SparseMatrix extends AbstractMatrix {
* @param columns
* @param rowVectors
*/
- public SparseMatrix(int rows, int columns, Map<Integer, RandomAccessSparseVector> rowVectors) {
+ public SparseMatrix(int rows, int columns, Map<Integer, Vector> rowVectors) {
+ this(rows, columns, rowVectors, false);
+ }
+
+ public SparseMatrix(int rows, int columns, Map<Integer, Vector> rowVectors, boolean shallow) {
+
+ // Why this is passing in a map? iterating it is pretty inefficient as opposed to simple lists...
super(rows, columns);
this.rowVectors = new OpenIntObjectHashMap<Vector>();
- for (Map.Entry<Integer, RandomAccessSparseVector> entry : rowVectors.entrySet()) {
- this.rowVectors.put(entry.getKey(), entry.getValue().clone());
+ if (shallow) {
+ for (Map.Entry<Integer, Vector> entry : rowVectors.entrySet()) {
+ this.rowVectors.put(entry.getKey(), entry.getValue());
+ }
+ } else {
+ for (Map.Entry<Integer, Vector> entry : rowVectors.entrySet()) {
+ this.rowVectors.put(entry.getKey(), entry.getValue().clone());
+ }
}
}
@@ -66,7 +80,11 @@ public class SparseMatrix extends AbstractMatrix {
}
@Override
- public Iterator<MatrixSlice> iterator() {
+ public int numSlices() {
+ return rowVectors.size();
+ }
+
+ public Iterator<MatrixSlice> iterateNonEmpty() {
final IntArrayList keys = new IntArrayList(rowVectors.size());
rowVectors.keys(keys);
return new AbstractIterator<MatrixSlice>() {
@@ -221,4 +239,8 @@ public class SparseMatrix extends AbstractMatrix {
return rowVectors.keys();
}
+ @Override
+ public MatrixFlavor getFlavor() {
+ return MatrixFlavor.SPARSEROWLIKE;
+ }
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/SparseRowMatrix.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/SparseRowMatrix.java b/math/src/main/java/org/apache/mahout/math/SparseRowMatrix.java
index 3021f3b..6e06769 100644
--- a/math/src/main/java/org/apache/mahout/math/SparseRowMatrix.java
+++ b/math/src/main/java/org/apache/mahout/math/SparseRowMatrix.java
@@ -17,6 +17,8 @@
package org.apache.mahout.math;
+import org.apache.mahout.math.flavor.MatrixFlavor;
+import org.apache.mahout.math.flavor.TraversingStructureEnum;
import org.apache.mahout.math.function.Functions;
/**
@@ -226,4 +228,9 @@ public class SparseRowMatrix extends AbstractMatrix {
}
}
}
+
+ @Override
+ public MatrixFlavor getFlavor() {
+ return MatrixFlavor.SPARSELIKE;
+ }
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/TransposedMatrixView.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/TransposedMatrixView.java b/math/src/main/java/org/apache/mahout/math/TransposedMatrixView.java
new file mode 100644
index 0000000..c67cb47
--- /dev/null
+++ b/math/src/main/java/org/apache/mahout/math/TransposedMatrixView.java
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math;
+
+import org.apache.mahout.math.flavor.BackEnum;
+import org.apache.mahout.math.flavor.MatrixFlavor;
+import org.apache.mahout.math.flavor.TraversingStructureEnum;
+import org.apache.mahout.math.function.DoubleDoubleFunction;
+import org.apache.mahout.math.function.DoubleFunction;
+
+/**
+ * Matrix View backed by an {@link org.apache.mahout.math.function.IntIntFunction}
+ */
+class TransposedMatrixView extends AbstractMatrix {
+
+ private Matrix m;
+
+ public TransposedMatrixView(Matrix m) {
+ super(m.numCols(), m.numRows());
+ this.m = m;
+ }
+
+ @Override
+ public Matrix assignColumn(int column, Vector other) {
+ m.assignRow(column,other);
+ return this;
+ }
+
+ @Override
+ public Matrix assignRow(int row, Vector other) {
+ m.assignColumn(row,other);
+ return this;
+ }
+
+ @Override
+ public double getQuick(int row, int column) {
+ return m.getQuick(column,row);
+ }
+
+ @Override
+ public Matrix like() {
+ return m.like(rows, columns);
+ }
+
+ @Override
+ public Matrix like(int rows, int columns) {
+ return m.like(rows,columns);
+ }
+
+ @Override
+ public void setQuick(int row, int column, double value) {
+ m.setQuick(column, row, value);
+ }
+
+ @Override
+ public Vector viewRow(int row) {
+ return m.viewColumn(row);
+ }
+
+ @Override
+ public Vector viewColumn(int column) {
+ return m.viewRow(column);
+ }
+
+ @Override
+ public Matrix assign(double value) {
+ return m.assign(value);
+ }
+
+ @Override
+ public Matrix assign(Matrix other, DoubleDoubleFunction function) {
+ if (other instanceof TransposedMatrixView) {
+ m.assign(((TransposedMatrixView) other).m, function);
+ } else {
+ m.assign(new TransposedMatrixView(other), function);
+ }
+ return this;
+ }
+
+ @Override
+ public Matrix assign(Matrix other) {
+ if (other instanceof TransposedMatrixView) {
+ return m.assign(((TransposedMatrixView) other).m);
+ } else {
+ return m.assign(new TransposedMatrixView(other));
+ }
+ }
+
+ @Override
+ public Matrix assign(DoubleFunction function) {
+ return m.assign(function);
+ }
+
+ @Override
+ public MatrixFlavor getFlavor() {
+ return flavor;
+ }
+
+ private MatrixFlavor flavor = new MatrixFlavor() {
+ @Override
+ public BackEnum getBacking() {
+ return m.getFlavor().getBacking();
+ }
+
+ @Override
+ public TraversingStructureEnum getStructure() {
+ TraversingStructureEnum flavor = m.getFlavor().getStructure();
+ switch (flavor) {
+ case COLWISE:
+ return TraversingStructureEnum.ROWWISE;
+ case SPARSECOLWISE:
+ return TraversingStructureEnum.SPARSEROWWISE;
+ case ROWWISE:
+ return TraversingStructureEnum.COLWISE;
+ case SPARSEROWWISE:
+ return TraversingStructureEnum.SPARSECOLWISE;
+ default:
+ return flavor;
+ }
+ }
+
+ @Override
+ public boolean isDense() {
+ return m.getFlavor().isDense();
+ }
+ };
+
+ Matrix getDelegate() {
+ return m;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/UpperTriangular.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/UpperTriangular.java b/math/src/main/java/org/apache/mahout/math/UpperTriangular.java
index a0cb3cd..29fa6a0 100644
--- a/math/src/main/java/org/apache/mahout/math/UpperTriangular.java
+++ b/math/src/main/java/org/apache/mahout/math/UpperTriangular.java
@@ -17,6 +17,10 @@
package org.apache.mahout.math;
+import org.apache.mahout.math.flavor.BackEnum;
+import org.apache.mahout.math.flavor.MatrixFlavor;
+import org.apache.mahout.math.flavor.TraversingStructureEnum;
+
/**
*
* Quick and dirty implementation of some {@link org.apache.mahout.math.Matrix} methods
@@ -148,4 +152,9 @@ public class UpperTriangular extends AbstractMatrix {
return values;
}
+ @Override
+ public MatrixFlavor getFlavor() {
+ // We kind of consider ourselves a vector-backed but dense matrix for mmul, etc. purposes.
+ return new MatrixFlavor.FlavorImpl(BackEnum.JVMMEM, TraversingStructureEnum.VECTORBACKED, true);
+ }
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/Vector.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/Vector.java b/math/src/main/java/org/apache/mahout/math/Vector.java
index 0d1a003..4480b0a 100644
--- a/math/src/main/java/org/apache/mahout/math/Vector.java
+++ b/math/src/main/java/org/apache/mahout/math/Vector.java
@@ -190,6 +190,14 @@ public interface Vector extends Cloneable {
Vector like();
/**
+ * Return a new empty vector of the same underlying class as the receiver with given cardinality
+ *
+ * @param cardinality
+ * @return
+ */
+ Vector like(int cardinality);
+
+ /**
* Return a new vector containing the element by element difference of the recipient and the argument
*
* @param x a Vector
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/VectorIterable.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/VectorIterable.java b/math/src/main/java/org/apache/mahout/math/VectorIterable.java
index 451c589..8414fdb 100644
--- a/math/src/main/java/org/apache/mahout/math/VectorIterable.java
+++ b/math/src/main/java/org/apache/mahout/math/VectorIterable.java
@@ -21,8 +21,12 @@ import java.util.Iterator;
public interface VectorIterable extends Iterable<MatrixSlice> {
+ /* Iterate all rows in order */
Iterator<MatrixSlice> iterateAll();
+ /* Iterate all non empty rows in arbitrary order */
+ Iterator<MatrixSlice> iterateNonEmpty();
+
int numSlices();
int numRows();
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/VectorView.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/VectorView.java b/math/src/main/java/org/apache/mahout/math/VectorView.java
index b503712..d61a038 100644
--- a/math/src/main/java/org/apache/mahout/math/VectorView.java
+++ b/math/src/main/java/org/apache/mahout/math/VectorView.java
@@ -69,6 +69,11 @@ public class VectorView extends AbstractVector {
}
@Override
+ public Vector like(int cardinality) {
+ return vector.like(cardinality);
+ }
+
+ @Override
public double getQuick(int index) {
return vector.getQuick(offset + index);
}
@@ -122,7 +127,7 @@ public class VectorView extends AbstractVector {
while (it.hasNext()) {
Element el = it.next();
if (isInView(el.index()) && el.get() != 0) {
- Element decorated = vector.getElement(el.index());
+ Element decorated = el; /* vector.getElement(el.index()); */
return new DecoratorElement(decorated);
}
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/flavor/BackEnum.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/flavor/BackEnum.java b/math/src/main/java/org/apache/mahout/math/flavor/BackEnum.java
new file mode 100644
index 0000000..1782f04
--- /dev/null
+++ b/math/src/main/java/org/apache/mahout/math/flavor/BackEnum.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.flavor;
+
+/**
+ * Matrix backends
+ */
+public enum BackEnum {
+ JVMMEM,
+ NETLIB_BLAS
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/flavor/MatrixFlavor.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/flavor/MatrixFlavor.java b/math/src/main/java/org/apache/mahout/math/flavor/MatrixFlavor.java
new file mode 100644
index 0000000..2b5c444
--- /dev/null
+++ b/math/src/main/java/org/apache/mahout/math/flavor/MatrixFlavor.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.flavor;
+
+/** A set of matrix structure properties that I denote as "flavor" (by analogy to quarks) */
+public interface MatrixFlavor {
+
+ /**
+ * Whether matrix is backed by a native system -- such as java memory, lapack/atlas, Magma etc.
+ */
+ BackEnum getBacking();
+
+ /**
+ * Structure flavors
+ */
+ TraversingStructureEnum getStructure() ;
+
+ boolean isDense();
+
+ /**
+ * This default for {@link org.apache.mahout.math.DenseMatrix}-like structures
+ */
+ static final MatrixFlavor DENSELIKE = new FlavorImpl(BackEnum.JVMMEM, TraversingStructureEnum.ROWWISE, true);
+ /**
+ * This is default flavor for {@link org.apache.mahout.math.SparseRowMatrix}-like.
+ */
+ static final MatrixFlavor SPARSELIKE = new FlavorImpl(BackEnum.JVMMEM, TraversingStructureEnum.ROWWISE, false);
+
+ /**
+ * This is default flavor for {@link org.apache.mahout.math.SparseMatrix}-like structures, i.e. sparse matrix blocks,
+ * where few, perhaps most, rows may be missing entirely.
+ */
+ static final MatrixFlavor SPARSEROWLIKE = new FlavorImpl(BackEnum.JVMMEM, TraversingStructureEnum.SPARSEROWWISE, false);
+
+ /**
+ * This is default flavor for {@link org.apache.mahout.math.DiagonalMatrix} and the likes.
+ */
+ static final MatrixFlavor DIAGONALLIKE = new FlavorImpl(BackEnum.JVMMEM, TraversingStructureEnum.VECTORBACKED, false);
+
+ static final class FlavorImpl implements MatrixFlavor {
+ private BackEnum pBacking;
+ private TraversingStructureEnum pStructure;
+ private boolean pDense;
+
+ public FlavorImpl(BackEnum backing, TraversingStructureEnum structure, boolean dense) {
+ pBacking = backing;
+ pStructure = structure;
+ pDense = dense;
+ }
+
+ @Override
+ public BackEnum getBacking() {
+ return pBacking;
+ }
+
+ @Override
+ public TraversingStructureEnum getStructure() {
+ return pStructure;
+ }
+
+ @Override
+ public boolean isDense() {
+ return pDense;
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/main/java/org/apache/mahout/math/flavor/TraversingStructureEnum.java
----------------------------------------------------------------------
diff --git a/math/src/main/java/org/apache/mahout/math/flavor/TraversingStructureEnum.java b/math/src/main/java/org/apache/mahout/math/flavor/TraversingStructureEnum.java
new file mode 100644
index 0000000..13c2cf4
--- /dev/null
+++ b/math/src/main/java/org/apache/mahout/math/flavor/TraversingStructureEnum.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.flavor;
+
+/** STRUCTURE HINT */
+public enum TraversingStructureEnum {
+
+ UNKNOWN,
+
+ /**
+ * Backing vectors are directly available as row views.
+ */
+ ROWWISE,
+
+ /**
+ * Column vectors are directly available as column views.
+ */
+ COLWISE,
+
+ /**
+ * Only some row-wise vectors are really present (can use iterateNonEmpty). Corresponds to
+ * [[org.apache.mahout.math.SparseMatrix]].
+ */
+ SPARSEROWWISE,
+
+ SPARSECOLWISE,
+
+ SPARSEHASH,
+
+ VECTORBACKED,
+
+ BLOCKIFIED
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math/src/test/java/org/apache/mahout/math/MatricesTest.java
----------------------------------------------------------------------
diff --git a/math/src/test/java/org/apache/mahout/math/MatricesTest.java b/math/src/test/java/org/apache/mahout/math/MatricesTest.java
index 1b6169e..9405429 100644
--- a/math/src/test/java/org/apache/mahout/math/MatricesTest.java
+++ b/math/src/test/java/org/apache/mahout/math/MatricesTest.java
@@ -65,8 +65,8 @@ public class MatricesTest extends MahoutTestCase {
m.set(1, 1, 33.0);
Matrix mt = Matrices.transposedView(m);
- assertTrue(!mt.viewColumn(0).isDense());
- assertTrue(!mt.viewRow(0).isDense());
+ assertTrue(mt.viewColumn(0).isDense() == m.viewRow(0).isDense());
+ assertTrue(mt.viewRow(0).isDense() == m.viewColumn(0).isDense());
m = new DenseMatrix(10,10);
m.set(1, 1, 33.0);
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/mr/src/main/java/org/apache/mahout/math/hadoop/DistributedRowMatrix.java
----------------------------------------------------------------------
diff --git a/mr/src/main/java/org/apache/mahout/math/hadoop/DistributedRowMatrix.java b/mr/src/main/java/org/apache/mahout/math/hadoop/DistributedRowMatrix.java
index 1a6ff16..de5e216 100644
--- a/mr/src/main/java/org/apache/mahout/math/hadoop/DistributedRowMatrix.java
+++ b/mr/src/main/java/org/apache/mahout/math/hadoop/DistributedRowMatrix.java
@@ -133,6 +133,11 @@ public class DistributedRowMatrix implements VectorIterable, Configurable {
}
@Override
+ public Iterator<MatrixSlice> iterateNonEmpty() {
+ return iterator();
+ }
+
+ @Override
public Iterator<MatrixSlice> iterateAll() {
try {
Path pathPattern = rowPath;
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/mr/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GivensThinSolver.java
----------------------------------------------------------------------
diff --git a/mr/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GivensThinSolver.java b/mr/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GivensThinSolver.java
index 7033efe..af79cb4 100644
--- a/mr/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GivensThinSolver.java
+++ b/mr/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GivensThinSolver.java
@@ -586,6 +586,11 @@ public class GivensThinSolver {
}
@Override
+ public Vector like(int cardinality) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
public void setQuick(int index, double value) {
viewed.setQuick(rowNum, index, value);
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
----------------------------------------------------------------------
diff --git a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala b/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
index 5ffc18c..4d0615a 100644
--- a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
+++ b/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
@@ -12,13 +12,14 @@ class MahoutSparkILoop extends SparkILoop {
private val postInitScript =
"import org.apache.mahout.math._" ::
- "import scalabindings._" ::
- "import RLikeOps._" ::
- "import drm._" ::
- "import RLikeDrmOps._" ::
- "import org.apache.mahout.sparkbindings._" ::
- "import collection.JavaConversions._" ::
- Nil
+ "import scalabindings._" ::
+ "import RLikeOps._" ::
+ "import drm._" ::
+ "import RLikeDrmOps._" ::
+ "import decompositions._" ::
+ "import org.apache.mahout.sparkbindings._" ::
+ "import collection.JavaConversions._" ::
+ Nil
override protected def postInitialization() {
super.postInitialization()
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/pom.xml
----------------------------------------------------------------------
diff --git a/spark/pom.xml b/spark/pom.xml
index 33e0d1b..7155115 100644
--- a/spark/pom.xml
+++ b/spark/pom.xml
@@ -119,6 +119,22 @@
</executions>
</plugin>
+ <!-- create test jar so other modules can reuse the math test utility classes.
+ DO NOT REMOVE! Testing framework is useful in subordinate/contrib projects!
+ Please contact @dlyubimov.
+ -->
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ <phase>package</phase>
+ </execution>
+ </executions>
+ </plugin>
</plugins>
</build>
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/common/DrmMetadata.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/common/DrmMetadata.scala b/spark/src/main/scala/org/apache/mahout/common/DrmMetadata.scala
index 5bbccb1..0aba319 100644
--- a/spark/src/main/scala/org/apache/mahout/common/DrmMetadata.scala
+++ b/spark/src/main/scala/org/apache/mahout/common/DrmMetadata.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.mahout.common
import scala.reflect.ClassTag
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/common/HDFSUtil.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/common/HDFSUtil.scala b/spark/src/main/scala/org/apache/mahout/common/HDFSUtil.scala
index f5f87d7..c949f92 100644
--- a/spark/src/main/scala/org/apache/mahout/common/HDFSUtil.scala
+++ b/spark/src/main/scala/org/apache/mahout/common/HDFSUtil.scala
@@ -17,10 +17,12 @@
package org.apache.mahout.common
+import org.apache.spark.SparkContext
+
/** High level Hadoop version-specific hdfs manipulations we need in context of our operations. */
trait HDFSUtil {
/** Read DRM header information off (H)DFS. */
- def readDrmHeader(path:String):DrmMetadata
+ def readDrmHeader(path:String)(implicit sc:SparkContext):DrmMetadata
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/common/Hadoop1HDFSUtil.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/common/Hadoop1HDFSUtil.scala b/spark/src/main/scala/org/apache/mahout/common/Hadoop1HDFSUtil.scala
index 047104a..399508d 100644
--- a/spark/src/main/scala/org/apache/mahout/common/Hadoop1HDFSUtil.scala
+++ b/spark/src/main/scala/org/apache/mahout/common/Hadoop1HDFSUtil.scala
@@ -17,10 +17,10 @@
package org.apache.mahout.common
-
import org.apache.hadoop.io.{Writable, SequenceFile}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.conf.Configuration
+import org.apache.spark.SparkContext
import collection._
import JavaConversions._
@@ -30,14 +30,16 @@ import JavaConversions._
*/
object Hadoop1HDFSUtil extends HDFSUtil {
- /**
- * Read the header of a sequence file and determine the Key and Value type
- * @param path
- * @return
- */
- def readDrmHeader(path: String): DrmMetadata = {
+
+ /** Read DRM header information off (H)DFS. */
+ override def readDrmHeader(path: String)(implicit sc: SparkContext): DrmMetadata = {
+
val dfsPath = new Path(path)
- val fs = dfsPath.getFileSystem(new Configuration())
+
+ val fs = dfsPath.getFileSystem(sc.hadoopConfiguration)
+
+ // Apparently getFileSystem() doesn't set conf??
+ fs.setConf(sc.hadoopConfiguration)
val partFilePath:Path = fs.listStatus(dfsPath)
[2/4] mahout git commit: MAHOUT-1660 MAHOUT-1713 MAHOUT-1714
MAHOUT-1715 MAHOUT-1716 MAHOUT-1717 MAHOUT-1718 MAHOUT-1719 MAHOUT-1720
MAHOUT-1721 MAHOUT-1722 MAHOUT-1723 MAHOUT-1724 MAHOUT-1725 MAHOUT-1726
MAHOUT-1727 MAHOUT-1728 MAHOUT-1729 MAHOUT-1730 M
Posted by dl...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/SparkEngine.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/SparkEngine.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/SparkEngine.scala
index 595cd66..41e966b 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/SparkEngine.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/SparkEngine.scala
@@ -24,51 +24,59 @@ import org.apache.mahout.sparkbindings.indexeddataset.IndexedDatasetSpark
import scalabindings._
import RLikeOps._
import org.apache.mahout.math.drm.logical._
-import org.apache.mahout.sparkbindings.drm.{CheckpointedDrmSpark, DrmRddInput}
+import org.apache.mahout.sparkbindings.drm.{cpDrmGeneric2DrmRddInput, CheckpointedDrmSpark, DrmRddInput}
import org.apache.mahout.math._
+import scala.Predef
import scala.reflect.ClassTag
+import scala.reflect.classTag
import org.apache.spark.storage.StorageLevel
import org.apache.mahout.sparkbindings.blas._
import org.apache.hadoop.io._
-import scala.Some
-import scala.collection.JavaConversions._
+import collection._
+import JavaConversions._
import org.apache.mahout.math.drm._
import org.apache.mahout.math.drm.RLikeDrmOps._
import org.apache.spark.rdd.RDD
import org.apache.mahout.common.{Hadoop1HDFSUtil, HDFSUtil}
+
/** Spark-specific non-drm-method operations */
object SparkEngine extends DistributedEngine {
// By default, use Hadoop 1 utils
var hdfsUtils: HDFSUtil = Hadoop1HDFSUtil
- def colSums[K:ClassTag](drm: CheckpointedDrm[K]): Vector = {
+ def colSums[K: ClassTag](drm: CheckpointedDrm[K]): Vector = {
val n = drm.ncol
drm.rdd
+
// Throw away keys
.map(_._2)
+
// Fold() doesn't work with kryo still. So work around it.
- .mapPartitions(iter => {
- val acc = ((new DenseVector(n): Vector) /: iter)((acc, v) => acc += v)
+ .mapPartitions(iter ⇒ {
+ val acc = ((new DenseVector(n): Vector) /: iter)((acc, v) ⇒ acc += v)
Iterator(acc)
})
+
// Since we preallocated new accumulator vector per partition, this must not cause any side
// effects now.
.reduce(_ += _)
}
- def numNonZeroElementsPerColumn[K:ClassTag](drm: CheckpointedDrm[K]): Vector = {
+ def numNonZeroElementsPerColumn[K: ClassTag](drm: CheckpointedDrm[K]): Vector = {
val n = drm.ncol
drm.rdd
+
// Throw away keys
.map(_._2)
+
// Fold() doesn't work with kryo still. So work around it.
- .mapPartitions(iter => {
- val acc = ((new DenseVector(n): Vector) /: iter) { (acc, v) =>
- v.nonZeroes().foreach { elem => acc(elem.index) += 1 }
+ .mapPartitions(iter ⇒ {
+ val acc = ((new DenseVector(n): Vector) /: iter) { (acc, v) ⇒
+ v.nonZeroes().foreach { elem ⇒ acc(elem.index) += 1}
acc
}
Iterator(acc)
@@ -79,17 +87,25 @@ object SparkEngine extends DistributedEngine {
}
/** Engine-specific colMeans implementation based on a checkpoint. */
- override def colMeans[K:ClassTag](drm: CheckpointedDrm[K]): Vector =
+ override def colMeans[K: ClassTag](drm: CheckpointedDrm[K]): Vector =
if (drm.nrow == 0) drm.colSums() else drm.colSums() /= drm.nrow
override def norm[K: ClassTag](drm: CheckpointedDrm[K]): Double =
drm.rdd
- // Compute sum of squares of each vector
- .map {
- case (key, v) => v dot v
+ // Compute sum of squares of each vector
+ .map {
+ case (key, v) ⇒ v dot v
}
- .reduce(_ + _)
+ .reduce(_ + _)
+
+ /** Optional engine-specific all reduce tensor operation. */
+ override def allreduceBlock[K: ClassTag](drm: CheckpointedDrm[K], bmf: BlockMapFunc2[K], rf:
+ BlockReduceFunc): Matrix = {
+
+ import drm._
+ drm.toBlockifiedDrmRdd(ncol = drm.ncol).map(bmf(_)).reduce(rf)
+ }
/**
* Perform default expression rewrite. Return physical plan that we can pass to exec(). <P>
@@ -104,10 +120,10 @@ object SparkEngine extends DistributedEngine {
def toPhysical[K: ClassTag](plan: DrmLike[K], ch: CacheHint.CacheHint): CheckpointedDrm[K] = {
// Spark-specific Physical Plan translation.
- val rdd = tr2phys(plan)
+ val rddInput = tr2phys(plan)
val newcp = new CheckpointedDrmSpark(
- rdd = rdd,
+ rddInput = rddInput,
_nrow = plan.nrow,
_ncol = plan.ncol,
_cacheStorageLevel = cacheHint2Spark(ch),
@@ -131,7 +147,13 @@ object SparkEngine extends DistributedEngine {
*
* @return DRM[Any] where Any is automatically translated to value type
*/
- def drmDfsRead (path: String, parMin:Int = 0)(implicit sc: DistributedContext): CheckpointedDrm[_] = {
+ def drmDfsRead(path: String, parMin: Int = 0)(implicit sc: DistributedContext): CheckpointedDrm[_] = {
+
+ // Require that context is actually Spark context.
+ require(sc.isInstanceOf[SparkDistributedContext], "Supplied context must be for the Spark backend.")
+
+ // Extract spark context -- we need it for some operations.
+ implicit val ssc = sc.asInstanceOf[SparkDistributedContext].sc
val drmMetadata = hdfsUtils.readDrmHeader(path)
val k2vFunc = drmMetadata.keyW2ValFunc
@@ -140,8 +162,8 @@ object SparkEngine extends DistributedEngine {
// Hadoop we must do it right after read operation).
val rdd = sc.sequenceFile(path, classOf[Writable], classOf[VectorWritable], minPartitions = parMin)
- // Immediately convert keys and value writables into value types.
- .map { case (wKey, wVec) => k2vFunc(wKey) -> wVec.get()}
+ // Immediately convert keys and value writables into value types.
+ .map { case (wKey, wVec) ⇒ k2vFunc(wKey) -> wVec.get()}
// Wrap into a DRM type with correct matrix row key class tag evident.
drmWrap(rdd = rdd, cacheHint = CacheHint.NONE)(drmMetadata.keyClassTag.asInstanceOf[ClassTag[Any]])
@@ -149,67 +171,141 @@ object SparkEngine extends DistributedEngine {
/** Parallelize in-core matrix as spark distributed matrix, using row ordinal indices as data set keys. */
def drmParallelizeWithRowIndices(m: Matrix, numPartitions: Int = 1)
- (implicit sc: DistributedContext)
+ (implicit sc: DistributedContext)
: CheckpointedDrm[Int] = {
- new CheckpointedDrmSpark(rdd = parallelizeInCore(m, numPartitions))
+ new CheckpointedDrmSpark(rddInput = parallelizeInCore(m, numPartitions), _nrow = m.nrow, _ncol = m.ncol)
}
private[sparkbindings] def parallelizeInCore(m: Matrix, numPartitions: Int = 1)
- (implicit sc: DistributedContext): DrmRdd[Int] = {
+ (implicit sc: DistributedContext): DrmRdd[Int] = {
- val p = (0 until m.nrow).map(i => i -> m(i, ::))
+ val p = (0 until m.nrow).map(i => i → m(i, ::))
sc.parallelize(p, numPartitions)
}
/** Parallelize in-core matrix as spark distributed matrix, using row labels as a data set keys. */
def drmParallelizeWithRowLabels(m: Matrix, numPartitions: Int = 1)
- (implicit sc: DistributedContext)
+ (implicit sc: DistributedContext)
: CheckpointedDrm[String] = {
val rb = m.getRowLabelBindings
- val p = for (i: String <- rb.keySet().toIndexedSeq) yield i -> m(rb(i), ::)
+ val p = for (i: String ← rb.keySet().toIndexedSeq) yield i → m(rb(i), ::)
- new CheckpointedDrmSpark(rdd = sc.parallelize(p, numPartitions))
+ new CheckpointedDrmSpark(rddInput = sc.parallelize(p, numPartitions), _nrow = m.nrow, _ncol = m.ncol)
}
/** This creates an empty DRM with specified number of partitions and cardinality. */
def drmParallelizeEmpty(nrow: Int, ncol: Int, numPartitions: Int = 10)
- (implicit sc: DistributedContext): CheckpointedDrm[Int] = {
- val rdd = sc.parallelize(0 to numPartitions, numPartitions).flatMap(part => {
+ (implicit sc: DistributedContext): CheckpointedDrm[Int] = {
+ val rdd = sc.parallelize(0 to numPartitions, numPartitions).flatMap(part ⇒ {
val partNRow = (nrow - 1) / numPartitions + 1
val partStart = partNRow * part
val partEnd = Math.min(partStart + partNRow, nrow)
- for (i <- partStart until partEnd) yield (i, new RandomAccessSparseVector(ncol): Vector)
+ for (i ← partStart until partEnd) yield (i, new RandomAccessSparseVector(ncol): Vector)
})
new CheckpointedDrmSpark[Int](rdd, nrow, ncol)
}
def drmParallelizeEmptyLong(nrow: Long, ncol: Int, numPartitions: Int = 10)
- (implicit sc: DistributedContext): CheckpointedDrm[Long] = {
- val rdd = sc.parallelize(0 to numPartitions, numPartitions).flatMap(part => {
+ (implicit sc: DistributedContext): CheckpointedDrm[Long] = {
+ val rdd = sc.parallelize(0 to numPartitions, numPartitions).flatMap(part ⇒ {
val partNRow = (nrow - 1) / numPartitions + 1
val partStart = partNRow * part
val partEnd = Math.min(partStart + partNRow, nrow)
- for (i <- partStart until partEnd) yield (i, new RandomAccessSparseVector(ncol): Vector)
+ for (i ← partStart until partEnd) yield (i, new RandomAccessSparseVector(ncol): Vector)
})
new CheckpointedDrmSpark[Long](rdd, nrow, ncol)
}
+ /**
+ * Convert non-int-keyed matrix to an int-keyed, computing optionally mapping from old keys
+ * to row indices in the new one. The mapping, if requested, is returned as a 1-column matrix.
+ */
+ override def drm2IntKeyed[K: ClassTag](drmX: DrmLike[K], computeMap: Boolean = false): (DrmLike[Int], Option[DrmLike[K]]) = {
+ if (classTag[K] == ClassTag.Int) {
+ drmX.asInstanceOf[DrmLike[Int]] → None
+ } else {
+
+ val drmXcp = drmX.checkpoint(CacheHint.MEMORY_ONLY)
+ val ncol = drmXcp.asInstanceOf[CheckpointedDrmSpark[K]]._ncol
+ val nrow = drmXcp.asInstanceOf[CheckpointedDrmSpark[K]]._nrow
+
+ // Compute sequential int key numbering.
+ val (intRdd, keyMap) = blas.rekeySeqInts(rdd = drmXcp.rdd, computeMap = computeMap)
+
+ // Convert computed key mapping to a matrix.
+ val mxKeyMap = keyMap.map { rdd =>
+ drmWrap(rdd = rdd.map { case (key, ordinal) ⇒ key → (dvec(ordinal):Vector)}, ncol = 1, nrow = nrow)
+ }
+
+
+ drmWrap(rdd = intRdd, ncol = ncol) → mxKeyMap
+ }
+
+ }
+
+
+ /**
+ * (Optional) Sampling operation. Consistent with Spark semantics of the same.
+ * @param drmX
+ * @param fraction
+ * @param replacement
+ * @tparam K
+ * @return
+ */
+ override def drmSampleRows[K: ClassTag](drmX: DrmLike[K], fraction: Double, replacement: Boolean): DrmLike[K] = {
+
+ // We do want to take ncol if already computed, if not, then we don't want to trigger computation
+ // here.
+ val ncol = drmX match {
+ case cp: CheckpointedDrmSpark[K] ⇒ cp._ncol
+ case _ ⇒ -1
+ }
+ val sample = drmX.rdd.sample(withReplacement = replacement, fraction = fraction)
+ if (classTag[K] != ClassTag.Int) return drmWrap(sample, ncol = ncol)
+
+ // K == Int: Int-keyed sample. rebase int counts.
+ drmWrap(rdd = blas.rekeySeqInts(rdd = sample, computeMap = false)._1, ncol = ncol).asInstanceOf[DrmLike[K]]
+ }
+
+
+ override def drmSampleKRows[K: ClassTag](drmX: DrmLike[K], numSamples: Int, replacement: Boolean): Matrix = {
+
+ val ncol = drmX match {
+ case cp: CheckpointedDrmSpark[K] ⇒ cp._ncol
+ case _ ⇒ -1
+ }
+
+ // I think as of the time of this writing, takeSample() in Spark is biased. It is not a true
+ // hypergeometric sampler. But it is faster than a true hypergeometric/categorical samplers
+ // would be.
+ val sample = drmX.rdd.takeSample(withReplacement = replacement, num = numSamples)
+ val isSparse = sample.exists { case (_, vec) ⇒ !vec.isDense }
+
+ val vectors = sample.map(_._2)
+ val labels = sample.view.zipWithIndex.map { case ((key, _), idx) ⇒ key.toString → (idx:Integer) }.toMap
+
+ val mx:Matrix = if (isSparse) sparse(vectors:_*) else dense(vectors)
+ mx.setRowLabelBindings(labels)
+
+ mx
+ }
+
private[mahout] def cacheHint2Spark(cacheHint: CacheHint.CacheHint): StorageLevel = cacheHint match {
- case CacheHint.NONE => StorageLevel.NONE
- case CacheHint.DISK_ONLY => StorageLevel.DISK_ONLY
- case CacheHint.DISK_ONLY_2 => StorageLevel.DISK_ONLY_2
- case CacheHint.MEMORY_ONLY => StorageLevel.MEMORY_ONLY
- case CacheHint.MEMORY_ONLY_2 => StorageLevel.MEMORY_ONLY_2
- case CacheHint.MEMORY_ONLY_SER => StorageLevel.MEMORY_ONLY_SER
- case CacheHint.MEMORY_ONLY_SER_2 => StorageLevel.MEMORY_ONLY_SER_2
- case CacheHint.MEMORY_AND_DISK => StorageLevel.MEMORY_AND_DISK
- case CacheHint.MEMORY_AND_DISK_2 => StorageLevel.MEMORY_AND_DISK_2
- case CacheHint.MEMORY_AND_DISK_SER => StorageLevel.MEMORY_AND_DISK_SER
- case CacheHint.MEMORY_AND_DISK_SER_2 => StorageLevel.MEMORY_AND_DISK_SER_2
+ case CacheHint.NONE ⇒ StorageLevel.NONE
+ case CacheHint.DISK_ONLY ⇒ StorageLevel.DISK_ONLY
+ case CacheHint.DISK_ONLY_2 ⇒ StorageLevel.DISK_ONLY_2
+ case CacheHint.MEMORY_ONLY ⇒ StorageLevel.MEMORY_ONLY
+ case CacheHint.MEMORY_ONLY_2 ⇒ StorageLevel.MEMORY_ONLY_2
+ case CacheHint.MEMORY_ONLY_SER ⇒ StorageLevel.MEMORY_ONLY_SER
+ case CacheHint.MEMORY_ONLY_SER_2 ⇒ StorageLevel.MEMORY_ONLY_SER_2
+ case CacheHint.MEMORY_AND_DISK ⇒ StorageLevel.MEMORY_AND_DISK
+ case CacheHint.MEMORY_AND_DISK_2 ⇒ StorageLevel.MEMORY_AND_DISK_2
+ case CacheHint.MEMORY_AND_DISK_SER ⇒ StorageLevel.MEMORY_AND_DISK_SER
+ case CacheHint.MEMORY_AND_DISK_SER_2 ⇒ StorageLevel.MEMORY_AND_DISK_SER_2
}
/** Translate previously optimized physical plan */
@@ -221,31 +317,32 @@ object SparkEngine extends DistributedEngine {
// If there are any such cases, they must go away in pass1. If they were not, then it wasn't
// the A'A case but actual transposition intent which should be removed from consideration
// (we cannot do actual flip for non-int-keyed arguments)
- case OpAtAnyKey(_) =>
+ case OpAtAnyKey(_) ⇒
throw new IllegalArgumentException("\"A\" must be Int-keyed in this A.t expression.")
- case op@OpAt(a) => At.at(op, tr2phys(a)(op.classTagA))
- case op@OpABt(a, b) => ABt.abt(op, tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB))
- case op@OpAtB(a, b) => AtB.atb_nograph(op, tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB),
- zippable = a.partitioningTag == b.partitioningTag)
- case op@OpAtA(a) => AtA.at_a(op, tr2phys(a)(op.classTagA))
- case op@OpAx(a, x) => Ax.ax_with_broadcast(op, tr2phys(a)(op.classTagA))
- case op@OpAtx(a, x) => Ax.atx_with_broadcast(op, tr2phys(a)(op.classTagA))
- case op@OpAewB(a, b, opId) => AewB.a_ew_b(op, tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB))
- case op@OpCbind(a, b) => CbindAB.cbindAB_nograph(op, tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB))
- case op@OpRbind(a, b) => RbindAB.rbindAB(op, tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB))
- case op@OpAewScalar(a, s, _) => AewB.a_ew_scalar(op, tr2phys(a)(op.classTagA), s)
- case op@OpRowRange(a, _) => Slicing.rowRange(op, tr2phys(a)(op.classTagA))
- case op@OpTimesRightMatrix(a, _) => AinCoreB.rightMultiply(op, tr2phys(a)(op.classTagA))
+ case op@OpAt(a) ⇒ At.at(op, tr2phys(a)(op.classTagA))
+ case op@OpABt(a, b) ⇒ ABt.abt(op, tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB))
+ case op@OpAtB(a, b) ⇒ AtB.atb(op, tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB))
+ case op@OpAtA(a) ⇒ AtA.at_a(op, tr2phys(a)(op.classTagA))
+ case op@OpAx(a, x) ⇒ Ax.ax_with_broadcast(op, tr2phys(a)(op.classTagA))
+ case op@OpAtx(a, x) ⇒ Ax.atx_with_broadcast(op, tr2phys(a)(op.classTagA))
+ case op@OpAewUnaryFunc(a, _, _) ⇒ AewB.a_ew_func(op, tr2phys(a)(op.classTagA))
+ case op@OpAewUnaryFuncFusion(a, _) ⇒ AewB.a_ew_func(op, tr2phys(a)(op.classTagA))
+ case op@OpAewB(a, b, opId) ⇒ AewB.a_ew_b(op, tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB))
+ case op@OpCbind(a, b) ⇒ CbindAB.cbindAB_nograph(op, tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB))
+ case op@OpCbindScalar(a, _, _) ⇒ CbindAB.cbindAScalar(op, tr2phys(a)(op.classTagA))
+ case op@OpRbind(a, b) ⇒ RbindAB.rbindAB(op, tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB))
+ case op@OpAewScalar(a, s, _) ⇒ AewB.a_ew_scalar(op, tr2phys(a)(op.classTagA), s)
+ case op@OpRowRange(a, _) ⇒ Slicing.rowRange(op, tr2phys(a)(op.classTagA))
+ case op@OpTimesRightMatrix(a, _) ⇒ AinCoreB.rightMultiply(op, tr2phys(a)(op.classTagA))
// Custom operators, we just execute them
- case blockOp: OpMapBlock[K, _] => MapBlock.exec(
+ case blockOp: OpMapBlock[K, _] ⇒ MapBlock.exec(
src = tr2phys(blockOp.A)(blockOp.classTagA),
- ncol = blockOp.ncol,
- bmf = blockOp.bmf
+ operator = blockOp
)
- case op@OpPar(a,_,_) => Par.exec(op,tr2phys(a)(op.classTagA))
- case cp: CheckpointedDrm[K] => new DrmRddInput[K](rowWiseSrc = Some((cp.ncol, cp.rdd)))
- case _ => throw new IllegalArgumentException("Internal:Optimizer has no exec policy for operator %s."
- .format(oper))
+ case op@OpPar(a, _, _) ⇒ Par.exec(op, tr2phys(a)(op.classTagA))
+ case cp: CheckpointedDrm[K] ⇒ cp.rdd: DrmRddInput[K]
+ case _ ⇒ throw new IllegalArgumentException("Internal:Optimizer has no exec policy for operator %s."
+ .format(oper))
}
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/ABt.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/ABt.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/ABt.scala
index 1e3f286..11e2bad 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/ABt.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/ABt.scala
@@ -19,16 +19,23 @@ package org.apache.mahout.sparkbindings.blas
import org.apache.mahout.math.scalabindings._
import RLikeOps._
+import org.apache.spark.rdd.RDD
import scala.reflect.ClassTag
import org.apache.mahout.sparkbindings._
-import drm._
-import org.apache.mahout.math.{Matrix, SparseRowMatrix}
+import org.apache.mahout.math.drm.BlockifiedDrmTuple
+import org.apache.mahout.sparkbindings.drm._
+import org.apache.mahout.math.{SparseMatrix, Matrix, SparseRowMatrix}
import org.apache.spark.SparkContext._
import org.apache.mahout.math.drm.logical.OpABt
+import org.apache.mahout.logging._
+
+import scala.tools.nsc.io.Pickler.TildeDecorator
/** Contains RDD plans for ABt operator */
object ABt {
+ private final implicit val log = getLog(ABt.getClass)
+
/**
* General entry point for AB' operator.
*
@@ -40,8 +47,11 @@ object ABt {
def abt[K: ClassTag](
operator: OpABt[K],
srcA: DrmRddInput[K],
- srcB: DrmRddInput[Int]): DrmRddInput[K] =
+ srcB: DrmRddInput[Int]): DrmRddInput[K] = {
+
+ debug("operator AB'(Spark)")
abt_nograph(operator, srcA, srcB)
+ }
/**
* Computes AB' without GraphX.
@@ -63,7 +73,146 @@ object ABt {
srcB: DrmRddInput[Int]): DrmRddInput[K] = {
// Blockify everything.
- val blocksA = srcA.toBlockifiedDrmRdd()
+ val blocksA = srcA.toBlockifiedDrmRdd(operator.A.ncol)
+
+ val blocksB = srcB.toBlockifiedDrmRdd(operator.B.ncol)
+
+ val prodNCol = operator.ncol
+ val prodNRow = operator.nrow
+ // We are actually computing AB' here.
+ val numProductPartitions = estimateProductPartitions(anrow = prodNRow, ancol = operator.A.ncol,
+ bncol = prodNCol, aparts = blocksA.partitions.size, bparts = blocksB.partitions.size)
+
+ debug(
+ s"AB': #parts = $numProductPartitions; A #parts=${blocksA.partitions.size}, B #parts=${blocksB.partitions.size}."+
+ s"A=${operator.A.nrow}x${operator.A.ncol}, B=${operator.B.nrow}x${operator.B.ncol},AB'=${prodNRow}x$prodNCol."
+ )
+
+ // blockwise multimplication function
+ def mmulFunc(tupleA: BlockifiedDrmTuple[K], tupleB: BlockifiedDrmTuple[Int]): (Array[K], Array[Int], Matrix) = {
+ val (keysA, blockA) = tupleA
+ val (keysB, blockB) = tupleB
+
+ var ms = traceDo(System.currentTimeMillis())
+
+ // We need to send keysB to the aggregator in order to know which columns are being updated.
+ val result = (keysA, keysB, (blockA %*% blockB.t))
+
+ ms = traceDo(System.currentTimeMillis() - ms.get)
+ trace(
+ s"block multiplication of(${blockA.nrow}x${blockA.ncol} x ${blockB.ncol}x${blockB.nrow} is completed in $ms " +
+ "ms.")
+ trace(s"block multiplication types: blockA: ${blockA.getClass.getName}(${blockA.t.getClass.getName}); " +
+ s"blockB: ${blockB.getClass.getName}.")
+
+ result
+ }
+
+ val blockwiseMmulRdd =
+
+ // Combine blocks pairwise.
+ pairwiseApply(blocksA, blocksB, mmulFunc _)
+
+ // Now reduce proper product blocks.
+ .combineByKey(
+
+ // Empty combiner += value
+ createCombiner = (t: (Array[K], Array[Int], Matrix)) => {
+ val (rowKeys, colKeys, block) = t
+ val comb = new SparseMatrix(prodNCol, block.nrow).t
+
+ for ((col, i) <- colKeys.zipWithIndex) comb(::, col) := block(::, i)
+ rowKeys -> comb
+ },
+
+ // Combiner += value
+ mergeValue = (comb: (Array[K], Matrix), value: (Array[K], Array[Int], Matrix)) => {
+ val (rowKeys, c) = comb
+ val (_, colKeys, block) = value
+ for ((col, i) <- colKeys.zipWithIndex) c(::, col) := block(::, i)
+ comb
+ },
+
+ // Combiner + Combiner
+ mergeCombiners = (comb1: (Array[K], Matrix), comb2: (Array[K], Matrix)) => {
+ comb1._2 += comb2._2
+ comb1
+ },
+
+ numPartitions = blocksA.partitions.size max blocksB.partitions.size
+ )
+
+
+ // Created BlockifiedRDD-compatible structure.
+ val blockifiedRdd = blockwiseMmulRdd
+
+ // throw away A-partition #
+ .map{case (_,tuple) => tuple}
+
+ val numPartsResult = blockifiedRdd.partitions.size
+
+ // See if we need to rebalance away from A granularity.
+ if (numPartsResult * 2 < numProductPartitions || numPartsResult / 2 > numProductPartitions) {
+
+ debug(s"Will re-coalesce from ${numPartsResult} to ${numProductPartitions}")
+
+ val rowRdd = deblockify(blockifiedRdd).coalesce(numPartitions = numProductPartitions)
+
+ rowRdd
+
+ } else {
+
+ // We don't have a terribly different partition
+ blockifiedRdd
+ }
+
+ }
+
+ /**
+ * This function tries to use join instead of cartesian to group blocks together without bloating
+ * the number of partitions. Hope is that we can apply pairwise reduction of block pair right away
+ * so if the data to one of the join parts is streaming, the result is still fitting to memory,
+ * since result size is much smaller than the operands.
+ *
+ * @param blocksA blockified RDD for A
+ * @param blocksB blockified RDD for B
+ * @param blockFunc a function over (blockA, blockB). Implies `blockA %*% blockB.t` but perhaps may be
+ * switched to another scheme based on which of the sides, A or B, is bigger.
+ */
+ private def pairwiseApply[K1, K2, T](blocksA: BlockifiedDrmRdd[K1], blocksB: BlockifiedDrmRdd[K2], blockFunc:
+ (BlockifiedDrmTuple[K1], BlockifiedDrmTuple[K2]) => T): RDD[(Int, T)] = {
+
+ // We will be joining blocks in B to blocks in A using A-partition as a key.
+
+ // Prepare A side.
+ val blocksAKeyed = blocksA.mapPartitionsWithIndex { (part, blockIter) =>
+
+ val r = if (blockIter.hasNext) Some(part -> blockIter.next) else Option.empty[(Int, BlockifiedDrmTuple[K1])]
+
+ require(blockIter.hasNext == false, s"more than 1 (${blockIter.size + 1}) blocks per partition and A of AB'")
+
+ r.toIterator
+ }
+
+ // Prepare B-side.
+ val aParts = blocksA.partitions.size
+ val blocksBKeyed = blocksB.flatMap(bTuple => for (blockKey <- (0 until aParts).view) yield blockKey -> bTuple )
+
+ // Perform the inner join. Let's try to do a simple thing now.
+ blocksAKeyed.join(blocksBKeyed, numPartitions = aParts)
+
+ // Apply product function which should produce smaller products. Hopefully, this streams blockB's in
+ .map{case (partKey,(blockA, blockB)) => partKey -> blockFunc(blockA, blockB)}
+
+ }
+
+ private[blas] def abt_nograph_cart[K: ClassTag](
+ operator: OpABt[K],
+ srcA: DrmRddInput[K],
+ srcB: DrmRddInput[Int]): DrmRddInput[K] = {
+
+ // Blockify everything.
+ val blocksA = srcA.toBlockifiedDrmRdd(operator.A.ncol)
// Mark row-blocks with group id
.mapPartitionsWithIndex((part, iter) => {
@@ -83,28 +232,35 @@ object ABt {
}
})
- val blocksB = srcB.toBlockifiedDrmRdd()
+ val blocksB = srcB.toBlockifiedDrmRdd(operator.B.ncol)
// Final product's geometry. We want to extract that into local variables since we want to use
// them as closure attributes.
val prodNCol = operator.ncol
val prodNRow = operator.nrow
-
- // Approximate number of final partitions.
- val numProductPartitions =
- if (blocksA.partitions.size > blocksB.partitions.size) {
- ((prodNCol.toDouble / operator.A.ncol) * blocksA.partitions.size).ceil.toInt
- } else {
- ((prodNRow.toDouble / operator.B.ncol) * blocksB.partitions.size).ceil.toInt
- }
+ val aNCol = operator.A.ncol
+
+ // Approximate number of final partitions. We take bigger partitions as our guide to number of
+ // elements per partition. TODO: do it better.
- //srcA.partitions.size.max(that = srcB.partitions.size)
+ // Elements per partition, bigger of two operands.
+ val epp = aNCol.toDouble * prodNRow / blocksA.partitions.size max aNCol.toDouble * prodNCol /
+ blocksB.partitions.size
+ // Number of partitions we want to converge to in the product. For now we simply extrapolate that
+ // assuming product density and operand densities being about the same; and using the same element
+ // per partition number in the product as the bigger of two operands.
+ val numProductPartitions = (prodNCol.toDouble * prodNRow / epp).ceil.toInt
+
+ debug(
+ s"AB': #parts = $numProductPartitions; A #parts=${blocksA.partitions.size}, B #parts=${blocksB.partitions.size}.")
// The plan.
- var blockifiedRdd :BlockifiedDrmRdd[K] = blocksA
+ var blockifiedRdd: BlockifiedDrmRdd[K] = blocksA
- // Build Cartesian. It may require a bit more memory there at tasks.
+ // Build Cartesian. It generates a LOT of tasks. TODO: figure how to fix performance of AB'
+ // operator. The thing is that product after map is really small one (partition fraction x
+ // partition fraction) so they can be combined into much bigger chunks.
.cartesian(blocksB)
// Multiply blocks
@@ -126,10 +282,14 @@ object ABt {
.combineByKey[(Array[K],Matrix)](
createCombiner = (t: (Array[K], Array[Int], Matrix)) => t match {
+
+ // Create combiner structure out of two products. Our combiner is sparse row matrix
+ // initialized to final product partition block dimensions.
case (rowKeys, colKeys, blockProd) =>
- // Accumulator is a row-wise block of sparse vectors.
- val acc:Matrix = new SparseRowMatrix(rowKeys.size, prodNCol)
+ // Accumulator is a row-wise block of sparse vectors. Since we assign to columns,
+ // the most efficient is perhaps to create column-oriented block here.
+ val acc:Matrix = new SparseRowMatrix(prodNCol, rowKeys.size).t
// Update accumulator using colKeys as column index indirection
colKeys.view.zipWithIndex.foreach({
@@ -168,6 +328,8 @@ object ABt {
// having at most one block per partition.
blockifiedRdd = rbind(blockifiedRdd)
- new DrmRddInput(blockifiedSrc = Some(blockifiedRdd))
+ blockifiedRdd
}
+
+
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AewB.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AewB.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AewB.scala
index 3cdb797..8a90398 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AewB.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AewB.scala
@@ -20,19 +20,22 @@ package org.apache.mahout.sparkbindings.blas
import org.apache.mahout.sparkbindings.drm.DrmRddInput
import scala.reflect.ClassTag
import org.apache.spark.SparkContext._
-import org.apache.mahout.math.scalabindings._
+import org.apache.mahout.math._
+import scalabindings._
import RLikeOps._
import org.apache.mahout.math.{SequentialAccessSparseVector, Matrix, Vector}
-import org.apache.mahout.math.drm.logical.{OpAewScalar, OpAewB}
-import org.apache.log4j.Logger
+import org.apache.mahout.math.drm.logical.{AbstractUnaryOp, TEwFunc, OpAewScalar, OpAewB}
import org.apache.mahout.sparkbindings.blas.AewB.{ReduceFuncScalar, ReduceFunc}
import org.apache.mahout.sparkbindings.{BlockifiedDrmRdd, DrmRdd, drm}
import org.apache.mahout.math.drm._
+import org.apache.mahout.logging._
+import collection._
+import JavaConversions._
/** Elementwise drm-drm operators */
object AewB {
- private val log = Logger.getLogger(AewB.getClass)
+ private final implicit val log = getLog(AewB.getClass)
/**
* Set to false to disallow in-place elementwise operations in case side-effects and non-idempotent
@@ -44,10 +47,10 @@ object AewB {
type ReduceFuncScalar = (Matrix, Double) => Matrix
- private[blas] def getEWOps() = {
- val inplaceProp = System.getProperty(PROPERTY_AEWB_INPLACE, "true").toBoolean
- if (inplaceProp) InplaceEWOps else CloningEWOps
- }
+ private[blas] def ewInplace(): Boolean = System.getProperty(PROPERTY_AEWB_INPLACE, "false").toBoolean
+
+ private[blas] def getEWOps() = if (ewInplace()) InplaceEWOps else CloningEWOps
+
/** Elementwise matrix-matrix operator, now handles both non- and identically partitioned */
def a_ew_b[K: ClassTag](op: OpAewB[K], srcA: DrmRddInput[K], srcB: DrmRddInput[K]): DrmRddInput[K] = {
@@ -67,12 +70,14 @@ object AewB {
val a = srcA.toDrmRdd()
val b = srcB.toDrmRdd()
+ debug(s"A${op.op}B: #partsA=${a.partitions.size},#partsB=${b.partitions.size}.")
+
// Check if A and B are identically partitioned AND keyed. if they are, then just perform zip
// instead of join, and apply the op map-side. Otherwise, perform join and apply the op
// reduce-side.
val rdd = if (op.isIdenticallyPartitioned(op.A)) {
- log.debug("applying zipped elementwise")
+ debug(s"A${op.op}B:applying zipped elementwise")
a
.zip(b)
@@ -83,7 +88,7 @@ object AewB {
}
} else {
- log.debug("applying elementwise as join")
+ debug("A${op.op}B:applying elementwise as join")
a
// Full outer-join operands row-wise
@@ -103,13 +108,51 @@ object AewB {
})
}
- new DrmRddInput(rowWiseSrc = Some(ncol -> rdd))
+ rdd
+ }
+
+ def a_ew_func[K:ClassTag](op:AbstractUnaryOp[K,K] with TEwFunc, srcA: DrmRddInput[K]):DrmRddInput[K] = {
+
+ val evalZeros = op.evalZeros
+ val inplace = ewInplace()
+ val f = op.f
+
+ // Before obtaining blockified rdd, see if we have to fix int row key consistency so that missing
+ // rows can get lazily pre-populated with empty vectors before proceeding with elementwise scalar.
+ val aBlockRdd = if (implicitly[ClassTag[K]] == ClassTag.Int && op.A.canHaveMissingRows && evalZeros) {
+ val fixedRdd = fixIntConsistency(op.A.asInstanceOf[DrmLike[Int]], src = srcA.toDrmRdd().asInstanceOf[DrmRdd[Int]])
+ drm.blockify(fixedRdd, blockncol = op.A.ncol).asInstanceOf[BlockifiedDrmRdd[K]]
+ } else {
+ srcA.toBlockifiedDrmRdd(op.A.ncol)
+ }
+
+ val rdd = aBlockRdd.map {case (keys, block) =>
+
+ // Do inplace or allocate a new copy?
+ val newBlock = if (inplace) block else block cloned
+
+ // Operation cares about zeros?
+ if (evalZeros) {
+
+ // Yes, we evaluate all:
+ newBlock := ((_,_,x)=>f(x))
+ } else {
+
+ // No, evaluate non-zeros only row-wise
+ for (row <- newBlock; el <- row.nonZeroes) el := f(el.get)
+ }
+
+ keys -> newBlock
+ }
+
+ rdd
}
/** Physical algorithm to handle matrix-scalar operators like A - s or s -: A */
def a_ew_scalar[K: ClassTag](op: OpAewScalar[K], srcA: DrmRddInput[K], scalar: Double):
DrmRddInput[K] = {
+
val ewOps = getEWOps()
val opId = op.op
@@ -129,15 +172,17 @@ object AewB {
val fixedRdd = fixIntConsistency(op.A.asInstanceOf[DrmLike[Int]], src = srcA.toDrmRdd().asInstanceOf[DrmRdd[Int]])
drm.blockify(fixedRdd, blockncol = op.A.ncol).asInstanceOf[BlockifiedDrmRdd[K]]
} else {
- srcA.toBlockifiedDrmRdd()
+ srcA.toBlockifiedDrmRdd(op.A.ncol)
}
+ debug(s"A${op.op}$scalar: #parts=${aBlockRdd.partitions.size}.")
+
val rdd = aBlockRdd
- .map({
+ .map {
case (keys, block) => keys -> reduceFunc(block, scalar)
- })
+ }
- new DrmRddInput[K](blockifiedSrc = Some(rdd))
+ rdd
}
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AinCoreB.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AinCoreB.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AinCoreB.scala
index c923e62..5f9f84a 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AinCoreB.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AinCoreB.scala
@@ -6,13 +6,17 @@ import scalabindings._
import RLikeOps._
import org.apache.mahout.sparkbindings._
import org.apache.mahout.sparkbindings.drm._
+import org.apache.mahout.logging._
import scala.reflect.ClassTag
import org.apache.mahout.math.DiagonalMatrix
import org.apache.mahout.math.drm.logical.OpTimesRightMatrix
+
/** Matrix product with one of operands an in-core matrix */
object AinCoreB {
+ private final implicit val log = getLog(AinCoreB.getClass)
+
def rightMultiply[K: ClassTag](op: OpTimesRightMatrix[K], srcA: DrmRddInput[K]): DrmRddInput[K] = {
if ( op.right.isInstanceOf[DiagonalMatrix])
rightMultiply_diag(op, srcA)
@@ -21,23 +25,27 @@ object AinCoreB {
}
private def rightMultiply_diag[K: ClassTag](op: OpTimesRightMatrix[K], srcA: DrmRddInput[K]): DrmRddInput[K] = {
- val rddA = srcA.toBlockifiedDrmRdd()
+ val rddA = srcA.toBlockifiedDrmRdd(op.A.ncol)
implicit val ctx:DistributedContext = rddA.context
val dg = drmBroadcast(op.right.viewDiagonal())
+ debug(s"operator A %*% inCoreB-diagonal. #parts=${rddA.partitions.size}.")
+
val rdd = rddA
// Just multiply the blocks
.map {
case (keys, blockA) => keys -> (blockA %*%: diagv(dg))
}
- new DrmRddInput(blockifiedSrc = Some(rdd))
+ rdd
}
private def rightMultiply_common[K: ClassTag](op: OpTimesRightMatrix[K], srcA: DrmRddInput[K]): DrmRddInput[K] = {
- val rddA = srcA.toBlockifiedDrmRdd()
+ val rddA = srcA.toBlockifiedDrmRdd(op.A.ncol)
implicit val sc:DistributedContext = rddA.sparkContext
+ debug(s"operator A %*% inCoreB. #parts=${rddA.partitions.size}.")
+
val bcastB = drmBroadcast(m = op.right)
val rdd = rddA
@@ -46,7 +54,7 @@ object AinCoreB {
case (keys, blockA) => keys -> (blockA %*% bcastB)
}
- new DrmRddInput(blockifiedSrc = Some(rdd))
+ rdd
}
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/At.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/At.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/At.scala
index 56de9f4..5789bd2 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/At.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/At.scala
@@ -17,16 +17,20 @@
package org.apache.mahout.sparkbindings.blas
-import org.apache.mahout.sparkbindings.drm.DrmRddInput
+import org.apache.mahout.sparkbindings.drm._
import org.apache.mahout.math.scalabindings._
+import org.apache.mahout.logging._
import RLikeOps._
import org.apache.spark.SparkContext._
import org.apache.mahout.math.{DenseVector, Vector, SequentialAccessSparseVector}
import org.apache.mahout.math.drm.logical.OpAt
+
/** A' algorithms */
object At {
+ private final implicit val log = getLog(At.getClass)
+
def at(
operator: OpAt,
srcA: DrmRddInput[Int]): DrmRddInput[Int] = at_nograph(operator = operator, srcA = srcA)
@@ -39,10 +43,15 @@ object At {
* groups into final rows of the transposed matrix.
*/
private[blas] def at_nograph(operator: OpAt, srcA: DrmRddInput[Int]): DrmRddInput[Int] = {
- val drmRdd = srcA.toBlockifiedDrmRdd()
+
+ debug("operator A'.")
+
+ val drmRdd = srcA.toBlockifiedDrmRdd(operator.A.ncol)
val numPartitions = drmRdd.partitions.size
val ncol = operator.ncol
+ debug(s"A' #parts = $numPartitions.")
+
// Validity of this conversion must be checked at logical operator level.
val nrow = operator.nrow.toInt
val atRdd = drmRdd
@@ -70,7 +79,7 @@ object At {
key -> v
}).densify()
- new DrmRddInput(rowWiseSrc = Some(ncol -> atRdd))
+ atRdd
}
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtA.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtA.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtA.scala
index be4f08c..a212878 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtA.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtA.scala
@@ -17,6 +17,7 @@
package org.apache.mahout.sparkbindings.blas
+import org.apache.mahout.logging._
import org.apache.mahout.math._
import org.apache.mahout.sparkbindings._
import org.apache.mahout.sparkbindings.drm._
@@ -34,25 +35,30 @@ import SparkEngine._
*/
object AtA {
- final val log = Logger.getLogger(AtA.getClass)
+ private final implicit val log = getLog(AtA.getClass)
final val PROPERTY_ATA_MAXINMEMNCOL = "mahout.math.AtA.maxInMemNCol"
+ final val PROPERTY_ATA_MMUL_BLOCKHEIGHT = "mahout.math.AtA.blockHeight"
/** Materialize A'A operator */
def at_a(operator: OpAtA[_], srcRdd: DrmRddInput[_]): DrmRddInput[Int] = {
- val maxInMemNCol = System.getProperty(PROPERTY_ATA_MAXINMEMNCOL, "2000").toInt
+ val maxInMemNCol = System.getProperty(PROPERTY_ATA_MAXINMEMNCOL, "200").toInt
maxInMemNCol.ensuring(_ > 0, "Invalid A'A in-memory setting for optimizer")
if (operator.ncol <= maxInMemNCol) {
+
// If we can comfortably fit upper-triangular operator into a map memory, we will run slim
// algorithm with upper-triangular accumulators in maps.
- val inCoreA = at_a_slim(srcRdd = srcRdd, operator = operator)
+ val inCoreA = at_a_slim(srcRdd = srcRdd.toDrmRdd(), operator = operator)
val drmRdd = parallelizeInCore(inCoreA, numPartitions = 1)(sc = srcRdd.sparkContext)
- new DrmRddInput(rowWiseSrc = Some(inCoreA.ncol, drmRdd))
+ drmRdd
+
} else {
+
// Otherwise, we need to run a distributed, big version
- new DrmRddInput(rowWiseSrc = Some(operator.ncol, at_a_nongraph(srcRdd = srcRdd, op = operator)))
+ // new DrmRddInput(rowWiseSrc = Some(operator.ncol, at_a_nongraph(srcRdd = srcRdd, op = operator)))
+ at_a_nongraph_mmul(srcRdd = srcRdd.toBlockifiedDrmRdd(operator.A.ncol), op = operator)
}
}
@@ -64,7 +70,7 @@ object AtA {
*/
def at_a_slim(operator: OpAtA[_], srcRdd: DrmRdd[_]): Matrix = {
- log.debug("Applying slim A'A.")
+ debug("operator slim A'A(Spark)")
val ncol = operator.ncol
// Compute backing vector of tiny-upper-triangular accumulator accross all the data.
@@ -73,122 +79,195 @@ object AtA {
val ut = new UpperTriangular(ncol)
// Strategy is to add to an outer product of each row to the upper triangular accumulator.
- pIter.foreach({
- case (k, v) =>
+ pIter.foreach({ case (k, v) =>
- // Use slightly various traversal strategies over dense vs. sparse source.
- if (v.isDense) {
+ // Use slightly various traversal strategies over dense vs. sparse source.
+ if (v.isDense) {
- // Update upper-triangular pattern only (due to symmetry).
- // Note: Scala for-comprehensions are said to be fairly inefficient this way, but this is
- // such spectacular case they were deesigned for.. Yes I do observe some 20% difference
- // compared to while loops with no other payload, but the other payload is usually much
- // heavier than this overhead, so... I am keeping this as is for the time being.
+ // Update upper-triangular pattern only (due to symmetry).
+ // Note: Scala for-comprehensions are said to be fairly inefficient this way, but this is
+ // such spectacular case they were deesigned for.. Yes I do observe some 20% difference
+ // compared to while loops with no other payload, but the other payload is usually much
+ // heavier than this overhead, so... I am keeping this as is for the time being.
- for (row <- 0 until v.length; col <- row until v.length)
- ut(row, col) = ut(row, col) + v(row) * v(col)
+ for (row <- 0 until v.length; col <- row until v.length)
+ ut(row, col) = ut(row, col) + v(row) * v(col)
- } else {
+ } else {
- // Sparse source.
- v.nonZeroes().view
+ // Sparse source.
+ v.nonZeroes().view
- // Outer iterator iterates over rows of outer product.
- .foreach(elrow => {
+ // Outer iterator iterates over rows of outer product.
+ .foreach(elrow => {
- // Inner loop for columns of outer product.
- v.nonZeroes().view
+ // Inner loop for columns of outer product.
+ v.nonZeroes().view
- // Filter out non-upper nonzero elements from the double loop.
- .filter(_.index >= elrow.index)
+ // Filter out non-upper nonzero elements from the double loop.
+ .filter(_.index >= elrow.index)
- // Incrementally update outer product value in the uppper triangular accumulator.
- .foreach(elcol => {
+ // Incrementally update outer product value in the uppper triangular accumulator.
+ .foreach(elcol => {
- val row = elrow.index
- val col = elcol.index
- ut(row, col) = ut(row, col) + elrow.get() * elcol.get()
+ val row = elrow.index
+ val col = elcol.index
+ ut(row, col) = ut(row, col) + elrow.get() * elcol.get()
- })
})
+ })
- }
+ }
})
Iterator(dvec(ddata = ut.getData): Vector)
- })
-
- .collect()
- .reduce(_ += _)
+ }).collect().reduce(_ += _)
new DenseSymmetricMatrix(resSym)
}
+ // Version that tries to use groupBy. In practice this is the slowest.
+ def at_a_group(op: OpAtA[_], srcRdd: DrmRdd[_]): DrmRddInput[Int] = {
+ debug("operator non-slim A'A(Spark-group).")
+
+ // Determine how many partitions the new matrix would need approximately. We base that on
+ // geometry only, but it may eventually not be that adequate. Indeed, A'A tends to be much more
+ // dense in reality than the source.
+ val m = op.A.nrow
+ val n = op.A.ncol
+ val srcNumParts = srcRdd.partitions.size
+ val finalNumParts = (srcNumParts * n / m).ceil.toInt max 1
+ val numParts = finalNumParts max srcNumParts
+ val ranges = computeEvenSplits(n, numParts)
+
+ var rddAtA = srcRdd
+
+ // Remove key, key is irrelevant
+ .map(_._2)
+
+ // Form partial outer blocks for each partition
+ .flatMap { v =>
+ for (blockKey <- 0 until numParts) yield {
+ blockKey -> v
+ }
+ }
+ // Sent to individual partition reducer
+ .groupByKey(numPartitions = numParts)
+
+ // Reduce individual group
+ .map { case (blockKey, iter) =>
+ val range = ranges(blockKey)
+ val mxC: Matrix = new SparseRowMatrix(range.size, n, false)
+ iter.foreach(vec => addOuterProduct(mxC, vec(range), vec))
+
+ // Fix keys
+ val blockStart = range.start
+ val rowKeys = Array.tabulate(mxC.nrow)(blockStart + _)
+ rowKeys -> mxC
+ }
+
+ if (log.isDebugEnabled)
+ log.debug(s"AtA (grouping) #parts: ${rddAtA.partitions.size}.")
+
+ if (finalNumParts < numParts) rddAtA = rddAtA.coalesce(finalNumParts, shuffle = false)
+
+ rddAtA
+ }
+
+
/** The version of A'A that does not use GraphX */
- def at_a_nongraph(op: OpAtA[_], srcRdd: DrmRdd[_]): DrmRdd[Int] = {
+ def at_a_nongraph(op: OpAtA[_], srcRdd: DrmRdd[_]): DrmRddInput[Int] = {
- log.debug("Applying non-slim non-graph A'A.")
+ debug("Applying non-slim non-graph A'A.")
// Determine how many partitions the new matrix would need approximately. We base that on
// geometry only, but it may eventually not be that adequate. Indeed, A'A tends to be much more
// dense in reality than the source.
-
val m = op.A.nrow
val n = op.A.ncol
-/* possible fix for index out of range for vector range
- val numParts = (srcRdd.partitions.size.toDouble * n / m).ceil.round.toInt max 1
+ val numParts = (srcRdd.partitions.size.toDouble * n / m).ceil.toInt max 1
val blockHeight = (n - 1) / numParts + 1
-*/
- val numParts = (srcRdd.partitions.size.toDouble * n / m).ceil.round.toInt max 1 min n
+ val offsets = (0 until numParts).map(_ * blockHeight)
+ val ranges = offsets.map(offset => offset until (offset + blockHeight min n))
- // Computing evenly split ranges to denote each partition size.
+ val rddAtA = srcRdd
- // Base size.
- val baseSize = n / numParts
+ // Remove key, key is irrelevant
+ .map(_._2)
- // How many partitions needs to be baseSize +1.
- val slack = n - baseSize * numParts
+ // Form partial outer blocks for each partition
+ .flatMap { v =>
+ for (blockKey <- 0 until numParts) yield {
+ blockKey ->(blockKey, v)
+ }
+ }
+ // Combine outer products
+ .combineByKey(// Combiner factory
+ createCombiner = (t: (Int, Vector)) => {
+ val partNo = t._1
+ val vec = t._2
+ val range = ranges(partNo)
+ val mxC = if (vec.isDense) new DenseMatrix(range.size, n) else new SparseRowMatrix(range.size, n)
+ addOuterProduct(mxC, vec(range), vec)
+ },
+
+ // Merge values into existing partition accumulator.
+ mergeValue = (mxC: Matrix, t: (Int, Vector)) => {
+ val partNo = t._1
+ val vec = t._2
+ addOuterProduct(mxC, vec(ranges(partNo)), vec)
+ },
+
+ // Merge combiners
+ mergeCombiners = (mxC1: Matrix, mxC2: Matrix) => mxC1 += mxC2, numPartitions = numParts)
+
+ // Restore proper block keys
+ .map { case (blockKey, block) =>
+ val blockStart = blockKey * blockHeight
+ val rowKeys = Array.tabulate(block.nrow)(blockStart + _)
+ rowKeys -> block
+ }
- val ranges =
- // Start with partition offsets... total numParts + 1.
- (0 to numParts).view.map { i => (baseSize + 1) * i - (i - slack max 0)}
- // And convert offsets to ranges.
- .sliding(2).map(s => s(0) until s(1)).toIndexedSeq
+ if (log.isDebugEnabled)
+ log.debug(s"AtA #parts: ${rddAtA.partitions.size}.")
- val rddAtA = srcRdd
+ rddAtA
+ }
- // Remove key, key is irrelevant
- .map(_._2)
-
- // Form partial outer blocks for each partition
- .flatMap {
- v =>
- for (blockKey <- Stream.range(0, numParts)) yield {
-/* patch to fix index out of range for vector access
- val blockStart = blockKey * blockHeight
- val blockEnd = n min (blockStart + blockHeight)
- blockKey -> (v(blockStart until blockEnd) cross v)
-*/
- val range = ranges(blockKey)
- blockKey -> (v(range) cross v)
- }
+ /**
+ * The version of A'A that does not use GraphX. Tries to use blockwise matrix multiply. If an
+ * accelerated matrix back is available, this might be somewhat faster.
+ */
+ def at_a_nongraph_mmul(op: OpAtA[_], srcRdd: BlockifiedDrmRdd[_]): DrmRddInput[Int] = {
+
+ // Determine how many partitions the new matrix would need approximately. We base that on
+ // geometry only, but it may eventually not be that adequate. Indeed, A'A tends to be much more
+ // dense in reality than the source.
+ val m = op.A.nrow
+ val n = op.A.ncol
+ val aparts = srcRdd.partitions.size
+ val numParts = estimateProductPartitions(anrow = n, ancol = m, bncol = n, aparts = aparts, bparts = aparts)
+ val ranges = computeEvenSplits(n, numParts)
+
+ debug(s"operator mmul-A'A(Spark); #parts = $numParts, #partsA=$aparts.")
+
+ val rddAtA = srcRdd.flatMap { case (keys, block) =>
+ Iterator.tabulate(numParts) { i =>
+ i -> block(::, ranges(i)).t %*% block
+ }
}
- // Combine outer blocks
- .reduceByKey(_ += _)
-
- // Restore proper block keys
- .map {
- case (blockKey, block) =>
-/* patch to fix index out of range for vector access
- val blockStart = blockKey * blockHeight
- val rowKeys = Array.tabulate(block.nrow)(blockStart + _)
-*/
- val range = ranges(blockKey)
- val rowKeys = Array.tabulate(block.nrow)(range.start + _)
- rowKeys -> block
+ // Reduce partial blocks.
+ .reduceByKey(_ += _, numPartitions = numParts)
+
+ // Produce keys
+ .map { case (blockKey, block) =>
+
+ val blockStart = ranges(blockKey).start
+ val rowKeys = Array.tabulate(block.nrow)(blockStart + _)
+ rowKeys -> block
}
- new DrmRddInput[Int](blockifiedSrc = Some(rddAtA))
+ rddAtA
}
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtB.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtB.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtB.scala
index 86aadc8..45705a9 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtB.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtB.scala
@@ -17,8 +17,13 @@
package org.apache.mahout.sparkbindings.blas
-import scala.reflect.ClassTag
-import org.apache.mahout.math.drm._
+import reflect.ClassTag
+import collection._
+import JavaConversions._
+
+import org.apache.mahout.logging._
+import org.apache.mahout.math._
+import drm._
import org.apache.mahout.sparkbindings.drm._
import org.apache.spark.rdd.RDD
import org.apache.mahout.math.scalabindings._
@@ -27,92 +32,330 @@ import org.apache.spark.SparkContext._
import org.apache.log4j.Logger
import org.apache.mahout.math.drm.logical.OpAtB
+import scala.collection.mutable.ArrayBuffer
+
object AtB {
- private val log = Logger.getLogger(AtB.getClass)
+ private final implicit val log = getLog(AtB.getClass)
+ def atb[A: ClassTag](operator: OpAtB[A], srcA: DrmRddInput[A], srcB: DrmRddInput[A]): DrmRddInput[Int] = {
+ atb_nograph_mmul(operator, srcA, srcB, operator.A.partitioningTag == operator.B.partitioningTag)
+ }
/**
* The logic for computing A'B is pretty much map-side generation of partial outer product blocks
* over co-grouped rows of A and B. If A and B are identically partitioned, we can just directly
* zip all the rows. Otherwise, we need to inner-join them first.
+ *
*/
- def atb_nograph[A: ClassTag](
- operator: OpAtB[A],
- srcA: DrmRddInput[A],
- srcB: DrmRddInput[A],
- zippable:Boolean = false
- ): DrmRddInput[Int] = {
+ @deprecated("slow, will remove", since = "0.10.2")
+ def atb_nograph[A: ClassTag](operator: OpAtB[A], srcA: DrmRddInput[A], srcB: DrmRddInput[A],
+ zippable: Boolean = false): DrmRddInput[Int] = {
val rddA = srcA.toDrmRdd()
- val zipped = if ( zippable ) {
+ val rddB = srcB.toDrmRdd()
+
+
+ val prodNCol = operator.ncol
+ val prodNRow = operator.nrow
+ val aNRow = operator.A.nrow
+
+ // Approximate number of final partitions. We take bigger partitions as our guide to number of
+ // elements per partition. TODO: do it better.
+ // Elements per partition, bigger of two operands.
+ val epp = aNRow.toDouble * prodNRow / rddA.partitions.size max aNRow.toDouble * prodNCol /
+ rddB.partitions.size
+
+ // Number of partitions we want to converge to in the product. For now we simply extrapolate that
+ // assuming product density and operand densities being about the same; and using the same element
+ // per partition number in the product as the bigger of two operands.
+ val numProductPartitions = (prodNCol.toDouble * prodNRow / epp).ceil.toInt
+
+ if (log.isDebugEnabled) log.debug(s"AtB: #parts ${numProductPartitions} for $prodNRow x $prodNCol geometry.")
+
+ val zipped = if (zippable) {
log.debug("A and B for A'B are identically distributed, performing row-wise zip.")
- rddA.zip(other = srcB.toDrmRdd())
+ rddA.zip(other = rddB)
} else {
log.debug("A and B for A'B are not identically partitioned, performing inner join.")
- rddA.join(other=srcB.toDrmRdd()).map({
- case (key,(v1,v2) ) => (key -> v1) -> (key -> v2)
+ rddA.join(other = rddB, numPartitions = numProductPartitions).map({ case (key, (v1,
+ v2)) => (key -> v1) -> (key -> v2)
})
}
- val blockHeight = safeToNonNegInt(
- (operator.B.ncol.toDouble/rddA.partitions.size).ceil.round max 1L
- )
-
- computeAtBZipped(
- zipped,
- nrow = operator.nrow,
- ancol = operator.A.ncol,
- bncol = operator.B.ncol,
- blockHeight = blockHeight
- )
+ computeAtBZipped2(zipped, nrow = operator.nrow, ancol = operator.A.ncol, bncol = operator.B.ncol,
+ numPartitions = numProductPartitions)
+ }
+
+ private[sparkbindings] def atb_nograph_mmul[A:ClassTag](operator:OpAtB[A], srcA: DrmRddInput[A], srcB:DrmRddInput[A], zippable:Boolean = false):DrmRddInput[Int] = {
+
+ debug("operator mmul-A'B(Spark)")
+
+ val prodNCol = operator.ncol
+ val prodNRow = safeToNonNegInt(operator.nrow)
+ val aNRow = safeToNonNegInt(operator.A.nrow)
+
+ val rddA = srcA.toDrmRdd()
+ val rddB = srcB.toDrmRdd()
+
+ // Approximate number of final partitions. We take bigger partitions as our guide to number of
+ // elements per partition. TODO: do it better.
+ // Elements per partition, bigger of two operands.
+ val epp = aNRow.toDouble * prodNRow / rddA.partitions.size max aNRow.toDouble * prodNCol /
+ rddB.partitions.size
+
+ // Number of partitions we want to converge to in the product. For now we simply extrapolate that
+ // assuming product density and operand densities being about the same; and using the same element
+ // per partition number in the product as the bigger of two operands.
+ val numProductPartitions = (prodNCol.toDouble * prodNRow / epp).ceil.toInt min prodNRow
+
+ if (log.isDebugEnabled) log.debug(s"AtB mmul: #parts ${numProductPartitions} for $prodNRow x $prodNCol geometry.")
+
+ val zipped = if (zippable) {
+
+ debug("mmul-A'B - zip: are identically distributed, performing row-wise zip.")
+
+ val blockdRddA = srcA.toBlockifiedDrmRdd(operator.A.ncol)
+ val blockdRddB = srcB.toBlockifiedDrmRdd(operator.B.ncol)
+
+ blockdRddA
+
+ // Zip
+ .zip(other = blockdRddB)
+
+ // Throw away the keys
+ .map { case ((_, blockA), (_, blockB)) => blockA -> blockB}
+
+ } else {
+
+ debug("mmul-A'B: cogroup for non-identically distributed stuff.")
+
+ // To take same route, we'll join stuff row-wise, blockify it here and then proceed with the
+ // same computation path. Although it is possible we could shave off one shuffle here. TBD.
+
+ rddA
+
+ // Do full join. We can't get away with partial join because it is going to lose some rows
+ // in case we have missing rows on either side.
+ .cogroup(other = rddB, numPartitions = rddA.partitions.size max rddB.partitions.size )
+
+
+ // Merge groups.
+ .mapPartitions{iter =>
+
+ val aRows = new ArrayBuffer[Vector](1000)
+ val bRows = new ArrayBuffer[Vector](1000)
+
+ // Populate hanging row buffs
+ iter.foreach{case (_, (arowbag,browbag)) =>
+
+ // Some up all vectors, if any, for a row. If we have > 1 that means original matrix had
+ // non-uniquely keyed rows which is generally a matrix format inconsistency (should not
+ // happen).
+ aRows += (if (arowbag.isEmpty)
+ new SequentialAccessSparseVector(prodNRow)
+ else arowbag.reduce(_ += _))
+
+ bRows += (if (browbag.isEmpty)
+ new SequentialAccessSparseVector(prodNCol)
+ else browbag.reduce(_ += _))
+ }
+
+ // Transform collection of vectors into blocks.
+ val blockNRow = aRows.size
+ assert(blockNRow == bRows.size)
+
+ val aBlock:Matrix = new SparseRowMatrix(blockNRow, prodNRow, aRows.toArray)
+ val bBlock:Matrix = new SparseRowMatrix(blockNRow, prodNCol, bRows.toArray)
+
+ // Form pairwise result
+ Iterator(aBlock -> bBlock)
+ }
+ }
+
+ computeAtBZipped3(pairwiseRdd = zipped, nrow = prodNRow, ancol = prodNRow, bncol = aNRow,
+ numPartitions = numProductPartitions)
+
+ }
+ /**
+ * Compute, combine and accumulate outer products for every key. The incoming tuple structure
+ * is (partNo, (vecA, vecB)), so for every `partNo` we compute an outer product of the form {{{
+ * vecA cross vecB
+ * }}}
+ * @param pairwiseRdd
+ * @return
+ */
+ @deprecated("slow, will remove", since = "0.10.2")
+ private[sparkbindings] def combineOuterProducts(pairwiseRdd: RDD[(Int, (Vector, Vector))], numPartitions: Int) = {
+
+ pairwiseRdd
+
+ // Reduce individual partitions
+ .combineByKey(createCombiner = (t: (Vector, Vector)) => {
+
+ val vecA = t._1
+ val vecB = t._2
+
+ // Create partition accumulator. Generally, summation of outer products probably calls for
+ // dense accumulators. However, let's assume extremely sparse cases are still possible, and
+ // by default assume any sparse case is an extremely sparse case. May need to tweak further.
+ val mxC: Matrix = if (!vecA.isDense && !vecB.isDense)
+ new SparseRowMatrix(vecA.length, vecB.length)
+ else
+ new DenseMatrix(vecA.length, vecB.length)
+
+ // Add outer product of arow and bRowFrag to mxC
+ addOuterProduct(mxC, vecA, vecB)
+
+ }, mergeValue = (mxC: Matrix, t: (Vector, Vector)) => {
+ // Merge of a combiner with another outer product fragment.
+ val vecA = t._1
+ val vecB = t._2
+
+ addOuterProduct(mxC, vecA, vecB)
+
+ }, mergeCombiners = (mxC1: Matrix, mxC2: Matrix) => {
+
+ // Merge of 2 combiners.
+ mxC1 += mxC2
+
+ }, numPartitions = numPartitions)
+ }
+
+ private[sparkbindings] def computeAtBZipped3[A: ClassTag](pairwiseRdd: RDD[(Matrix, Matrix)], nrow: Int,
+ ancol: Int, bncol: Int, numPartitions: Int) = {
+
+ val ranges = computeEvenSplits(nrow, numPartitions)
+
+ val rdd = pairwiseRdd.flatMap{ case (blockA, blockB) ⇒
+
+ // Handling microscopic Pat's cases. Any slicing doesn't work well on 0-row matrix. This
+ // probably should be fixed in the in-core matrix implementations.
+ if (blockA.nrow == 0 )
+ Iterator.empty
+ else
+ // Output each partial outer product with its correspondent partition #.
+ Iterator.tabulate(numPartitions) {part ⇒
+
+ val mBlock = blockA(::, ranges(part)).t %*% blockB
+ part → mBlock
+ }
+ }
+
+ // Reduce.
+ .reduceByKey(_ += _, numPartitions = numPartitions)
+
+ // Produce keys
+ .map { case (blockKey, block) ⇒ ranges(blockKey).toArray → block }
+
+ debug(s"A'B mmul #parts: ${rdd.partitions.size}.")
+
+ rdd
}
+ private[sparkbindings] def computeAtBZipped2[A: ClassTag](zipped: RDD[(DrmTuple[A], DrmTuple[A])], nrow: Long,
+ ancol: Int, bncol: Int, numPartitions: Int) = {
+
+ // The plan of this approach is to send a_i and parts of b_i to partitoin reducers which actually
+ // do outer product sum update locally (instead of sending outer blocks). Thus it should minimize
+ // expense for IO and also in-place partition block accum update should be much more efficient
+ // than forming outer block matrices and perform matrix-on-patrix +.
+ // Figure out appriximately block height per partition of the result.
+ val blockHeight = safeToNonNegInt((nrow - 1) / numPartitions) + 1
-// private[sparkbindings] def atb_nograph()
+ val partitionedRdd = zipped
+
+ // Split B-rows into partitions using blockHeight
+ .mapPartitions { iter =>
+
+ val offsets = (0 until numPartitions).map(_ * blockHeight)
+ val ranges = offsets.map(offs => offs until (offs + blockHeight min ancol))
+
+ // Transform into series of (part -> (arow, part-brow)) tuples (keyed by part #).
+ iter.flatMap { case ((_, arow), (_, brow)) =>
+
+ ranges.view.zipWithIndex.map { case (arange, partNum) =>
+ partNum -> (arow(arange).cloned -> brow)
+ }
+ }
+ }
+
+ val blockRdd = combineOuterProducts(partitionedRdd, numPartitions)
+
+ // Add ordinal row keys.
+ .map { case (blockNum, block) =>
+
+ // Starting key
+ var offset = blockNum * blockHeight
+
+ var keys = Array.tabulate(block.nrow)(offset + _)
+ keys -> block
+
+ }
+
+ blockRdd
+ }
/** Given already zipped, joined rdd of rows of A' and B, compute their product A'B */
- private[sparkbindings] def computeAtBZipped[A: ClassTag](zipped:RDD[(DrmTuple[A], DrmTuple[A])],
- nrow:Long, ancol:Int, bncol:Int, blockHeight: Int) = {
+ @deprecated("slow, will remove", since = "0.10.2")
+ private[sparkbindings] def computeAtBZipped[A: ClassTag](zipped: RDD[(DrmTuple[A], DrmTuple[A])], nrow: Long,
+ ancol: Int, bncol: Int, numPartitions: Int) = {
// Since Q and A are partitioned same way,we can just zip their rows and proceed from there by
// forming outer products. Our optimizer lacks this primitive, so we will implement it using RDDs
// directly. We try to compile B' = A'Q now by collecting outer products of rows of A and Q. At
// this point we need to split n-range of B' into sutiable number of partitions.
- val btNumParts = safeToNonNegInt((nrow - 1) / blockHeight + 1)
+ if (log.isDebugEnabled) {
+ log.debug(s"AtBZipped:zipped #parts ${zipped.partitions.size}")
+ log.debug(s"AtBZipped:Targeted #parts ${numPartitions}")
+ }
+
+ // Figure out appriximately block height per partition of the result.
+ val blockHeight = safeToNonNegInt((nrow - 1) / numPartitions) + 1
val rddBt = zipped
- // Produce outer product blocks
- .flatMap {
- case ((aKey, aRow), (qKey, qRow)) =>
- for (blockKey <- Stream.range(0, btNumParts)) yield {
- val blockStart = blockKey * blockHeight
- val blockEnd = ancol min (blockStart + blockHeight)
+ // Produce outer product blocks
+ .flatMap { case ((aKey, aRow), (qKey, qRow)) =>
+ for (blockKey <- Stream.range(0, numPartitions)) yield {
+ val blockStart = blockKey * blockHeight
+ val blockEnd = ancol min (blockStart + blockHeight)
- // Create block by cross product of proper slice of aRow and qRow
- blockKey -> (aRow(blockStart until blockEnd) cross qRow)
- }
- }
- // Combine blocks by just summing them up
- .reduceByKey {
- case (block1, block2) => block1 += block2
+ // Create block by cross product of proper slice of aRow and qRow
+ blockKey -> (aRow(blockStart until blockEnd) cross qRow)
+
+ // TODO: computing tons of cross product matrices seems to be pretty inefficient here. More
+ // likely single streaming algorithm of updates will perform much better here. So rewrite
+ // this using mapPartitions with numPartitions block accumulators.
+
+ }
}
+ // .combineByKey(
+ // createCombiner = (mx:Matrix) => mx,
+ // mergeValue = (c:Matrix,mx:Matrix) => c += mx,
+ // mergeCombiners = (c1:Matrix,c2:Matrix) => c1 += c2,
+ // numPartitions = numPartitions
+ // )
+ // Doesn't look like use of combineByKey produces any better results than reduceByKey. So keeping
+ // reduceByKey for simplicity. Combiners probably doesn't mean reduceByKey doesn't combine map-side.
+ // Combine blocks by just summing them up
+ .reduceByKey((block1, block2) => block1 += block2, numPartitions)
- // Throw away block key, generate row keys instead.
- .map {
- case (blockKey, block) =>
- val blockStart = blockKey * blockHeight
- val rowKeys = Array.tabulate(block.nrow)(blockStart + _)
- rowKeys -> block
+ // Throw away block key, generate row keys instead.
+ .map { case (blockKey, block) =>
+ val blockStart = blockKey * blockHeight
+ val rowKeys = Array.tabulate(block.nrow)(blockStart + _)
+ rowKeys -> block
}
- new DrmRddInput[Int](blockifiedSrc = Some(rddBt))
+ if (log.isDebugEnabled) log.debug(s"AtBZipped #parts ${rddBt.partitions.size}")
+
+ rddBt
}
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Ax.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Ax.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Ax.scala
index 94c3f06..629accd 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Ax.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Ax.scala
@@ -15,22 +15,22 @@ object Ax {
def ax_with_broadcast[K: ClassTag](op: OpAx[K], srcA: DrmRddInput[K]): DrmRddInput[K] = {
- val rddA = srcA.toBlockifiedDrmRdd()
- implicit val sc:DistributedContext = rddA.sparkContext
+ val rddA = srcA.toBlockifiedDrmRdd(op.A.ncol)
+ implicit val sc: DistributedContext = rddA.sparkContext
val bcastX = drmBroadcast(op.x)
- val rdd = rddA
- // Just multiply the blocks
- .map({
- case (keys, blockA) => keys -> (blockA %*% bcastX).toColMatrix
- })
+ val rdd: BlockifiedDrmRdd[K] = rddA
+
+ // Just multiply the blocks
+ .map { case (keys, blockA) ⇒ keys → (blockA %*% bcastX).toColMatrix }
- new DrmRddInput(blockifiedSrc = Some(rdd))
+ new DrmRddInput(Right(rdd))
}
def atx_with_broadcast(op: OpAtx, srcA: DrmRddInput[Int]): DrmRddInput[Int] = {
- val rddA = srcA.toBlockifiedDrmRdd()
+
+ val rddA = srcA.toBlockifiedDrmRdd(op.A.ncol)
implicit val dc:DistributedContext = rddA.sparkContext
val bcastX = drmBroadcast(op.x)
@@ -52,10 +52,10 @@ object Ax {
// It is ridiculous, but in this scheme we will have to re-parallelize it again in order to plug
// it back as drm blockified rdd
- val rdd = dc.parallelize(Seq(inCoreM), numSlices = 1)
- .map(block => Array.tabulate(block.nrow)(i => i) -> block)
+ val rdd:BlockifiedDrmRdd[Int] = dc.parallelize(Seq(inCoreM), numSlices = 1)
+ .map{block ⇒ Array.tabulate(block.nrow)(i ⇒ i) -> block}
- new DrmRddInput(blockifiedSrc = Some(rdd))
+ rdd
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/CbindAB.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/CbindAB.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/CbindAB.scala
index ea10ccb..4a379ec 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/CbindAB.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/CbindAB.scala
@@ -18,12 +18,13 @@
package org.apache.mahout.sparkbindings.blas
import org.apache.log4j.Logger
-import scala.reflect.ClassTag
+import org.apache.mahout.sparkbindings.DrmRdd
+import reflect._
import org.apache.mahout.sparkbindings.drm.DrmRddInput
import org.apache.mahout.math._
import scalabindings._
import RLikeOps._
-import org.apache.mahout.math.drm.logical.OpCbind
+import org.apache.mahout.math.drm.logical.{OpCbindScalar, OpCbind}
import org.apache.spark.SparkContext._
/** Physical cbind */
@@ -31,6 +32,34 @@ object CbindAB {
private val log = Logger.getLogger(CbindAB.getClass)
+ def cbindAScalar[K:ClassTag](op: OpCbindScalar[K], srcA:DrmRddInput[K]) : DrmRddInput[K] = {
+ val srcRdd = srcA.toDrmRdd()
+
+ val ncol = op.A.ncol
+ val x = op.x
+
+ val fixedRdd = if (classTag[K] == ClassTag.Int && x != 0.0)
+ fixIntConsistency(op.asInstanceOf[OpCbindScalar[Int]],
+ src = srcRdd.asInstanceOf[DrmRdd[Int]]).asInstanceOf[DrmRdd[K]]
+ else srcRdd
+
+ val left = op.leftBind
+
+ val resultRdd = fixedRdd.map { case (key, vec) =>
+ val newVec = vec.like(ncol + 1)
+ if (left) {
+ newVec(1 to ncol) := vec
+ newVec(0) = x
+ } else {
+ newVec(0 until ncol) := vec
+ newVec(ncol) = x
+ }
+ key -> newVec
+ }
+
+ resultRdd
+ }
+
def cbindAB_nograph[K: ClassTag](op: OpCbind[K], srcA: DrmRddInput[K], srcB: DrmRddInput[K]): DrmRddInput[K] = {
val a = srcA.toDrmRdd()
@@ -88,7 +117,7 @@ object CbindAB {
}
}
- new DrmRddInput(rowWiseSrc = Some(op.ncol -> rdd))
+ rdd
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/DrmRddOps.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/DrmRddOps.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/DrmRddOps.scala
index a3caac7..4cd9a74 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/DrmRddOps.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/DrmRddOps.scala
@@ -25,12 +25,14 @@ import org.apache.mahout.sparkbindings.DrmRdd
class DrmRddOps[K: ClassTag](private[blas] val rdd: DrmRdd[K]) {
+ /** Turn RDD into dense row-wise vectors if density threshold is exceeded. */
def densify(threshold: Double = 0.80): DrmRdd[K] = rdd.map({
case (key, v) =>
val vd = if (!v.isDense && v.getNumNonZeroElements > threshold * v.length) new DenseVector(v) else v
key -> vd
})
+ /** Turn rdd into sparse RDD if density threshold is underrun. */
def sparsify(threshold: Double = 0.80): DrmRdd[K] = rdd.map({
case (key, v) =>
val vs = if (v.isDense() && v.getNumNonZeroElements <= threshold * v.length)
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/MapBlock.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/MapBlock.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/MapBlock.scala
index 4c68c9a..2933ddc 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/MapBlock.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/MapBlock.scala
@@ -17,6 +17,7 @@
package org.apache.mahout.sparkbindings.blas
+import org.apache.mahout.math.drm.logical.OpMapBlock
import org.apache.mahout.sparkbindings.drm.DrmRddInput
import org.apache.mahout.math.drm.BlockMapFunc
import org.apache.mahout.math.scalabindings.RLikeOps._
@@ -24,12 +25,13 @@ import scala.reflect.ClassTag
object MapBlock {
- def exec[S, R:ClassTag](src: DrmRddInput[S], ncol:Int, bmf:BlockMapFunc[S,R]): DrmRddInput[R] = {
+ def exec[S, R:ClassTag](src: DrmRddInput[S], operator:OpMapBlock[S,R]): DrmRddInput[R] = {
- // We can't use attributes to avoid putting the whole this into closure.
-
- val rdd = src.toBlockifiedDrmRdd()
- .map(blockTuple => {
+ // We can't use attributes directly in the closure in order to avoid putting the whole object
+ // into closure.
+ val bmf = operator.bmf
+ val ncol = operator.ncol
+ val rdd = src.toBlockifiedDrmRdd(operator.A.ncol).map(blockTuple => {
val out = bmf(blockTuple)
assert(out._2.nrow == blockTuple._2.nrow, "block mapping must return same number of rows.")
@@ -37,7 +39,8 @@ object MapBlock {
out
})
- new DrmRddInput(blockifiedSrc = Some(rdd))
+
+ rdd
}
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Par.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Par.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Par.scala
index e73376d..0434a72 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Par.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Par.scala
@@ -1,50 +1,58 @@
package org.apache.mahout.sparkbindings.blas
+import org.apache.mahout.sparkbindings.drm
+
import scala.reflect.ClassTag
import org.apache.mahout.sparkbindings.drm.DrmRddInput
import org.apache.mahout.math.drm.logical.OpPar
import org.apache.spark.rdd.RDD
+import scala.math._
+
+import org.apache.mahout.logging._
/** Physical adjustment of parallelism */
object Par {
+ private final implicit val log = getLog(Par.getClass)
+
def exec[K: ClassTag](op: OpPar[K], src: DrmRddInput[K]): DrmRddInput[K] = {
- def adjust[T](rdd: RDD[T]): RDD[T] =
- if (op.minSplits > 0) {
- if (rdd.partitions.size < op.minSplits)
- rdd.coalesce(op.minSplits, shuffle = true)
- else rdd.coalesce(rdd.partitions.size)
- } else if (op.exactSplits > 0) {
- if (op.exactSplits < rdd.partitions.size)
- rdd.coalesce(numPartitions = op.exactSplits, shuffle = false)
- else if (op.exactSplits > rdd.partitions.size)
- rdd.coalesce(numPartitions = op.exactSplits, shuffle = true)
- else
- rdd.coalesce(rdd.partitions.size)
- } else if (op.exactSplits == -1 && op.minSplits == -1) {
-
- // auto adjustment, try to scale up to either x1Size or x2Size.
- val clusterSize = rdd.context.getConf.get("spark.default.parallelism", "1").toInt
-
- val x1Size = (clusterSize * .95).ceil.toInt
- val x2Size = (clusterSize * 1.9).ceil.toInt
-
- if (rdd.partitions.size <= x1Size)
- rdd.coalesce(numPartitions = x1Size, shuffle = true)
- else if (rdd.partitions.size <= x2Size)
- rdd.coalesce(numPartitions = x2Size, shuffle = true)
- else
- rdd.coalesce(numPartitions = rdd.partitions.size)
- } else rdd.coalesce(rdd.partitions.size)
-
- if (src.isBlockified) {
- val rdd = src.toBlockifiedDrmRdd()
- new DrmRddInput[K](blockifiedSrc = Some(adjust(rdd)))
+ val srcBlockified = src.isBlockified
+
+ val srcRdd = if (srcBlockified) src.toBlockifiedDrmRdd(op.ncol) else src.toDrmRdd()
+ val srcNParts = srcRdd.partitions.size
+
+ // To what size?
+ val targetParts = if (op.minSplits > 0) srcNParts max op.minSplits
+ else if (op.exactSplits > 0) op.exactSplits
+ else /* auto adjustment */ {
+ val stdParallelism = srcRdd.context.getConf.get("spark.default.parallelism", "1").toInt
+ val x1 = 0.95 * stdParallelism
+ if (srcNParts <= ceil(x1)) ceil(x1).toInt else ceil(2 * x1).toInt
+ }
+
+ debug(s"par $srcNParts => $targetParts.")
+
+ if (targetParts > srcNParts) {
+
+ // Expanding. Always requires deblockified stuff. May require re-shuffling.
+ val rdd = src.toDrmRdd().repartition(numPartitions = targetParts)
+
+ rdd
+
+ } else if (targetParts < srcNParts) {
+ // Shrinking.
+
+ if (srcBlockified) {
+ drm.rbind(src.toBlockifiedDrmRdd(op.ncol).coalesce(numPartitions = targetParts))
+ } else {
+ src.toDrmRdd().coalesce(numPartitions = targetParts)
+ }
} else {
- val rdd = src.toDrmRdd()
- new DrmRddInput[K](rowWiseSrc = Some(op.ncol -> adjust(rdd)))
+ // no adjustment required.
+ src
}
+
}
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/RbindAB.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/RbindAB.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/RbindAB.scala
index 5037d68..62abba6 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/RbindAB.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/RbindAB.scala
@@ -31,11 +31,11 @@ object RbindAB {
// If any of the inputs is blockified, use blockified inputs
if (srcA.isBlockified || srcB.isBlockified) {
- val a = srcA.toBlockifiedDrmRdd()
- val b = srcB.toBlockifiedDrmRdd()
+ val a = srcA.toBlockifiedDrmRdd(op.A.ncol)
+ val b = srcB.toBlockifiedDrmRdd(op.B.ncol)
// Union seems to be fine, it is indeed just do partition-level unionization, no shuffles
- new DrmRddInput(blockifiedSrc = Some(a ++ b))
+ a ++ b
} else {
@@ -43,7 +43,7 @@ object RbindAB {
val a = srcA.toDrmRdd()
val b = srcB.toDrmRdd()
- new DrmRddInput(rowWiseSrc = Some(op.ncol -> (a ++ b)))
+ a ++ b
}
}
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Slicing.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Slicing.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Slicing.scala
index d0a50b5..0284ba2 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Slicing.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Slicing.scala
@@ -22,6 +22,6 @@ object Slicing {
// TODO: we probably need to re-shuffle result or at least cut down the partitions of 0 size
- new DrmRddInput(rowWiseSrc = Some(ncol -> rdd))
+ rdd
}
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/package.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/package.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/package.scala
index 9a50afa..6b8513f 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/package.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/package.scala
@@ -17,13 +17,17 @@
package org.apache.mahout.sparkbindings
+import org.apache.mahout.sparkbindings
+import org.apache.spark.rdd.RDD
+
import scala.reflect.ClassTag
-import org.apache.mahout.sparkbindings.drm.{CheckpointedDrmSpark, DrmRddInput}
import org.apache.spark.SparkContext._
import org.apache.mahout.math._
import org.apache.mahout.math.drm._
import scalabindings._
import RLikeOps._
+import collection._
+import JavaConversions._
/**
* This validation contains distributed algorithms that distributed matrix expression optimizer picks
@@ -31,8 +35,81 @@ import RLikeOps._
*/
package object blas {
- implicit def drmRdd2ops[K:ClassTag](rdd:DrmRdd[K]):DrmRddOps[K] = new DrmRddOps[K](rdd)
+ implicit def drmRdd2ops[K: ClassTag](rdd: DrmRdd[K]): DrmRddOps[K] = new DrmRddOps[K](rdd)
+
+
+ /**
+ * Rekey matrix dataset keys to consequtive int keys.
+ * @param rdd incoming matrix row-wise dataset
+ *
+ * @param computeMap if true, also compute mapping between old and new keys
+ * @tparam K existing key parameter
+ * @return
+ */
+ private[mahout] def rekeySeqInts[K: ClassTag](rdd: DrmRdd[K], computeMap: Boolean = true): (DrmRdd[Int],
+ Option[RDD[(K, Int)]]) = {
+
+ // Spark context please.
+ val sctx = rdd.context
+ import sctx._
+
+ // First, compute partition sizes.
+ val partSizes = rdd.mapPartitionsWithIndex((part, iter) => Iterator(part -> iter.size))
+
+ // Collect in-core
+ .collect()
+
+ // Starting indices
+ var startInd = new Array[Int](rdd.partitions.size)
+
+ // Save counts
+ for (pc <- partSizes) startInd(pc._1) = pc._2
+
+ // compute cumulative sum
+ val siBcast = broadcast(startInd.scanLeft(0)(_ + _).init)
+
+ // Compute key -> int index map:
+ val keyMap = if (computeMap) {
+ Some(rdd
+
+ // Process individual partition with index, output `key -> index` tuple
+ .mapPartitionsWithIndex((part, iter) => {
+
+ // Start index for this partition
+ val si = siBcast.value(part)
+ iter.zipWithIndex.map { case ((key, _), index) => key -> (index + si)}
+ })) // Some
+
+ } else {
+
+ // Were not asked to compute key mapping
+ None
+ }
+
+ // Finally, do the transform
+ val intRdd = rdd
+
+ // Re-number each partition
+ .mapPartitionsWithIndex((part, iter) => {
+ // Start index for this partition
+ val si = siBcast.value(part)
+
+ // Iterate over data by producing sequential row index and retaining vector value.
+ iter.zipWithIndex.map { case ((_, vec), ind) => si + ind -> vec}
+ })
+
+ // Finally, return drm -> keymap result
+
+ intRdd -> keyMap
+
+ }
+
+
+ /**
+ * Fills in missing rows in an Int-indexed matrix by putting in empty row vectors for the missing
+ * keys.
+ */
private[mahout] def fixIntConsistency(op: DrmLike[Int], src: DrmRdd[Int]): DrmRdd[Int] = {
if (op.canHaveMissingRows) {
@@ -45,20 +122,20 @@ package object blas {
// Compute the fix.
sc
- // Bootstrap full key set
- .parallelize(0 until dueRows, numSlices = rdd.partitions.size max 1)
+ // Bootstrap full key set
+ .parallelize(0 until dueRows, numSlices = rdd.partitions.size max 1)
- // Enable PairedFunctions
- .map(_ -> Unit)
+ // Enable PairedFunctions
+ .map(_ -> Unit)
- // Cogroup with all rows
- .cogroup(other = rdd)
+ // Cogroup with all rows
+ .cogroup(other = rdd)
- // Filter out out-of-bounds
- .filter { case (key, _) => key >= 0 && key < dueRows}
+ // Filter out out-of-bounds
+ .filter { case (key, _) => key >= 0 && key < dueRows}
- // Coalesce and output RHS
- .map { case (key, (seqUnit, seqVec)) =>
+ // Coalesce and output RHS
+ .map { case (key, (seqUnit, seqVec)) =>
val acc = seqVec.headOption.getOrElse(new SequentialAccessSparseVector(dueCols))
val vec = if (seqVec.size > 0) (acc /: seqVec.tail)(_ + _) else acc
key -> vec
@@ -68,4 +145,77 @@ package object blas {
}
+ /** Method to do `mxC += a cross b` in-plcae a bit more efficiently than this expression does. */
+ def addOuterProduct(mxC: Matrix, a: Vector, b: Vector): Matrix = {
+
+ // Try to pay attention to density a bit here when computing and adding the outer product of
+ // arow and brow fragment.
+ if (b.isDense)
+ for (ela <- a.nonZeroes) mxC(ela.index, ::) := { (i, x) => x + ela * b(i)}
+ else
+ for (ela <- a.nonZeroes; elb <- b.nonZeroes()) mxC(ela.index, elb.index) += ela * elb
+
+ mxC
+ }
+
+ /**
+ * Compute ranges of more or less even splits of total `nrow` number
+ *
+ * @param nrow
+ * @param numSplits
+ * @return
+ */
+ @inline
+ private[blas] def computeEvenSplits(nrow: Long, numSplits: Int): IndexedSeq[Range] = {
+ require(numSplits <= nrow, "Requested amount of splits greater than number of data points.")
+ require(nrow >= 1)
+ require(numSplits >= 1)
+
+ // Base split -- what is our base split size?
+ val baseSplit = safeToNonNegInt(nrow / numSplits)
+
+ // Slack -- how many splits will have to be incremented by 1 though?
+ val slack = safeToNonNegInt(nrow % numSplits)
+
+ // Compute ranges. We need to set ranges so that numSplits - slack splits have size of baseSplit;
+ // and `slack` splits have size baseSplit + 1. Here is how we do it: First, we compute the range
+ // offsets:
+ val offsets = (0 to numSplits).map(i => i * (baseSplit + 1) - (0 max i - slack))
+ // And then we connect the ranges using gaps between offsets:
+ offsets.sliding(2).map(offs => offs(0) until offs(1)).toIndexedSeq
+ }
+
+ /**
+ * Estimate number of partitions for the product of A %*% B.
+ *
+ * We take average per-partition element count of product as higher of the same of A and B. (prefer
+ * larger partitions of operands).
+ *
+ * @param anrow A.nrow
+ * @param ancol A.ncol
+ * @param bncol B.ncol
+ * @param aparts partitions in A
+ * @param bparts partitions in B
+ * @return recommended partitions
+ */
+ private[blas] def estimateProductPartitions(anrow:Long, ancol:Long, bncol:Long, aparts:Int, bparts:Int):Int = {
+
+ // Compute per-partition element density in A
+ val eppA = anrow.toDouble * ancol/ aparts
+
+ // Compute per-partition element density in B
+ val eppB = ancol.toDouble * bncol / bparts
+
+ // Take the maximum element density into account. Is it a good enough?
+ val epp = eppA max eppB
+
+ // product partitions
+ val prodParts = anrow * bncol / epp
+
+ val nparts = math.round(prodParts).toInt max 1
+
+ // Constrain nparts to maximum of anrow to prevent guaranteed empty partitions.
+ if (nparts > anrow) anrow.toInt else nparts
+ }
+
}
[4/4] mahout git commit: MAHOUT-1660 MAHOUT-1713 MAHOUT-1714
MAHOUT-1715 MAHOUT-1716 MAHOUT-1717 MAHOUT-1718 MAHOUT-1719 MAHOUT-1720
MAHOUT-1721 MAHOUT-1722 MAHOUT-1723 MAHOUT-1724 MAHOUT-1725 MAHOUT-1726
MAHOUT-1727 MAHOUT-1728 MAHOUT-1729 MAHOUT-1730 M
Posted by dl...@apache.org.
MAHOUT-1660 MAHOUT-1713 MAHOUT-1714 MAHOUT-1715 MAHOUT-1716 MAHOUT-1717 MAHOUT-1718 MAHOUT-1719 MAHOUT-1720 MAHOUT-1721 MAHOUT-1722 MAHOUT-1723 MAHOUT-1724 MAHOUT-1725 MAHOUT-1726 MAHOUT-1727 MAHOUT-1728 MAHOUT-1729 MAHOUT-1730 MAHOUT-1731 MAHOUT-1732
Cumulative patch for the above issues. Closes apache/mahout#135
Squashed commit of the following:
commit c59bf8a21e1ad77dee80730772d2184b3f28a495
Author: Dmitriy Lyubimov <dl...@apache.org>
Date: Mon Jun 8 18:11:57 2015 -0700
handling degenerate matrix cases for rbind, cbind, and serialization (0 columns or rows)
commit 56b735e137355e174facffd409d6456360c2f8e7
Author: Dmitriy Lyubimov <dl...@apache.org>
Date: Mon Jun 8 16:58:34 2015 -0700
Inserting back the testing framework artifact being built. Need this as a dependency
in subordinate projects that do method testing as well.
commit 7e6ce766d06c5a2337dd9b08df7c9fa37bd9a9c8
Author: Dmitriy Lyubimov <dl...@apache.org>
Date: Mon Jun 8 10:22:53 2015 -0700
adding "final" for logger per comment on public PR
commit e42bcedf8521b89c7583f8e7e299c2be0c2a8de2
Author: Dmitriy Lyubimov <dl...@apache.org>
Date: Tue Jun 2 12:24:30 2015 -0700
final fixes in h20.
fixing @deprecated warnings in atb
commit 00fb618ad0ef0e5b8aac30c88b23d2e9325ea8f8
Author: Dmitriy Lyubimov <dl...@apache.org>
Date: Tue Jun 2 12:08:13 2015 -0700
h20 stuff
commit f4e15506ed2497bc2e179e3ded9ca399fd826d15
Author: Dmitriy Lyubimov <dl...@apache.org>
Date: Tue Jun 2 11:55:30 2015 -0700
restoring merge errors in h2o module, nothing is touched here.
commit 1b892de589bccf03c41c6b2e49493472e6bd1d52
Author: Dmitriy Lyubimov <dl...@apache.org>
Date: Mon Jun 1 18:44:21 2015 -0700
Picking up missing changes on both sides in spark module.
TODO: Pat's similarity driver tests fail, seems, on some degenerate splitting in optimizer. Need to take a look.
commit 3422046b94c03d43a91f091e38532339cf890351
Author: Dmitriy Lyubimov <dl...@apache.org>
Date: Mon Jun 1 18:13:03 2015 -0700
Adding missing change. uncommenting performance in-core tests.
commit 7aa5de5431ad01c37e8069956287194c97b37b06
Author: Dmitriy Lyubimov <dl...@apache.org>
Date: Mon Jun 1 17:54:17 2015 -0700
Initial merge with ora private-review branch. Stuff compiles up to h2o (which needs to be added some unimplemented stuff) and ssvd tests
are failing in math-scala module due to lack of matrix flavor on mmul. They are not failing in private branch though -- some changes still
have not been merged?
Most changes i care about seems to be there though.
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/8a6b805a
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/8a6b805a
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/8a6b805a
Branch: refs/heads/mahout-0.10.x
Commit: 8a6b805a3c15080f28be050a83b1ad26a60f21e6
Parents: e6d24b9
Author: Dmitriy Lyubimov <dl...@apache.org>
Authored: Wed Jun 10 17:08:37 2015 -0700
Committer: Dmitriy Lyubimov <dl...@apache.org>
Committed: Wed Jun 10 17:08:37 2015 -0700
----------------------------------------------------------------------
CHANGELOG | 41 +++
bin/mahout | 4 +-
.../apache/mahout/h2obindings/drm/H2OBCast.java | 12 +
.../apache/mahout/h2obindings/H2OEngine.scala | 66 ++--
.../org/apache/mahout/logging/package.scala | 73 ++++
.../apache/mahout/math/decompositions/DQR.scala | 9 +-
.../mahout/math/decompositions/DSSVD.scala | 19 +-
.../mahout/math/decompositions/SSVD.scala | 2 +-
.../org/apache/mahout/math/drm/BCast.scala | 3 +-
.../mahout/math/drm/CheckpointedOps.scala | 7 +
.../mahout/math/drm/DistributedEngine.scala | 125 ++++---
.../mahout/math/drm/DrmDoubleScalarOps.scala | 8 +-
.../org/apache/mahout/math/drm/DrmLikeOps.scala | 7 +-
.../apache/mahout/math/drm/RLikeDrmOps.scala | 55 ++-
.../math/drm/logical/AbstractUnaryOp.scala | 2 +-
.../math/drm/logical/CheckpointAction.scala | 3 +-
.../mahout/math/drm/logical/OpAewScalar.scala | 6 +-
.../math/drm/logical/OpAewUnaryFunc.scala | 47 +++
.../math/drm/logical/OpAewUnaryFuncFusion.scala | 62 ++++
.../mahout/math/drm/logical/OpCbind.scala | 2 +-
.../mahout/math/drm/logical/OpCbindScalar.scala | 37 ++
.../mahout/math/drm/logical/OpMapBlock.scala | 2 +-
.../mahout/math/drm/logical/TEwFunc.scala | 37 ++
.../org/apache/mahout/math/drm/package.scala | 50 ++-
.../math/scalabindings/DoubleScalarOps.scala | 42 ---
.../apache/mahout/math/scalabindings/MMul.scala | 295 ++++++++++++++++
.../mahout/math/scalabindings/MatrixOps.scala | 87 ++++-
.../scalabindings/RLikeDoubleScalarOps.scala | 63 ++++
.../math/scalabindings/RLikeMatrixOps.scala | 77 ++++-
.../mahout/math/scalabindings/RLikeOps.scala | 4 +-
.../math/scalabindings/RLikeTimesOps.scala | 28 --
.../math/scalabindings/RLikeVectorOps.scala | 29 +-
.../mahout/math/scalabindings/VectorOps.scala | 45 ++-
.../mahout/math/scalabindings/package.scala | 81 ++++-
.../org/apache/mahout/util/IOUtilsScala.scala | 64 ++++
.../mahout/math/drm/DrmLikeOpsSuiteBase.scala | 20 ++
.../mahout/math/drm/DrmLikeSuiteBase.scala | 3 +-
.../mahout/math/drm/RLikeDrmOpsSuiteBase.scala | 94 ++++-
.../math/scalabindings/MatrixOpsSuite.scala | 33 +-
.../scalabindings/RLikeMatrixOpsSuite.scala | 276 +++++++++++++++
.../math/scalabindings/VectorOpsSuite.scala | 19 +-
.../org/apache/mahout/math/AbstractMatrix.java | 24 +-
.../org/apache/mahout/math/ConstantVector.java | 5 +
.../apache/mahout/math/DelegatingVector.java | 5 +
.../org/apache/mahout/math/DenseMatrix.java | 9 +-
.../mahout/math/DenseSymmetricMatrix.java | 2 +
.../org/apache/mahout/math/DenseVector.java | 5 +
.../org/apache/mahout/math/DiagonalMatrix.java | 14 +
.../math/FileBasedSparseBinaryMatrix.java | 5 +
.../mahout/math/FunctionalMatrixView.java | 9 +
.../java/org/apache/mahout/math/Matrices.java | 18 +-
.../java/org/apache/mahout/math/Matrix.java | 7 +
.../apache/mahout/math/MatrixVectorView.java | 5 +
.../java/org/apache/mahout/math/MatrixView.java | 6 +
.../org/apache/mahout/math/NamedVector.java | 5 +
.../apache/mahout/math/PermutedVectorView.java | 5 +
.../mahout/math/RandomAccessSparseVector.java | 5 +
.../math/SequentialAccessSparseVector.java | 7 +
.../apache/mahout/math/SparseColumnMatrix.java | 20 +-
.../org/apache/mahout/math/SparseMatrix.java | 30 +-
.../org/apache/mahout/math/SparseRowMatrix.java | 7 +
.../mahout/math/TransposedMatrixView.java | 147 ++++++++
.../org/apache/mahout/math/UpperTriangular.java | 9 +
.../java/org/apache/mahout/math/Vector.java | 8 +
.../org/apache/mahout/math/VectorIterable.java | 4 +
.../java/org/apache/mahout/math/VectorView.java | 7 +-
.../org/apache/mahout/math/flavor/BackEnum.java | 26 ++
.../apache/mahout/math/flavor/MatrixFlavor.java | 82 +++++
.../math/flavor/TraversingStructureEnum.java | 48 +++
.../org/apache/mahout/math/MatricesTest.java | 4 +-
.../math/hadoop/DistributedRowMatrix.java | 5 +
.../stochasticsvd/qr/GivensThinSolver.java | 5 +
.../sparkbindings/shell/MahoutSparkILoop.scala | 15 +-
spark/pom.xml | 16 +
.../org/apache/mahout/common/DrmMetadata.scala | 17 +
.../org/apache/mahout/common/HDFSUtil.scala | 4 +-
.../apache/mahout/common/Hadoop1HDFSUtil.scala | 18 +-
.../mahout/sparkbindings/SparkEngine.scala | 227 +++++++++----
.../apache/mahout/sparkbindings/blas/ABt.scala | 200 +++++++++--
.../apache/mahout/sparkbindings/blas/AewB.scala | 75 +++-
.../mahout/sparkbindings/blas/AinCoreB.scala | 16 +-
.../apache/mahout/sparkbindings/blas/At.scala | 15 +-
.../apache/mahout/sparkbindings/blas/AtA.scala | 247 +++++++++-----
.../apache/mahout/sparkbindings/blas/AtB.scala | 339 ++++++++++++++++---
.../apache/mahout/sparkbindings/blas/Ax.scala | 24 +-
.../mahout/sparkbindings/blas/CbindAB.scala | 35 +-
.../mahout/sparkbindings/blas/DrmRddOps.scala | 2 +
.../mahout/sparkbindings/blas/MapBlock.scala | 15 +-
.../apache/mahout/sparkbindings/blas/Par.scala | 74 ++--
.../mahout/sparkbindings/blas/RbindAB.scala | 8 +-
.../mahout/sparkbindings/blas/Slicing.scala | 2 +-
.../mahout/sparkbindings/blas/package.scala | 174 +++++++++-
.../drm/CheckpointedDrmSpark.scala | 41 ++-
.../drm/CheckpointedDrmSparkOps.scala | 2 +-
.../mahout/sparkbindings/drm/DrmRddInput.scala | 18 +-
.../mahout/sparkbindings/drm/SparkBCast.scala | 2 +
.../mahout/sparkbindings/drm/package.scala | 23 +-
.../io/GenericMatrixKryoSerializer.scala | 189 +++++++++++
.../io/MahoutKryoRegistrator.scala | 28 +-
.../io/UnsupportedSerializer.scala | 31 ++
.../sparkbindings/io/VectorKryoSerializer.scala | 252 ++++++++++++++
.../apache/mahout/sparkbindings/package.scala | 118 +++----
.../sparkbindings/SparkBindingsSuite.scala | 12 +-
.../mahout/sparkbindings/blas/BlasSuite.scala | 4 +-
.../sparkbindings/drm/DrmLikeOpsSuite.scala | 17 +-
.../sparkbindings/drm/RLikeDrmOpsSuite.scala | 63 ++++
.../mahout/sparkbindings/io/IOSuite.scala | 195 +++++++++++
.../test/DistributedSparkSuite.scala | 15 +-
.../test/LoggerConfiguration.scala | 2 +-
109 files changed, 4310 insertions(+), 702 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index 89c2cbc..dd65b0e 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,47 @@ Mahout Change Log
Release 0.10.2 - unreleased
+ MAHOUT-1660: Hadoop1HDFSUtil.readDRMHEader should be taking Hadoop conf (dlyubimov)
+
+ MAHOUT-1713: Performance and parallelization improvements for AB', A'B, A'A spark physical operators (dlyubimov)
+
+ MAHOUT-1714: Add MAHOUT_OPTS environment when running Spark shell (dlyubimov)
+
+ MAHOUT-1715: Closeable API for broadcast tensors (dlyubimov)
+
+ MAHOUT-1716: Scala logging style (dlyubimov)
+
+ MAHOUT-1717: allreduceBlock() operator api and Spark implementation (dlyubimov)
+
+ MAHOUT-1718: Support for conversion of any type-keyed DRM into ordinally-keyed DRM (dlyubimov)
+
+ MAHOUT-1719: Unary elementwise function operator and function fusions (dlyubimov)
+
+ MAHOUT-1720: Support 1 cbind X, X cbind 1 etc. for both Matrix and DRM (dlyubimov)
+
+ MAHOUT-1721: rowSumsMap() summary for non-int-keyed DRMs (dlyubimov)
+
+ MAHOUT-1722: DRM row sampling api (dlyubimov)
+
+ MAHOUT-1723: Optional structural "flavor" abstraction for in-core matrices (dlyubimov)
+
+ MAHOUT-1724: Optimizations of matrix-matrix in-core multiplication based on structural flavors (dlyubimov)
+
+ MAHOUT-1725: elementwise power operator ^ (dlyubimov)
+
+ MAHOUT-1726: R-like vector concatenation operator (dlyubimov)
+
+ MAHOUT-1727: Elementwise analogues of scala.math functions for tensor types (dlyubimov)
+
+ MAHOUT-1728: In-core functional assignments (dlyubimov)
+
+ MAHOUT-1729: Straighten out behavior of Matrix.iterator() and iterateNonEmpty() (dlyubimov)
+
+ MAHOUT-1730: New mutable transposition view for in-core matrices (dlyubimov)
+
+ MAHOUT-1731: Deprecate SparseColumnMatrix (dlyubimov)
+
+ MAHOUT-1732: Native support for kryo serialization of tensor types (dlyubimov)
Release 0.10.1 - 2015-05-31
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/bin/mahout
----------------------------------------------------------------------
diff --git a/bin/mahout b/bin/mahout
index ee0b918..24f01ba 100755
--- a/bin/mahout
+++ b/bin/mahout
@@ -254,12 +254,10 @@ fi
# restore ordinary behaviour
unset IFS
-
-
case "$1" in
(spark-shell)
save_stty=$(stty -g 2>/dev/null);
- "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.sparkbindings.shell.Main" $@
+ "$JAVA" $JAVA_HEAP_MAX $MAHOUT_OPTS -classpath "$CLASSPATH" "org.apache.mahout.sparkbindings.shell.Main" $@
stty sane; stty $save_stty
;;
# Spark CLI drivers go here
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2OBCast.java
----------------------------------------------------------------------
diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2OBCast.java b/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2OBCast.java
index 523a771..ebcc626 100644
--- a/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2OBCast.java
+++ b/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2OBCast.java
@@ -118,4 +118,16 @@ public class H2OBCast<T> implements BCast<T>, Serializable {
}
return ret;
}
+
+ /**
+ * Stop broadcasting when called on driver side. Release any network resources.
+ *
+ */
+ @Override
+ public void close() throws IOException {
+
+ // TODO: review this. It looks like it is not really a broadcast mechanism but rather just a
+ // serialization wrapper. In which case it doesn't hold any network resources.
+
+ }
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala
----------------------------------------------------------------------
diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala
index 173d5a0..e0ac302 100644
--- a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala
+++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala
@@ -26,9 +26,13 @@ import org.apache.mahout.math.drm.logical._
import org.apache.mahout.h2obindings.ops._
import org.apache.mahout.h2obindings.drm._
import org.apache.mahout.h2o.common.{Hadoop1HDFSUtil, HDFSUtil}
+import org.apache.mahout.logging._
/** H2O specific non-DRM operations */
object H2OEngine extends DistributedEngine {
+
+ private final implicit val log = getLog(H2OEngine.getClass)
+
// By default, use Hadoop 1 utils
var hdfsUtils: HDFSUtil = Hadoop1HDFSUtil
@@ -119,40 +123,64 @@ object H2OEngine extends DistributedEngine {
abstract class IndexedDatasetH2O(val matrix: CheckpointedDrm[Int], val rowIDs: BiDictionary, val columnIDs: BiDictionary)
extends IndexedDataset {}
- /**
- * reads an IndexedDatasetH2O from default text delimited files
+ /**
+ * Reads an IndexedDatasetH2O from default text delimited files
* @todo unimplemented
* @param src a comma separated list of URIs to read from
* @param schema how the text file is formatted
* @return
*/
def indexedDatasetDFSRead(src: String,
- schema: Schema = DefaultIndexedDatasetReadSchema,
- existingRowIDs: Option[BiDictionary] = None)
- (implicit sc: DistributedContext):
- IndexedDatasetH2O = {
- // should log a warning when this is built but no logger here, can an H2O contributor help with this
- println("Warning: unimplemented indexedDatasetDFSReadElements." )
- throw new UnsupportedOperationException("IndexedDatasetH2O is not implemented so can't be read.")
- null.asInstanceOf[IndexedDatasetH2O]
+ schema: Schema = DefaultIndexedDatasetReadSchema,
+ existingRowIDs: Option[BiDictionary] = None)
+ (implicit sc: DistributedContext):
+ IndexedDatasetH2O = {
+
+ error("Unimplemented indexedDatasetDFSReadElements.")
+
+ ???
}
/**
- * reads an IndexedDatasetH2O from default text delimited files
+ * Reads an IndexedDatasetH2O from default text delimited files
* @todo unimplemented
* @param src a comma separated list of URIs to read from
* @param schema how the text file is formatted
* @return
*/
def indexedDatasetDFSReadElements(src: String,
- schema: Schema = DefaultIndexedDatasetReadSchema,
- existingRowIDs: Option[BiDictionary] = None)
- (implicit sc: DistributedContext):
- IndexedDatasetH2O = {
- // should log a warning when this is built but no logger here, can an H2O contributor help with this
- println("Warning: unimplemented indexedDatasetDFSReadElements." )
- throw new UnsupportedOperationException("IndexedDatasetH2O is not implemented so can't be read by elements.")
- null.asInstanceOf[IndexedDatasetH2O]
+ schema: Schema = DefaultIndexedDatasetReadSchema,
+ existingRowIDs: Option[BiDictionary] = None)
+ (implicit sc: DistributedContext): IndexedDatasetH2O = {
+
+ error("Unimplemented indexedDatasetDFSReadElements.")
+
+ ???
}
+ /**
+ * Optional engine-specific all reduce tensor operation.
+ *
+ * TODO: implement this please.
+ *
+ */
+ override def allreduceBlock[K: ClassTag](drm: CheckpointedDrm[K], bmf: BlockMapFunc2[K], rf: BlockReduceFunc)
+ : Matrix = ???
+
+ /**
+ * TODO: implement this please.
+ */
+ override def drmSampleKRows[K: ClassTag](drmX: DrmLike[K], numSamples: Int, replacement: Boolean): Matrix = ???
+
+ /**
+ * (Optional) Sampling operation. Consistent with Spark semantics of the same.
+ * TODO: implement this please.
+ */
+ override def drmSampleRows[K: ClassTag](drmX: DrmLike[K], fraction: Double, replacement: Boolean): DrmLike[K] = ???
+
+ /**
+ * TODO: implement this please.
+ */
+ override def drm2IntKeyed[K: ClassTag](drmX: DrmLike[K], computeMap: Boolean)
+ : (DrmLike[Int], Option[DrmLike[K]]) = ???
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/logging/package.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/logging/package.scala b/math-scala/src/main/scala/org/apache/mahout/logging/package.scala
new file mode 100644
index 0000000..15aa909
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/logging/package.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout
+
+import org.apache.log4j.{Level, Priority, Logger}
+
+package object logging {
+
+ /** Compute `expr` if debug is on, only */
+ def debugDo[T](expr: => T)(implicit log: Logger): Option[T] = {
+ if (log.isDebugEnabled) Some(expr)
+ else None
+ }
+
+ /** Compute `expr` if trace is on, only */
+ def traceDo[T](expr: => T)(implicit log: Logger): Option[T] = {
+ if (log.isTraceEnabled) Some(expr) else None
+ }
+
+ /** Shorter, and lazy, versions of logging methods. Just declare log implicit. */
+ def debug(msg: => AnyRef)(implicit log: Logger) { if (log.isDebugEnabled) log.debug(msg) }
+
+ def debug(msg: => AnyRef, t: Throwable)(implicit log: Logger) { if (log.isDebugEnabled()) log.debug(msg, t) }
+
+ /** Shorter, and lazy, versions of logging methods. Just declare log implicit. */
+ def trace(msg: => AnyRef)(implicit log: Logger) { if (log.isTraceEnabled) log.trace(msg) }
+
+ def trace(msg: => AnyRef, t: Throwable)(implicit log: Logger) { if (log.isTraceEnabled()) log.trace(msg, t) }
+
+ def info(msg: => AnyRef)(implicit log: Logger) { if (log.isInfoEnabled) log.info(msg)}
+
+ def info(msg: => AnyRef, t:Throwable)(implicit log: Logger) { if (log.isInfoEnabled) log.info(msg,t)}
+
+ def warn(msg: => AnyRef)(implicit log: Logger) { if (log.isEnabledFor(Level.WARN)) log.warn(msg) }
+
+ def warn(msg: => AnyRef, t: Throwable)(implicit log: Logger) { if (log.isEnabledFor(Level.WARN)) error(msg, t) }
+
+ def error(msg: => AnyRef)(implicit log: Logger) { if (log.isEnabledFor(Level.ERROR)) log.warn(msg) }
+
+ def error(msg: => AnyRef, t: Throwable)(implicit log: Logger) { if (log.isEnabledFor(Level.ERROR)) error(msg, t) }
+
+ def fatal(msg: => AnyRef)(implicit log: Logger) { if (log.isEnabledFor(Level.FATAL)) log.fatal(msg) }
+
+ def fatal(msg: => AnyRef, t: Throwable)(implicit log: Logger) { if (log.isEnabledFor(Level.FATAL)) log.fatal(msg, t) }
+
+ def getLog(name: String): Logger = Logger.getLogger(name)
+
+ def getLog(clazz: Class[_]): Logger = Logger.getLogger(clazz)
+
+ def mahoutLog :Logger = getLog("org.apache.mahout")
+
+ def setLogLevel(l:Level)(implicit log:Logger) = {
+ log.setLevel(l)
+ }
+
+ def setAdditivity(a:Boolean)(implicit log:Logger) = log.setAdditivity(a)
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DQR.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DQR.scala b/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DQR.scala
index 7caa3dd..866ee34 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DQR.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DQR.scala
@@ -18,6 +18,7 @@
package org.apache.mahout.math.decompositions
import scala.reflect.ClassTag
+import org.apache.mahout.logging._
import org.apache.mahout.math.Matrix
import org.apache.mahout.math.scalabindings._
import RLikeOps._
@@ -27,7 +28,7 @@ import org.apache.log4j.Logger
object DQR {
- private val log = Logger.getLogger(DQR.getClass)
+ private final implicit val log = getLog(DQR.getClass)
/**
* Distributed _thin_ QR. A'A must fit in a memory, i.e. if A is m x n, then n should be pretty
@@ -41,19 +42,19 @@ object DQR {
def dqrThin[K: ClassTag](drmA: DrmLike[K], checkRankDeficiency: Boolean = true): (DrmLike[K], Matrix) = {
if (drmA.ncol > 5000)
- log.warn("A is too fat. A'A must fit in memory and easily broadcasted.")
+ warn("A is too fat. A'A must fit in memory and easily broadcasted.")
implicit val ctx = drmA.context
val AtA = (drmA.t %*% drmA).checkpoint()
val inCoreAtA = AtA.collect
- if (log.isDebugEnabled) log.debug("A'A=\n%s\n".format(inCoreAtA))
+ trace("A'A=\n%s\n".format(inCoreAtA))
val ch = chol(inCoreAtA)
val inCoreR = (ch.getL cloned) t
- if (log.isDebugEnabled) log.debug("R=\n%s\n".format(inCoreR))
+ trace("R=\n%s\n".format(inCoreR))
if (checkRankDeficiency && !ch.isPositiveDefinite)
throw new IllegalArgumentException("R is rank-deficient.")
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DSSVD.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DSSVD.scala b/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DSSVD.scala
index 1abfb87..cecaec8 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DSSVD.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DSSVD.scala
@@ -7,9 +7,12 @@ import RLikeOps._
import org.apache.mahout.math.drm._
import RLikeDrmOps._
import org.apache.mahout.common.RandomUtils
+import org.apache.mahout.logging._
object DSSVD {
+ private final implicit val log = getLog(DSSVD.getClass)
+
/**
* Distributed Stochastic Singular Value decomposition algorithm.
*
@@ -43,18 +46,22 @@ object DSSVD {
case (keys, blockA) =>
val blockY = blockA %*% Matrices.symmetricUniformView(n, r, omegaSeed)
keys -> blockY
- }
+ }.checkpoint()
- var drmQ = dqrThin(drmY.checkpoint())._1
+ var drmQ = dqrThin(drmY)._1
// Checkpoint Q if last iteration
if (q == 0) drmQ = drmQ.checkpoint()
+ trace(s"dssvd:drmQ=${drmQ.collect}.")
+
// This actually should be optimized as identically partitioned map-side A'B since A and Q should
// still be identically partitioned.
var drmBt = drmAcp.t %*% drmQ
// Checkpoint B' if last iteration
if (q == 0) drmBt = drmBt.checkpoint()
+ trace(s"dssvd:drmB'=${drmBt.collect}.")
+
for (i <- 0 until q) {
drmY = drmAcp %*% drmBt
drmQ = dqrThin(drmY.checkpoint())._1
@@ -62,13 +69,17 @@ object DSSVD {
if (i == q - 1) drmQ = drmQ.checkpoint()
// This on the other hand should be inner-join-and-map A'B optimization since A and Q_i are not
- // identically partitioned anymore.
+ // identically partitioned anymore.`
drmBt = drmAcp.t %*% drmQ
// Checkpoint B' if last iteration
if (i == q - 1) drmBt = drmBt.checkpoint()
}
- val (inCoreUHat, d) = eigen(drmBt.t %*% drmBt)
+ val mxBBt:Matrix = drmBt.t %*% drmBt
+
+ trace(s"dssvd: BB'=$mxBBt.")
+
+ val (inCoreUHat, d) = eigen(mxBBt)
val s = d.sqrt
// Since neither drmU nor drmV are actually computed until actually used, we don't need the flags
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/SSVD.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/SSVD.scala b/math-scala/src/main/scala/org/apache/mahout/math/decompositions/SSVD.scala
index 80385a3..e1b2f03 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/SSVD.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/decompositions/SSVD.scala
@@ -150,7 +150,7 @@ private[math] object SSVD {
val c = s_q cross s_b
// BB' computation becomes
- val bbt = bt.t %*% bt -c - c.t + (s_q cross s_q) * (xi dot xi)
+ val bbt = bt.t %*% bt - c - c.t + (s_q cross s_q) * (xi dot xi)
val (uhat, d) = eigen(bbt)
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/drm/BCast.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/BCast.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/BCast.scala
index 850614457..b86e286 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/BCast.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/BCast.scala
@@ -18,6 +18,7 @@
package org.apache.mahout.math.drm
/** Broadcast variable abstraction */
-trait BCast[T] {
+trait BCast[T] extends java.io.Closeable {
def value:T
+
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/drm/CheckpointedOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/CheckpointedOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/CheckpointedOps.scala
index 8c3911f..c43c6c7 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/CheckpointedOps.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/CheckpointedOps.scala
@@ -20,6 +20,7 @@ package org.apache.mahout.math.drm
import scala.reflect.ClassTag
import org.apache.mahout.math._
+import org.apache.mahout.math.scalabindings.RLikeOps._
/**
* Additional experimental operations over CheckpointedDRM implementation. I will possibly move them up to
@@ -38,6 +39,12 @@ class CheckpointedOps[K: ClassTag](val drm: CheckpointedDrm[K]) {
/** Column Means */
def colMeans(): Vector = drm.context.colMeans(drm)
+ /** Optional engine-specific all reduce tensor operation. */
+ def allreduceBlock(bmf: BlockMapFunc2[K], rf: BlockReduceFunc = _ += _): Matrix =
+
+ drm.context.allreduceBlock(drm, bmf, rf)
+
+
def norm():Double = drm.context.norm(drm)
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/drm/DistributedEngine.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/DistributedEngine.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/DistributedEngine.scala
index bb6772a..519a127 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/DistributedEngine.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/DistributedEngine.scala
@@ -19,16 +19,15 @@ package org.apache.mahout.math.drm
import org.apache.mahout.math.indexeddataset._
-import scala.reflect.ClassTag
import logical._
import org.apache.mahout.math._
import scalabindings._
import RLikeOps._
-import RLikeDrmOps._
import DistributedEngine._
-import org.apache.mahout.math.scalabindings._
import org.apache.log4j.Logger
+import scala.reflect.ClassTag
+
/** Abstraction of optimizer/distributed engine */
trait DistributedEngine {
@@ -37,7 +36,7 @@ trait DistributedEngine {
* introduce logical constructs (including engine-specific ones) that user DSL cannot even produce
* per se.
* <P>
- *
+ *
* A particular physical engine implementation may choose to either use the default rewrites or
* build its own rewriting rules.
* <P>
@@ -50,6 +49,9 @@ trait DistributedEngine {
/** Engine-specific colSums implementation based on a checkpoint. */
def colSums[K: ClassTag](drm: CheckpointedDrm[K]): Vector
+ /** Optional engine-specific all reduce tensor operation. */
+ def allreduceBlock[K: ClassTag](drm: CheckpointedDrm[K], bmf: BlockMapFunc2[K], rf: BlockReduceFunc): Matrix
+
/** Engine-specific numNonZeroElementsPerColumn implementation based on a checkpoint. */
def numNonZeroElementsPerColumn[K: ClassTag](drm: CheckpointedDrm[K]): Vector
@@ -73,20 +75,39 @@ trait DistributedEngine {
def drmDfsRead(path: String, parMin: Int = 0)(implicit sc: DistributedContext): CheckpointedDrm[_]
/** Parallelize in-core matrix as spark distributed matrix, using row ordinal indices as data set keys. */
- def drmParallelizeWithRowIndices(m: Matrix, numPartitions: Int = 1)
- (implicit sc: DistributedContext): CheckpointedDrm[Int]
+ def drmParallelizeWithRowIndices(m: Matrix, numPartitions: Int = 1)(implicit sc: DistributedContext):
+ CheckpointedDrm[Int]
/** Parallelize in-core matrix as spark distributed matrix, using row labels as a data set keys. */
- def drmParallelizeWithRowLabels(m: Matrix, numPartitions: Int = 1)
- (implicit sc: DistributedContext): CheckpointedDrm[String]
+ def drmParallelizeWithRowLabels(m: Matrix, numPartitions: Int = 1)(implicit sc: DistributedContext):
+ CheckpointedDrm[String]
/** This creates an empty DRM with specified number of partitions and cardinality. */
- def drmParallelizeEmpty(nrow: Int, ncol: Int, numPartitions: Int = 10)
- (implicit sc: DistributedContext): CheckpointedDrm[Int]
+ def drmParallelizeEmpty(nrow: Int, ncol: Int, numPartitions: Int = 10)(implicit sc: DistributedContext):
+ CheckpointedDrm[Int]
/** Creates empty DRM with non-trivial height */
- def drmParallelizeEmptyLong(nrow: Long, ncol: Int, numPartitions: Int = 10)
- (implicit sc: DistributedContext): CheckpointedDrm[Long]
+ def drmParallelizeEmptyLong(nrow: Long, ncol: Int, numPartitions: Int = 10)(implicit sc: DistributedContext):
+ CheckpointedDrm[Long]
+
+ /**
+ * Convert non-int-keyed matrix to an int-keyed, computing optionally mapping from old keys
+ * to row indices in the new one. The mapping, if requested, is returned as a 1-column matrix.
+ */
+ def drm2IntKeyed[K: ClassTag](drmX: DrmLike[K], computeMap: Boolean = false): (DrmLike[Int], Option[DrmLike[K]])
+
+ /**
+ * (Optional) Sampling operation. Consistent with Spark semantics of the same.
+ * @param drmX
+ * @param fraction
+ * @param replacement
+ * @tparam K
+ * @return
+ */
+ def drmSampleRows[K: ClassTag](drmX: DrmLike[K], fraction: Double, replacement: Boolean = false): DrmLike[K]
+
+ def drmSampleKRows[K: ClassTag](drmX: DrmLike[K], numSamples:Int, replacement:Boolean = false) : Matrix
+
/**
* Load IndexedDataset from text delimited format.
* @param src comma delimited URIs to read from
@@ -119,38 +140,49 @@ object DistributedEngine {
private def pass1[K: ClassTag](action: DrmLike[K]): DrmLike[K] = {
action match {
- case OpAB(OpAt(a), b) if (a == b) => OpAtA(pass1(a))
- case OpABAnyKey(OpAtAnyKey(a), b) if (a == b) => OpAtA(pass1(a))
+
+ // self element-wise rewrite
+ case OpAewB(a, b, op) if (a == b) => {
+ op match {
+ case "*" ⇒ OpAewUnaryFunc(pass1(a), (x) ⇒ x * x)
+ case "/" ⇒ OpAewUnaryFunc(pass1(a), (x) ⇒ x / x)
+ // Self "+" and "-" don't make a lot of sense, but we do include it for completeness.
+ case "+" ⇒ OpAewUnaryFunc(pass1(a), 2.0 * _)
+ case "-" ⇒ OpAewUnaryFunc(pass1(a), (_) ⇒ 0.0)
+ case _ ⇒
+ require(false, s"Unsupported operator $op")
+ null
+ }
+ }
+ case OpAB(OpAt(a), b) if (a == b) ⇒ OpAtA(pass1(a))
+ case OpABAnyKey(OpAtAnyKey(a), b) if (a == b) ⇒ OpAtA(pass1(a))
// For now, rewrite left-multiply via transpositions, i.e.
// inCoreA %*% B = (B' %*% inCoreA')'
- case op@OpTimesLeftMatrix(a, b) =>
- OpAt(OpTimesRightMatrix(A = OpAt(pass1(b)), right = a.t))
+ case op@OpTimesLeftMatrix(a, b) ⇒
+ OpAt(OpTimesRightMatrix(A = OpAt(pass1(b)), right = a.t))
// Add vertical row index concatenation for rbind() on DrmLike[Int] fragments
- case op@OpRbind(a, b) if (implicitly[ClassTag[K]] == ClassTag.Int) =>
+ case op@OpRbind(a, b) if (implicitly[ClassTag[K]] == ClassTag.Int) ⇒
// Make sure closure sees only local vals, not attributes. We need to do these ugly casts
// around because compiler could not infer that K is the same as Int, based on if() above.
val ma = safeToNonNegInt(a.nrow)
- val bAdjusted = new OpMapBlock[Int, Int](
- A = pass1(b.asInstanceOf[DrmLike[Int]]),
- bmf = {
- case (keys, block) => keys.map(_ + ma) -> block
- },
- identicallyPartitioned = false
- )
+ val bAdjusted = new OpMapBlock[Int, Int](A = pass1(b.asInstanceOf[DrmLike[Int]]), bmf = {
+ case (keys, block) ⇒ keys.map(_ + ma) → block
+ }, identicallyPartitioned = false)
val aAdjusted = a.asInstanceOf[DrmLike[Int]]
OpRbind(pass1(aAdjusted), bAdjusted).asInstanceOf[DrmLike[K]]
// Stop at checkpoints
- case cd: CheckpointedDrm[_] => action
+ case cd: CheckpointedDrm[_] ⇒ action
// For everything else we just pass-thru the operator arguments to optimizer
- case uop: AbstractUnaryOp[_, K] =>
+ case uop: AbstractUnaryOp[_, K] ⇒
uop.A = pass1(uop.A)(uop.classTagA)
uop
- case bop: AbstractBinaryOp[_, _, K] =>
+
+ case bop: AbstractBinaryOp[_, _, K] ⇒
bop.A = pass1(bop.A)(bop.classTagA)
bop.B = pass1(bop.B)(bop.classTagB)
bop
@@ -160,17 +192,30 @@ object DistributedEngine {
/** This would remove stuff like A.t.t that previous step may have created */
private def pass2[K: ClassTag](action: DrmLike[K]): DrmLike[K] = {
action match {
+
+ // Fusion of unary funcs into single, like 1 + x * x.
+ // Since we repeating the pass over self after rewrite, we dont' need to descend into arguments
+ // recursively here.
+ case op1@OpAewUnaryFunc(op2@OpAewUnaryFunc(a, _, _), _, _) ⇒
+ pass2(OpAewUnaryFuncFusion(a, op1 :: op2 :: Nil))
+
+ // Fusion one step further, like 1 + 2 * x * x. All should be rewritten as one UnaryFuncFusion.
+ // Since we repeating the pass over self after rewrite, we dont' need to descend into arguments
+ // recursively here.
+ case op@OpAewUnaryFuncFusion(op2@OpAewUnaryFunc(a, _, _), _) ⇒
+ pass2(OpAewUnaryFuncFusion(a, op.ff :+ op2))
+
// A.t.t => A
- case OpAt(top@OpAt(a)) => pass2(a)(top.classTagA)
+ case OpAt(top@OpAt(a)) ⇒ pass2(a)(top.classTagA)
// Stop at checkpoints
- case cd: CheckpointedDrm[_] => action
+ case cd: CheckpointedDrm[_] ⇒ action
// For everything else we just pass-thru the operator arguments to optimizer
- case uop: AbstractUnaryOp[_, K] =>
+ case uop: AbstractUnaryOp[_, K] ⇒
uop.A = pass2(uop.A)(uop.classTagA)
uop
- case bop: AbstractBinaryOp[_, _, K] =>
+ case bop: AbstractBinaryOp[_, _, K] ⇒
bop.A = pass2(bop.A)(bop.classTagA)
bop.B = pass2(bop.B)(bop.classTagB)
bop
@@ -182,29 +227,29 @@ object DistributedEngine {
action match {
// matrix products.
- case OpAB(a, OpAt(b)) => OpABt(pass3(a), pass3(b))
+ case OpAB(a, OpAt(b)) ⇒ OpABt(pass3(a), pass3(b))
// AtB cases that make sense.
- case OpAB(OpAt(a), b) if (a.partitioningTag == b.partitioningTag) => OpAtB(pass3(a), pass3(b))
- case OpABAnyKey(OpAtAnyKey(a), b) => OpAtB(pass3(a), pass3(b))
+ case OpAB(OpAt(a), b) if (a.partitioningTag == b.partitioningTag) ⇒ OpAtB(pass3(a), pass3(b))
+ case OpABAnyKey(OpAtAnyKey(a), b) ⇒ OpAtB(pass3(a), pass3(b))
// Need some cost to choose between the following.
- case OpAB(OpAt(a), b) => OpAtB(pass3(a), pass3(b))
+ case OpAB(OpAt(a), b) ⇒ OpAtB(pass3(a), pass3(b))
// case OpAB(OpAt(a), b) => OpAt(OpABt(OpAt(pass1(b)), pass1(a)))
- case OpAB(a, b) => OpABt(pass3(a), OpAt(pass3(b)))
+ case OpAB(a, b) ⇒ OpABt(pass3(a), OpAt(pass3(b)))
// Rewrite A'x
- case op@OpAx(op1@OpAt(a), x) => OpAtx(pass3(a)(op1.classTagA), x)
+ case op@OpAx(op1@OpAt(a), x) ⇒ OpAtx(pass3(a)(op1.classTagA), x)
// Stop at checkpoints
- case cd: CheckpointedDrm[_] => action
+ case cd: CheckpointedDrm[_] ⇒ action
// For everything else we just pass-thru the operator arguments to optimizer
- case uop: AbstractUnaryOp[_, K] =>
+ case uop: AbstractUnaryOp[_, K] ⇒
uop.A = pass3(uop.A)(uop.classTagA)
uop
- case bop: AbstractBinaryOp[_, _, K] =>
+ case bop: AbstractBinaryOp[_, _, K] ⇒
bop.A = pass3(bop.A)(bop.classTagA)
bop.B = pass3(bop.B)(bop.classTagB)
bop
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmDoubleScalarOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmDoubleScalarOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmDoubleScalarOps.scala
index e5cf563..96ef893 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmDoubleScalarOps.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmDoubleScalarOps.scala
@@ -18,7 +18,11 @@
package org.apache.mahout.math.drm
import RLikeDrmOps._
-import scala.reflect.ClassTag
+import org.apache.mahout.math._
+import org.apache.mahout.math.drm.logical.OpCbindScalar
+import scalabindings._
+import RLikeOps._
+import reflect.ClassTag
class DrmDoubleScalarOps(val x:Double) extends AnyVal{
@@ -30,4 +34,6 @@ class DrmDoubleScalarOps(val x:Double) extends AnyVal{
def /[K:ClassTag](that:DrmLike[K]) = x /: that
+ def cbind[K: ClassTag](that: DrmLike[K]) = OpCbindScalar(A = that, x = x, leftBind = true)
+
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmLikeOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmLikeOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmLikeOps.scala
index bc937d6..19432d0 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmLikeOps.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmLikeOps.scala
@@ -49,7 +49,7 @@ class DrmLikeOps[K: ClassTag](protected[drm] val drm: DrmLike[K]) {
* is applied.
*/
def par(min: Int = -1, exact: Int = -1, auto: Boolean = false) = {
- assert(min >= 0 || exact >= 0 || auto, "Invalid argument")
+ require(min > 0 || exact > 0 || auto, "Invalid argument")
OpPar(drm, minSplits = min, exactSplits = exact)
}
@@ -65,16 +65,15 @@ class DrmLikeOps[K: ClassTag](protected[drm] val drm: DrmLike[K]) {
* @tparam R
* @return
*/
- def mapBlock[R: ClassTag](ncol: Int = -1, identicallyParitioned: Boolean = true)
+ def mapBlock[R: ClassTag](ncol: Int = -1, identicallyPartitioned: Boolean = true)
(bmf: BlockMapFunc[K, R]): DrmLike[R] =
new OpMapBlock[K, R](
A = drm,
bmf = bmf,
_ncol = ncol,
- identicallyPartitioned = identicallyParitioned
+ identicallyPartitioned = identicallyPartitioned
)
-
/**
* Slicing the DRM. Should eventually work just like in-core drm (e.g. A(0 until 5, 5 until 15)).<P>
*
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/drm/RLikeDrmOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/RLikeDrmOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/RLikeDrmOps.scala
index 380f4eb..7927e51 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/RLikeDrmOps.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/RLikeDrmOps.scala
@@ -18,12 +18,17 @@
package org.apache.mahout.math.drm
import scala.reflect.ClassTag
+import collection._
+import JavaConversions._
import org.apache.mahout.math.{Vector, Matrix}
import org.apache.mahout.math.drm.logical._
+import org.apache.mahout.math.scalabindings._
+import RLikeOps._
class RLikeDrmOps[K: ClassTag](drm: DrmLike[K]) extends DrmLikeOps[K](drm) {
import RLikeDrmOps._
+ import org.apache.mahout.math.scalabindings._
def +(that: DrmLike[K]): DrmLike[K] = OpAewB[K](A = this, B = that, op = "+")
@@ -33,21 +38,23 @@ class RLikeDrmOps[K: ClassTag](drm: DrmLike[K]) extends DrmLikeOps[K](drm) {
def /(that: DrmLike[K]): DrmLike[K] = OpAewB[K](A = this, B = that, op = "/")
- def +(that: Double): DrmLike[K] = OpAewScalar[K](A = this, scalar = that, op = "+")
+ def +(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = _ + that, evalZeros = true)
- def +:(that: Double): DrmLike[K] = OpAewScalar[K](A = this, scalar = that, op = "+")
+ def +:(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = that + _, evalZeros = true)
- def -(that: Double): DrmLike[K] = OpAewScalar[K](A = this, scalar = that, op = "-")
+ def -(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = _ - that, evalZeros = true)
- def -:(that: Double): DrmLike[K] = OpAewScalar[K](A = this, scalar = that, op = "-:")
+ def -:(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = that - _, evalZeros = true)
- def *(that: Double): DrmLike[K] = OpAewScalar[K](A = this, scalar = that, op = "*")
+ def *(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = _ * that)
- def *:(that: Double): DrmLike[K] = OpAewScalar[K](A = this, scalar = that, op = "*")
+ def *:(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = that * _)
- def /(that: Double): DrmLike[K] = OpAewScalar[K](A = this, scalar = that, op = "/")
+ def ^(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = math.pow(_, that))
- def /:(that: Double): DrmLike[K] = OpAewScalar[K](A = this, scalar = that, op = "/:")
+ def /(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = _ / that, evalZeros = that == 0.0)
+
+ def /:(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = that / _, evalZeros = true)
def :%*%(that: DrmLike[Int]): DrmLike[K] = OpAB[K](A = this.drm, B = that)
@@ -65,18 +72,36 @@ class RLikeDrmOps[K: ClassTag](drm: DrmLike[K]) extends DrmLikeOps[K](drm) {
def t: DrmLike[Int] = OpAtAnyKey(A = drm)
- def cbind(that: DrmLike[K]) = OpCbind(A = this.drm, B = that)
+ def cbind(that: DrmLike[K]): DrmLike[K] = OpCbind(A = this.drm, B = that)
+
+ def cbind(that: Double): DrmLike[K] = OpCbindScalar(A = this.drm, x = that, leftBind = false)
+
+ def rbind(that: DrmLike[K]): DrmLike[K] = OpRbind(A = this.drm, B = that)
- def rbind(that: DrmLike[K]) = OpRbind(A = this.drm, B = that)
+ /**
+ * `rowSums` method for non-int keyed matrices.
+ *
+ * Slight problem here is the limitation of in-memory representation of Colt's Matrix, which can
+ * only have String row labels. Therefore, internally we do ".toString()" call on each key object,
+ * and then put it into [[Matrix]] row label bindings, at which point they are coerced to be Strings.
+ *
+ * This is obviously a suboptimal behavior, so as TODO we have here future enhancements of `collect'.
+ *
+ * @return map of row keys into row sums, front-end collected.
+ */
+ def rowSumsMap(): Map[String, Double] = {
+ val m = drm.mapBlock(ncol = 1) { case (keys, block) =>
+ keys -> dense(block.rowSums).t
+ }.collect
+ m.getRowLabelBindings.map { case (key, idx) => key -> m(idx, 0)}
+ }
}
class RLikeDrmIntOps(drm: DrmLike[Int]) extends RLikeDrmOps[Int](drm) {
import org.apache.mahout.math._
import scalabindings._
- import RLikeOps._
import RLikeDrmOps._
- import scala.collection.JavaConversions._
override def t: DrmLike[Int] = OpAt(A = drm)
@@ -108,7 +133,7 @@ class RLikeDrmIntOps(drm: DrmLike[Int]) extends RLikeDrmOps[Int](drm) {
// Collect block-wise row means and output them as one-column matrix.
keys -> dense(block.rowMeans).t
}
- .collect(::, 0)
+ .collect(::, 0)
}
/** Return diagonal vector */
@@ -117,14 +142,14 @@ class RLikeDrmIntOps(drm: DrmLike[Int]) extends RLikeDrmOps[Int](drm) {
drm.mapBlock(ncol = 1) { case (keys, block) =>
keys -> dense(for (r <- block.view) yield r(keys(r.index))).t
}
- .collect(::, 0)
+ .collect(::, 0)
}
}
object RLikeDrmOps {
- implicit def double2ScalarOps(x:Double) = new DrmDoubleScalarOps(x)
+ implicit def double2ScalarOps(x: Double) = new DrmDoubleScalarOps(x)
implicit def drmInt2RLikeOps(drm: DrmLike[Int]): RLikeDrmIntOps = new RLikeDrmIntOps(drm)
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/AbstractUnaryOp.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/AbstractUnaryOp.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/AbstractUnaryOp.scala
index a445f21..60b2c77 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/AbstractUnaryOp.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/AbstractUnaryOp.scala
@@ -24,7 +24,7 @@ import org.apache.mahout.math.drm.{DistributedContext, DrmLike}
abstract class AbstractUnaryOp[A: ClassTag, K: ClassTag]
extends CheckpointAction[K] with DrmLike[K] {
- protected[drm] var A: DrmLike[A]
+ protected[mahout] var A: DrmLike[A]
lazy val context: DistributedContext = A.context
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/CheckpointAction.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/CheckpointAction.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/CheckpointAction.scala
index aa3a3b9..a7934a3 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/CheckpointAction.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/CheckpointAction.scala
@@ -37,7 +37,8 @@ abstract class CheckpointAction[K: ClassTag] extends DrmLike[K] {
*/
def checkpoint(cacheHint: CacheHint.CacheHint): CheckpointedDrm[K] = cp match {
case None =>
- val physPlan = context.toPhysical(context.optimizerRewrite(this), cacheHint)
+ val plan = context.optimizerRewrite(this)
+ val physPlan = context.toPhysical(plan, cacheHint)
cp = Some(physPlan)
physPlan
case Some(cp) => cp
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewScalar.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewScalar.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewScalar.scala
index 19a910c..dbcb366 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewScalar.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewScalar.scala
@@ -21,7 +21,11 @@ import scala.reflect.ClassTag
import org.apache.mahout.math.drm.DrmLike
import scala.util.Random
-/** Operator denoting expressions like 5.0 - A or A * 5.6 */
+/**
+ * Operator denoting expressions like 5.0 - A or A * 5.6
+ *
+ * @deprecated use [[OpAewUnaryFunc]] instead
+ */
case class OpAewScalar[K: ClassTag](
override var A: DrmLike[K],
val scalar: Double,
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFunc.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFunc.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFunc.scala
new file mode 100644
index 0000000..71489ab
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFunc.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.drm.logical
+
+import scala.reflect.ClassTag
+import org.apache.mahout.math.drm.DrmLike
+import scala.util.Random
+
+/**
+ * @author dmitriy
+ */
+case class OpAewUnaryFunc[K: ClassTag](
+ override var A: DrmLike[K],
+ val f: (Double) => Double,
+ val evalZeros:Boolean = false
+ ) extends AbstractUnaryOp[K,K] with TEwFunc {
+
+ override protected[mahout] lazy val partitioningTag: Long =
+ if (A.canHaveMissingRows)
+ Random.nextLong()
+ else A.partitioningTag
+
+ /** Stuff like `A +1` is always supposed to fix this */
+ override protected[mahout] lazy val canHaveMissingRows: Boolean = false
+
+ /** R-like syntax for number of rows. */
+ def nrow: Long = A.nrow
+
+ /** R-like syntax for number of columns */
+ def ncol: Int = A.ncol
+}
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFuncFusion.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFuncFusion.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFuncFusion.scala
new file mode 100644
index 0000000..ed95f4f
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFuncFusion.scala
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.drm.logical
+
+import scala.reflect.ClassTag
+import org.apache.mahout.math.drm.DrmLike
+import scala.util.Random
+import collection._
+
+/**
+ * Composition of unary elementwise functions.
+ */
+case class OpAewUnaryFuncFusion[K: ClassTag](
+ override var A: DrmLike[K],
+ var ff:List[OpAewUnaryFunc[K]] = Nil
+ ) extends AbstractUnaryOp[K,K] with TEwFunc {
+
+ override protected[mahout] lazy val partitioningTag: Long =
+ if (A.canHaveMissingRows)
+ Random.nextLong()
+ else A.partitioningTag
+
+ /** Stuff like `A +1` is always supposed to fix this */
+ override protected[mahout] lazy val canHaveMissingRows: Boolean = false
+
+ /** R-like syntax for number of rows. */
+ def nrow: Long = A.nrow
+
+ /** R-like syntax for number of columns */
+ def ncol: Int = A.ncol
+
+ /** Apply to degenerate elements? */
+ override def evalZeros: Boolean = ff.exists(_.evalZeros)
+
+ /** the function itself */
+ override def f: (Double) => Double = {
+
+ // Make sure composed collection becomes an attribute of this closure because we will be sending
+ // it to the backend.
+ val composedFunc = ff.map(_.f)
+
+ // Create functional closure and return.
+ (x: Double) => (composedFunc :\ x) { case (f, xarg) => f(xarg)}
+
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbind.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbind.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbind.scala
index 1425264..0598551 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbind.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbind.scala
@@ -17,7 +17,7 @@
package org.apache.mahout.math.drm.logical
-import scala.reflect.ClassTag
+import reflect.ClassTag
import org.apache.mahout.math.drm.DrmLike
import scala.util.Random
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbindScalar.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbindScalar.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbindScalar.scala
new file mode 100644
index 0000000..5aee518
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbindScalar.scala
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.math.drm.logical
+
+import reflect.ClassTag
+import org.apache.mahout.math.drm.DrmLike
+
+case class OpCbindScalar[K:ClassTag](
+ override var A:DrmLike[K],
+ var x:Double,
+ val leftBind:Boolean ) extends AbstractUnaryOp[K,K] {
+
+ override protected[mahout] lazy val canHaveMissingRows: Boolean = false
+
+ override protected[mahout] lazy val partitioningTag: Long = A.partitioningTag
+
+ /** R-like syntax for number of rows. */
+ def nrow: Long = A.nrow
+
+ /** R-like syntax for number of columns */
+ def ncol: Int = A.ncol + 1
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpMapBlock.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpMapBlock.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpMapBlock.scala
index 7299d9e..a1cd718 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpMapBlock.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpMapBlock.scala
@@ -23,7 +23,7 @@ import RLikeOps._
import org.apache.mahout.math.drm.{BlockMapFunc, DrmLike}
import scala.util.Random
-class OpMapBlock[S: ClassTag, R: ClassTag](
+case class OpMapBlock[S: ClassTag, R: ClassTag](
override var A: DrmLike[S],
val bmf: BlockMapFunc[S, R],
val _ncol: Int = -1,
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/TEwFunc.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/TEwFunc.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/TEwFunc.scala
new file mode 100644
index 0000000..0eb5f65
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/TEwFunc.scala
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.drm.logical
+
+/**
+ * Trait denoting logical operators providing elementwise operations that work as unary operators
+ * on each element of a matrix.
+ */
+trait TEwFunc {
+
+ /** Apply to degenerate elments? */
+ def evalZeros: Boolean
+
+ /** the function itself */
+ def f: (Double) => Double
+
+ /**
+ * Self assignment ok? If yes, may cause side effects if works off non-serialized cached object
+ * tree!
+ */
+ def selfAssignOk: Boolean = false
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/drm/package.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/package.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/package.scala
index 1fae831..d865b58 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/package.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/package.scala
@@ -23,6 +23,8 @@ import org.apache.mahout.math.scalabindings.RLikeOps._
import org.apache.mahout.math.scalabindings._
import scala.reflect.ClassTag
+import org.apache.mahout.math.drm.logical.OpAewUnaryFunc
+import collection._
package object drm {
@@ -34,7 +36,11 @@ package object drm {
/** Block-map func */
- type BlockMapFunc[S, R] = BlockifiedDrmTuple[S] => BlockifiedDrmTuple[R]
+ type BlockMapFunc[S, R] = BlockifiedDrmTuple[S] ⇒ BlockifiedDrmTuple[R]
+
+ type BlockMapFunc2[S] = BlockifiedDrmTuple[S] ⇒ Matrix
+
+ type BlockReduceFunc = (Matrix, Matrix) ⇒ Matrix
/** CacheHint type */
// type CacheHint = CacheHint.CacheHint
@@ -92,7 +98,7 @@ package object drm {
implicit def drm2InCore[K: ClassTag](drm: DrmLike[K]): Matrix = drm.collect
/** Do vertical concatenation of collection of blockified tuples */
- def rbind[K: ClassTag](blocks: Iterable[BlockifiedDrmTuple[K]]): BlockifiedDrmTuple[K] = {
+ private[mahout] def rbind[K: ClassTag](blocks: Iterable[BlockifiedDrmTuple[K]]): BlockifiedDrmTuple[K] = {
assert(blocks.nonEmpty, "rbind: 0 blocks passed in")
if (blocks.size == 1) {
// No coalescing required.
@@ -115,6 +121,46 @@ package object drm {
}
}
+ /**
+ * Convert arbitrarily-keyed matrix to int-keyed matrix. Some algebra will accept only int-numbered
+ * row matrices. So this method is to help.
+ *
+ * @param drmX input to be transcoded
+ * @param computeMap collect `old key -> int key` map to front-end?
+ * @tparam K key type
+ * @return Sequentially keyed matrix + (optionally) map from non-int key to [[Int]] key. If the
+ * key type is actually Int, then we just return the argument with None for the map,
+ * regardless of computeMap parameter.
+ */
+ def drm2IntKeyed[K: ClassTag](drmX: DrmLike[K], computeMap: Boolean = false): (DrmLike[Int], Option[DrmLike[K]]) =
+ drmX.context.engine.drm2IntKeyed(drmX, computeMap)
+
+ /**
+ * (Optional) Sampling operation. Consistent with Spark semantics of the same.
+ * @param drmX
+ * @param fraction
+ * @param replacement
+ * @tparam K
+ * @return samples
+ */
+ def drmSampleRows[K: ClassTag](drmX: DrmLike[K], fraction: Double, replacement: Boolean = false): DrmLike[K] =
+ drmX.context.engine.drmSampleRows(drmX, fraction, replacement)
+
+ def drmSampleKRows[K: ClassTag](drmX: DrmLike[K], numSamples: Int, replacement: Boolean = false): Matrix =
+ drmX.context.engine.drmSampleKRows(drmX, numSamples, replacement)
+
+ ///////////////////////////////////////////////////////////
+ // Elementwise unary functions on distributed operands.
+ def dexp[K: ClassTag](drmA: DrmLike[K]): DrmLike[K] = new OpAewUnaryFunc[K](drmA, math.exp, true)
+
+ def dlog[K: ClassTag](drmA: DrmLike[K]): DrmLike[K] = new OpAewUnaryFunc[K](drmA, math.log, true)
+
+ def dabs[K: ClassTag](drmA: DrmLike[K]): DrmLike[K] = new OpAewUnaryFunc[K](drmA, math.abs)
+
+ def dsqrt[K: ClassTag](drmA: DrmLike[K]): DrmLike[K] = new OpAewUnaryFunc[K](drmA, math.sqrt)
+
+ def dsignum[K: ClassTag](drmA: DrmLike[K]): DrmLike[K] = new OpAewUnaryFunc[K](drmA, math.signum)
+
}
package object indexeddataset {
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/DoubleScalarOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/DoubleScalarOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/DoubleScalarOps.scala
deleted file mode 100644
index 9fdd6e5..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/DoubleScalarOps.scala
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.math.scalabindings
-
-import org.apache.mahout.math._
-
-class DoubleScalarOps(val x:Double) extends AnyVal{
-
- import RLikeOps._
-
- def +(that:Matrix) = that + x
-
- def +(that:Vector) = that + x
-
- def *(that:Matrix) = that * x
-
- def *(that:Vector) = that * x
-
- def -(that:Matrix) = x -: that
-
- def -(that:Vector) = x -: that
-
- def /(that:Matrix) = x /: that
-
- def /(that:Vector) = x /: that
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MMul.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MMul.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MMul.scala
new file mode 100644
index 0000000..d0fd393
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MMul.scala
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.scalabindings
+
+import org.apache.mahout.math._
+import org.apache.mahout.math.flavor.{BackEnum, TraversingStructureEnum}
+import org.apache.mahout.math.function.Functions
+import RLikeOps._
+import org.apache.mahout.logging._
+
+import scala.collection.JavaConversions._
+import scala.collection._
+
+object MMul extends MMBinaryFunc {
+
+ private final implicit val log = getLog(MMul.getClass)
+
+ override def apply(a: Matrix, b: Matrix, r: Option[Matrix]): Matrix = {
+
+ require(a.ncol == b.nrow, "Incompatible matrix sizes in matrix multiplication.")
+
+ val (af, bf) = (a.getFlavor, b.getFlavor)
+ val backs = (af.getBacking, bf.getBacking)
+ val sd = (af.getStructure, af.isDense, bf.getStructure, bf.isDense)
+
+ val alg: MMulAlg = backs match {
+
+ // Both operands are jvm memory backs.
+ case (BackEnum.JVMMEM, BackEnum.JVMMEM) ⇒
+
+ sd match {
+
+ // Multiplication cases by a diagonal matrix.
+ case (TraversingStructureEnum.VECTORBACKED, _, TraversingStructureEnum.COLWISE, _) if (a
+ .isInstanceOf[DiagonalMatrix]) ⇒ jvmDiagCW _
+ case (TraversingStructureEnum.VECTORBACKED, _, TraversingStructureEnum.SPARSECOLWISE, _) if (a
+ .isInstanceOf[DiagonalMatrix]) ⇒ jvmDiagCW _
+ case (TraversingStructureEnum.VECTORBACKED, _, TraversingStructureEnum.ROWWISE, _) if (a
+ .isInstanceOf[DiagonalMatrix]) ⇒ jvmDiagRW _
+ case (TraversingStructureEnum.VECTORBACKED, _, TraversingStructureEnum.SPARSEROWWISE, _) if (a
+ .isInstanceOf[DiagonalMatrix]) ⇒ jvmDiagRW _
+
+ case (TraversingStructureEnum.COLWISE, _, TraversingStructureEnum.VECTORBACKED, _) if (b
+ .isInstanceOf[DiagonalMatrix]) ⇒ jvmCWDiag _
+ case (TraversingStructureEnum.SPARSECOLWISE, _, TraversingStructureEnum.VECTORBACKED, _) if (b
+ .isInstanceOf[DiagonalMatrix]) ⇒ jvmCWDiag _
+ case (TraversingStructureEnum.ROWWISE, _, TraversingStructureEnum.VECTORBACKED, _) if (b
+ .isInstanceOf[DiagonalMatrix]) ⇒ jvmRWDiag _
+ case (TraversingStructureEnum.SPARSEROWWISE, _, TraversingStructureEnum.VECTORBACKED, _) if (b
+ .isInstanceOf[DiagonalMatrix]) ⇒ jvmRWDiag _
+
+ // Dense-dense cases
+ case (TraversingStructureEnum.ROWWISE, true, TraversingStructureEnum.COLWISE, true) if (a eq b.t) ⇒ jvmDRWAAt _
+ case (TraversingStructureEnum.ROWWISE, true, TraversingStructureEnum.COLWISE, true) if (a.t eq b) ⇒ jvmDRWAAt _
+ case (TraversingStructureEnum.ROWWISE, true, TraversingStructureEnum.COLWISE, true) ⇒ jvmRWCW
+ case (TraversingStructureEnum.ROWWISE, true, TraversingStructureEnum.ROWWISE, true) ⇒ jvmRWRW
+ case (TraversingStructureEnum.COLWISE, true, TraversingStructureEnum.COLWISE, true) ⇒ jvmCWCW
+ case (TraversingStructureEnum.COLWISE, true, TraversingStructureEnum.ROWWISE, true) if ( a eq b.t) ⇒ jvmDCWAAt _
+ case (TraversingStructureEnum.COLWISE, true, TraversingStructureEnum.ROWWISE, true) if ( a.t eq b) ⇒ jvmDCWAAt _
+ case (TraversingStructureEnum.COLWISE, true, TraversingStructureEnum.ROWWISE, true) ⇒ jvmCWRW
+
+ // Sparse row matrix x sparse row matrix (array of vectors)
+ case (TraversingStructureEnum.ROWWISE, false, TraversingStructureEnum.ROWWISE, false) ⇒ jvmSparseRWRW
+ case (TraversingStructureEnum.ROWWISE, false, TraversingStructureEnum.COLWISE, false) ⇒ jvmSparseRWCW
+ case (TraversingStructureEnum.COLWISE, false, TraversingStructureEnum.ROWWISE, false) ⇒ jvmSparseCWRW
+ case (TraversingStructureEnum.COLWISE, false, TraversingStructureEnum.COLWISE, false) ⇒ jvmSparseCWCW
+
+ // Sparse matrix x sparse matrix (hashtable of vectors)
+ case (TraversingStructureEnum.SPARSEROWWISE, false, TraversingStructureEnum.SPARSEROWWISE, false) ⇒
+ jvmSparseRowRWRW
+ case (TraversingStructureEnum.SPARSEROWWISE, false, TraversingStructureEnum.SPARSECOLWISE, false) ⇒
+ jvmSparseRowRWCW
+ case (TraversingStructureEnum.SPARSECOLWISE, false, TraversingStructureEnum.SPARSEROWWISE, false) ⇒
+ jvmSparseRowCWRW
+ case (TraversingStructureEnum.SPARSECOLWISE, false, TraversingStructureEnum.SPARSECOLWISE, false) ⇒
+ jvmSparseRowCWCW
+
+ // Sparse matrix x non-like
+ case (TraversingStructureEnum.SPARSEROWWISE, false, TraversingStructureEnum.ROWWISE, _) ⇒ jvmSparseRowRWRW
+ case (TraversingStructureEnum.SPARSEROWWISE, false, TraversingStructureEnum.COLWISE, _) ⇒ jvmSparseRowRWCW
+ case (TraversingStructureEnum.SPARSECOLWISE, false, TraversingStructureEnum.ROWWISE, _) ⇒ jvmSparseRowCWRW
+ case (TraversingStructureEnum.SPARSECOLWISE, false, TraversingStructureEnum.COLWISE, _) ⇒ jvmSparseCWCW
+ case (TraversingStructureEnum.ROWWISE, _, TraversingStructureEnum.SPARSEROWWISE, false) ⇒ jvmSparseRWRW
+ case (TraversingStructureEnum.ROWWISE, _, TraversingStructureEnum.SPARSECOLWISE, false) ⇒ jvmSparseRWCW
+ case (TraversingStructureEnum.COLWISE, _, TraversingStructureEnum.SPARSEROWWISE, false) ⇒ jvmSparseCWRW
+ case (TraversingStructureEnum.COLWISE, _, TraversingStructureEnum.SPARSECOLWISE, false) ⇒ jvmSparseRowCWCW
+
+ // Everything else including at least one sparse LHS or RHS argument
+ case (TraversingStructureEnum.ROWWISE, false, TraversingStructureEnum.ROWWISE, _) ⇒ jvmSparseRWRW
+ case (TraversingStructureEnum.ROWWISE, false, TraversingStructureEnum.COLWISE, _) ⇒ jvmSparseRWCW
+ case (TraversingStructureEnum.COLWISE, false, TraversingStructureEnum.ROWWISE, _) ⇒ jvmSparseCWRW
+ case (TraversingStructureEnum.COLWISE, false, TraversingStructureEnum.COLWISE, _) ⇒ jvmSparseCWCW2flips
+
+ // Sparse methods are only effective if the first argument is sparse, so we need to do a swap.
+ case (_, _, _, false) ⇒ { (a, b, r) ⇒ apply(b.t, a.t, r.map {_.t}).t }
+
+ // Default jvm-jvm case.
+ case _ ⇒ jvmRWCW
+ }
+ }
+
+ alg(a, b, r)
+ }
+
+ type MMulAlg = MMBinaryFunc
+
+ @inline
+ private def jvmRWCW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+
+ require(r.forall(mxR ⇒ mxR.nrow == a.nrow && mxR.ncol == b.ncol))
+ val (m, n) = (a.nrow, b.ncol)
+
+ val mxR = r.getOrElse(if (a.getFlavor.isDense) a.like(m, n) else b.like(m, n))
+
+ for (row ← 0 until mxR.nrow; col ← 0 until mxR.ncol) {
+ // this vector-vector should be sort of optimized, right?
+ mxR(row, col) = a(row, ::) dot b(::, col)
+ }
+ mxR
+ }
+
+
+ @inline
+ private def jvmRWRW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+
+ // A bit hackish: currently, this relies a bit on the fact that like produces RW(?)
+ val bclone = b.like(b.ncol, b.nrow).t
+ for (brow ← b) bclone(brow.index(), ::) := brow
+
+ require(bclone.getFlavor.getStructure == TraversingStructureEnum.COLWISE || bclone.getFlavor.getStructure ==
+ TraversingStructureEnum.SPARSECOLWISE, "COL wise conversion assumption of RHS is wrong, do over this code.")
+
+ jvmRWCW(a, bclone, r)
+ }
+
+ private def jvmCWCW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+ jvmRWRW(b.t, a.t, r.map(_.t)).t
+ }
+
+ private def jvmCWRW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+ // This is a primary contender with Outer Prod sum algo.
+ // Here, we force-reorient both matrices and run RWCW.
+ // A bit hackish: currently, this relies a bit on the fact that clone always produces RW(?)
+ val aclone = a.cloned
+
+ require(aclone.getFlavor.getStructure == TraversingStructureEnum.ROWWISE || aclone.getFlavor.getStructure ==
+ TraversingStructureEnum.SPARSEROWWISE, "Row wise conversion assumption of RHS is wrong, do over this code.")
+
+ jvmRWRW(aclone, b, r)
+ }
+
+ private def jvmSparseRWRW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+ val mxR = r.getOrElse(b.like(a.nrow, b.ncol))
+
+ // This is basically almost the algorithm from SparseMatrix.times
+ for (arow ← a; ael ← arow.nonZeroes)
+ mxR(arow.index(), ::).assign(b(ael.index, ::), Functions.plusMult(ael))
+
+ mxR
+ }
+
+ private def jvmSparseRowRWRW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+ val mxR = r.getOrElse(b.like(a.nrow, b.ncol))
+ for (arow ← a.iterateNonEmpty(); ael ← arow.vector.nonZeroes)
+ mxR(arow.index(), ::).assign(b(ael.index, ::), Functions.plusMult(ael))
+
+ mxR
+ }
+
+ private def jvmSparseRowCWCW(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
+ jvmSparseRowRWRW(b.t, a.t, r.map(_.t)).t
+
+ private def jvmSparseRowCWCW2flips(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
+ jvmSparseRowRWRW(a cloned, b cloned, r)
+
+ private def jvmSparseRowRWCW(a: Matrix, b: Matrix, r: Option[Matrix]) =
+ jvmSparseRowRWRW(a, b cloned, r)
+
+
+ private def jvmSparseRowCWRW(a: Matrix, b: Matrix, r: Option[Matrix]) =
+ jvmSparseRowRWRW(a cloned, b, r)
+
+ private def jvmSparseRWCW(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
+ jvmSparseRWRW(a, b.cloned, r)
+
+ private def jvmSparseCWRW(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
+ jvmSparseRWRW(a cloned, b, r)
+
+ private def jvmSparseCWCW(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
+ jvmSparseRWRW(b.t, a.t, r.map(_.t)).t
+
+ private def jvmSparseCWCW2flips(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
+ jvmSparseRWRW(a cloned, b cloned, r)
+
+ private def jvmDiagRW(diagm:Matrix, b:Matrix, r:Option[Matrix] = None):Matrix = {
+ val mxR = r.getOrElse(b.like(diagm.nrow, b.ncol))
+
+ for (del ← diagm.diagv.nonZeroes())
+ mxR(del.index, ::).assign(b(del.index, ::), Functions.plusMult(del))
+
+ mxR
+ }
+
+ private def jvmDiagCW(diagm: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+ val mxR = r.getOrElse(b.like(diagm.nrow, b.ncol))
+ for (bcol ← b.t) mxR(::, bcol.index()) := bcol * diagm.diagv
+ mxR
+ }
+
+ private def jvmCWDiag(a: Matrix, diagm: Matrix, r: Option[Matrix] = None) =
+ jvmDiagRW(diagm, a.t, r.map {_.t}).t
+
+ private def jvmRWDiag(a: Matrix, diagm: Matrix, r: Option[Matrix] = None) =
+ jvmDiagCW(diagm, a.t, r.map {_.t}).t
+
+
+ /** Dense column-wise AA' */
+ private def jvmDCWAAt(a:Matrix, b:Matrix, r:Option[Matrix] = None) = {
+ // a.t must be equiv. to b. Cloning must rewrite to row-wise.
+ jvmDRWAAt(a.cloned,null,r)
+ }
+
+ /** Dense Row-wise AA' */
+ private def jvmDRWAAt(a:Matrix, b:Matrix, r:Option[Matrix] = None) = {
+ // a.t must be equiv to b.
+
+ debug("AAt computation detected.")
+
+ // Check dimensions if result is supplied.
+ require(r.forall(mxR ⇒ mxR.nrow == a.nrow && mxR.ncol == a.nrow))
+
+ val mxR = r.getOrElse(a.like(a.nrow, a.nrow))
+
+ // This is symmetric computation. Compile upper triangular first.
+ for (row ← 0 until mxR.nrow) {
+ // diagonal value
+ mxR(row, row) = a(row, ::).aggregate(Functions.PLUS, Functions.SQUARE)
+
+ for ( col ← row + 1 until mxR.ncol) {
+ // this vector-vector should be sort of optimized, right?
+ val v = a(row, ::) dot a(col, ::)
+
+ mxR(row, col) = v
+ mxR(col,row) = v
+ }
+ }
+
+ mxR
+ }
+
+ private def jvmOuterProdSum(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+
+ // This may be already laid out for outer product computation, which may be faster than reorienting
+ // both matrices? need to check.
+ val (m, n) = (a.nrow, b.ncol)
+
+ // Prefer col-wise result iff a is dense and b is sparse. In all other cases default to row-wise.
+ val preferColWiseR = a.getFlavor.isDense && !b.getFlavor.isDense
+
+ val mxR = r.getOrElse {
+ (a.getFlavor.isDense, preferColWiseR) match {
+ case (false, false) ⇒ b.like(m, n)
+ case (false, true) ⇒ b.like(n, m).t
+ case (true, false) ⇒ a.like(m, n)
+ case (true, true) ⇒ a.like(n, m).t
+ }
+ }
+
+ // Loop outer products
+ if (preferColWiseR) {
+ // this means B is sparse and A is not, so we need to iterate over b values and update R columns with +=
+ // one at a time.
+ for ((acol, brow) ← a.t.zip(b); bel ← brow.nonZeroes) mxR(::, bel.index()) += bel * acol
+ } else {
+ for ((acol, brow) ← a.t.zip(b); ael ← acol.nonZeroes()) mxR(ael.index(), ::) += ael * brow
+ }
+
+ mxR
+ }
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatrixOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatrixOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatrixOps.scala
index 910035f..3c0ae89 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatrixOps.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatrixOps.scala
@@ -17,8 +17,10 @@
package org.apache.mahout.math.scalabindings
+import org.apache.mahout.math.flavor.TraversingStructureEnum
import org.apache.mahout.math.{Matrices, QRDecomposition, Vector, Matrix}
-import scala.collection.JavaConversions._
+import collection._
+import JavaConversions._
import org.apache.mahout.math.function.{DoubleDoubleFunction, VectorFunction, DoubleFunction, Functions}
import scala.math._
@@ -41,6 +43,10 @@ class MatrixOps(val m: Matrix) {
def +=(that: Matrix) = m.assign(that, Functions.PLUS)
+ def +=:(that:Matrix) = m += that
+
+ def +=:(that:Double) = m += that
+
def -=(that: Matrix) = m.assign(that, Functions.MINUS)
def +=(that: Double) = m.assign(new DoubleFunction {
@@ -70,24 +76,30 @@ class MatrixOps(val m: Matrix) {
def -:(that: Double) = that -=: cloned
-
- def norm = sqrt(m.aggregate(Functions.PLUS, Functions.SQUARE))
+ def norm = math.sqrt(m.aggregate(Functions.PLUS, Functions.SQUARE))
def pnorm(p: Int) = pow(m.aggregate(Functions.PLUS, Functions.chain(Functions.ABS, Functions.pow(p))), 1.0 / p)
def apply(row: Int, col: Int) = m.get(row, col)
- def update(row: Int, col: Int, v: Double): Matrix = {
- m.setQuick(row, col, v);
+ def update(row: Int, col: Int, that: Double): Matrix = {
+ m.setQuick(row, col, that);
m
}
+ def update(rowRange: Range, colRange: Range, that: Double) = apply(rowRange, colRange) := that
+
+ def update(row: Int, colRange: Range, that: Double) = apply(row, colRange) := that
+
+ def update(rowRange: Range, col: Int, that: Double) = apply(rowRange, col) := that
+
def update(rowRange: Range, colRange: Range, that: Matrix) = apply(rowRange, colRange) := that
def update(row: Int, colRange: Range, that: Vector) = apply(row, colRange) := that
def update(rowRange: Range, col: Int, that: Vector) = apply(rowRange, col) := that
-
+
+
def apply(rowRange: Range, colRange: Range): Matrix = {
if (rowRange == :: &&
@@ -140,12 +152,60 @@ class MatrixOps(val m: Matrix) {
})
}
+ def :=(that: Double) = m.assign(that)
+
def :=(f: (Int, Int, Double) => Double): Matrix = {
- for (r <- 0 until nrow; c <- 0 until ncol) m(r, c) = f(r, c, m(r, c))
+ import RLikeOps._
+ m.getFlavor.getStructure match {
+ case TraversingStructureEnum.COLWISE | TraversingStructureEnum.SPARSECOLWISE =>
+ for (col <- t; el <- col.all) el := f(el.index, col.index, el)
+ case default =>
+ for (row <- m; el <- row.all) el := f(row.index, el.index, el)
+ }
+ m
+ }
+
+ /** Functional assign with (Double) => Double */
+ def :=(f: (Double) => Double): Matrix = {
+ import RLikeOps._
+ m.getFlavor.getStructure match {
+ case TraversingStructureEnum.COLWISE | TraversingStructureEnum.SPARSECOLWISE =>
+ for (col <- t; el <- col.all) el := f(el)
+ case default =>
+ for (row <- m; el <- row.all) el := f(el)
+ }
m
}
- def cloned: Matrix = m.like := m
+ /** Sparse assign: iterate and assign over non-zeros only */
+ def ::=(f: (Int, Int, Double) => Double): Matrix = {
+
+ import RLikeOps._
+
+ m.getFlavor.getStructure match {
+ case TraversingStructureEnum.COLWISE | TraversingStructureEnum.SPARSECOLWISE =>
+ for (col <- t; el <- col.nonZeroes) el := f(el.index, col.index, el)
+ case default =>
+ for (row <- m; el <- row.nonZeroes) el := f(row.index, el.index, el)
+ }
+ m
+ }
+
+ /** Sparse function assign: iterate and assign over non-zeros only */
+ def ::=(f: (Double) => Double): Matrix = {
+
+ import RLikeOps._
+
+ m.getFlavor.getStructure match {
+ case TraversingStructureEnum.COLWISE | TraversingStructureEnum.SPARSECOLWISE =>
+ for (col <- t; el <- col.nonZeroes) el := f(el)
+ case default =>
+ for (row <- m; el <- row.nonZeroes) el := f(el)
+ }
+ m
+ }
+
+ def cloned: Matrix = m.like := m
/**
* Ideally, we would probably want to override equals(). But that is not
@@ -155,11 +215,14 @@ class MatrixOps(val m: Matrix) {
* @return
*/
def equiv(that: Matrix) =
+
+ // Warning: TODO: This would actually create empty objects in SparseMatrix. Should really implement
+ // merge-type comparison strategy using iterateNonEmpty.
that != null &&
- nrow == that.nrow &&
- m.view.zip(that).forall(t => {
- t._1.equiv(t._2)
- })
+ nrow == that.nrow &&
+ m.view.zip(that).forall(t => {
+ t._1.equiv(t._2)
+ })
def nequiv(that: Matrix) = !equiv(that)
http://git-wip-us.apache.org/repos/asf/mahout/blob/8a6b805a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeDoubleScalarOps.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeDoubleScalarOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeDoubleScalarOps.scala
new file mode 100644
index 0000000..a1e9377
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeDoubleScalarOps.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.scalabindings
+
+import org.apache.mahout.math._
+
+class RLikeDoubleScalarOps(val x:Double) extends AnyVal{
+
+ import RLikeOps._
+
+ def +(that:Matrix) = that + x
+
+ def +(that:Vector) = that + x
+
+ def *(that:Matrix) = that * x
+
+ def *(that:Vector) = that * x
+
+ def -(that:Matrix) = x -: that
+
+ def -(that:Vector) = x -: that
+
+ def /(that:Matrix) = x /: that
+
+ def /(that:Vector) = x /: that
+
+ def cbind(that:Matrix) = {
+ val mx = that.like(that.nrow, that.ncol + 1)
+ mx(::, 1 until mx.ncol) := that
+ if (x != 0.0) mx(::, 0) := x
+ mx
+ }
+
+ def rbind(that: Matrix) = {
+ val mx = that.like(that.nrow + 1, that.ncol)
+ mx(1 until mx.nrow, ::) := that
+ if (x != 0.0) mx(0, ::) := x
+ mx
+ }
+
+ def c(that: Vector): Vector = {
+ val cv = that.like(that.length + 1)
+ cv(1 until cv.length) := that
+ cv(0) = x
+ cv
+ }
+
+}