You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ss...@apache.org on 2015/04/14 08:28:54 UTC
[9/9] mahout git commit: MAHOUT-1681: Renamed mahout-math-scala to
mahout-samsara
MAHOUT-1681: Renamed mahout-math-scala to mahout-samsara
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/f7b69fab
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/f7b69fab
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/f7b69fab
Branch: refs/heads/master
Commit: f7b69fabf1253b5e735e269c9410459d91816cdd
Parents: 63e82ea
Author: Stevo Slavic <ss...@gmail.com>
Authored: Tue Apr 14 08:28:15 2015 +0200
Committer: Stevo Slavic <ss...@gmail.com>
Committed: Tue Apr 14 08:28:15 2015 +0200
----------------------------------------------------------------------
CHANGELOG | 2 +
distribution/pom.xml | 2 +-
distribution/src/main/assembly/bin.xml | 8 +-
h2o/pom.xml | 4 +-
math-scala/pom.xml | 197 -------
.../classifier/naivebayes/NBClassifier.scala | 119 ----
.../mahout/classifier/naivebayes/NBModel.scala | 217 --------
.../classifier/naivebayes/NaiveBayes.scala | 380 -------------
.../classifier/stats/ClassifierStats.scala | 467 ----------------
.../classifier/stats/ConfusionMatrix.scala | 460 ----------------
.../apache/mahout/drivers/MahoutDriver.scala | 44 --
.../mahout/drivers/MahoutOptionParser.scala | 220 --------
.../mahout/math/cf/SimilarityAnalysis.scala | 308 -----------
.../apache/mahout/math/decompositions/ALS.scala | 140 -----
.../apache/mahout/math/decompositions/DQR.scala | 74 ---
.../mahout/math/decompositions/DSPCA.scala | 153 ------
.../mahout/math/decompositions/DSSVD.scala | 82 ---
.../mahout/math/decompositions/SSVD.scala | 165 ------
.../mahout/math/decompositions/package.scala | 141 -----
.../org/apache/mahout/math/drm/BCast.scala | 23 -
.../org/apache/mahout/math/drm/CacheHint.scala | 19 -
.../mahout/math/drm/CheckpointedDrm.scala | 47 --
.../mahout/math/drm/CheckpointedOps.scala | 43 --
.../mahout/math/drm/DistributedContext.scala | 27 -
.../mahout/math/drm/DistributedEngine.scala | 215 --------
.../mahout/math/drm/DrmDoubleScalarOps.scala | 33 --
.../org/apache/mahout/math/drm/DrmLike.scala | 55 --
.../org/apache/mahout/math/drm/DrmLikeOps.scala | 118 ----
.../apache/mahout/math/drm/RLikeDrmOps.scala | 146 -----
.../math/drm/logical/AbstractBinaryOp.scala | 54 --
.../math/drm/logical/AbstractUnaryOp.scala | 37 --
.../math/drm/logical/CheckpointAction.scala | 47 --
.../apache/mahout/math/drm/logical/OpAB.scala | 41 --
.../mahout/math/drm/logical/OpABAnyKey.scala | 41 --
.../apache/mahout/math/drm/logical/OpABt.scala | 42 --
.../apache/mahout/math/drm/logical/OpAewB.scala | 46 --
.../mahout/math/drm/logical/OpAewScalar.scala | 45 --
.../apache/mahout/math/drm/logical/OpAt.scala | 35 --
.../apache/mahout/math/drm/logical/OpAtA.scala | 36 --
.../mahout/math/drm/logical/OpAtAnyKey.scala | 34 --
.../apache/mahout/math/drm/logical/OpAtB.scala | 42 --
.../apache/mahout/math/drm/logical/OpAtx.scala | 41 --
.../apache/mahout/math/drm/logical/OpAx.scala | 42 --
.../mahout/math/drm/logical/OpCbind.scala | 42 --
.../mahout/math/drm/logical/OpMapBlock.scala | 43 --
.../apache/mahout/math/drm/logical/OpPar.scala | 18 -
.../mahout/math/drm/logical/OpRbind.scala | 40 --
.../mahout/math/drm/logical/OpRowRange.scala | 36 --
.../math/drm/logical/OpTimesLeftMatrix.scala | 43 --
.../math/drm/logical/OpTimesRightMatrix.scala | 46 --
.../org/apache/mahout/math/drm/package.scala | 136 -----
.../math/indexeddataset/IndexedDataset.scala | 63 ---
.../math/indexeddataset/ReaderWriter.scala | 117 ----
.../mahout/math/indexeddataset/Schema.scala | 104 ----
.../math/scalabindings/DoubleScalarOps.scala | 42 --
.../scalabindings/MatlabLikeMatrixOps.scala | 66 ---
.../math/scalabindings/MatlabLikeOps.scala | 35 --
.../math/scalabindings/MatlabLikeTimesOps.scala | 28 -
.../scalabindings/MatlabLikeVectorOps.scala | 73 ---
.../mahout/math/scalabindings/MatrixOps.scala | 215 --------
.../math/scalabindings/RLikeMatrixOps.scala | 94 ----
.../mahout/math/scalabindings/RLikeOps.scala | 38 --
.../math/scalabindings/RLikeTimesOps.scala | 28 -
.../math/scalabindings/RLikeVectorOps.scala | 71 ---
.../mahout/math/scalabindings/VectorOps.scala | 141 -----
.../mahout/math/scalabindings/package.scala | 297 ----------
.../org/apache/mahout/nlp/tfidf/TFIDF.scala | 112 ----
.../classifier/naivebayes/NBTestBase.scala | 291 ----------
.../stats/ClassifierStatsTestBase.scala | 257 ---------
.../decompositions/DecompositionsSuite.scala | 113 ----
.../DistributedDecompositionsSuiteBase.scala | 219 --------
.../mahout/math/drm/DrmLikeOpsSuiteBase.scala | 93 ----
.../mahout/math/drm/DrmLikeSuiteBase.scala | 76 ---
.../mahout/math/drm/RLikeDrmOpsSuiteBase.scala | 550 -------------------
.../mahout/math/scalabindings/MathSuite.scala | 214 --------
.../MatlabLikeMatrixOpsSuite.scala | 67 ---
.../math/scalabindings/MatrixOpsSuite.scala | 185 -------
.../scalabindings/RLikeMatrixOpsSuite.scala | 80 ---
.../scalabindings/RLikeVectorOpsSuite.scala | 36 --
.../math/scalabindings/VectorOpsSuite.scala | 82 ---
.../apache/mahout/nlp/tfidf/TFIDFtestBase.scala | 184 -------
.../mahout/test/DistributedMahoutSuite.scala | 28 -
.../mahout/test/LoggerConfiguration.scala | 16 -
.../org/apache/mahout/test/MahoutSuite.scala | 54 --
pom.xml | 6 +-
samsara/pom.xml | 194 +++++++
.../classifier/naivebayes/NBClassifier.scala | 119 ++++
.../mahout/classifier/naivebayes/NBModel.scala | 217 ++++++++
.../classifier/naivebayes/NaiveBayes.scala | 380 +++++++++++++
.../classifier/stats/ClassifierStats.scala | 467 ++++++++++++++++
.../classifier/stats/ConfusionMatrix.scala | 460 ++++++++++++++++
.../apache/mahout/drivers/MahoutDriver.scala | 44 ++
.../mahout/drivers/MahoutOptionParser.scala | 220 ++++++++
.../mahout/math/cf/SimilarityAnalysis.scala | 308 +++++++++++
.../apache/mahout/math/decompositions/ALS.scala | 140 +++++
.../apache/mahout/math/decompositions/DQR.scala | 74 +++
.../mahout/math/decompositions/DSPCA.scala | 153 ++++++
.../mahout/math/decompositions/DSSVD.scala | 82 +++
.../mahout/math/decompositions/SSVD.scala | 165 ++++++
.../mahout/math/decompositions/package.scala | 141 +++++
.../org/apache/mahout/math/drm/BCast.scala | 23 +
.../org/apache/mahout/math/drm/CacheHint.scala | 19 +
.../mahout/math/drm/CheckpointedDrm.scala | 47 ++
.../mahout/math/drm/CheckpointedOps.scala | 43 ++
.../mahout/math/drm/DistributedContext.scala | 27 +
.../mahout/math/drm/DistributedEngine.scala | 215 ++++++++
.../mahout/math/drm/DrmDoubleScalarOps.scala | 33 ++
.../org/apache/mahout/math/drm/DrmLike.scala | 55 ++
.../org/apache/mahout/math/drm/DrmLikeOps.scala | 118 ++++
.../apache/mahout/math/drm/RLikeDrmOps.scala | 146 +++++
.../math/drm/logical/AbstractBinaryOp.scala | 54 ++
.../math/drm/logical/AbstractUnaryOp.scala | 37 ++
.../math/drm/logical/CheckpointAction.scala | 47 ++
.../apache/mahout/math/drm/logical/OpAB.scala | 41 ++
.../mahout/math/drm/logical/OpABAnyKey.scala | 41 ++
.../apache/mahout/math/drm/logical/OpABt.scala | 42 ++
.../apache/mahout/math/drm/logical/OpAewB.scala | 46 ++
.../mahout/math/drm/logical/OpAewScalar.scala | 45 ++
.../apache/mahout/math/drm/logical/OpAt.scala | 35 ++
.../apache/mahout/math/drm/logical/OpAtA.scala | 36 ++
.../mahout/math/drm/logical/OpAtAnyKey.scala | 34 ++
.../apache/mahout/math/drm/logical/OpAtB.scala | 42 ++
.../apache/mahout/math/drm/logical/OpAtx.scala | 41 ++
.../apache/mahout/math/drm/logical/OpAx.scala | 42 ++
.../mahout/math/drm/logical/OpCbind.scala | 42 ++
.../mahout/math/drm/logical/OpMapBlock.scala | 43 ++
.../apache/mahout/math/drm/logical/OpPar.scala | 18 +
.../mahout/math/drm/logical/OpRbind.scala | 40 ++
.../mahout/math/drm/logical/OpRowRange.scala | 36 ++
.../math/drm/logical/OpTimesLeftMatrix.scala | 43 ++
.../math/drm/logical/OpTimesRightMatrix.scala | 46 ++
.../org/apache/mahout/math/drm/package.scala | 136 +++++
.../math/indexeddataset/IndexedDataset.scala | 63 +++
.../math/indexeddataset/ReaderWriter.scala | 117 ++++
.../mahout/math/indexeddataset/Schema.scala | 104 ++++
.../math/scalabindings/DoubleScalarOps.scala | 42 ++
.../scalabindings/MatlabLikeMatrixOps.scala | 66 +++
.../math/scalabindings/MatlabLikeOps.scala | 35 ++
.../math/scalabindings/MatlabLikeTimesOps.scala | 28 +
.../scalabindings/MatlabLikeVectorOps.scala | 73 +++
.../mahout/math/scalabindings/MatrixOps.scala | 215 ++++++++
.../math/scalabindings/RLikeMatrixOps.scala | 94 ++++
.../mahout/math/scalabindings/RLikeOps.scala | 38 ++
.../math/scalabindings/RLikeTimesOps.scala | 28 +
.../math/scalabindings/RLikeVectorOps.scala | 71 +++
.../mahout/math/scalabindings/VectorOps.scala | 141 +++++
.../mahout/math/scalabindings/package.scala | 297 ++++++++++
.../org/apache/mahout/nlp/tfidf/TFIDF.scala | 112 ++++
.../classifier/naivebayes/NBTestBase.scala | 291 ++++++++++
.../stats/ClassifierStatsTestBase.scala | 257 +++++++++
.../decompositions/DecompositionsSuite.scala | 113 ++++
.../DistributedDecompositionsSuiteBase.scala | 219 ++++++++
.../mahout/math/drm/DrmLikeOpsSuiteBase.scala | 93 ++++
.../mahout/math/drm/DrmLikeSuiteBase.scala | 76 +++
.../mahout/math/drm/RLikeDrmOpsSuiteBase.scala | 550 +++++++++++++++++++
.../mahout/math/scalabindings/MathSuite.scala | 214 ++++++++
.../MatlabLikeMatrixOpsSuite.scala | 67 +++
.../math/scalabindings/MatrixOpsSuite.scala | 185 +++++++
.../scalabindings/RLikeMatrixOpsSuite.scala | 80 +++
.../scalabindings/RLikeVectorOpsSuite.scala | 36 ++
.../math/scalabindings/VectorOpsSuite.scala | 82 +++
.../apache/mahout/nlp/tfidf/TFIDFtestBase.scala | 184 +++++++
.../mahout/test/DistributedMahoutSuite.scala | 28 +
.../mahout/test/LoggerConfiguration.scala | 16 +
.../org/apache/mahout/test/MahoutSuite.scala | 54 ++
spark-shell/pom.xml | 2 +-
spark/pom.xml | 4 +-
167 files changed, 8961 insertions(+), 8962 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7b69fab/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index 777963a..a3e39ac 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,8 @@ Mahout Change Log
Release 0.11.0 - unreleased
+ MAHOUT-1681: Renamed mahout-math-scala to mahout-samsara
+
MAHOUT-1680: Renamed mahout-distribution to apache-mahout-distribution
Release 0.10.0 - 2015-04-11
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7b69fab/distribution/pom.xml
----------------------------------------------------------------------
diff --git a/distribution/pom.xml b/distribution/pom.xml
index bc17a08..3a47e08 100644
--- a/distribution/pom.xml
+++ b/distribution/pom.xml
@@ -115,7 +115,7 @@
</dependency>
<dependency>
<groupId>org.apache.mahout</groupId>
- <artifactId>mahout-math-scala_${scala.compat.version}</artifactId>
+ <artifactId>mahout-samsara_${scala.compat.version}</artifactId>
</dependency>
</dependencies>
</project>
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7b69fab/distribution/src/main/assembly/bin.xml
----------------------------------------------------------------------
diff --git a/distribution/src/main/assembly/bin.xml b/distribution/src/main/assembly/bin.xml
index c49ddc2..5dd014c 100644
--- a/distribution/src/main/assembly/bin.xml
+++ b/distribution/src/main/assembly/bin.xml
@@ -117,7 +117,7 @@
<outputDirectory/>
</fileSet>
<fileSet>
- <directory>${project.basedir}/../math-scala/target</directory>
+ <directory>${project.basedir}/../samsara/target</directory>
<includes>
<include>mahout-*.jar</include>
<include>mahout-*.job</include>
@@ -193,12 +193,12 @@
<outputDirectory>docs/mahout-examples</outputDirectory>
</fileSet>
<fileSet>
- <directory>${project.basedir}/../math-scala/target/site/scaladocs</directory>
- <outputDirectory>docs/mahout-examples</outputDirectory>
+ <directory>${project.basedir}/../samsara/target/site/scaladocs</directory>
+ <outputDirectory>docs/mahout-samsara</outputDirectory>
</fileSet>
<fileSet>
<directory>${project.basedir}/../spark/target/site/scaladocs</directory>
- <outputDirectory>docs/mahout-examples</outputDirectory>
+ <outputDirectory>docs/mahout-spark</outputDirectory>
</fileSet>
<fileSet>
<directory>${project.basedir}/../spark-shell/target/site/scaladocs</directory>
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7b69fab/h2o/pom.xml
----------------------------------------------------------------------
diff --git a/h2o/pom.xml b/h2o/pom.xml
index b9d101a..c0ccdcc 100644
--- a/h2o/pom.xml
+++ b/h2o/pom.xml
@@ -127,7 +127,7 @@
<dependency>
<groupId>org.apache.mahout</groupId>
- <artifactId>mahout-math-scala_${scala.compat.version}</artifactId>
+ <artifactId>mahout-samsara_${scala.compat.version}</artifactId>
<version>${project.version}</version>
</dependency>
@@ -140,7 +140,7 @@
<dependency>
<groupId>org.apache.mahout</groupId>
- <artifactId>mahout-math-scala_${scala.compat.version}</artifactId>
+ <artifactId>mahout-samsara_${scala.compat.version}</artifactId>
<classifier>tests</classifier>
<scope>test</scope>
</dependency>
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7b69fab/math-scala/pom.xml
----------------------------------------------------------------------
diff --git a/math-scala/pom.xml b/math-scala/pom.xml
deleted file mode 100644
index 78331dd..0000000
--- a/math-scala/pom.xml
+++ /dev/null
@@ -1,197 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout</artifactId>
- <version>0.11.0-SNAPSHOT</version>
- <relativePath>../pom.xml</relativePath>
- </parent>
-
- <artifactId>mahout-math-scala_${scala.compat.version}</artifactId>
- <name>Mahout Math Scala bindings</name>
- <description>High performance scientific and technical computing data structures and methods,
- mostly based on CERN's
- Colt Java API
- </description>
-
- <packaging>jar</packaging>
-
- <build>
- <plugins>
- <!-- create test jar so other modules can reuse the math-scala test utility classes. -->
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-jar-plugin</artifactId>
- <executions>
- <execution>
- <goals>
- <goal>test-jar</goal>
- </goals>
- <phase>package</phase>
- </execution>
- </executions>
- </plugin>
-
- <plugin>
- <artifactId>maven-javadoc-plugin</artifactId>
- </plugin>
-
- <plugin>
- <artifactId>maven-source-plugin</artifactId>
- </plugin>
-
- <plugin>
- <groupId>net.alchim31.maven</groupId>
- <artifactId>scala-maven-plugin</artifactId>
- <executions>
- <execution>
- <id>add-scala-sources</id>
- <phase>initialize</phase>
- <goals>
- <goal>add-source</goal>
- </goals>
- </execution>
- <execution>
- <id>scala-compile</id>
- <phase>process-resources</phase>
- <goals>
- <goal>compile</goal>
- </goals>
- </execution>
- <execution>
- <id>scala-test-compile</id>
- <phase>process-test-resources</phase>
- <goals>
- <goal>testCompile</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
-
- <!--this is what scalatest recommends to do to enable scala tests -->
-
- <!-- disable surefire -->
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-surefire-plugin</artifactId>
- <configuration>
- <skipTests>true</skipTests>
- </configuration>
- </plugin>
- <!-- enable scalatest -->
- <plugin>
- <groupId>org.scalatest</groupId>
- <artifactId>scalatest-maven-plugin</artifactId>
- <executions>
- <execution>
- <id>test</id>
- <goals>
- <goal>test</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
-
- </plugins>
- </build>
-
- <dependencies>
-
- <dependency>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-math</artifactId>
- </dependency>
-
- <!-- 3rd-party -->
- <dependency>
- <groupId>log4j</groupId>
- <artifactId>log4j</artifactId>
- </dependency>
-
- <dependency>
- <groupId>com.github.scopt</groupId>
- <artifactId>scopt_${scala.compat.version}</artifactId>
- <version>3.3.0</version>
- </dependency>
-
- <!-- scala stuff -->
- <dependency>
- <groupId>org.scala-lang</groupId>
- <artifactId>scala-compiler</artifactId>
- <version>${scala.version}</version>
- </dependency>
- <dependency>
- <groupId>org.scala-lang</groupId>
- <artifactId>scala-reflect</artifactId>
- <version>${scala.version}</version>
- </dependency>
- <dependency>
- <groupId>org.scala-lang</groupId>
- <artifactId>scala-library</artifactId>
- <version>${scala.version}</version>
- </dependency>
- <dependency>
- <groupId>org.scala-lang</groupId>
- <artifactId>scala-actors</artifactId>
- <version>${scala.version}</version>
- </dependency>
- <dependency>
- <groupId>org.scala-lang</groupId>
- <artifactId>scalap</artifactId>
- <version>${scala.version}</version>
- </dependency>
- <dependency>
- <groupId>org.scalatest</groupId>
- <artifactId>scalatest_${scala.compat.version}</artifactId>
- </dependency>
-
- </dependencies>
-
- <profiles>
- <profile>
- <id>mahout-release</id>
- <build>
- <plugins>
- <plugin>
- <groupId>net.alchim31.maven</groupId>
- <artifactId>scala-maven-plugin</artifactId>
- <executions>
- <execution>
- <id>generate-scaladoc</id>
- <goals>
- <goal>doc</goal>
- </goals>
- </execution>
- <execution>
- <id>attach-scaladoc-jar</id>
- <goals>
- <goal>doc-jar</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
- </plugins>
- </build>
- </profile>
- </profiles>
-</project>
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7b69fab/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NBClassifier.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NBClassifier.scala b/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NBClassifier.scala
deleted file mode 100644
index 5de0733..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NBClassifier.scala
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-package org.apache.mahout.classifier.naivebayes
-
-import org.apache.mahout.math.Vector
-import scala.collection.JavaConversions._
-
-/**
- * Abstract Classifier base for Complentary and Standard Classifiers
- * @param nbModel a trained NBModel
- */
-abstract class AbstractNBClassifier(nbModel: NBModel) extends java.io.Serializable {
-
- // Trained Naive Bayes Model
- val model = nbModel
-
- /** scoring method for standard and complementary classifiers */
- protected def getScoreForLabelFeature(label: Int, feature: Int): Double
-
- /** getter for model */
- protected def getModel: NBModel= {
- model
- }
-
- /**
- * Compute the score for a Vector of weighted TF-IDF featured
- * @param label Label to be scored
- * @param instance Vector of weights to be calculate score
- * @return score for this Label
- */
- protected def getScoreForLabelInstance(label: Int, instance: Vector): Double = {
- var result: Double = 0.0
- for (e <- instance.nonZeroes) {
- result += e.get * getScoreForLabelFeature(label, e.index)
- }
- result
- }
-
- /** number of categories the model has been trained on */
- def numCategories: Int = {
- model.numLabels
- }
-
- /**
- * get a scoring vector for a vector of TF of TF-IDF weights
- * @param instance vector of TF of TF-IDF weights to be classified
- * @return a vector of scores.
- */
- def classifyFull(instance: Vector): Vector = {
- classifyFull(model.createScoringVector, instance)
- }
-
- /** helper method for classifyFull(Vector) */
- def classifyFull(r: Vector, instance: Vector): Vector = {
- var label: Int = 0
- for (label <- 0 until model.numLabels) {
- r.setQuick(label, getScoreForLabelInstance(label, instance))
- }
- r
- }
-}
-
-/**
- * Standard Multinomial Naive Bayes Classifier
- * @param nbModel a trained NBModel
- */
-class StandardNBClassifier(nbModel: NBModel) extends AbstractNBClassifier(nbModel: NBModel) with java.io.Serializable{
- override def getScoreForLabelFeature(label: Int, feature: Int): Double = {
- val model: NBModel = getModel
- StandardNBClassifier.computeWeight(model.weight(label, feature), model.labelWeight(label), model.alphaI, model.numFeatures)
- }
-}
-
-/** helper object for StandardNBClassifier */
-object StandardNBClassifier extends java.io.Serializable {
- /** Compute Standard Multinomial Naive Bayes Weights See Rennie et. al. Section 2.1 */
- def computeWeight(featureLabelWeight: Double, labelWeight: Double, alphaI: Double, numFeatures: Double): Double = {
- val numerator: Double = featureLabelWeight + alphaI
- val denominator: Double = labelWeight + alphaI * numFeatures
- return Math.log(numerator / denominator)
- }
-}
-
-/**
- * Complementary Naive Bayes Classifier
- * @param nbModel a trained NBModel
- */
-class ComplementaryNBClassifier(nbModel: NBModel) extends AbstractNBClassifier(nbModel: NBModel) with java.io.Serializable {
- override def getScoreForLabelFeature(label: Int, feature: Int): Double = {
- val model: NBModel = getModel
- val weight: Double = ComplementaryNBClassifier.computeWeight(model.featureWeight(feature), model.weight(label, feature), model.totalWeightSum, model.labelWeight(label), model.alphaI, model.numFeatures)
- return weight / model.thetaNormalizer(label)
- }
-}
-
-/** helper object for ComplementaryNBClassifier */
-object ComplementaryNBClassifier extends java.io.Serializable {
-
- /** Compute Complementary weights See Rennie et. al. Section 3.1 */
- def computeWeight(featureWeight: Double, featureLabelWeight: Double, totalWeight: Double, labelWeight: Double, alphaI: Double, numFeatures: Double): Double = {
- val numerator: Double = featureWeight - featureLabelWeight + alphaI
- val denominator: Double = totalWeight - labelWeight + alphaI * numFeatures
- return -Math.log(numerator / denominator)
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7b69fab/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NBModel.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NBModel.scala b/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NBModel.scala
deleted file mode 100644
index 3ceae96..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NBModel.scala
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.classifier.naivebayes
-
-import org.apache.mahout.math._
-
-import org.apache.mahout.math.{drm, scalabindings}
-
-import scalabindings._
-import scalabindings.RLikeOps._
-import drm.RLikeDrmOps._
-import drm._
-import scala.collection.JavaConverters._
-import scala.language.asInstanceOf
-import scala.collection._
-import JavaConversions._
-
-/**
- *
- * @param weightsPerLabelAndFeature Aggregated matrix of weights of labels x features
- * @param weightsPerFeature Vector of summation of all feature weights.
- * @param weightsPerLabel Vector of summation of all label weights.
- * @param perlabelThetaNormalizer Vector of weight normalizers per label (used only for complemtary models)
- * @param labelIndex HashMap of labels and their corresponding row in the weightMatrix
- * @param alphaI Laplace smoothing factor.
- * @param isComplementary Whether or not this is a complementary model.
- */
-class NBModel(val weightsPerLabelAndFeature: Matrix = null,
- val weightsPerFeature: Vector = null,
- val weightsPerLabel: Vector = null,
- val perlabelThetaNormalizer: Vector = null,
- val labelIndex: Map[String, Integer] = null,
- val alphaI: Float = 1.0f,
- val isComplementary: Boolean= false) extends java.io.Serializable {
-
-
- val numFeatures: Double = weightsPerFeature.getNumNondefaultElements
- val totalWeightSum: Double = weightsPerLabel.zSum
- val alphaVector: Vector = null
-
- validate()
-
- // todo: Maybe it is a good idea to move the dfsWrite and dfsRead out
- // todo: of the model and into a helper
-
- // TODO: weightsPerLabelAndFeature, a sparse (numFeatures x numLabels) matrix should fit
- // TODO: upfront in memory and should not require a DRM decide if we want this to scale out.
-
-
- /** getter for summed label weights. Used by legacy classifier */
- def labelWeight(label: Int): Double = {
- weightsPerLabel.getQuick(label)
- }
-
- /** getter for weight normalizers. Used by legacy classifier */
- def thetaNormalizer(label: Int): Double = {
- perlabelThetaNormalizer.get(label)
- }
-
- /** getter for summed feature weights. Used by legacy classifier */
- def featureWeight(feature: Int): Double = {
- weightsPerFeature.getQuick(feature)
- }
-
- /** getter for individual aggregated weights. Used by legacy classifier */
- def weight(label: Int, feature: Int): Double = {
- weightsPerLabelAndFeature.getQuick(label, feature)
- }
-
- /** getter for a single empty vector of weights */
- def createScoringVector: Vector = {
- weightsPerLabel.like
- }
-
- /** getter for a the number of labels to consider */
- def numLabels: Int = {
- weightsPerLabel.size
- }
-
- /**
- * Write a trained model to the filesystem as a series of DRMs
- * @param pathToModel Directory to which the model will be written
- */
- def dfsWrite(pathToModel: String)(implicit ctx: DistributedContext): Unit = {
- //todo: write out as smaller partitions or possibly use reader and writers to
- //todo: write something other than a DRM for label Index, is Complementary, alphaI.
-
- // add a directory to put all of the DRMs in
- val fullPathToModel = pathToModel + NBModel.modelBaseDirectory
-
- drmParallelize(weightsPerLabelAndFeature).dfsWrite(fullPathToModel + "/weightsPerLabelAndFeatureDrm.drm")
- drmParallelize(sparse(weightsPerFeature)).dfsWrite(fullPathToModel + "/weightsPerFeatureDrm.drm")
- drmParallelize(sparse(weightsPerLabel)).dfsWrite(fullPathToModel + "/weightsPerLabelDrm.drm")
- drmParallelize(sparse(perlabelThetaNormalizer)).dfsWrite(fullPathToModel + "/perlabelThetaNormalizerDrm.drm")
- drmParallelize(sparse(svec((0,alphaI)::Nil))).dfsWrite(fullPathToModel + "/alphaIDrm.drm")
-
- // isComplementry is true if isComplementaryDrm(0,0) == 1 else false
- val isComplementaryDrm = sparse(0 to 1, 0 to 1)
- if(isComplementary){
- isComplementaryDrm(0,0) = 1.0
- } else {
- isComplementaryDrm(0,0) = 0.0
- }
- drmParallelize(isComplementaryDrm).dfsWrite(fullPathToModel + "/isComplementaryDrm.drm")
-
- // write the label index as a String-Keyed DRM.
- val labelIndexDummyDrm = weightsPerLabelAndFeature.like()
- labelIndexDummyDrm.setRowLabelBindings(labelIndex)
- // get a reverse map of [Integer, String] and set the value of firsr column of the drm
- // to the corresponding row number for it's Label (the rows may not be read back in the same order)
- val revMap = labelIndex.map(x => x._2 -> x._1)
- for(i <- 0 until labelIndexDummyDrm.numRows() ){
- labelIndexDummyDrm.set(labelIndex(revMap(i)), 0, i.toDouble)
- }
-
- drmParallelizeWithRowLabels(labelIndexDummyDrm).dfsWrite(fullPathToModel + "/labelIndex.drm")
- }
-
- /** Model Validation */
- def validate() {
- assert(alphaI > 0, "alphaI has to be greater than 0!")
- assert(numFeatures > 0, "the vocab count has to be greater than 0!")
- assert(totalWeightSum > 0, "the totalWeightSum has to be greater than 0!")
- assert(weightsPerLabel != null, "the number of labels has to be defined!")
- assert(weightsPerLabel.getNumNondefaultElements > 0, "the number of labels has to be greater than 0!")
- assert(weightsPerFeature != null, "the feature sums have to be defined")
- assert(weightsPerFeature.getNumNondefaultElements > 0, "the feature sums have to be greater than 0!")
- if (isComplementary) {
- assert(perlabelThetaNormalizer != null, "the theta normalizers have to be defined")
- assert(perlabelThetaNormalizer.getNumNondefaultElements > 0, "the number of theta normalizers has to be greater than 0!")
- assert(Math.signum(perlabelThetaNormalizer.minValue) == Math.signum(perlabelThetaNormalizer.maxValue), "Theta normalizers do not all have the same sign")
- assert(perlabelThetaNormalizer.getNumNonZeroElements == perlabelThetaNormalizer.size, "Weight normalizers can not have zero value.")
- }
- assert(labelIndex.size == weightsPerLabel.getNumNondefaultElements, "label index must have entries for all labels")
- }
-}
-
-object NBModel extends java.io.Serializable {
-
- val modelBaseDirectory = "/naiveBayesModel"
-
- /**
- * Read a trained model in from from the filesystem.
- * @param pathToModel directory from which to read individual model components
- * @return a valid NBModel
- */
- def dfsRead(pathToModel: String)(implicit ctx: DistributedContext): NBModel = {
- //todo: Takes forever to read we need a more practical method of writing models. Readers/Writers?
-
- // read from a base directory for all drms
- val fullPathToModel = pathToModel + modelBaseDirectory
-
- val weightsPerFeatureDrm = drmDfsRead(fullPathToModel + "/weightsPerFeatureDrm.drm").checkpoint(CacheHint.MEMORY_ONLY)
- val weightsPerFeature = weightsPerFeatureDrm.collect(0, ::)
- weightsPerFeatureDrm.uncache()
-
- val weightsPerLabelDrm = drmDfsRead(fullPathToModel + "/weightsPerLabelDrm.drm").checkpoint(CacheHint.MEMORY_ONLY)
- val weightsPerLabel = weightsPerLabelDrm.collect(0, ::)
- weightsPerLabelDrm.uncache()
-
- val alphaIDrm = drmDfsRead(fullPathToModel + "/alphaIDrm.drm").checkpoint(CacheHint.MEMORY_ONLY)
- val alphaI: Float = alphaIDrm.collect(0, 0).toFloat
- alphaIDrm.uncache()
-
- // isComplementry is true if isComplementaryDrm(0,0) == 1 else false
- val isComplementaryDrm = drmDfsRead(fullPathToModel + "/isComplementaryDrm.drm").checkpoint(CacheHint.MEMORY_ONLY)
- val isComplementary = isComplementaryDrm.collect(0, 0).toInt == 1
- isComplementaryDrm.uncache()
-
- var perLabelThetaNormalizer= weightsPerFeature.like()
- if (isComplementary) {
- val perLabelThetaNormalizerDrm = drm.drmDfsRead(fullPathToModel + "/perlabelThetaNormalizerDrm.drm")
- .checkpoint(CacheHint.MEMORY_ONLY)
- perLabelThetaNormalizer = perLabelThetaNormalizerDrm.collect(0, ::)
- }
-
- val dummyLabelDrm= drmDfsRead(fullPathToModel + "/labelIndex.drm")
- .checkpoint(CacheHint.MEMORY_ONLY)
- val labelIndexMap:java.util.Map[String, Integer] = dummyLabelDrm.getRowLabelBindings
- dummyLabelDrm.uncache()
-
- // map the labels to the corresponding row numbers of weightsPerFeatureDrm (values in dummyLabelDrm)
- val scalaLabelIndexMap: mutable.Map[String, Integer] =
- labelIndexMap.map(x => x._1 -> dummyLabelDrm.get(labelIndexMap(x._1), 0)
- .toInt
- .asInstanceOf[Integer])
-
- val weightsPerLabelAndFeatureDrm = drmDfsRead(fullPathToModel + "/weightsPerLabelAndFeatureDrm.drm").checkpoint(CacheHint.MEMORY_ONLY)
- val weightsPerLabelAndFeature = weightsPerLabelAndFeatureDrm.collect
- weightsPerLabelAndFeatureDrm.uncache()
-
- // model validation is triggered automatically by constructor
- val model: NBModel = new NBModel(weightsPerLabelAndFeature,
- weightsPerFeature,
- weightsPerLabel,
- perLabelThetaNormalizer,
- scalaLabelIndexMap,
- alphaI,
- isComplementary)
-
- model
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7b69fab/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NaiveBayes.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NaiveBayes.scala b/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NaiveBayes.scala
deleted file mode 100644
index a15ca09..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/classifier/naivebayes/NaiveBayes.scala
+++ /dev/null
@@ -1,380 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.naivebayes
-
-import org.apache.mahout.classifier.stats.{ResultAnalyzer, ClassifierResult}
-import org.apache.mahout.math._
-import scalabindings._
-import scalabindings.RLikeOps._
-import drm.RLikeDrmOps._
-import drm._
-import scala.reflect.ClassTag
-import scala.language.asInstanceOf
-import collection._
-import scala.collection.JavaConversions._
-
-/**
- * Distributed training of a Naive Bayes model. Follows the approach presented in Rennie et.al.: Tackling the poor
- * assumptions of Naive Bayes Text classifiers, ICML 2003, http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf
- */
-trait NaiveBayes extends java.io.Serializable{
-
- /** default value for the Laplacian smoothing parameter */
- def defaultAlphaI = 1.0f
-
- // function to extract categories from string keys
- type CategoryParser = String => String
-
- /** Default: seqdirectory/seq2Sparse Categories are Stored in Drm Keys as: /Category/document_id */
- def seq2SparseCategoryParser: CategoryParser = x => x.split("/")(1)
-
-
- /**
- * Distributed training of a Naive Bayes model. Follows the approach presented in Rennie et.al.: Tackling the poor
- * assumptions of Naive Bayes Text classifiers, ICML 2003, http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf
- *
- * @param observationsPerLabel a DrmLike[Int] matrix containing term frequency counts for each label.
- * @param trainComplementary whether or not to train a complementary Naive Bayes model
- * @param alphaI Laplace smoothing parameter
- * @return trained naive bayes model
- */
- def train(observationsPerLabel: DrmLike[Int],
- labelIndex: Map[String, Integer],
- trainComplementary: Boolean = true,
- alphaI: Float = defaultAlphaI): NBModel = {
-
- // Summation of all weights per feature
- val weightsPerFeature = observationsPerLabel.colSums
-
- // Distributed summation of all weights per label
- val weightsPerLabel = observationsPerLabel.rowSums
-
- // Collect a matrix to pass to the NaiveBayesModel
- val inCoreTFIDF = observationsPerLabel.collect
-
- // perLabelThetaNormalizer Vector is expected by NaiveBayesModel. We can pass a null value
- // or Vector of zeroes in the case of a standard NB model.
- var thetaNormalizer = weightsPerFeature.like()
-
- // Instantiate a trainer and retrieve the perLabelThetaNormalizer Vector from it in the case of
- // a complementary NB model
- if (trainComplementary) {
- val thetaTrainer = new ComplementaryNBThetaTrainer(weightsPerFeature,
- weightsPerLabel,
- alphaI)
- // local training of the theta normalization
- for (labelIndex <- 0 until inCoreTFIDF.nrow) {
- thetaTrainer.train(labelIndex, inCoreTFIDF(labelIndex, ::))
- }
- thetaNormalizer = thetaTrainer.retrievePerLabelThetaNormalizer
- }
-
- new NBModel(inCoreTFIDF,
- weightsPerFeature,
- weightsPerLabel,
- thetaNormalizer,
- labelIndex,
- alphaI,
- trainComplementary)
- }
-
- /**
- * Extract label Keys from raw TF or TF-IDF Matrix generated by seqdirectory/seq2sparse
- * and aggregate TF or TF-IDF values by their label
- * Override this method in engine specific modules to optimize
- *
- * @param stringKeyedObservations DrmLike matrix; Output from seq2sparse
- * in form K = eg./Category/document_title
- * V = TF or TF-IDF values per term
- * @param cParser a String => String function used to extract categories from
- * Keys of the stringKeyedObservations DRM. The default
- * CategoryParser will extract "Category" from: '/Category/document_id'
- * @return (labelIndexMap,aggregatedByLabelObservationDrm)
- * labelIndexMap is a HashMap [String, Integer] K = label row index
- * V = label
- * aggregatedByLabelObservationDrm is a DrmLike[Int] of aggregated
- * TF or TF-IDF counts per label
- */
- def extractLabelsAndAggregateObservations[K: ClassTag](stringKeyedObservations: DrmLike[K],
- cParser: CategoryParser = seq2SparseCategoryParser)
- (implicit ctx: DistributedContext):
- (mutable.HashMap[String, Integer], DrmLike[Int])= {
-
- stringKeyedObservations.checkpoint()
-
- val numDocs=stringKeyedObservations.nrow
- val numFeatures=stringKeyedObservations.ncol
-
- // Extract categories from labels assigned by seq2sparse
- // Categories are Stored in Drm Keys as eg.: /Category/document_id
-
- // Get a new DRM with a single column so that we don't have to collect the
- // DRM into memory upfront.
- val strippedObeservations= stringKeyedObservations.mapBlock(ncol=1){
- case(keys, block) =>
- val blockB = block.like(keys.size, 1)
- keys -> blockB
- }
-
- // Extract the row label bindings (the String keys) from the slim Drm
- // strip the document_id from the row keys keeping only the category.
- // Sort the bindings alphabetically into a Vector
- val labelVectorByRowIndex = strippedObeservations
- .getRowLabelBindings
- .map(x => x._2 -> cParser(x._1))
- .toVector.sortWith(_._1 < _._1)
-
- //TODO: add a .toIntKeyed(...) method to DrmLike?
-
- // Copy stringKeyedObservations to an Int-Keyed Drm so that we can compute transpose
- // Copy the Collected Matrices up front for now until we hav a distributed way of converting
- val inCoreStringKeyedObservations = stringKeyedObservations.collect
- val inCoreIntKeyedObservations = new SparseMatrix(
- stringKeyedObservations.nrow.toInt,
- stringKeyedObservations.ncol)
- for (i <- 0 until inCoreStringKeyedObservations.nrow.toInt) {
- inCoreIntKeyedObservations(i, ::) = inCoreStringKeyedObservations(i, ::)
- }
-
- val intKeyedObservations= drmParallelize(inCoreIntKeyedObservations)
-
- stringKeyedObservations.uncache()
-
- var labelIndex = 0
- val labelIndexMap = new mutable.HashMap[String, Integer]
- val encodedLabelByRowIndexVector = new DenseVector(labelVectorByRowIndex.size)
-
- // Encode Categories as an Integer (Double) so we can broadcast as a vector
- // where each element is an Int-encoded category whose index corresponds
- // to its row in the Drm
- for (i <- 0 until labelVectorByRowIndex.size) {
- if (!(labelIndexMap.contains(labelVectorByRowIndex(i)._2))) {
- encodedLabelByRowIndexVector(i) = labelIndex.toDouble
- labelIndexMap.put(labelVectorByRowIndex(i)._2, labelIndex)
- labelIndex += 1
- }
- // don't like this casting but need to use a java.lang.Integer when setting rowLabelBindings
- encodedLabelByRowIndexVector(i) = labelIndexMap
- .getOrElse(labelVectorByRowIndex(i)._2, -1)
- .asInstanceOf[Int].toDouble
- }
-
- // "Combiner": Map and aggregate by Category. Do this by broadcasting the encoded
- // category vector and mapping a transposed IntKeyed Drm out so that all categories
- // will be present on all nodes as columns and can be referenced by
- // BCastEncodedCategoryByRowVector. Iteratively sum all categories.
- val nLabels = labelIndex
-
- val bcastEncodedCategoryByRowVector = drmBroadcast(encodedLabelByRowIndexVector)
-
- val aggregetedObservationByLabelDrm = intKeyedObservations.t.mapBlock(ncol = nLabels) {
- case (keys, blockA) =>
- val blockB = blockA.like(keys.size, nLabels)
- var label : Int = 0
- for (i <- 0 until keys.size) {
- blockA(i, ::).nonZeroes().foreach { elem =>
- label = bcastEncodedCategoryByRowVector.get(elem.index).toInt
- blockB(i, label) = blockB(i, label) + blockA(i, elem.index)
- }
- }
- keys -> blockB
- }.t
-
- (labelIndexMap, aggregetedObservationByLabelDrm)
- }
-
- /**
- * Test a trained model with a labeled dataset sequentially
- * @param model a trained NBModel
- * @param testSet a labeled testing set
- * @param testComplementary test using a complementary or a standard NB classifier
- * @param cParser a String => String function used to extract categories from
- * Keys of the testing set DRM. The default
- * CategoryParser will extract "Category" from: '/Category/document_id'
- *
- * *Note*: this method brings the entire test set into upfront memory,
- * This method is optimized and parallelized in SparkNaiveBayes
- *
- * @tparam K implicitly determined Key type of test set DRM: String
- * @return a result analyzer with confusion matrix and accuracy statistics
- */
- def test[K: ClassTag](model: NBModel,
- testSet: DrmLike[K],
- testComplementary: Boolean = false,
- cParser: CategoryParser = seq2SparseCategoryParser)
- (implicit ctx: DistributedContext): ResultAnalyzer = {
-
- val labelMap = model.labelIndex
-
- val numLabels = model.numLabels
-
- testSet.checkpoint()
-
- val numTestInstances = testSet.nrow.toInt
-
- // instantiate the correct type of classifier
- val classifier = testComplementary match {
- case true => new ComplementaryNBClassifier(model) with Serializable
- case _ => new StandardNBClassifier(model) with Serializable
- }
-
- if (testComplementary) {
- assert(testComplementary == model.isComplementary,
- "Complementary Label Assignment requires Complementary Training")
- }
-
-
- // Sequentially assign labels to the test set:
- // *Note* this brings the entire test set into memory upfront:
-
- // Since we cant broadcast the model as is do it sequentially up front for now
- val inCoreTestSet = testSet.collect
-
- // get the labels of the test set and extract the keys
- val testSetLabelMap = testSet.getRowLabelBindings
-
- // empty Matrix in which we'll set the classification scores
- val inCoreScoredTestSet = testSet.like(numTestInstances, numLabels)
-
- testSet.uncache()
-
- for (i <- 0 until numTestInstances) {
- inCoreScoredTestSet(i, ::) := classifier.classifyFull(inCoreTestSet(i, ::))
- }
-
- // todo: reverse the labelMaps in training and through the model?
-
- // reverse the label map and extract the labels
- val reverseTestSetLabelMap = testSetLabelMap.map(x => x._2 -> cParser(x._1))
-
- val reverseLabelMap = labelMap.map(x => x._2 -> x._1)
-
- val analyzer = new ResultAnalyzer(labelMap.keys.toList.sorted, "DEFAULT")
-
- // assign labels- winner takes all
- for (i <- 0 until numTestInstances) {
- val (bestIdx, bestScore) = argmax(inCoreScoredTestSet(i, ::))
- val classifierResult = new ClassifierResult(reverseLabelMap(bestIdx), bestScore)
- analyzer.addInstance(reverseTestSetLabelMap(i), classifierResult)
- }
-
- analyzer
- }
-
- /**
- * argmax with values as well
- * returns a tuple of index of the max score and the score itself.
- * @param v Vector of of scores
- * @return (bestIndex, bestScore)
- */
- def argmax(v: Vector): (Int, Double) = {
- var bestIdx: Int = Integer.MIN_VALUE
- var bestScore: Double = Integer.MIN_VALUE.asInstanceOf[Int].toDouble
- for(i <- 0 until v.size) {
- if(v(i) > bestScore){
- bestScore = v(i)
- bestIdx = i
- }
- }
- (bestIdx, bestScore)
- }
-
-}
-
-object NaiveBayes extends NaiveBayes with java.io.Serializable
-
-/**
- * Trainer for the weight normalization vector used by Transform Weight Normalized Complement
- * Naive Bayes. See: Rennie et.al.: Tackling the poor assumptions of Naive Bayes Text classifiers,
- * ICML 2003, http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf Sec. 3.2.
- *
- * @param weightsPerFeature a Vector of summed TF or TF-IDF weights for each word in dictionary.
- * @param weightsPerLabel a Vector of summed TF or TF-IDF weights for each label.
- * @param alphaI Laplace smoothing factor. Defaut value of 1.
- */
-class ComplementaryNBThetaTrainer(private val weightsPerFeature: Vector,
- private val weightsPerLabel: Vector,
- private val alphaI: Double = 1.0) {
-
- private val perLabelThetaNormalizer: Vector = weightsPerLabel.like()
- private val totalWeightSum: Double = weightsPerLabel.zSum
- private var numFeatures: Double = weightsPerFeature.getNumNondefaultElements
-
- assert(weightsPerFeature != null, "weightsPerFeature vector can not be null")
- assert(weightsPerLabel != null, "weightsPerLabel vector can not be null")
-
- /**
- * Train the weight normalization vector for each label
- * @param label
- * @param featurePerLabelWeight
- */
- def train(label: Int, featurePerLabelWeight: Vector) {
- val currentLabelWeight = labelWeight(label)
- // sum weights for each label including those with zero word counts
- for (i <- 0 until featurePerLabelWeight.size) {
- val currentFeaturePerLabelWeight = featurePerLabelWeight(i)
- updatePerLabelThetaNormalizer(label,
- ComplementaryNBClassifier.computeWeight(featureWeight(i),
- currentFeaturePerLabelWeight,
- totalWeightSum,
- currentLabelWeight,
- alphaI,
- numFeatures)
- )
- }
- }
-
- /**
- * getter for summed TF or TF-IDF weights by label
- * @param label index of label
- * @return sum of word TF or TF-IDF weights for label
- */
- def labelWeight(label: Int): Double = {
- weightsPerLabel(label)
- }
-
- /**
- * getter for summed TF or TF-IDF weights by word.
- * @param feature index of word.
- * @return sum of TF or TF-IDF weights for word.
- */
- def featureWeight(feature: Int): Double = {
- weightsPerFeature(feature)
- }
-
- /**
- * add the magnitude of the current weight to the current
- * label's corresponding Vector element.
- * @param label index of label to update.
- * @param weight weight to add.
- */
- def updatePerLabelThetaNormalizer(label: Int, weight: Double) {
- perLabelThetaNormalizer(label) = perLabelThetaNormalizer(label) + Math.abs(weight)
- }
-
- /**
- * Getter for the weight normalizer vector as indexed by label
- * @return a copy of the weight normalizer vector.
- */
- def retrievePerLabelThetaNormalizer: Vector = {
- perLabelThetaNormalizer.cloned
- }
-
-
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7b69fab/math-scala/src/main/scala/org/apache/mahout/classifier/stats/ClassifierStats.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/classifier/stats/ClassifierStats.scala b/math-scala/src/main/scala/org/apache/mahout/classifier/stats/ClassifierStats.scala
deleted file mode 100644
index 8f1413a..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/classifier/stats/ClassifierStats.scala
+++ /dev/null
@@ -1,467 +0,0 @@
-/*
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-
-package org.apache.mahout.classifier.stats
-
-import java.text.{DecimalFormat, NumberFormat}
-import java.util
-import org.apache.mahout.math.stats.OnlineSummarizer
-
-
-/**
- * Result of a document classification. The label and the associated score (usually probabilty)
- */
-class ClassifierResult (private var label: String = null,
- private var score: Double = 0.0,
- private var logLikelihood: Double = Integer.MAX_VALUE.toDouble) {
-
- def getLogLikelihood: Double = logLikelihood
-
- def setLogLikelihood(llh: Double) {
- logLikelihood = llh
- }
-
- def getLabel: String = label
-
- def getScore: Double = score
-
- def setLabel(lbl: String) {
- label = lbl
- }
-
- def setScore(sc: Double) {
- score = sc
- }
-
- override def toString: String = {
- "ClassifierResult{" + "category='" + label + '\'' + ", score=" + score + '}'
- }
-
-}
-
-/**
- * ResultAnalyzer captures the classification statistics and displays in a tabular manner
- * @param labelSet Set of labels to be considered in classification
- * @param defaultLabel the default label for an unknown classification
- */
-class ResultAnalyzer(private val labelSet: util.Collection[String], defaultLabel: String) {
-
- val confusionMatrix = new ConfusionMatrix(labelSet, defaultLabel)
- val summarizer = new OnlineSummarizer
-
- private var hasLL: Boolean = false
- private var correctlyClassified: Int = 0
- private var incorrectlyClassified: Int = 0
-
-
- def getConfusionMatrix: ConfusionMatrix = confusionMatrix
-
- /**
- *
- * @param correctLabel
- * The correct label
- * @param classifiedResult
- * The classified result
- * @return whether the instance was correct or not
- */
- def addInstance(correctLabel: String, classifiedResult: ClassifierResult): Boolean = {
- val result: Boolean = correctLabel == classifiedResult.getLabel
- if (result) {
- correctlyClassified += 1
- }
- else {
- incorrectlyClassified += 1
- }
- confusionMatrix.addInstance(correctLabel, classifiedResult)
- if (classifiedResult.getLogLikelihood != Integer.MAX_VALUE.toDouble) {
- summarizer.add(classifiedResult.getLogLikelihood)
- hasLL = true
- }
-
- result
- }
-
- /** Dump the resulting statistics to a string */
- override def toString: String = {
- val returnString: StringBuilder = new StringBuilder
- returnString.append('\n')
- returnString.append("=======================================================\n")
- returnString.append("Summary\n")
- returnString.append("-------------------------------------------------------\n")
- val totalClassified: Int = correctlyClassified + incorrectlyClassified
- val percentageCorrect: Double = 100.asInstanceOf[Double] * correctlyClassified / totalClassified
- val percentageIncorrect: Double = 100.asInstanceOf[Double] * incorrectlyClassified / totalClassified
- val decimalFormatter: NumberFormat = new DecimalFormat("0.####")
- returnString.append("Correctly Classified Instances")
- .append(": ")
- .append(Integer.toString(correctlyClassified))
- .append('\t')
- .append(decimalFormatter.format(percentageCorrect))
- .append("%\n")
- returnString.append("Incorrectly Classified Instances")
- .append(": ")
- .append(Integer.toString(incorrectlyClassified))
- .append('\t')
- .append(decimalFormatter.format(percentageIncorrect))
- .append("%\n")
- returnString.append("Total Classified Instances")
- .append(": ")
- .append(Integer.toString(totalClassified))
- .append('\n')
- returnString.append('\n')
- returnString.append(confusionMatrix)
- returnString.append("=======================================================\n")
- returnString.append("Statistics\n")
- returnString.append("-------------------------------------------------------\n")
- val normStats: RunningAverageAndStdDev = confusionMatrix.getNormalizedStats
- returnString.append("Kappa: \t")
- .append(decimalFormatter.format(confusionMatrix.getKappa))
- .append('\n')
- returnString.append("Accuracy: \t")
- .append(decimalFormatter.format(confusionMatrix.getAccuracy))
- .append("%\n")
- returnString.append("Reliability: \t")
- .append(decimalFormatter.format(normStats.getAverage * 100.00000001))
- .append("%\n")
- returnString.append("Reliability (std dev): \t")
- .append(decimalFormatter.format(normStats.getStandardDeviation))
- .append('\n')
- returnString.append("Weighted precision: \t")
- .append(decimalFormatter.format(confusionMatrix.getWeightedPrecision))
- .append('\n')
- returnString.append("Weighted recall: \t")
- .append(decimalFormatter.format(confusionMatrix.getWeightedRecall))
- .append('\n')
- returnString.append("Weighted F1 score: \t")
- .append(decimalFormatter.format(confusionMatrix.getWeightedF1score))
- .append('\n')
- if (hasLL) {
- returnString.append("Log-likelihood: \t")
- .append("mean : \t")
- .append(decimalFormatter.format(summarizer.getMean))
- .append('\n')
- returnString.append("25%-ile : \t")
- .append(decimalFormatter.format(summarizer.getQuartile(1)))
- .append('\n')
- returnString.append("75%-ile : \t")
- .append(decimalFormatter.format(summarizer.getQuartile(3)))
- .append('\n')
- }
-
- returnString.toString()
- }
-
-
-}
-
-/**
- *
- * Interface for classes that can keep track of a running average of a series of numbers. One can add to or
- * remove from the series, as well as update a datum in the series. The class does not actually keep track of
- * the series of values, just its running average, so it doesn't even matter if you remove/change a value that
- * wasn't added.
- *
- * Ported from org.apache.mahout.cf.taste.impl.common.RunningAverage.java
- */
-trait RunningAverage {
-
- /**
- * @param datum
- * new item to add to the running average
- * @throws IllegalArgumentException
- * if datum is { @link Double#NaN}
- */
- def addDatum(datum: Double)
-
- /**
- * @param datum
- * item to remove to the running average
- * @throws IllegalArgumentException
- * if datum is { @link Double#NaN}
- * @throws IllegalStateException
- * if count is 0
- */
- def removeDatum(datum: Double)
-
- /**
- * @param delta
- * amount by which to change a datum in the running average
- * @throws IllegalArgumentException
- * if delta is { @link Double#NaN}
- * @throws IllegalStateException
- * if count is 0
- */
- def changeDatum(delta: Double)
-
- def getCount: Int
-
- def getAverage: Double
-
- /**
- * @return a (possibly immutable) object whose average is the negative of this object's
- */
- def inverse: RunningAverage
-}
-
-/**
- *
- * Extends {@link RunningAverage} by adding standard deviation too.
- *
- * Ported from org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev.java
- */
-trait RunningAverageAndStdDev extends RunningAverage {
-
- /** @return standard deviation of data */
- def getStandardDeviation: Double
-
- /**
- * @return a (possibly immutable) object whose average is the negative of this object's
- */
- def inverse: RunningAverageAndStdDev
-}
-
-
-class InvertedRunningAverage(private val delegate: RunningAverage) extends RunningAverage {
-
- override def addDatum(datum: Double) {
- throw new UnsupportedOperationException
- }
-
- override def removeDatum(datum: Double) {
- throw new UnsupportedOperationException
- }
-
- override def changeDatum(delta: Double) {
- throw new UnsupportedOperationException
- }
-
- override def getCount: Int = {
- delegate.getCount
- }
-
- override def getAverage: Double = {
- -delegate.getAverage
- }
-
- override def inverse: RunningAverage = {
- delegate
- }
-}
-
-
-/**
- *
- * A simple class that can keep track of a running average of a series of numbers. One can add to or remove
- * from the series, as well as update a datum in the series. The class does not actually keep track of the
- * series of values, just its running average, so it doesn't even matter if you remove/change a value that
- * wasn't added.
- *
- * Ported from org.apache.mahout.cf.taste.impl.common.FullRunningAverage.java
- */
-class FullRunningAverage(private var count: Int = 0,
- private var average: Double = Double.NaN ) extends RunningAverage {
-
- /**
- * @param datum
- * new item to add to the running average
- */
- override def addDatum(datum: Double) {
- count += 1
- if (count == 1) {
- average = datum
- }
- else {
- average = average * (count - 1) / count + datum / count
- }
- }
-
- /**
- * @param datum
- * item to remove from the running average
- * @throws IllegalStateException
- * if count is 0
- */
- override def removeDatum(datum: Double) {
- if (count == 0) {
- throw new IllegalStateException
- }
- count -= 1
- if (count == 0) {
- average = Double.NaN
- }
- else {
- average = average * (count + 1) / count - datum / count
- }
- }
-
- /**
- * @param delta
- * amount by which to change a datum in the running average
- * @throws IllegalStateException
- * if count is 0
- */
- override def changeDatum(delta: Double) {
- if (count == 0) {
- throw new IllegalStateException
- }
- average += delta / count
- }
-
- override def getCount: Int = {
- count
- }
-
- override def getAverage: Double = {
- average
- }
-
- override def inverse: RunningAverage = {
- new InvertedRunningAverage(this)
- }
-
- override def toString: String = {
- String.valueOf(average)
- }
-}
-
-
-/**
- *
- * Extends {@link FullRunningAverage} to add a running standard deviation computation.
- * Uses Welford's method, as described at http://www.johndcook.com/standard_deviation.html
- *
- * Ported from org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev.java
- */
-class FullRunningAverageAndStdDev(private var count: Int = 0,
- private var average: Double = 0.0,
- private var mk: Double = 0.0,
- private var sk: Double = 0.0) extends FullRunningAverage with RunningAverageAndStdDev {
-
- var stdDev: Double = 0.0
-
- recomputeStdDev
-
- def getMk: Double = {
- mk
- }
-
- def getSk: Double = {
- sk
- }
-
- override def getStandardDeviation: Double = {
- stdDev
- }
-
- override def addDatum(datum: Double) {
- super.addDatum(datum)
- val count: Int = getCount
- if (count == 1) {
- mk = datum
- sk = 0.0
- }
- else {
- val oldmk: Double = mk
- val diff: Double = datum - oldmk
- mk += diff / count
- sk += diff * (datum - mk)
- }
- recomputeStdDev
- }
-
- override def removeDatum(datum: Double) {
- val oldCount: Int = getCount
- super.removeDatum(datum)
- val oldmk: Double = mk
- mk = (oldCount * oldmk - datum) / (oldCount - 1)
- sk -= (datum - mk) * (datum - oldmk)
- recomputeStdDev
- }
-
- /**
- * @throws UnsupportedOperationException
- */
- override def changeDatum(delta: Double) {
- throw new UnsupportedOperationException
- }
-
- private def recomputeStdDev {
- val count: Int = getCount
- stdDev = if (count > 1) Math.sqrt(sk / (count - 1)) else Double.NaN
- }
-
- override def inverse: RunningAverageAndStdDev = {
- new InvertedRunningAverageAndStdDev(this)
- }
-
- override def toString: String = {
- String.valueOf(String.valueOf(getAverage) + ',' + stdDev)
- }
-
-}
-
-
-/**
- *
- * @param delegate RunningAverageAndStdDev instance
- *
- * Ported from org.apache.mahout.cf.taste.impl.common.InvertedRunningAverageAndStdDev.java
- */
-class InvertedRunningAverageAndStdDev(private val delegate: RunningAverageAndStdDev) extends RunningAverageAndStdDev {
-
- /**
- * @throws UnsupportedOperationException
- */
- override def addDatum(datum: Double) {
- throw new UnsupportedOperationException
- }
-
- /**
- * @throws UnsupportedOperationException
- */
-
- override def removeDatum(datum: Double) {
- throw new UnsupportedOperationException
- }
-
- /**
- * @throws UnsupportedOperationException
- */
- override def changeDatum(delta: Double) {
- throw new UnsupportedOperationException
- }
-
- override def getCount: Int = {
- delegate.getCount
- }
-
- override def getAverage: Double = {
- -delegate.getAverage
- }
-
- override def getStandardDeviation: Double = {
- delegate.getStandardDeviation
- }
-
- override def inverse: RunningAverageAndStdDev = {
- delegate
- }
-}
-
-
-
-
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7b69fab/math-scala/src/main/scala/org/apache/mahout/classifier/stats/ConfusionMatrix.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/classifier/stats/ConfusionMatrix.scala b/math-scala/src/main/scala/org/apache/mahout/classifier/stats/ConfusionMatrix.scala
deleted file mode 100644
index 328d27b..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/classifier/stats/ConfusionMatrix.scala
+++ /dev/null
@@ -1,460 +0,0 @@
-/*
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-
-package org.apache.mahout.classifier.stats
-
-import java.util
-import org.apache.commons.math3.stat.descriptive.moment.Mean // This is brought in by mahout-math
-import org.apache.mahout.math.{DenseMatrix, Matrix}
-import scala.collection.mutable
-import scala.collection.JavaConversions._
-
-/**
- *
- * Ported from org.apache.mahout.classifier.ConfusionMatrix.java
- *
- * The ConfusionMatrix Class stores the result of Classification of a Test Dataset.
- *
- * The fact of whether there is a default is not stored. A row of zeros is the only indicator that there is no default.
- *
- * See http://en.wikipedia.org/wiki/Confusion_matrix for background
- *
- *
- * @param labels The labels to consider for classification
- * @param defaultLabel default unknown label
- */
-class ConfusionMatrix(private var labels: util.Collection[String] = null,
- private var defaultLabel: String = "unknown") {
- /**
- * Matrix Constructor
- * @param m a DenseMatrix with RowLabelBindings
- */
-// def this(m: Matrix) {
-// this()
-// confusionMatrix = Array.ofDim[Int](m.numRows, m.numRows)
-// setMatrix(m)
-// }
-
- // val LOG: Logger = LoggerFactory.getLogger(classOf[ConfusionMatrix])
-
- var confusionMatrix = Array.ofDim[Int](labels.size + 1, labels.size + 1)
-
- val labelMap = new mutable.HashMap[String,Integer]()
-
- var samples: Int = 0
-
- var i: Integer = 0
- for (label <- labels) {
- labelMap.put(label, i)
- i+=1
- }
- labelMap.put(defaultLabel, i)
-
-
- def getConfusionMatrix: Array[Array[Int]] = confusionMatrix
-
- def getLabels = labelMap.keys.toList
-
- def numLabels: Int = labelMap.size
-
- def getAccuracy(label: String): Double = {
- val labelId: Int = labelMap(label)
- var labelTotal: Int = 0
- var correct: Int = 0
- for (i <- 0 until numLabels) {
- labelTotal += confusionMatrix(labelId)(i)
- if (i == labelId) {
- correct += confusionMatrix(labelId)(i)
- }
- }
-
- 100.0 * correct / labelTotal
- }
-
- def getAccuracy: Double = {
- var total: Int = 0
- var correct: Int = 0
- for (i <- 0 until numLabels) {
- for (j <- 0 until numLabels) {
- total += confusionMatrix(i)(j)
- if (i == j) {
- correct += confusionMatrix(i)(j)
- }
- }
- }
-
- 100.0 * correct / total
- }
-
- /** Sum of true positives and false negatives */
- private def getActualNumberOfTestExamplesForClass(label: String): Int = {
- val labelId: Int = labelMap(label)
- var sum: Int = 0
- for (i <- 0 until numLabels) {
- sum += confusionMatrix(labelId)(i)
- }
- sum
- }
-
- def getPrecision(label: String): Double = {
- val labelId: Int = labelMap(label)
- val truePositives: Int = confusionMatrix(labelId)(labelId)
- var falsePositives: Int = 0
-
- for (i <- 0 until numLabels) {
- if (i != labelId) {
- falsePositives += confusionMatrix(i)(labelId)
- }
- }
-
- if (truePositives + falsePositives == 0) {
- 0
- } else {
- (truePositives.asInstanceOf[Double]) / (truePositives + falsePositives)
- }
- }
-
-
- def getWeightedPrecision: Double = {
- val precisions: Array[Double] = new Array[Double](numLabels)
- val weights: Array[Double] = new Array[Double](numLabels)
- var index: Int = 0
- for (label <- labelMap.keys) {
- precisions(index) = getPrecision(label)
- weights(index) = getActualNumberOfTestExamplesForClass(label)
- index += 1
- }
- new Mean().evaluate(precisions, weights)
- }
-
- def getRecall(label: String): Double = {
- val labelId: Int = labelMap(label)
- val truePositives: Int = confusionMatrix(labelId)(labelId)
- var falseNegatives: Int = 0
- for (i <- 0 until numLabels) {
- if (i != labelId) {
- falseNegatives += confusionMatrix(labelId)(i)
- }
- }
-
- if (truePositives + falseNegatives == 0) {
- 0
- } else {
- (truePositives.asInstanceOf[Double]) / (truePositives + falseNegatives)
- }
- }
-
- def getWeightedRecall: Double = {
- val recalls: Array[Double] = new Array[Double](numLabels)
- val weights: Array[Double] = new Array[Double](numLabels)
- var index: Int = 0
- for (label <- labelMap.keys) {
- recalls(index) = getRecall(label)
- weights(index) = getActualNumberOfTestExamplesForClass(label)
- index += 1
- }
- new Mean().evaluate(recalls, weights)
- }
-
- def getF1score(label: String): Double = {
- val precision: Double = getPrecision(label)
- val recall: Double = getRecall(label)
- if (precision + recall == 0) {
- 0
- } else {
- 2 * precision * recall / (precision + recall)
- }
- }
-
- def getWeightedF1score: Double = {
- val f1Scores: Array[Double] = new Array[Double](numLabels)
- val weights: Array[Double] = new Array[Double](numLabels)
- var index: Int = 0
- for (label <- labelMap.keys) {
- f1Scores(index) = getF1score(label)
- weights(index) = getActualNumberOfTestExamplesForClass(label)
- index += 1
- }
- new Mean().evaluate(f1Scores, weights)
- }
-
- def getReliability: Double = {
- var count: Int = 0
- var accuracy: Double = 0
- for (label <- labelMap.keys) {
- if (!(label == defaultLabel)) {
- accuracy += getAccuracy(label)
- }
- count += 1
- }
- accuracy / count
- }
-
- /**
- * Accuracy v.s. randomly classifying all samples.
- * kappa() = (totalAccuracy() - randomAccuracy()) / (1 - randomAccuracy())
- * Cohen, Jacob. 1960. A coefficient of agreement for nominal scales.
- * Educational And Psychological Measurement 20:37-46.
- *
- * Formula and variable names from:
- * http://www.yale.edu/ceo/OEFS/Accuracy.pdf
- *
- * @return double
- */
- def getKappa: Double = {
- var a: Double = 0.0
- var b: Double = 0.0
- for (i <- 0 until confusionMatrix.length) {
- a += confusionMatrix(i)(i)
- var br: Int = 0
- for (j <- 0 until confusionMatrix.length) {
- br += confusionMatrix(i)(j)
- }
- var bc: Int = 0
- //TODO: verify this as an iterator
- for (vec <- confusionMatrix) {
- bc += vec(i)
- }
- b += br * bc
- }
- (samples * a - b) / (samples * samples - b)
- }
-
- def getCorrect(label: String): Int = {
- val labelId: Int = labelMap(label)
- confusionMatrix(labelId)(labelId)
- }
-
- def getTotal(label: String): Int = {
- val labelId: Int = labelMap(label)
- var labelTotal: Int = 0
- for (i <- 0 until numLabels) {
- labelTotal += confusionMatrix(labelId)(i)
- }
- labelTotal
- }
-
- /**
- * Standard deviation of normalized producer accuracy
- * Not a standard score
- * @return double
- */
- def getNormalizedStats: RunningAverageAndStdDev = {
- val summer = new FullRunningAverageAndStdDev()
- for (d <- 0 until confusionMatrix.length) {
- var total: Double = 0.0
- for (j <- 0 until confusionMatrix.length) {
- total += confusionMatrix(d)(j)
- }
- summer.addDatum(confusionMatrix(d)(d) / (total + 0.000001))
- }
- summer
- }
-
- def addInstance(correctLabel: String, classifiedResult: ClassifierResult): Unit = {
- samples += 1
- incrementCount(correctLabel, classifiedResult.getLabel)
- }
-
- def addInstance(correctLabel: String, classifiedLabel: String): Unit = {
- samples += 1
- incrementCount(correctLabel, classifiedLabel)
- }
-
- def getCount(correctLabel: String, classifiedLabel: String): Int = {
- if (!labelMap.containsKey(correctLabel)) {
- // LOG.warn("Label {} did not appear in the training examples", correctLabel)
- return 0
- }
- assert(labelMap.containsKey(classifiedLabel), "Label not found: " + classifiedLabel)
- val correctId: Int = labelMap(correctLabel)
- val classifiedId: Int = labelMap(classifiedLabel)
- confusionMatrix(correctId)(classifiedId)
- }
-
- def putCount(correctLabel: String, classifiedLabel: String, count: Int): Unit = {
- if (!labelMap.containsKey(correctLabel)) {
- // LOG.warn("Label {} did not appear in the training examples", correctLabel)
- return
- }
- assert(labelMap.containsKey(classifiedLabel), "Label not found: " + classifiedLabel)
- val correctId: Int = labelMap(correctLabel)
- val classifiedId: Int = labelMap(classifiedLabel)
- if (confusionMatrix(correctId)(classifiedId) == 0.0 && count != 0) {
- samples += 1
- }
- confusionMatrix(correctId)(classifiedId) = count
- }
-
- def incrementCount(correctLabel: String, classifiedLabel: String, count: Int): Unit = {
- putCount(correctLabel, classifiedLabel, count + getCount(correctLabel, classifiedLabel))
- }
-
- def incrementCount(correctLabel: String, classifiedLabel: String): Unit = {
- incrementCount(correctLabel, classifiedLabel, 1)
- }
-
- def getDefaultLabel: String = {
- defaultLabel
- }
-
- def merge(b: ConfusionMatrix): ConfusionMatrix = {
- assert(labelMap.size == b.getLabels.size, "The label sizes do not match")
- for (correctLabel <- this.labelMap.keys) {
- for (classifiedLabel <- this.labelMap.keys) {
- incrementCount(correctLabel, classifiedLabel, b.getCount(correctLabel, classifiedLabel))
- }
- }
- this
- }
-
- def getMatrix: Matrix = {
- val length: Int = confusionMatrix.length
- val m: Matrix = new DenseMatrix(length, length)
-
- val labels: java.util.HashMap[String, Integer] = new java.util.HashMap()
-
- for (r <- 0 until length) {
- for (c <- 0 until length) {
- m.set(r, c, confusionMatrix(r)(c))
- }
- }
-
- for (entry <- labelMap.entrySet) {
- labels.put(entry.getKey, entry.getValue)
- }
- m.setRowLabelBindings(labels)
- m.setColumnLabelBindings(labels)
-
- m
- }
-
- def setMatrix(m: Matrix) : Unit = {
- val length: Int = confusionMatrix.length
- if (m.numRows != m.numCols) {
- throw new IllegalArgumentException("ConfusionMatrix: matrix(" + m.numRows + ',' + m.numCols + ") must be square")
- }
-
- for (r <- 0 until length) {
- for (c <- 0 until length) {
- confusionMatrix(r)(c) = Math.round(m.get(r, c)).toInt
- }
- }
-
- var labels = m.getRowLabelBindings
- if (labels == null) {
- labels = m.getColumnLabelBindings
- }
-
- if (labels != null) {
- val sorted: Array[String] = sortLabels(labels)
- verifyLabels(length, sorted)
- labelMap.clear
- for (i <- 0 until length) {
- labelMap.put(sorted(i), i)
- }
- }
- }
-
- def verifyLabels(length: Int, sorted: Array[String]): Unit = {
- assert(sorted.length == length, "One label, one row")
- for (i <- 0 until length) {
- if (sorted(i) == null) {
- assert(false, "One label, one row")
- }
- }
- }
-
- def sortLabels(labels: java.util.Map[String, Integer]): Array[String] = {
- val sorted: Array[String] = new Array[String](labels.size)
- for (entry <- labels.entrySet) {
- sorted(entry.getValue) = entry.getKey
- }
-
- sorted
- }
-
- /**
- * This is overloaded. toString() is not a formatted report you print for a manager :)
- * Assume that if there are no default assignments, the default feature was not used
- */
- override def toString: String = {
-
- val returnString: StringBuilder = new StringBuilder(200)
-
- returnString.append("=======================================================").append('\n')
- returnString.append("Confusion Matrix\n")
- returnString.append("-------------------------------------------------------").append('\n')
-
- val unclassified: Int = getTotal(defaultLabel)
-
- for (entry <- this.labelMap.entrySet) {
- if (!((entry.getKey == defaultLabel) && unclassified == 0)) {
- returnString.append(getSmallLabel(entry.getValue) + " ").append('\t')
- }
- }
-
- returnString.append("<--Classified as").append('\n')
-
- for (entry <- this.labelMap.entrySet) {
- if (!((entry.getKey == defaultLabel) && unclassified == 0)) {
- val correctLabel: String = entry.getKey
- var labelTotal: Int = 0
-
- for (classifiedLabel <- this.labelMap.keySet) {
- if (!((classifiedLabel == defaultLabel) && unclassified == 0)) {
- returnString.append(Integer.toString(getCount(correctLabel, classifiedLabel)) + " ")
- .append('\t')
- labelTotal += getCount(correctLabel, classifiedLabel)
- }
- }
- returnString.append(" | ").append(String.valueOf(labelTotal) + " ")
- .append('\t')
- .append(getSmallLabel(entry.getValue) + " ")
- .append(" = ")
- .append(correctLabel)
- .append('\n')
- }
- }
-
- if (unclassified > 0) {
- returnString.append("Default Category: ")
- .append(defaultLabel)
- .append(": ")
- .append(unclassified)
- .append('\n')
- }
- returnString.append('\n')
-
- returnString.toString()
- }
-
-
- def getSmallLabel(i: Int): String = {
- var value: Int = i
- val returnString: StringBuilder = new StringBuilder
- do {
- val n: Int = value % 26
- returnString.insert(0, ('a' + n).asInstanceOf[Char])
- value /= 26
- } while (value > 0)
-
- returnString.toString()
- }
-
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7b69fab/math-scala/src/main/scala/org/apache/mahout/drivers/MahoutDriver.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/drivers/MahoutDriver.scala b/math-scala/src/main/scala/org/apache/mahout/drivers/MahoutDriver.scala
deleted file mode 100644
index 32515f1..0000000
--- a/math-scala/src/main/scala/org/apache/mahout/drivers/MahoutDriver.scala
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.drivers
-
-import org.apache.mahout.math.drm.DistributedContext
-
-/** Extended by a platform specific version of this class to create a Mahout CLI driver. */
-abstract class MahoutDriver {
-
- implicit protected var mc: DistributedContext = _
- implicit protected var parser: MahoutOptionParser = _
-
- var _useExistingContext: Boolean = false // used in the test suite to reuse one context per suite
-
- /** must be overriden to setup the DistributedContext mc*/
- protected def start() : Unit
-
- /** Override (optionally) for special cleanup */
- protected def stop(): Unit = {
- if (!_useExistingContext) mc.close
- }
-
- /** This is where you do the work, call start first, then before exiting call stop */
- protected def process(): Unit
-
- /** Parse command line and call process */
- def main(args: Array[String]): Unit
-
-}