You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by jk...@apache.org on 2018/04/06 17:10:04 UTC
spark git commit: [SPARK-23859][ML] Initial PR for Instrumentation
improvements: UUID and logging levels
Repository: spark
Updated Branches:
refs/heads/master c926acf71 -> d23a805f9
[SPARK-23859][ML] Initial PR for Instrumentation improvements: UUID and logging levels
## What changes were proposed in this pull request?
Initial PR for Instrumentation improvements: UUID and logging levels.
This PR takes over #20837
Closes #20837
## How was this patch tested?
Manual.
Author: Bago Amirbekian <ba...@databricks.com>
Author: WeichenXu <we...@databricks.com>
Closes #20982 from WeichenXu123/better-instrumentation.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d23a805f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d23a805f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d23a805f
Branch: refs/heads/master
Commit: d23a805f975f209f273db2b52de3f336be17d873
Parents: c926acf
Author: Bago Amirbekian <ba...@databricks.com>
Authored: Fri Apr 6 10:09:55 2018 -0700
Committer: Joseph K. Bradley <jo...@databricks.com>
Committed: Fri Apr 6 10:09:55 2018 -0700
----------------------------------------------------------------------
.../ml/classification/LogisticRegression.scala | 15 +++++---
.../apache/spark/ml/util/Instrumentation.scala | 40 ++++++++++++++++----
2 files changed, 41 insertions(+), 14 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/d23a805f/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 3ae4db3..ee4b010 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -517,6 +517,9 @@ class LogisticRegression @Since("1.2.0") (
(new MultivariateOnlineSummarizer, new MultiClassSummarizer)
)(seqOp, combOp, $(aggregationDepth))
}
+ instr.logNamedValue(Instrumentation.loggerTags.numExamples, summarizer.count)
+ instr.logNamedValue("lowestLabelWeight", labelSummarizer.histogram.min.toString)
+ instr.logNamedValue("highestLabelWeight", labelSummarizer.histogram.max.toString)
val histogram = labelSummarizer.histogram
val numInvalid = labelSummarizer.countInvalid
@@ -560,15 +563,15 @@ class LogisticRegression @Since("1.2.0") (
if (numInvalid != 0) {
val msg = s"Classification labels should be in [0 to ${numClasses - 1}]. " +
s"Found $numInvalid invalid labels."
- logError(msg)
+ instr.logError(msg)
throw new SparkException(msg)
}
val isConstantLabel = histogram.count(_ != 0.0) == 1
if ($(fitIntercept) && isConstantLabel && !usingBoundConstrainedOptimization) {
- logWarning(s"All labels are the same value and fitIntercept=true, so the coefficients " +
- s"will be zeros. Training is not needed.")
+ instr.logWarning(s"All labels are the same value and fitIntercept=true, so the " +
+ s"coefficients will be zeros. Training is not needed.")
val constantLabelIndex = Vectors.dense(histogram).argmax
val coefMatrix = new SparseMatrix(numCoefficientSets, numFeatures,
new Array[Int](numCoefficientSets + 1), Array.empty[Int], Array.empty[Double],
@@ -581,7 +584,7 @@ class LogisticRegression @Since("1.2.0") (
(coefMatrix, interceptVec, Array.empty[Double])
} else {
if (!$(fitIntercept) && isConstantLabel) {
- logWarning(s"All labels belong to a single class and fitIntercept=false. It's a " +
+ instr.logWarning(s"All labels belong to a single class and fitIntercept=false. It's a " +
s"dangerous ground, so the algorithm may not converge.")
}
@@ -590,7 +593,7 @@ class LogisticRegression @Since("1.2.0") (
if (!$(fitIntercept) && (0 until numFeatures).exists { i =>
featuresStd(i) == 0.0 && featuresMean(i) != 0.0 }) {
- logWarning("Fitting LogisticRegressionModel without intercept on dataset with " +
+ instr.logWarning("Fitting LogisticRegressionModel without intercept on dataset with " +
"constant nonzero column, Spark MLlib outputs zero coefficients for constant " +
"nonzero columns. This behavior is the same as R glmnet but different from LIBSVM.")
}
@@ -708,7 +711,7 @@ class LogisticRegression @Since("1.2.0") (
(_initialModel.interceptVector.size == numCoefficientSets) &&
(_initialModel.getFitIntercept == $(fitIntercept))
if (!modelIsValid) {
- logWarning(s"Initial coefficients will be ignored! Its dimensions " +
+ instr.logWarning(s"Initial coefficients will be ignored! Its dimensions " +
s"(${providedCoefs.numRows}, ${providedCoefs.numCols}) did not match the " +
s"expected size ($numCoefficientSets, $numFeatures)")
}
http://git-wip-us.apache.org/repos/asf/spark/blob/d23a805f/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
index 7c46f45..e694bc2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
@@ -17,7 +17,7 @@
package org.apache.spark.ml.util
-import java.util.concurrent.atomic.AtomicLong
+import java.util.UUID
import org.json4s._
import org.json4s.JsonDSL._
@@ -42,7 +42,7 @@ import org.apache.spark.sql.Dataset
private[spark] class Instrumentation[E <: Estimator[_]] private (
estimator: E, dataset: RDD[_]) extends Logging {
- private val id = Instrumentation.counter.incrementAndGet()
+ private val id = UUID.randomUUID()
private val prefix = {
val className = estimator.getClass.getSimpleName
s"$className-${estimator.uid}-${dataset.hashCode()}-$id: "
@@ -56,13 +56,32 @@ private[spark] class Instrumentation[E <: Estimator[_]] private (
}
/**
- * Logs a message with a prefix that uniquely identifies the training session.
+ * Logs a warning message with a prefix that uniquely identifies the training session.
*/
- def log(msg: String): Unit = {
- logInfo(prefix + msg)
+ override def logWarning(msg: => String): Unit = {
+ super.logWarning(prefix + msg)
}
/**
+ * Logs a error message with a prefix that uniquely identifies the training session.
+ */
+ override def logError(msg: => String): Unit = {
+ super.logError(prefix + msg)
+ }
+
+ /**
+ * Logs an info message with a prefix that uniquely identifies the training session.
+ */
+ override def logInfo(msg: => String): Unit = {
+ super.logInfo(prefix + msg)
+ }
+
+ /**
+ * Alias for logInfo, see above.
+ */
+ def log(msg: String): Unit = logInfo(msg)
+
+ /**
* Logs the value of the given parameters for the estimator being used in this session.
*/
def logParams(params: Param[_]*): Unit = {
@@ -77,11 +96,11 @@ private[spark] class Instrumentation[E <: Estimator[_]] private (
}
def logNumFeatures(num: Long): Unit = {
- log(compact(render("numFeatures" -> num)))
+ logNamedValue(Instrumentation.loggerTags.numFeatures, num)
}
def logNumClasses(num: Long): Unit = {
- log(compact(render("numClasses" -> num)))
+ logNamedValue(Instrumentation.loggerTags.numClasses, num)
}
/**
@@ -107,7 +126,12 @@ private[spark] class Instrumentation[E <: Estimator[_]] private (
* Some common methods for logging information about a training session.
*/
private[spark] object Instrumentation {
- private val counter = new AtomicLong(0)
+
+ object loggerTags {
+ val numFeatures = "numFeatures"
+ val numClasses = "numClasses"
+ val numExamples = "numExamples"
+ }
/**
* Creates an instrumentation object for a training session.
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org