You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by jk...@apache.org on 2018/04/06 17:10:04 UTC

spark git commit: [SPARK-23859][ML] Initial PR for Instrumentation improvements: UUID and logging levels

Repository: spark
Updated Branches:
  refs/heads/master c926acf71 -> d23a805f9


[SPARK-23859][ML] Initial PR for Instrumentation improvements: UUID and logging levels

## What changes were proposed in this pull request?

Initial PR for Instrumentation improvements: UUID and logging levels.
This PR takes over #20837

Closes #20837

## How was this patch tested?

Manual.

Author: Bago Amirbekian <ba...@databricks.com>
Author: WeichenXu <we...@databricks.com>

Closes #20982 from WeichenXu123/better-instrumentation.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d23a805f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d23a805f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d23a805f

Branch: refs/heads/master
Commit: d23a805f975f209f273db2b52de3f336be17d873
Parents: c926acf
Author: Bago Amirbekian <ba...@databricks.com>
Authored: Fri Apr 6 10:09:55 2018 -0700
Committer: Joseph K. Bradley <jo...@databricks.com>
Committed: Fri Apr 6 10:09:55 2018 -0700

----------------------------------------------------------------------
 .../ml/classification/LogisticRegression.scala  | 15 +++++---
 .../apache/spark/ml/util/Instrumentation.scala  | 40 ++++++++++++++++----
 2 files changed, 41 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/d23a805f/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 3ae4db3..ee4b010 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -517,6 +517,9 @@ class LogisticRegression @Since("1.2.0") (
         (new MultivariateOnlineSummarizer, new MultiClassSummarizer)
       )(seqOp, combOp, $(aggregationDepth))
     }
+    instr.logNamedValue(Instrumentation.loggerTags.numExamples, summarizer.count)
+    instr.logNamedValue("lowestLabelWeight", labelSummarizer.histogram.min.toString)
+    instr.logNamedValue("highestLabelWeight", labelSummarizer.histogram.max.toString)
 
     val histogram = labelSummarizer.histogram
     val numInvalid = labelSummarizer.countInvalid
@@ -560,15 +563,15 @@ class LogisticRegression @Since("1.2.0") (
       if (numInvalid != 0) {
         val msg = s"Classification labels should be in [0 to ${numClasses - 1}]. " +
           s"Found $numInvalid invalid labels."
-        logError(msg)
+        instr.logError(msg)
         throw new SparkException(msg)
       }
 
       val isConstantLabel = histogram.count(_ != 0.0) == 1
 
       if ($(fitIntercept) && isConstantLabel && !usingBoundConstrainedOptimization) {
-        logWarning(s"All labels are the same value and fitIntercept=true, so the coefficients " +
-          s"will be zeros. Training is not needed.")
+        instr.logWarning(s"All labels are the same value and fitIntercept=true, so the " +
+          s"coefficients will be zeros. Training is not needed.")
         val constantLabelIndex = Vectors.dense(histogram).argmax
         val coefMatrix = new SparseMatrix(numCoefficientSets, numFeatures,
           new Array[Int](numCoefficientSets + 1), Array.empty[Int], Array.empty[Double],
@@ -581,7 +584,7 @@ class LogisticRegression @Since("1.2.0") (
         (coefMatrix, interceptVec, Array.empty[Double])
       } else {
         if (!$(fitIntercept) && isConstantLabel) {
-          logWarning(s"All labels belong to a single class and fitIntercept=false. It's a " +
+          instr.logWarning(s"All labels belong to a single class and fitIntercept=false. It's a " +
             s"dangerous ground, so the algorithm may not converge.")
         }
 
@@ -590,7 +593,7 @@ class LogisticRegression @Since("1.2.0") (
 
         if (!$(fitIntercept) && (0 until numFeatures).exists { i =>
           featuresStd(i) == 0.0 && featuresMean(i) != 0.0 }) {
-          logWarning("Fitting LogisticRegressionModel without intercept on dataset with " +
+          instr.logWarning("Fitting LogisticRegressionModel without intercept on dataset with " +
             "constant nonzero column, Spark MLlib outputs zero coefficients for constant " +
             "nonzero columns. This behavior is the same as R glmnet but different from LIBSVM.")
         }
@@ -708,7 +711,7 @@ class LogisticRegression @Since("1.2.0") (
               (_initialModel.interceptVector.size == numCoefficientSets) &&
               (_initialModel.getFitIntercept == $(fitIntercept))
             if (!modelIsValid) {
-              logWarning(s"Initial coefficients will be ignored! Its dimensions " +
+              instr.logWarning(s"Initial coefficients will be ignored! Its dimensions " +
                 s"(${providedCoefs.numRows}, ${providedCoefs.numCols}) did not match the " +
                 s"expected size ($numCoefficientSets, $numFeatures)")
             }

http://git-wip-us.apache.org/repos/asf/spark/blob/d23a805f/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
index 7c46f45..e694bc2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.util
 
-import java.util.concurrent.atomic.AtomicLong
+import java.util.UUID
 
 import org.json4s._
 import org.json4s.JsonDSL._
@@ -42,7 +42,7 @@ import org.apache.spark.sql.Dataset
 private[spark] class Instrumentation[E <: Estimator[_]] private (
     estimator: E, dataset: RDD[_]) extends Logging {
 
-  private val id = Instrumentation.counter.incrementAndGet()
+  private val id = UUID.randomUUID()
   private val prefix = {
     val className = estimator.getClass.getSimpleName
     s"$className-${estimator.uid}-${dataset.hashCode()}-$id: "
@@ -56,13 +56,32 @@ private[spark] class Instrumentation[E <: Estimator[_]] private (
   }
 
   /**
-   * Logs a message with a prefix that uniquely identifies the training session.
+   * Logs a warning message with a prefix that uniquely identifies the training session.
    */
-  def log(msg: String): Unit = {
-    logInfo(prefix + msg)
+  override def logWarning(msg: => String): Unit = {
+    super.logWarning(prefix + msg)
   }
 
   /**
+   * Logs a error message with a prefix that uniquely identifies the training session.
+   */
+  override def logError(msg: => String): Unit = {
+    super.logError(prefix + msg)
+  }
+
+  /**
+   * Logs an info message with a prefix that uniquely identifies the training session.
+   */
+  override def logInfo(msg: => String): Unit = {
+    super.logInfo(prefix + msg)
+  }
+
+  /**
+   * Alias for logInfo, see above.
+   */
+  def log(msg: String): Unit = logInfo(msg)
+
+  /**
    * Logs the value of the given parameters for the estimator being used in this session.
    */
   def logParams(params: Param[_]*): Unit = {
@@ -77,11 +96,11 @@ private[spark] class Instrumentation[E <: Estimator[_]] private (
   }
 
   def logNumFeatures(num: Long): Unit = {
-    log(compact(render("numFeatures" -> num)))
+    logNamedValue(Instrumentation.loggerTags.numFeatures, num)
   }
 
   def logNumClasses(num: Long): Unit = {
-    log(compact(render("numClasses" -> num)))
+    logNamedValue(Instrumentation.loggerTags.numClasses, num)
   }
 
   /**
@@ -107,7 +126,12 @@ private[spark] class Instrumentation[E <: Estimator[_]] private (
  * Some common methods for logging information about a training session.
  */
 private[spark] object Instrumentation {
-  private val counter = new AtomicLong(0)
+
+  object loggerTags {
+    val numFeatures = "numFeatures"
+    val numClasses = "numClasses"
+    val numExamples = "numExamples"
+  }
 
   /**
    * Creates an instrumentation object for a training session.


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org