You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Peng Meng (JIRA)" <ji...@apache.org> on 2017/08/04 08:45:00 UTC
[jira] [Updated] (SPARK-21638) Warning message of RF is not
accurate
[ https://issues.apache.org/jira/browse/SPARK-21638?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Peng Meng updated SPARK-21638:
------------------------------
Description:
When train RF model, there is many warning message like this:
{quote}WARN RandomForest: Tree learning is using approximately 268492800 bytes per iteration, which exceeds requested limit maxMemoryUsage=268435456. This allows splitting 2622 nodes in this iteration.{quote}
This warning message is unnecessary and the data is not accurate.
This is because
{code:java}
while (nodeStack.nonEmpty && (memUsage < maxMemoryUsage || memUsage == 0)) {
val (treeIndex, node) = nodeStack.top
// Choose subset of features for node (if subsampling).
val featureSubset: Option[Array[Int]] = if (metadata.subsamplingFeatures) {
Some(SamplingUtils.reservoirSampleAndCount(Range(0,
metadata.numFeatures).iterator, metadata.numFeaturesPerNode, rng.nextLong())._1)
} else {
None
}
// Check if enough memory remains to add this node to the group.
val nodeMemUsage = RandomForest.aggregateSizeForNode(metadata, featureSubset) * 8L
if (memUsage + nodeMemUsage <= maxMemoryUsage || memUsage == 0) {
nodeStack.pop()
mutableNodesForGroup.getOrElseUpdate(treeIndex, new mutable.ArrayBuffer[LearningNode]()) +=
node
mutableTreeToNodeToIndexInfo
.getOrElseUpdate(treeIndex, new mutable.HashMap[Int, NodeIndexInfo]())(node.id)
= new NodeIndexInfo(numNodesInGroup, featureSubset)
}
numNodesInGroup += 1 *//we not add the node to mutableNodesForGroup, but we add memUsage here.*
memUsage += nodeMemUsage
}
if (memUsage > maxMemoryUsage) {
// If maxMemoryUsage is 0, we should still allow splitting 1 node.
logWarning(s"Tree learning is using approximately $memUsage bytes per iteration, which" +
s" exceeds requested limit maxMemoryUsage=$maxMemoryUsage. This allows splitting" +
s" $numNodesInGroup nodes in this iteration.")
}
{code}
To avoid this unnecessary warning, we should change the code like this:
{code:java}
while (nodeStack.nonEmpty) {
val (treeIndex, node) = nodeStack.top
// Choose subset of features for node (if subsampling).
val featureSubset: Option[Array[Int]] = if (metadata.subsamplingFeatures) {
Some(SamplingUtils.reservoirSampleAndCount(Range(0,
metadata.numFeatures).iterator, metadata.numFeaturesPerNode, rng.nextLong())._1)
} else {
None
}
// Check if enough memory remains to add this node to the group.
val nodeMemUsage = RandomForest.aggregateSizeForNode(metadata, featureSubset) * 8L
if (memUsage + nodeMemUsage <= maxMemoryUsage || memUsage == 0) {
nodeStack.pop()
mutableNodesForGroup.getOrElseUpdate(treeIndex, new mutable.ArrayBuffer[LearningNode]()) +=
node
mutableTreeToNodeToIndexInfo
.getOrElseUpdate(treeIndex, new mutable.HashMap[Int, NodeIndexInfo]())(node.id)
= new NodeIndexInfo(numNodesInGroup, featureSubset)
numNodesInGroup += 1 //we not add the node to mutableNodesForGroup, but we add memUsage here.
memUsage += nodeMemUsage
} else {
break
}
}
{code}
was:
When train RF model, there is many warning message like this:
{quote}WARN RandomForest: Tree learning is using approximately 268492800 bytes per iteration, which exceeds requested limit maxMemoryUsage=268435456. This allows splitting 2622 nodes in this iteration.{quote}
This warning message is unnecessary and the data is not accuracy.
This is because
{code:java}
while (nodeStack.nonEmpty && (memUsage < maxMemoryUsage || memUsage == 0)) {
val (treeIndex, node) = nodeStack.top
// Choose subset of features for node (if subsampling).
val featureSubset: Option[Array[Int]] = if (metadata.subsamplingFeatures) {
Some(SamplingUtils.reservoirSampleAndCount(Range(0,
metadata.numFeatures).iterator, metadata.numFeaturesPerNode, rng.nextLong())._1)
} else {
None
}
// Check if enough memory remains to add this node to the group.
val nodeMemUsage = RandomForest.aggregateSizeForNode(metadata, featureSubset) * 8L
if (memUsage + nodeMemUsage <= maxMemoryUsage || memUsage == 0) {
nodeStack.pop()
mutableNodesForGroup.getOrElseUpdate(treeIndex, new mutable.ArrayBuffer[LearningNode]()) +=
node
mutableTreeToNodeToIndexInfo
.getOrElseUpdate(treeIndex, new mutable.HashMap[Int, NodeIndexInfo]())(node.id)
= new NodeIndexInfo(numNodesInGroup, featureSubset)
}
numNodesInGroup += 1 *//we not add the node to mutableNodesForGroup, but we add memUsage here.*
memUsage += nodeMemUsage
}
if (memUsage > maxMemoryUsage) {
// If maxMemoryUsage is 0, we should still allow splitting 1 node.
logWarning(s"Tree learning is using approximately $memUsage bytes per iteration, which" +
s" exceeds requested limit maxMemoryUsage=$maxMemoryUsage. This allows splitting" +
s" $numNodesInGroup nodes in this iteration.")
}
{code}
To avoid this unnecessary warning, we should change the code like this:
{code:java}
while (nodeStack.nonEmpty) {
val (treeIndex, node) = nodeStack.top
// Choose subset of features for node (if subsampling).
val featureSubset: Option[Array[Int]] = if (metadata.subsamplingFeatures) {
Some(SamplingUtils.reservoirSampleAndCount(Range(0,
metadata.numFeatures).iterator, metadata.numFeaturesPerNode, rng.nextLong())._1)
} else {
None
}
// Check if enough memory remains to add this node to the group.
val nodeMemUsage = RandomForest.aggregateSizeForNode(metadata, featureSubset) * 8L
if (memUsage + nodeMemUsage <= maxMemoryUsage || memUsage == 0) {
nodeStack.pop()
mutableNodesForGroup.getOrElseUpdate(treeIndex, new mutable.ArrayBuffer[LearningNode]()) +=
node
mutableTreeToNodeToIndexInfo
.getOrElseUpdate(treeIndex, new mutable.HashMap[Int, NodeIndexInfo]())(node.id)
= new NodeIndexInfo(numNodesInGroup, featureSubset)
numNodesInGroup += 1 //we not add the node to mutableNodesForGroup, but we add memUsage here.
memUsage += nodeMemUsage
} else {
break
}
}
{code}
> Warning message of RF is not accurate
> -------------------------------------
>
> Key: SPARK-21638
> URL: https://issues.apache.org/jira/browse/SPARK-21638
> Project: Spark
> Issue Type: Bug
> Components: ML
> Affects Versions: 2.3.0
> Environment:
> Reporter: Peng Meng
> Priority: Minor
>
> When train RF model, there is many warning message like this:
> {quote}WARN RandomForest: Tree learning is using approximately 268492800 bytes per iteration, which exceeds requested limit maxMemoryUsage=268435456. This allows splitting 2622 nodes in this iteration.{quote}
> This warning message is unnecessary and the data is not accurate.
> This is because
> {code:java}
> while (nodeStack.nonEmpty && (memUsage < maxMemoryUsage || memUsage == 0)) {
> val (treeIndex, node) = nodeStack.top
> // Choose subset of features for node (if subsampling).
> val featureSubset: Option[Array[Int]] = if (metadata.subsamplingFeatures) {
> Some(SamplingUtils.reservoirSampleAndCount(Range(0,
> metadata.numFeatures).iterator, metadata.numFeaturesPerNode, rng.nextLong())._1)
> } else {
> None
> }
> // Check if enough memory remains to add this node to the group.
> val nodeMemUsage = RandomForest.aggregateSizeForNode(metadata, featureSubset) * 8L
> if (memUsage + nodeMemUsage <= maxMemoryUsage || memUsage == 0) {
> nodeStack.pop()
> mutableNodesForGroup.getOrElseUpdate(treeIndex, new mutable.ArrayBuffer[LearningNode]()) +=
> node
> mutableTreeToNodeToIndexInfo
> .getOrElseUpdate(treeIndex, new mutable.HashMap[Int, NodeIndexInfo]())(node.id)
> = new NodeIndexInfo(numNodesInGroup, featureSubset)
> }
> numNodesInGroup += 1 *//we not add the node to mutableNodesForGroup, but we add memUsage here.*
> memUsage += nodeMemUsage
> }
> if (memUsage > maxMemoryUsage) {
> // If maxMemoryUsage is 0, we should still allow splitting 1 node.
> logWarning(s"Tree learning is using approximately $memUsage bytes per iteration, which" +
> s" exceeds requested limit maxMemoryUsage=$maxMemoryUsage. This allows splitting" +
> s" $numNodesInGroup nodes in this iteration.")
> }
> {code}
> To avoid this unnecessary warning, we should change the code like this:
> {code:java}
> while (nodeStack.nonEmpty) {
> val (treeIndex, node) = nodeStack.top
> // Choose subset of features for node (if subsampling).
> val featureSubset: Option[Array[Int]] = if (metadata.subsamplingFeatures) {
> Some(SamplingUtils.reservoirSampleAndCount(Range(0,
> metadata.numFeatures).iterator, metadata.numFeaturesPerNode, rng.nextLong())._1)
> } else {
> None
> }
> // Check if enough memory remains to add this node to the group.
> val nodeMemUsage = RandomForest.aggregateSizeForNode(metadata, featureSubset) * 8L
> if (memUsage + nodeMemUsage <= maxMemoryUsage || memUsage == 0) {
> nodeStack.pop()
> mutableNodesForGroup.getOrElseUpdate(treeIndex, new mutable.ArrayBuffer[LearningNode]()) +=
> node
> mutableTreeToNodeToIndexInfo
> .getOrElseUpdate(treeIndex, new mutable.HashMap[Int, NodeIndexInfo]())(node.id)
> = new NodeIndexInfo(numNodesInGroup, featureSubset)
> numNodesInGroup += 1 //we not add the node to mutableNodesForGroup, but we add memUsage here.
> memUsage += nodeMemUsage
> } else {
> break
> }
> }
> {code}
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org