You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by jk...@apache.org on 2016/01/11 23:48:41 UTC
spark git commit: [SPARK-12685][MLLIB] word2vec trainWordsCount gets
overflow
Repository: spark
Updated Branches:
refs/heads/master ee4ee02b8 -> 4f8eefa36
[SPARK-12685][MLLIB] word2vec trainWordsCount gets overflow
jira: https://issues.apache.org/jira/browse/SPARK-12685
the log of `word2vec` reports
trainWordsCount = -785727483
during computation over a large dataset.
Update the priority as it will affect the computation process.
`alpha = learningRate * (1 - numPartitions * wordCount.toDouble / (trainWordsCount + 1))`
Author: Yuhao Yang <hh...@gmail.com>
Closes #10627 from hhbyyh/w2voverflow.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4f8eefa3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4f8eefa3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4f8eefa3
Branch: refs/heads/master
Commit: 4f8eefa36bb90812aac61ac7a762c9452de666bf
Parents: ee4ee02
Author: Yuhao Yang <hh...@gmail.com>
Authored: Mon Jan 11 14:48:35 2016 -0800
Committer: Joseph K. Bradley <jo...@databricks.com>
Committed: Mon Jan 11 14:48:35 2016 -0800
----------------------------------------------------------------------
.../main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/4f8eefa3/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index a7e1b76..dc5d070 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -151,7 +151,7 @@ class Word2Vec extends Serializable with Logging {
/** context words from [-window, window] */
private var window = 5
- private var trainWordsCount = 0
+ private var trainWordsCount = 0L
private var vocabSize = 0
@transient private var vocab: Array[VocabWord] = null
@transient private var vocabHash = mutable.HashMap.empty[String, Int]
@@ -159,13 +159,13 @@ class Word2Vec extends Serializable with Logging {
private def learnVocab(words: RDD[String]): Unit = {
vocab = words.map(w => (w, 1))
.reduceByKey(_ + _)
+ .filter(_._2 >= minCount)
.map(x => VocabWord(
x._1,
x._2,
new Array[Int](MAX_CODE_LENGTH),
new Array[Int](MAX_CODE_LENGTH),
0))
- .filter(_.cn >= minCount)
.collect()
.sortWith((a, b) => a.cn > b.cn)
@@ -179,7 +179,7 @@ class Word2Vec extends Serializable with Logging {
trainWordsCount += vocab(a).cn
a += 1
}
- logInfo("trainWordsCount = " + trainWordsCount)
+ logInfo(s"vocabSize = $vocabSize, trainWordsCount = $trainWordsCount")
}
private def createExpTable(): Array[Float] = {
@@ -332,7 +332,7 @@ class Word2Vec extends Serializable with Logging {
val random = new XORShiftRandom(seed ^ ((idx + 1) << 16) ^ ((-k - 1) << 8))
val syn0Modify = new Array[Int](vocabSize)
val syn1Modify = new Array[Int](vocabSize)
- val model = iter.foldLeft((bcSyn0Global.value, bcSyn1Global.value, 0, 0)) {
+ val model = iter.foldLeft((bcSyn0Global.value, bcSyn1Global.value, 0L, 0L)) {
case ((syn0, syn1, lastWordCount, wordCount), sentence) =>
var lwc = lastWordCount
var wc = wordCount
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org