You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Florian Verhein (JIRA)" <ji...@apache.org> on 2015/01/21 03:52:34 UTC
[jira] [Updated] (SPARK-5331) Spark workers can't find tachyon
master as spark-ec2 doesn't set spark.tachyonStore.url
[ https://issues.apache.org/jira/browse/SPARK-5331?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Florian Verhein updated SPARK-5331:
-----------------------------------
Component/s: EC2
Description:
ps -ef | grep Tachyon
shows Tachyon running on the master (and the slave) node with correct setting:
-Dtachyon.master.hostname=ec2-54-252-156-187.ap-southeast-2.compute.amazonaws.com
However from stderr log on worker running the SparkTachyonPi example:
15/01/20 06:00:56 INFO CacheManager: Partition rdd_0_0 not found, computing it
15/01/20 06:00:56 INFO : Trying to connect master @ localhost/127.0.0.1:19998
15/01/20 06:00:56 ERROR : Failed to connect (1) to master localhost/127.0.0.1:19998 : java.net.ConnectException: Connection refused
15/01/20 06:00:57 ERROR : Failed to connect (2) to master localhost/127.0.0.1:19998 : java.net.ConnectException: Connection refused
15/01/20 06:00:58 ERROR : Failed to connect (3) to master localhost/127.0.0.1:19998 : java.net.ConnectException: Connection refused
15/01/20 06:00:59 ERROR : Failed to connect (4) to master localhost/127.0.0.1:19998 : java.net.ConnectException: Connection refused
15/01/20 06:01:00 ERROR : Failed to connect (5) to master localhost/127.0.0.1:19998 : java.net.ConnectException: Connection refused
15/01/20 06:01:01 WARN TachyonBlockManager: Attempt 1 to create tachyon dir null failed
java.io.IOException: Failed to connect to master localhost/127.0.0.1:19998 after 5 attempts
at tachyon.client.TachyonFS.connect(TachyonFS.java:293)
at tachyon.client.TachyonFS.getFileId(TachyonFS.java:1011)
at tachyon.client.TachyonFS.exist(TachyonFS.java:633)
at org.apache.spark.storage.TachyonBlockManager$$anonfun$createTachyonDirs$2.apply(TachyonBlockManager.scala:117)
at org.apache.spark.storage.TachyonBlockManager$$anonfun$createTachyonDirs$2.apply(TachyonBlockManager.scala:106)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:108)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:108)
at org.apache.spark.storage.TachyonBlockManager.createTachyonDirs(TachyonBlockManager.scala:106)
at org.apache.spark.storage.TachyonBlockManager.<init>(TachyonBlockManager.scala:57)
at org.apache.spark.storage.BlockManager.tachyonStore$lzycompute(BlockManager.scala:94)
at org.apache.spark.storage.BlockManager.tachyonStore(BlockManager.scala:88)
at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:773)
at org.apache.spark.storage.BlockManager.putIterator(BlockManager.scala:638)
at org.apache.spark.CacheManager.putInBlockManager(CacheManager.scala:145)
at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:70)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:228)
at org.apache.spark.rdd.MappedRDD.compute(MappedRDD.scala:31)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
at org.apache.spark.scheduler.Task.run(Task.scala:56)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:196)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by: tachyon.org.apache.thrift.TException: Failed to connect to master localhost/127.0.0.1:19998 after 5 attempts
at tachyon.master.MasterClient.connect(MasterClient.java:178)
at tachyon.client.TachyonFS.connect(TachyonFS.java:290)
... 28 more
Caused by: tachyon.org.apache.thrift.transport.TTransportException: java.net.ConnectException: Connection refused
at tachyon.org.apache.thrift.transport.TSocket.open(TSocket.java:185)
at tachyon.org.apache.thrift.transport.TFramedTransport.open(TFramedTransport.java:81)
at tachyon.master.MasterClient.connect(MasterClient.java:156)
... 29 more
Caused by: java.net.ConnectException: Connection refused
at java.net.PlainSocketImpl.socketConnect(Native Method)
at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:339)
at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:200)
at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:182)
at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
at java.net.Socket.connect(Socket.java:579)
at tachyon.org.apache.thrift.transport.TSocket.open(TSocket.java:180)
... 31 more
15/01/20 06:01:01 ERROR TachyonBlockManager: Failed 10 attempts to create tachyon dir in /tmp_spark_tachyon/spark-f1c7257e-b79e-4fa2-955a-e3734d80dbc6/1
was:
ps -ef | grep Tachyon
shows Tachyon running on the master (and the slave) node with correct setting:
-Dtachyon.master.hostname=ec2-54-252-156-187.ap-southeast-2.compute.amazonaws.com
However from stderr log on worker running the SparkTachyonPi example:
15/01/20 06:00:56 INFO CacheManager: Partition rdd_0_0 not found, computing it
15/01/20 06:00:56 INFO : Trying to connect master @ localhost/127.0.0.1:19998
15/01/20 06:00:56 ERROR : Failed to connect (1) to master localhost/127.0.0.1:19998 : java.net.ConnectException: Connection refused
15/01/20 06:00:57 ERROR : Failed to connect (2) to master localhost/127.0.0.1:19998 : java.net.ConnectException: Connection refused
15/01/20 06:00:58 ERROR : Failed to connect (3) to master localhost/127.0.0.1:19998 : java.net.ConnectException: Connection refused
15/01/20 06:00:59 ERROR : Failed to connect (4) to master localhost/127.0.0.1:19998 : java.net.ConnectException: Connection refused
15/01/20 06:01:00 ERROR : Failed to connect (5) to master localhost/127.0.0.1:19998 : java.net.ConnectException: Connection refused
15/01/20 06:01:01 WARN TachyonBlockManager: Attempt 1 to create tachyon dir null failed
java.io.IOException: Failed to connect to master localhost/127.0.0.1:19998 after 5 attempts
at tachyon.client.TachyonFS.connect(TachyonFS.java:293)
at tachyon.client.TachyonFS.getFileId(TachyonFS.java:1011)
at tachyon.client.TachyonFS.exist(TachyonFS.java:633)
at org.apache.spark.storage.TachyonBlockManager$$anonfun$createTachyonDirs$2.apply(TachyonBlockManager.scala:117)
at org.apache.spark.storage.TachyonBlockManager$$anonfun$createTachyonDirs$2.apply(TachyonBlockManager.scala:106)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:108)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:108)
at org.apache.spark.storage.TachyonBlockManager.createTachyonDirs(TachyonBlockManager.scala:106)
at org.apache.spark.storage.TachyonBlockManager.<init>(TachyonBlockManager.scala:57)
at org.apache.spark.storage.BlockManager.tachyonStore$lzycompute(BlockManager.scala:94)
at org.apache.spark.storage.BlockManager.tachyonStore(BlockManager.scala:88)
at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:773)
at org.apache.spark.storage.BlockManager.putIterator(BlockManager.scala:638)
at org.apache.spark.CacheManager.putInBlockManager(CacheManager.scala:145)
at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:70)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:228)
at org.apache.spark.rdd.MappedRDD.compute(MappedRDD.scala:31)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
at org.apache.spark.scheduler.Task.run(Task.scala:56)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:196)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by: tachyon.org.apache.thrift.TException: Failed to connect to master localhost/127.0.0.1:19998 after 5 attempts
at tachyon.master.MasterClient.connect(MasterClient.java:178)
at tachyon.client.TachyonFS.connect(TachyonFS.java:290)
... 28 more
Caused by: tachyon.org.apache.thrift.transport.TTransportException: java.net.ConnectException: Connection refused
at tachyon.org.apache.thrift.transport.TSocket.open(TSocket.java:185)
at tachyon.org.apache.thrift.transport.TFramedTransport.open(TFramedTransport.java:81)
at tachyon.master.MasterClient.connect(MasterClient.java:156)
... 29 more
Caused by: java.net.ConnectException: Connection refused
at java.net.PlainSocketImpl.socketConnect(Native Method)
at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:339)
at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:200)
at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:182)
at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
at java.net.Socket.connect(Socket.java:579)
at tachyon.org.apache.thrift.transport.TSocket.open(TSocket.java:180)
... 31 more
15/01/20 06:01:01 ERROR TachyonBlockManager: Failed 10 attempts to create tachyon dir in /tmp_spark_tachyon/spark-f1c7257e-b79e-4fa2-955a-e3734d80dbc6/1
Summary: Spark workers can't find tachyon master as spark-ec2 doesn't set spark.tachyonStore.url (was: Tachyon workers seem to ignore tachyon.master.hostname and use localhost instead)
> Spark workers can't find tachyon master as spark-ec2 doesn't set spark.tachyonStore.url
> ---------------------------------------------------------------------------------------
>
> Key: SPARK-5331
> URL: https://issues.apache.org/jira/browse/SPARK-5331
> Project: Spark
> Issue Type: Bug
> Components: EC2
> Environment: Running on EC2 via modified spark-ec2 scripts (to get dependencies right so tachyon starts)
> Using tachyon 0.5.0 built against hadoop 2.4.1
> Spark 1.2.0 built against tachyon 0.5.0 and hadoop 0.4.1
> Tachyon configured using the template in 0.5.0 but updated with slave list and master variables etc..
> Reporter: Florian Verhein
>
> ps -ef | grep Tachyon
> shows Tachyon running on the master (and the slave) node with correct setting:
> -Dtachyon.master.hostname=ec2-54-252-156-187.ap-southeast-2.compute.amazonaws.com
> However from stderr log on worker running the SparkTachyonPi example:
> 15/01/20 06:00:56 INFO CacheManager: Partition rdd_0_0 not found, computing it
> 15/01/20 06:00:56 INFO : Trying to connect master @ localhost/127.0.0.1:19998
> 15/01/20 06:00:56 ERROR : Failed to connect (1) to master localhost/127.0.0.1:19998 : java.net.ConnectException: Connection refused
> 15/01/20 06:00:57 ERROR : Failed to connect (2) to master localhost/127.0.0.1:19998 : java.net.ConnectException: Connection refused
> 15/01/20 06:00:58 ERROR : Failed to connect (3) to master localhost/127.0.0.1:19998 : java.net.ConnectException: Connection refused
> 15/01/20 06:00:59 ERROR : Failed to connect (4) to master localhost/127.0.0.1:19998 : java.net.ConnectException: Connection refused
> 15/01/20 06:01:00 ERROR : Failed to connect (5) to master localhost/127.0.0.1:19998 : java.net.ConnectException: Connection refused
> 15/01/20 06:01:01 WARN TachyonBlockManager: Attempt 1 to create tachyon dir null failed
> java.io.IOException: Failed to connect to master localhost/127.0.0.1:19998 after 5 attempts
> at tachyon.client.TachyonFS.connect(TachyonFS.java:293)
> at tachyon.client.TachyonFS.getFileId(TachyonFS.java:1011)
> at tachyon.client.TachyonFS.exist(TachyonFS.java:633)
> at org.apache.spark.storage.TachyonBlockManager$$anonfun$createTachyonDirs$2.apply(TachyonBlockManager.scala:117)
> at org.apache.spark.storage.TachyonBlockManager$$anonfun$createTachyonDirs$2.apply(TachyonBlockManager.scala:106)
> at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
> at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
> at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
> at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:108)
> at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
> at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:108)
> at org.apache.spark.storage.TachyonBlockManager.createTachyonDirs(TachyonBlockManager.scala:106)
> at org.apache.spark.storage.TachyonBlockManager.<init>(TachyonBlockManager.scala:57)
> at org.apache.spark.storage.BlockManager.tachyonStore$lzycompute(BlockManager.scala:94)
> at org.apache.spark.storage.BlockManager.tachyonStore(BlockManager.scala:88)
> at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:773)
> at org.apache.spark.storage.BlockManager.putIterator(BlockManager.scala:638)
> at org.apache.spark.CacheManager.putInBlockManager(CacheManager.scala:145)
> at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:70)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:228)
> at org.apache.spark.rdd.MappedRDD.compute(MappedRDD.scala:31)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
> at org.apache.spark.scheduler.Task.run(Task.scala:56)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:196)
> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
> at java.lang.Thread.run(Thread.java:745)
> Caused by: tachyon.org.apache.thrift.TException: Failed to connect to master localhost/127.0.0.1:19998 after 5 attempts
> at tachyon.master.MasterClient.connect(MasterClient.java:178)
> at tachyon.client.TachyonFS.connect(TachyonFS.java:290)
> ... 28 more
> Caused by: tachyon.org.apache.thrift.transport.TTransportException: java.net.ConnectException: Connection refused
> at tachyon.org.apache.thrift.transport.TSocket.open(TSocket.java:185)
> at tachyon.org.apache.thrift.transport.TFramedTransport.open(TFramedTransport.java:81)
> at tachyon.master.MasterClient.connect(MasterClient.java:156)
> ... 29 more
> Caused by: java.net.ConnectException: Connection refused
> at java.net.PlainSocketImpl.socketConnect(Native Method)
> at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:339)
> at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:200)
> at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:182)
> at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
> at java.net.Socket.connect(Socket.java:579)
> at tachyon.org.apache.thrift.transport.TSocket.open(TSocket.java:180)
> ... 31 more
> 15/01/20 06:01:01 ERROR TachyonBlockManager: Failed 10 attempts to create tachyon dir in /tmp_spark_tachyon/spark-f1c7257e-b79e-4fa2-955a-e3734d80dbc6/1
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org