You are viewing a plain text version of this content. The canonical link for it is here.
Posted to mapreduce-user@hadoop.apache.org by ch huang <ju...@gmail.com> on 2013/07/26 07:31:03 UTC

problem about starting datanode

i config name node HA,but when i start data node ,i found the error info in
log
and here is my hdfs-site.xml file

<configuration>
<!--
  <property>
     <name>dfs.name.dir</name>
     <value>/var/lib/hadoop-hdfs/cache/hdfs/dfs/name</value>
  </property>
-->
<property>
 <name>dfs.permissions.superusergroup</name>
 <value>hadoop</value>
</property>
<!--
<property>
        <name>dfs.namenode.name.dir</name>
        <value>/data/hadoopnamespace</value>
</property>
-->
<property>
        <name>dfs.datanode.data.dir</name>
        <value>/data/hadoopdataspace</value>
</property>
<property>
        <name>dfs.datanode.failed.volumes.tolerated</name>
        <value>3</value>
</property>
<!-- HA -->
<property>
        <name>dfs.nameservices</name>
        <value>mycluster</value>
</property>
<property>
        <name>dfs.ha.namenodes.mycluster</name>
        <value>nn1,nn2</value>
</property>
<property>
        <name>dfs.namenode.rpc-address.mycluster.nn1</name>
        <value>node1:8020</value>
</property>
<property>
        <name>dfs.namenode.rpc-address.mycluster.nn2</name>
        <value>node2:8020</value>
</property>
<property>
        <name>dfs.namenode.http-address.mycluster.nn1</name>
        <value>node1:50070</value>
</property>
<property>
        <name>dfs.namenode.http-address.mycluster.nn2</name>
        <value>node2:50070</value>
</property>
<property>
        <name>dfs.namenode.shared.edits.dir</name>
        <value>qjournal://node1:8485;node2:8485;node3:8485/mycluster</value>
</property>
<property>
        <name>dfs.journalnode.edits.dir</name>
        <value>/data/1/dfs/jn</value>
</property>
<property>
        <name>dfs.client.failover.proxy.provider.mycluster</name>

<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<!-- HA end -->
<!-- ssh fence method -->
<property>
        <name>dfs.ha.fencing.methods</name>
        <value>sshfence</value>
</property>
<property>
        <name>dfs.ha.fencing.ssh.private-key-files</name>
        <value>/home/nodefence/.ssh/id_rsa</value>
</property>
<property>
  <name>dfs.ha.fencing.ssh.connect-timeout</name>
  <value>30000</value>
  <description>
    SSH connection timeout, in milliseconds, to use with the builtin
    sshfence fencer.
  </description>
</property>
<!-- enable web HDFS -->
<property>
  <name>dfs.webhdfs.enabled</name>
  <value>true</value>
</property>
</configuration>



2013-07-26 21:20:18,850 INFO
org.apache.hadoop.hdfs.server.datanode.DataNode: Setting up storage:
nsid=291409768;bpid=BP-771660648-192.168.142.129-1374837820241;lv=-40;nsInfo=lv
=-40;cid=CID-28365f0e-e4f1-45b0-a86a-bb37794b6672;nsid=291409768;c=0;bpid=BP-771660648-192.168.142.129-1374837820241
2013-07-26 21:20:18,870 INFO
org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool
BP-771660648-192.168.142.129-1374837820241 (storage id
DS-713465905-192.168.142.131-5001
0-1374844418641) service to node1/192.168.142.129:8020 beginning handshake
with NN
2013-07-26 21:20:18,873 FATAL
org.apache.hadoop.hdfs.server.datanode.DataNode: Initialization failed for
block pool Block pool BP-771660648-192.168.142.129-1374837820241 (storag
e id DS-713465905-192.168.142.131-50010-1374844418641) service to node2/
192.168.142.130:8020
org.apache.hadoop.util.DiskChecker$DiskErrorException: Invalid volume
failure  config value: 3
        at
org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl.<init>(FsDatasetImpl.java:183)
        at
org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:34)
        at
org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:30)
        at
org.apache.hadoop.hdfs.server.datanode.DataNode.initStorage(DataNode.java:920)
        at
org.apache.hadoop.hdfs.server.datanode.DataNode.initBlockPool(DataNode.java:882)
        at
org.apache.hadoop.hdfs.server.datanode.BPOfferService.verifyAndSetNamespaceInfo(BPOfferService.java:308)
        at
org.apache.hadoop.hdfs.server.datanode.BPServiceActor.connectToNNAndHandshake(BPServiceActor.java:218)
        at
org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:660)
        at java.lang.Thread.run(Thread.java:722)
2013-07-26 21:20:18,874 WARN
org.apache.hadoop.hdfs.server.datanode.DataNode: En
ding block pool service for: Block pool
BP-771660648-192.168.142.129-1374837820241 (storage id
DS-713465905-192.168.142.131-50010-1374844418641) service to node2/
192.168.142.130:8020
2013-07-26 21:20:18,886 INFO
org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool Block pool
BP-771660648-192.168.142.129-1374837820241 (storage id
DS-713465905-192.168.142.131-50010-1374844418641) service to node1/
192.168.142.129:8020 successfully registered with NN
2013-07-26 21:20:18,887 INFO
org.apache.hadoop.hdfs.server.datanode.DataNode: For namenode node1/
192.168.142.129:8020 using DELETEREPORT_INTERVAL of 300000 msec
BLOCKREPORT_INTERVAL of 21600000msec Initial delay: 0msec;
heartBeatInterval=3000
2013-07-26 21:20:18,887 ERROR
org.apache.hadoop.hdfs.server.datanode.DataNode: Exception in
BPOfferService for Block pool BP-771660648-192.168.142.129-1374837820241
(storage id DS-713465905-192.168.142.131-50010-1374844418641) service to
node1/192.168.142.129:8020
java.lang.NullPointerException
        at
org.apache.hadoop.hdfs.server.datanode.BPServiceActor.sendHeartBeat(BPServiceActor.java:435)
        at
org.apache.hadoop.hdfs.server.datanode.BPServiceActor.offerService(BPServiceActor.java:521)
        at
org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:673)
        at java.lang.Thread.run(Thread.java:722)
~

Re: problem about starting datanode

Posted by ch huang <ju...@gmail.com>.
that's no use i find problem in 2 name node

.web.resources, pathSpec=/webhdfs/v1/*
2013-07-26 22:55:30,772 INFO org.apache.hadoop.http.HttpServer: Jetty bound
to port 50070
2013-07-26 22:55:30,772 INFO org.mortbay.log: jetty-6.1.26.cloudera.2
2013-07-26 22:55:31,001 WARN
org.apache.hadoop.security.authentication.server.AuthenticationFilter:
'signature.secret' configuration not set, using a random value as secret
2013-07-26 22:55:31,054 INFO org.mortbay.log: Started
SelectChannelConnector@node1:50070
2013-07-26 22:55:31,054 INFO
org.apache.hadoop.hdfs.server.namenode.NameNode: Web-server up at:
node1:50070
2013-07-26 22:55:31,057 INFO org.apache.hadoop.ipc.Server: IPC Server
Responder: starting
2013-07-26 22:55:31,058 INFO org.apache.hadoop.ipc.Server: IPC Server
listener on 8020: starting
2013-07-26 22:55:31,072 INFO
org.apache.hadoop.hdfs.server.namenode.NameNode: NameNode up at: node1/
192.168.142.129:8020
2013-07-26 22:55:31,072 INFO
org.apache.hadoop.hdfs.server.namenode.FSNamesystem: Starting services
required for standby state
2013-07-26 22:55:31,074 INFO
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer: Will roll logs on
active node at node2/192.168.142.130:8020 every 120 seconds.
2013-07-26 22:55:31,084 INFO
org.apache.hadoop.hdfs.server.namenode.ha.StandbyCheckpointer: Starting
standby checkpoint thread...
Checkpointing active NN at node2:50070
Serving checkpoints at node1/192.168.142.129:50070
2013-07-26 22:57:31,112 INFO
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer: Triggering log
roll on remote NameNode node2/192.168.142.130:8020
2013-07-26 22:57:31,329 WARN
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer: Unable to trigger
a roll of the active NN
org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException):
Operation category JOURNAL is not supported in state standby
        at
org.apache.hadoop.hdfs.server.namenode.ha.StandbyState.checkOperation(StandbyState.java:87)
        at
org.apache.hadoop.hdfs.server.namenode.NameNode$NameNodeHAContext.checkOperation(NameNode.java:1401)
        at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkOperation(FSNamesystem.java:871)
        at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.rollEditLog(FSNamesystem.java:4577)
        at
org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.rollEditLog(NameNodeRpcServer.java:756)
        at
org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolServerSideTranslatorPB.rollEditLog(NamenodeProtocolServerSideTranslatorPB.java:129)
        at
org.apache.hadoop.hdfs.protocol.proto.NamenodeProtocolProtos$NamenodeProtocolService$2.callBlockingMethod(NamenodeProtocolProtos.java:8762)
        at
org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:453)
        at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1002)
        at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1701)


On Fri, Jul 26, 2013 at 1:48 PM, Harsh J <ha...@cloudera.com> wrote:

> You have one datanode data volume (dfs.datanode.data.dir) configured,
> but you've specified tolerated failures as 3. The state of 3 > 1 is
> invalid and hence the error. You cannot enable disk failure toleration
> of a DN with just one volume, so remove the toleration config and your
> problem will be resolved.
>
> On Fri, Jul 26, 2013 at 11:01 AM, ch huang <ju...@gmail.com> wrote:
> > i config name node HA,but when i start data node ,i found the error info
> in
> > log
> > and here is my hdfs-site.xml file
> >
> > <configuration>
> > <!--
> >   <property>
> >      <name>dfs.name.dir</name>
> >      <value>/var/lib/hadoop-hdfs/cache/hdfs/dfs/name</value>
> >   </property>
> > -->
> > <property>
> >  <name>dfs.permissions.superusergroup</name>
> >  <value>hadoop</value>
> > </property>
> > <!--
> > <property>
> >         <name>dfs.namenode.name.dir</name>
> >         <value>/data/hadoopnamespace</value>
> > </property>
> > -->
> > <property>
> >         <name>dfs.datanode.data.dir</name>
> >         <value>/data/hadoopdataspace</value>
> > </property>
> > <property>
> >         <name>dfs.datanode.failed.volumes.tolerated</name>
> >         <value>3</value>
> > </property>
> > <!-- HA -->
> > <property>
> >         <name>dfs.nameservices</name>
> >         <value>mycluster</value>
> > </property>
> > <property>
> >         <name>dfs.ha.namenodes.mycluster</name>
> >         <value>nn1,nn2</value>
> > </property>
> > <property>
> >         <name>dfs.namenode.rpc-address.mycluster.nn1</name>
> >         <value>node1:8020</value>
> > </property>
> > <property>
> >         <name>dfs.namenode.rpc-address.mycluster.nn2</name>
> >         <value>node2:8020</value>
> > </property>
> > <property>
> >         <name>dfs.namenode.http-address.mycluster.nn1</name>
> >         <value>node1:50070</value>
> > </property>
> > <property>
> >         <name>dfs.namenode.http-address.mycluster.nn2</name>
> >         <value>node2:50070</value>
> > </property>
> > <property>
> >         <name>dfs.namenode.shared.edits.dir</name>
> >
> <value>qjournal://node1:8485;node2:8485;node3:8485/mycluster</value>
> > </property>
> > <property>
> >         <name>dfs.journalnode.edits.dir</name>
> >         <value>/data/1/dfs/jn</value>
> > </property>
> > <property>
> >         <name>dfs.client.failover.proxy.provider.mycluster</name>
> >
> >
> <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
> > </property>
> > <!-- HA end -->
> > <!-- ssh fence method -->
> > <property>
> >         <name>dfs.ha.fencing.methods</name>
> >         <value>sshfence</value>
> > </property>
> > <property>
> >         <name>dfs.ha.fencing.ssh.private-key-files</name>
> >         <value>/home/nodefence/.ssh/id_rsa</value>
> > </property>
> > <property>
> >   <name>dfs.ha.fencing.ssh.connect-timeout</name>
> >   <value>30000</value>
> >   <description>
> >     SSH connection timeout, in milliseconds, to use with the builtin
> >     sshfence fencer.
> >   </description>
> > </property>
> > <!-- enable web HDFS -->
> > <property>
> >   <name>dfs.webhdfs.enabled</name>
> >   <value>true</value>
> > </property>
> > </configuration>
> >
> >
> >
> > 2013-07-26 21:20:18,850 INFO
> > org.apache.hadoop.hdfs.server.datanode.DataNode: Setting up storage:
> >
> nsid=291409768;bpid=BP-771660648-192.168.142.129-1374837820241;lv=-40;nsInfo=lv
> >
> =-40;cid=CID-28365f0e-e4f1-45b0-a86a-bb37794b6672;nsid=291409768;c=0;bpid=BP-771660648-192.168.142.129-1374837820241
> > 2013-07-26 21:20:18,870 INFO
> > org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool
> > BP-771660648-192.168.142.129-1374837820241 (storage id
> > DS-713465905-192.168.142.131-5001
> > 0-1374844418641) service to node1/192.168.142.129:8020 beginning
> handshake
> > with NN
> > 2013-07-26 21:20:18,873 FATAL
> > org.apache.hadoop.hdfs.server.datanode.DataNode: Initialization failed
> for
> > block pool Block pool BP-771660648-192.168.142.129-1374837820241 (storag
> > e id DS-713465905-192.168.142.131-50010-1374844418641) service to
> > node2/192.168.142.130:8020
> > org.apache.hadoop.util.DiskChecker$DiskErrorException: Invalid volume
> > failure  config value: 3
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl.<init>(FsDatasetImpl.java:183)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:34)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:30)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.DataNode.initStorage(DataNode.java:920)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.DataNode.initBlockPool(DataNode.java:882)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPOfferService.verifyAndSetNamespaceInfo(BPOfferService.java:308)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.connectToNNAndHandshake(BPServiceActor.java:218)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:660)
> >         at java.lang.Thread.run(Thread.java:722)
> > 2013-07-26 21:20:18,874 WARN
> > org.apache.hadoop.hdfs.server.datanode.DataNode: En
> > ding block pool service for: Block pool
> > BP-771660648-192.168.142.129-1374837820241 (storage id
> > DS-713465905-192.168.142.131-50010-1374844418641) service to
> > node2/192.168.142.130:8020
> > 2013-07-26 21:20:18,886 INFO
> > org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool Block pool
> > BP-771660648-192.168.142.129-1374837820241 (storage id
> > DS-713465905-192.168.142.131-50010-1374844418641) service to
> > node1/192.168.142.129:8020 successfully registered with NN
> > 2013-07-26 21:20:18,887 INFO
> > org.apache.hadoop.hdfs.server.datanode.DataNode: For namenode
> > node1/192.168.142.129:8020 using DELETEREPORT_INTERVAL of 300000 msec
> > BLOCKREPORT_INTERVAL of 21600000msec Initial delay: 0msec;
> > heartBeatInterval=3000
> > 2013-07-26 21:20:18,887 ERROR
> > org.apache.hadoop.hdfs.server.datanode.DataNode: Exception in
> BPOfferService
> > for Block pool BP-771660648-192.168.142.129-1374837820241 (storage id
> > DS-713465905-192.168.142.131-50010-1374844418641) service to
> > node1/192.168.142.129:8020
> > java.lang.NullPointerException
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.sendHeartBeat(BPServiceActor.java:435)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.offerService(BPServiceActor.java:521)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:673)
> >         at java.lang.Thread.run(Thread.java:722)
> > ~
>
>
>
> --
> Harsh J
>

Re: problem about starting datanode

Posted by ch huang <ju...@gmail.com>.
that's no use i find problem in 2 name node

.web.resources, pathSpec=/webhdfs/v1/*
2013-07-26 22:55:30,772 INFO org.apache.hadoop.http.HttpServer: Jetty bound
to port 50070
2013-07-26 22:55:30,772 INFO org.mortbay.log: jetty-6.1.26.cloudera.2
2013-07-26 22:55:31,001 WARN
org.apache.hadoop.security.authentication.server.AuthenticationFilter:
'signature.secret' configuration not set, using a random value as secret
2013-07-26 22:55:31,054 INFO org.mortbay.log: Started
SelectChannelConnector@node1:50070
2013-07-26 22:55:31,054 INFO
org.apache.hadoop.hdfs.server.namenode.NameNode: Web-server up at:
node1:50070
2013-07-26 22:55:31,057 INFO org.apache.hadoop.ipc.Server: IPC Server
Responder: starting
2013-07-26 22:55:31,058 INFO org.apache.hadoop.ipc.Server: IPC Server
listener on 8020: starting
2013-07-26 22:55:31,072 INFO
org.apache.hadoop.hdfs.server.namenode.NameNode: NameNode up at: node1/
192.168.142.129:8020
2013-07-26 22:55:31,072 INFO
org.apache.hadoop.hdfs.server.namenode.FSNamesystem: Starting services
required for standby state
2013-07-26 22:55:31,074 INFO
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer: Will roll logs on
active node at node2/192.168.142.130:8020 every 120 seconds.
2013-07-26 22:55:31,084 INFO
org.apache.hadoop.hdfs.server.namenode.ha.StandbyCheckpointer: Starting
standby checkpoint thread...
Checkpointing active NN at node2:50070
Serving checkpoints at node1/192.168.142.129:50070
2013-07-26 22:57:31,112 INFO
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer: Triggering log
roll on remote NameNode node2/192.168.142.130:8020
2013-07-26 22:57:31,329 WARN
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer: Unable to trigger
a roll of the active NN
org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException):
Operation category JOURNAL is not supported in state standby
        at
org.apache.hadoop.hdfs.server.namenode.ha.StandbyState.checkOperation(StandbyState.java:87)
        at
org.apache.hadoop.hdfs.server.namenode.NameNode$NameNodeHAContext.checkOperation(NameNode.java:1401)
        at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkOperation(FSNamesystem.java:871)
        at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.rollEditLog(FSNamesystem.java:4577)
        at
org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.rollEditLog(NameNodeRpcServer.java:756)
        at
org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolServerSideTranslatorPB.rollEditLog(NamenodeProtocolServerSideTranslatorPB.java:129)
        at
org.apache.hadoop.hdfs.protocol.proto.NamenodeProtocolProtos$NamenodeProtocolService$2.callBlockingMethod(NamenodeProtocolProtos.java:8762)
        at
org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:453)
        at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1002)
        at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1701)


On Fri, Jul 26, 2013 at 1:48 PM, Harsh J <ha...@cloudera.com> wrote:

> You have one datanode data volume (dfs.datanode.data.dir) configured,
> but you've specified tolerated failures as 3. The state of 3 > 1 is
> invalid and hence the error. You cannot enable disk failure toleration
> of a DN with just one volume, so remove the toleration config and your
> problem will be resolved.
>
> On Fri, Jul 26, 2013 at 11:01 AM, ch huang <ju...@gmail.com> wrote:
> > i config name node HA,but when i start data node ,i found the error info
> in
> > log
> > and here is my hdfs-site.xml file
> >
> > <configuration>
> > <!--
> >   <property>
> >      <name>dfs.name.dir</name>
> >      <value>/var/lib/hadoop-hdfs/cache/hdfs/dfs/name</value>
> >   </property>
> > -->
> > <property>
> >  <name>dfs.permissions.superusergroup</name>
> >  <value>hadoop</value>
> > </property>
> > <!--
> > <property>
> >         <name>dfs.namenode.name.dir</name>
> >         <value>/data/hadoopnamespace</value>
> > </property>
> > -->
> > <property>
> >         <name>dfs.datanode.data.dir</name>
> >         <value>/data/hadoopdataspace</value>
> > </property>
> > <property>
> >         <name>dfs.datanode.failed.volumes.tolerated</name>
> >         <value>3</value>
> > </property>
> > <!-- HA -->
> > <property>
> >         <name>dfs.nameservices</name>
> >         <value>mycluster</value>
> > </property>
> > <property>
> >         <name>dfs.ha.namenodes.mycluster</name>
> >         <value>nn1,nn2</value>
> > </property>
> > <property>
> >         <name>dfs.namenode.rpc-address.mycluster.nn1</name>
> >         <value>node1:8020</value>
> > </property>
> > <property>
> >         <name>dfs.namenode.rpc-address.mycluster.nn2</name>
> >         <value>node2:8020</value>
> > </property>
> > <property>
> >         <name>dfs.namenode.http-address.mycluster.nn1</name>
> >         <value>node1:50070</value>
> > </property>
> > <property>
> >         <name>dfs.namenode.http-address.mycluster.nn2</name>
> >         <value>node2:50070</value>
> > </property>
> > <property>
> >         <name>dfs.namenode.shared.edits.dir</name>
> >
> <value>qjournal://node1:8485;node2:8485;node3:8485/mycluster</value>
> > </property>
> > <property>
> >         <name>dfs.journalnode.edits.dir</name>
> >         <value>/data/1/dfs/jn</value>
> > </property>
> > <property>
> >         <name>dfs.client.failover.proxy.provider.mycluster</name>
> >
> >
> <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
> > </property>
> > <!-- HA end -->
> > <!-- ssh fence method -->
> > <property>
> >         <name>dfs.ha.fencing.methods</name>
> >         <value>sshfence</value>
> > </property>
> > <property>
> >         <name>dfs.ha.fencing.ssh.private-key-files</name>
> >         <value>/home/nodefence/.ssh/id_rsa</value>
> > </property>
> > <property>
> >   <name>dfs.ha.fencing.ssh.connect-timeout</name>
> >   <value>30000</value>
> >   <description>
> >     SSH connection timeout, in milliseconds, to use with the builtin
> >     sshfence fencer.
> >   </description>
> > </property>
> > <!-- enable web HDFS -->
> > <property>
> >   <name>dfs.webhdfs.enabled</name>
> >   <value>true</value>
> > </property>
> > </configuration>
> >
> >
> >
> > 2013-07-26 21:20:18,850 INFO
> > org.apache.hadoop.hdfs.server.datanode.DataNode: Setting up storage:
> >
> nsid=291409768;bpid=BP-771660648-192.168.142.129-1374837820241;lv=-40;nsInfo=lv
> >
> =-40;cid=CID-28365f0e-e4f1-45b0-a86a-bb37794b6672;nsid=291409768;c=0;bpid=BP-771660648-192.168.142.129-1374837820241
> > 2013-07-26 21:20:18,870 INFO
> > org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool
> > BP-771660648-192.168.142.129-1374837820241 (storage id
> > DS-713465905-192.168.142.131-5001
> > 0-1374844418641) service to node1/192.168.142.129:8020 beginning
> handshake
> > with NN
> > 2013-07-26 21:20:18,873 FATAL
> > org.apache.hadoop.hdfs.server.datanode.DataNode: Initialization failed
> for
> > block pool Block pool BP-771660648-192.168.142.129-1374837820241 (storag
> > e id DS-713465905-192.168.142.131-50010-1374844418641) service to
> > node2/192.168.142.130:8020
> > org.apache.hadoop.util.DiskChecker$DiskErrorException: Invalid volume
> > failure  config value: 3
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl.<init>(FsDatasetImpl.java:183)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:34)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:30)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.DataNode.initStorage(DataNode.java:920)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.DataNode.initBlockPool(DataNode.java:882)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPOfferService.verifyAndSetNamespaceInfo(BPOfferService.java:308)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.connectToNNAndHandshake(BPServiceActor.java:218)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:660)
> >         at java.lang.Thread.run(Thread.java:722)
> > 2013-07-26 21:20:18,874 WARN
> > org.apache.hadoop.hdfs.server.datanode.DataNode: En
> > ding block pool service for: Block pool
> > BP-771660648-192.168.142.129-1374837820241 (storage id
> > DS-713465905-192.168.142.131-50010-1374844418641) service to
> > node2/192.168.142.130:8020
> > 2013-07-26 21:20:18,886 INFO
> > org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool Block pool
> > BP-771660648-192.168.142.129-1374837820241 (storage id
> > DS-713465905-192.168.142.131-50010-1374844418641) service to
> > node1/192.168.142.129:8020 successfully registered with NN
> > 2013-07-26 21:20:18,887 INFO
> > org.apache.hadoop.hdfs.server.datanode.DataNode: For namenode
> > node1/192.168.142.129:8020 using DELETEREPORT_INTERVAL of 300000 msec
> > BLOCKREPORT_INTERVAL of 21600000msec Initial delay: 0msec;
> > heartBeatInterval=3000
> > 2013-07-26 21:20:18,887 ERROR
> > org.apache.hadoop.hdfs.server.datanode.DataNode: Exception in
> BPOfferService
> > for Block pool BP-771660648-192.168.142.129-1374837820241 (storage id
> > DS-713465905-192.168.142.131-50010-1374844418641) service to
> > node1/192.168.142.129:8020
> > java.lang.NullPointerException
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.sendHeartBeat(BPServiceActor.java:435)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.offerService(BPServiceActor.java:521)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:673)
> >         at java.lang.Thread.run(Thread.java:722)
> > ~
>
>
>
> --
> Harsh J
>

Re: problem about starting datanode

Posted by ch huang <ju...@gmail.com>.
that's no use i find problem in 2 name node

.web.resources, pathSpec=/webhdfs/v1/*
2013-07-26 22:55:30,772 INFO org.apache.hadoop.http.HttpServer: Jetty bound
to port 50070
2013-07-26 22:55:30,772 INFO org.mortbay.log: jetty-6.1.26.cloudera.2
2013-07-26 22:55:31,001 WARN
org.apache.hadoop.security.authentication.server.AuthenticationFilter:
'signature.secret' configuration not set, using a random value as secret
2013-07-26 22:55:31,054 INFO org.mortbay.log: Started
SelectChannelConnector@node1:50070
2013-07-26 22:55:31,054 INFO
org.apache.hadoop.hdfs.server.namenode.NameNode: Web-server up at:
node1:50070
2013-07-26 22:55:31,057 INFO org.apache.hadoop.ipc.Server: IPC Server
Responder: starting
2013-07-26 22:55:31,058 INFO org.apache.hadoop.ipc.Server: IPC Server
listener on 8020: starting
2013-07-26 22:55:31,072 INFO
org.apache.hadoop.hdfs.server.namenode.NameNode: NameNode up at: node1/
192.168.142.129:8020
2013-07-26 22:55:31,072 INFO
org.apache.hadoop.hdfs.server.namenode.FSNamesystem: Starting services
required for standby state
2013-07-26 22:55:31,074 INFO
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer: Will roll logs on
active node at node2/192.168.142.130:8020 every 120 seconds.
2013-07-26 22:55:31,084 INFO
org.apache.hadoop.hdfs.server.namenode.ha.StandbyCheckpointer: Starting
standby checkpoint thread...
Checkpointing active NN at node2:50070
Serving checkpoints at node1/192.168.142.129:50070
2013-07-26 22:57:31,112 INFO
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer: Triggering log
roll on remote NameNode node2/192.168.142.130:8020
2013-07-26 22:57:31,329 WARN
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer: Unable to trigger
a roll of the active NN
org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException):
Operation category JOURNAL is not supported in state standby
        at
org.apache.hadoop.hdfs.server.namenode.ha.StandbyState.checkOperation(StandbyState.java:87)
        at
org.apache.hadoop.hdfs.server.namenode.NameNode$NameNodeHAContext.checkOperation(NameNode.java:1401)
        at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkOperation(FSNamesystem.java:871)
        at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.rollEditLog(FSNamesystem.java:4577)
        at
org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.rollEditLog(NameNodeRpcServer.java:756)
        at
org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolServerSideTranslatorPB.rollEditLog(NamenodeProtocolServerSideTranslatorPB.java:129)
        at
org.apache.hadoop.hdfs.protocol.proto.NamenodeProtocolProtos$NamenodeProtocolService$2.callBlockingMethod(NamenodeProtocolProtos.java:8762)
        at
org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:453)
        at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1002)
        at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1701)


On Fri, Jul 26, 2013 at 1:48 PM, Harsh J <ha...@cloudera.com> wrote:

> You have one datanode data volume (dfs.datanode.data.dir) configured,
> but you've specified tolerated failures as 3. The state of 3 > 1 is
> invalid and hence the error. You cannot enable disk failure toleration
> of a DN with just one volume, so remove the toleration config and your
> problem will be resolved.
>
> On Fri, Jul 26, 2013 at 11:01 AM, ch huang <ju...@gmail.com> wrote:
> > i config name node HA,but when i start data node ,i found the error info
> in
> > log
> > and here is my hdfs-site.xml file
> >
> > <configuration>
> > <!--
> >   <property>
> >      <name>dfs.name.dir</name>
> >      <value>/var/lib/hadoop-hdfs/cache/hdfs/dfs/name</value>
> >   </property>
> > -->
> > <property>
> >  <name>dfs.permissions.superusergroup</name>
> >  <value>hadoop</value>
> > </property>
> > <!--
> > <property>
> >         <name>dfs.namenode.name.dir</name>
> >         <value>/data/hadoopnamespace</value>
> > </property>
> > -->
> > <property>
> >         <name>dfs.datanode.data.dir</name>
> >         <value>/data/hadoopdataspace</value>
> > </property>
> > <property>
> >         <name>dfs.datanode.failed.volumes.tolerated</name>
> >         <value>3</value>
> > </property>
> > <!-- HA -->
> > <property>
> >         <name>dfs.nameservices</name>
> >         <value>mycluster</value>
> > </property>
> > <property>
> >         <name>dfs.ha.namenodes.mycluster</name>
> >         <value>nn1,nn2</value>
> > </property>
> > <property>
> >         <name>dfs.namenode.rpc-address.mycluster.nn1</name>
> >         <value>node1:8020</value>
> > </property>
> > <property>
> >         <name>dfs.namenode.rpc-address.mycluster.nn2</name>
> >         <value>node2:8020</value>
> > </property>
> > <property>
> >         <name>dfs.namenode.http-address.mycluster.nn1</name>
> >         <value>node1:50070</value>
> > </property>
> > <property>
> >         <name>dfs.namenode.http-address.mycluster.nn2</name>
> >         <value>node2:50070</value>
> > </property>
> > <property>
> >         <name>dfs.namenode.shared.edits.dir</name>
> >
> <value>qjournal://node1:8485;node2:8485;node3:8485/mycluster</value>
> > </property>
> > <property>
> >         <name>dfs.journalnode.edits.dir</name>
> >         <value>/data/1/dfs/jn</value>
> > </property>
> > <property>
> >         <name>dfs.client.failover.proxy.provider.mycluster</name>
> >
> >
> <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
> > </property>
> > <!-- HA end -->
> > <!-- ssh fence method -->
> > <property>
> >         <name>dfs.ha.fencing.methods</name>
> >         <value>sshfence</value>
> > </property>
> > <property>
> >         <name>dfs.ha.fencing.ssh.private-key-files</name>
> >         <value>/home/nodefence/.ssh/id_rsa</value>
> > </property>
> > <property>
> >   <name>dfs.ha.fencing.ssh.connect-timeout</name>
> >   <value>30000</value>
> >   <description>
> >     SSH connection timeout, in milliseconds, to use with the builtin
> >     sshfence fencer.
> >   </description>
> > </property>
> > <!-- enable web HDFS -->
> > <property>
> >   <name>dfs.webhdfs.enabled</name>
> >   <value>true</value>
> > </property>
> > </configuration>
> >
> >
> >
> > 2013-07-26 21:20:18,850 INFO
> > org.apache.hadoop.hdfs.server.datanode.DataNode: Setting up storage:
> >
> nsid=291409768;bpid=BP-771660648-192.168.142.129-1374837820241;lv=-40;nsInfo=lv
> >
> =-40;cid=CID-28365f0e-e4f1-45b0-a86a-bb37794b6672;nsid=291409768;c=0;bpid=BP-771660648-192.168.142.129-1374837820241
> > 2013-07-26 21:20:18,870 INFO
> > org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool
> > BP-771660648-192.168.142.129-1374837820241 (storage id
> > DS-713465905-192.168.142.131-5001
> > 0-1374844418641) service to node1/192.168.142.129:8020 beginning
> handshake
> > with NN
> > 2013-07-26 21:20:18,873 FATAL
> > org.apache.hadoop.hdfs.server.datanode.DataNode: Initialization failed
> for
> > block pool Block pool BP-771660648-192.168.142.129-1374837820241 (storag
> > e id DS-713465905-192.168.142.131-50010-1374844418641) service to
> > node2/192.168.142.130:8020
> > org.apache.hadoop.util.DiskChecker$DiskErrorException: Invalid volume
> > failure  config value: 3
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl.<init>(FsDatasetImpl.java:183)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:34)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:30)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.DataNode.initStorage(DataNode.java:920)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.DataNode.initBlockPool(DataNode.java:882)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPOfferService.verifyAndSetNamespaceInfo(BPOfferService.java:308)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.connectToNNAndHandshake(BPServiceActor.java:218)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:660)
> >         at java.lang.Thread.run(Thread.java:722)
> > 2013-07-26 21:20:18,874 WARN
> > org.apache.hadoop.hdfs.server.datanode.DataNode: En
> > ding block pool service for: Block pool
> > BP-771660648-192.168.142.129-1374837820241 (storage id
> > DS-713465905-192.168.142.131-50010-1374844418641) service to
> > node2/192.168.142.130:8020
> > 2013-07-26 21:20:18,886 INFO
> > org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool Block pool
> > BP-771660648-192.168.142.129-1374837820241 (storage id
> > DS-713465905-192.168.142.131-50010-1374844418641) service to
> > node1/192.168.142.129:8020 successfully registered with NN
> > 2013-07-26 21:20:18,887 INFO
> > org.apache.hadoop.hdfs.server.datanode.DataNode: For namenode
> > node1/192.168.142.129:8020 using DELETEREPORT_INTERVAL of 300000 msec
> > BLOCKREPORT_INTERVAL of 21600000msec Initial delay: 0msec;
> > heartBeatInterval=3000
> > 2013-07-26 21:20:18,887 ERROR
> > org.apache.hadoop.hdfs.server.datanode.DataNode: Exception in
> BPOfferService
> > for Block pool BP-771660648-192.168.142.129-1374837820241 (storage id
> > DS-713465905-192.168.142.131-50010-1374844418641) service to
> > node1/192.168.142.129:8020
> > java.lang.NullPointerException
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.sendHeartBeat(BPServiceActor.java:435)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.offerService(BPServiceActor.java:521)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:673)
> >         at java.lang.Thread.run(Thread.java:722)
> > ~
>
>
>
> --
> Harsh J
>

Re: problem about starting datanode

Posted by ch huang <ju...@gmail.com>.
that's no use i find problem in 2 name node

.web.resources, pathSpec=/webhdfs/v1/*
2013-07-26 22:55:30,772 INFO org.apache.hadoop.http.HttpServer: Jetty bound
to port 50070
2013-07-26 22:55:30,772 INFO org.mortbay.log: jetty-6.1.26.cloudera.2
2013-07-26 22:55:31,001 WARN
org.apache.hadoop.security.authentication.server.AuthenticationFilter:
'signature.secret' configuration not set, using a random value as secret
2013-07-26 22:55:31,054 INFO org.mortbay.log: Started
SelectChannelConnector@node1:50070
2013-07-26 22:55:31,054 INFO
org.apache.hadoop.hdfs.server.namenode.NameNode: Web-server up at:
node1:50070
2013-07-26 22:55:31,057 INFO org.apache.hadoop.ipc.Server: IPC Server
Responder: starting
2013-07-26 22:55:31,058 INFO org.apache.hadoop.ipc.Server: IPC Server
listener on 8020: starting
2013-07-26 22:55:31,072 INFO
org.apache.hadoop.hdfs.server.namenode.NameNode: NameNode up at: node1/
192.168.142.129:8020
2013-07-26 22:55:31,072 INFO
org.apache.hadoop.hdfs.server.namenode.FSNamesystem: Starting services
required for standby state
2013-07-26 22:55:31,074 INFO
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer: Will roll logs on
active node at node2/192.168.142.130:8020 every 120 seconds.
2013-07-26 22:55:31,084 INFO
org.apache.hadoop.hdfs.server.namenode.ha.StandbyCheckpointer: Starting
standby checkpoint thread...
Checkpointing active NN at node2:50070
Serving checkpoints at node1/192.168.142.129:50070
2013-07-26 22:57:31,112 INFO
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer: Triggering log
roll on remote NameNode node2/192.168.142.130:8020
2013-07-26 22:57:31,329 WARN
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer: Unable to trigger
a roll of the active NN
org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException):
Operation category JOURNAL is not supported in state standby
        at
org.apache.hadoop.hdfs.server.namenode.ha.StandbyState.checkOperation(StandbyState.java:87)
        at
org.apache.hadoop.hdfs.server.namenode.NameNode$NameNodeHAContext.checkOperation(NameNode.java:1401)
        at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkOperation(FSNamesystem.java:871)
        at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.rollEditLog(FSNamesystem.java:4577)
        at
org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.rollEditLog(NameNodeRpcServer.java:756)
        at
org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolServerSideTranslatorPB.rollEditLog(NamenodeProtocolServerSideTranslatorPB.java:129)
        at
org.apache.hadoop.hdfs.protocol.proto.NamenodeProtocolProtos$NamenodeProtocolService$2.callBlockingMethod(NamenodeProtocolProtos.java:8762)
        at
org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:453)
        at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1002)
        at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1701)


On Fri, Jul 26, 2013 at 1:48 PM, Harsh J <ha...@cloudera.com> wrote:

> You have one datanode data volume (dfs.datanode.data.dir) configured,
> but you've specified tolerated failures as 3. The state of 3 > 1 is
> invalid and hence the error. You cannot enable disk failure toleration
> of a DN with just one volume, so remove the toleration config and your
> problem will be resolved.
>
> On Fri, Jul 26, 2013 at 11:01 AM, ch huang <ju...@gmail.com> wrote:
> > i config name node HA,but when i start data node ,i found the error info
> in
> > log
> > and here is my hdfs-site.xml file
> >
> > <configuration>
> > <!--
> >   <property>
> >      <name>dfs.name.dir</name>
> >      <value>/var/lib/hadoop-hdfs/cache/hdfs/dfs/name</value>
> >   </property>
> > -->
> > <property>
> >  <name>dfs.permissions.superusergroup</name>
> >  <value>hadoop</value>
> > </property>
> > <!--
> > <property>
> >         <name>dfs.namenode.name.dir</name>
> >         <value>/data/hadoopnamespace</value>
> > </property>
> > -->
> > <property>
> >         <name>dfs.datanode.data.dir</name>
> >         <value>/data/hadoopdataspace</value>
> > </property>
> > <property>
> >         <name>dfs.datanode.failed.volumes.tolerated</name>
> >         <value>3</value>
> > </property>
> > <!-- HA -->
> > <property>
> >         <name>dfs.nameservices</name>
> >         <value>mycluster</value>
> > </property>
> > <property>
> >         <name>dfs.ha.namenodes.mycluster</name>
> >         <value>nn1,nn2</value>
> > </property>
> > <property>
> >         <name>dfs.namenode.rpc-address.mycluster.nn1</name>
> >         <value>node1:8020</value>
> > </property>
> > <property>
> >         <name>dfs.namenode.rpc-address.mycluster.nn2</name>
> >         <value>node2:8020</value>
> > </property>
> > <property>
> >         <name>dfs.namenode.http-address.mycluster.nn1</name>
> >         <value>node1:50070</value>
> > </property>
> > <property>
> >         <name>dfs.namenode.http-address.mycluster.nn2</name>
> >         <value>node2:50070</value>
> > </property>
> > <property>
> >         <name>dfs.namenode.shared.edits.dir</name>
> >
> <value>qjournal://node1:8485;node2:8485;node3:8485/mycluster</value>
> > </property>
> > <property>
> >         <name>dfs.journalnode.edits.dir</name>
> >         <value>/data/1/dfs/jn</value>
> > </property>
> > <property>
> >         <name>dfs.client.failover.proxy.provider.mycluster</name>
> >
> >
> <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
> > </property>
> > <!-- HA end -->
> > <!-- ssh fence method -->
> > <property>
> >         <name>dfs.ha.fencing.methods</name>
> >         <value>sshfence</value>
> > </property>
> > <property>
> >         <name>dfs.ha.fencing.ssh.private-key-files</name>
> >         <value>/home/nodefence/.ssh/id_rsa</value>
> > </property>
> > <property>
> >   <name>dfs.ha.fencing.ssh.connect-timeout</name>
> >   <value>30000</value>
> >   <description>
> >     SSH connection timeout, in milliseconds, to use with the builtin
> >     sshfence fencer.
> >   </description>
> > </property>
> > <!-- enable web HDFS -->
> > <property>
> >   <name>dfs.webhdfs.enabled</name>
> >   <value>true</value>
> > </property>
> > </configuration>
> >
> >
> >
> > 2013-07-26 21:20:18,850 INFO
> > org.apache.hadoop.hdfs.server.datanode.DataNode: Setting up storage:
> >
> nsid=291409768;bpid=BP-771660648-192.168.142.129-1374837820241;lv=-40;nsInfo=lv
> >
> =-40;cid=CID-28365f0e-e4f1-45b0-a86a-bb37794b6672;nsid=291409768;c=0;bpid=BP-771660648-192.168.142.129-1374837820241
> > 2013-07-26 21:20:18,870 INFO
> > org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool
> > BP-771660648-192.168.142.129-1374837820241 (storage id
> > DS-713465905-192.168.142.131-5001
> > 0-1374844418641) service to node1/192.168.142.129:8020 beginning
> handshake
> > with NN
> > 2013-07-26 21:20:18,873 FATAL
> > org.apache.hadoop.hdfs.server.datanode.DataNode: Initialization failed
> for
> > block pool Block pool BP-771660648-192.168.142.129-1374837820241 (storag
> > e id DS-713465905-192.168.142.131-50010-1374844418641) service to
> > node2/192.168.142.130:8020
> > org.apache.hadoop.util.DiskChecker$DiskErrorException: Invalid volume
> > failure  config value: 3
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl.<init>(FsDatasetImpl.java:183)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:34)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:30)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.DataNode.initStorage(DataNode.java:920)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.DataNode.initBlockPool(DataNode.java:882)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPOfferService.verifyAndSetNamespaceInfo(BPOfferService.java:308)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.connectToNNAndHandshake(BPServiceActor.java:218)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:660)
> >         at java.lang.Thread.run(Thread.java:722)
> > 2013-07-26 21:20:18,874 WARN
> > org.apache.hadoop.hdfs.server.datanode.DataNode: En
> > ding block pool service for: Block pool
> > BP-771660648-192.168.142.129-1374837820241 (storage id
> > DS-713465905-192.168.142.131-50010-1374844418641) service to
> > node2/192.168.142.130:8020
> > 2013-07-26 21:20:18,886 INFO
> > org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool Block pool
> > BP-771660648-192.168.142.129-1374837820241 (storage id
> > DS-713465905-192.168.142.131-50010-1374844418641) service to
> > node1/192.168.142.129:8020 successfully registered with NN
> > 2013-07-26 21:20:18,887 INFO
> > org.apache.hadoop.hdfs.server.datanode.DataNode: For namenode
> > node1/192.168.142.129:8020 using DELETEREPORT_INTERVAL of 300000 msec
> > BLOCKREPORT_INTERVAL of 21600000msec Initial delay: 0msec;
> > heartBeatInterval=3000
> > 2013-07-26 21:20:18,887 ERROR
> > org.apache.hadoop.hdfs.server.datanode.DataNode: Exception in
> BPOfferService
> > for Block pool BP-771660648-192.168.142.129-1374837820241 (storage id
> > DS-713465905-192.168.142.131-50010-1374844418641) service to
> > node1/192.168.142.129:8020
> > java.lang.NullPointerException
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.sendHeartBeat(BPServiceActor.java:435)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.offerService(BPServiceActor.java:521)
> >         at
> >
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:673)
> >         at java.lang.Thread.run(Thread.java:722)
> > ~
>
>
>
> --
> Harsh J
>

Re: problem about starting datanode

Posted by Harsh J <ha...@cloudera.com>.
You have one datanode data volume (dfs.datanode.data.dir) configured,
but you've specified tolerated failures as 3. The state of 3 > 1 is
invalid and hence the error. You cannot enable disk failure toleration
of a DN with just one volume, so remove the toleration config and your
problem will be resolved.

On Fri, Jul 26, 2013 at 11:01 AM, ch huang <ju...@gmail.com> wrote:
> i config name node HA,but when i start data node ,i found the error info in
> log
> and here is my hdfs-site.xml file
>
> <configuration>
> <!--
>   <property>
>      <name>dfs.name.dir</name>
>      <value>/var/lib/hadoop-hdfs/cache/hdfs/dfs/name</value>
>   </property>
> -->
> <property>
>  <name>dfs.permissions.superusergroup</name>
>  <value>hadoop</value>
> </property>
> <!--
> <property>
>         <name>dfs.namenode.name.dir</name>
>         <value>/data/hadoopnamespace</value>
> </property>
> -->
> <property>
>         <name>dfs.datanode.data.dir</name>
>         <value>/data/hadoopdataspace</value>
> </property>
> <property>
>         <name>dfs.datanode.failed.volumes.tolerated</name>
>         <value>3</value>
> </property>
> <!-- HA -->
> <property>
>         <name>dfs.nameservices</name>
>         <value>mycluster</value>
> </property>
> <property>
>         <name>dfs.ha.namenodes.mycluster</name>
>         <value>nn1,nn2</value>
> </property>
> <property>
>         <name>dfs.namenode.rpc-address.mycluster.nn1</name>
>         <value>node1:8020</value>
> </property>
> <property>
>         <name>dfs.namenode.rpc-address.mycluster.nn2</name>
>         <value>node2:8020</value>
> </property>
> <property>
>         <name>dfs.namenode.http-address.mycluster.nn1</name>
>         <value>node1:50070</value>
> </property>
> <property>
>         <name>dfs.namenode.http-address.mycluster.nn2</name>
>         <value>node2:50070</value>
> </property>
> <property>
>         <name>dfs.namenode.shared.edits.dir</name>
>         <value>qjournal://node1:8485;node2:8485;node3:8485/mycluster</value>
> </property>
> <property>
>         <name>dfs.journalnode.edits.dir</name>
>         <value>/data/1/dfs/jn</value>
> </property>
> <property>
>         <name>dfs.client.failover.proxy.provider.mycluster</name>
>
> <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
> </property>
> <!-- HA end -->
> <!-- ssh fence method -->
> <property>
>         <name>dfs.ha.fencing.methods</name>
>         <value>sshfence</value>
> </property>
> <property>
>         <name>dfs.ha.fencing.ssh.private-key-files</name>
>         <value>/home/nodefence/.ssh/id_rsa</value>
> </property>
> <property>
>   <name>dfs.ha.fencing.ssh.connect-timeout</name>
>   <value>30000</value>
>   <description>
>     SSH connection timeout, in milliseconds, to use with the builtin
>     sshfence fencer.
>   </description>
> </property>
> <!-- enable web HDFS -->
> <property>
>   <name>dfs.webhdfs.enabled</name>
>   <value>true</value>
> </property>
> </configuration>
>
>
>
> 2013-07-26 21:20:18,850 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Setting up storage:
> nsid=291409768;bpid=BP-771660648-192.168.142.129-1374837820241;lv=-40;nsInfo=lv
> =-40;cid=CID-28365f0e-e4f1-45b0-a86a-bb37794b6672;nsid=291409768;c=0;bpid=BP-771660648-192.168.142.129-1374837820241
> 2013-07-26 21:20:18,870 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-5001
> 0-1374844418641) service to node1/192.168.142.129:8020 beginning handshake
> with NN
> 2013-07-26 21:20:18,873 FATAL
> org.apache.hadoop.hdfs.server.datanode.DataNode: Initialization failed for
> block pool Block pool BP-771660648-192.168.142.129-1374837820241 (storag
> e id DS-713465905-192.168.142.131-50010-1374844418641) service to
> node2/192.168.142.130:8020
> org.apache.hadoop.util.DiskChecker$DiskErrorException: Invalid volume
> failure  config value: 3
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl.<init>(FsDatasetImpl.java:183)
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:34)
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:30)
>         at
> org.apache.hadoop.hdfs.server.datanode.DataNode.initStorage(DataNode.java:920)
>         at
> org.apache.hadoop.hdfs.server.datanode.DataNode.initBlockPool(DataNode.java:882)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPOfferService.verifyAndSetNamespaceInfo(BPOfferService.java:308)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.connectToNNAndHandshake(BPServiceActor.java:218)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:660)
>         at java.lang.Thread.run(Thread.java:722)
> 2013-07-26 21:20:18,874 WARN
> org.apache.hadoop.hdfs.server.datanode.DataNode: En
> ding block pool service for: Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-50010-1374844418641) service to
> node2/192.168.142.130:8020
> 2013-07-26 21:20:18,886 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-50010-1374844418641) service to
> node1/192.168.142.129:8020 successfully registered with NN
> 2013-07-26 21:20:18,887 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: For namenode
> node1/192.168.142.129:8020 using DELETEREPORT_INTERVAL of 300000 msec
> BLOCKREPORT_INTERVAL of 21600000msec Initial delay: 0msec;
> heartBeatInterval=3000
> 2013-07-26 21:20:18,887 ERROR
> org.apache.hadoop.hdfs.server.datanode.DataNode: Exception in BPOfferService
> for Block pool BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-50010-1374844418641) service to
> node1/192.168.142.129:8020
> java.lang.NullPointerException
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.sendHeartBeat(BPServiceActor.java:435)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.offerService(BPServiceActor.java:521)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:673)
>         at java.lang.Thread.run(Thread.java:722)
> ~



-- 
Harsh J

Re: problem about starting datanode

Posted by ch huang <ju...@gmail.com>.
here is info in namenode log

WithFixedSleep(maxRetries=10, sleepTime=1 SECONDS)
2013-07-26 21:52:12,210 INFO org.apache.hadoop.ipc.Client: Retrying connect
to server: node2/192.168.142.130:8020. Already tried 5 time(s); retry
policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1
SECONDS)
2013-07-26 21:52:13,224 INFO org.apache.hadoop.ipc.Client: Retrying connect
to server: node2/192.168.142.130:8020. Already tried 6 time(s); retry
policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1
SECONDS)
2013-07-26 21:52:14,238 INFO org.apache.hadoop.ipc.Client: Retrying connect
to server: node2/192.168.142.130:8020. Already tried 7 time(s); retry
policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1
SECONDS)
2013-07-26 21:52:15,252 INFO org.apache.hadoop.ipc.Client: Retrying connect
to server: node2/192.168.142.130:8020. Already tried 8 time(s); retry
policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1
SECONDS)
2013-07-26 21:52:16,266 INFO org.apache.hadoop.ipc.Client: Retrying connect
to server: node2/192.168.142.130:8020. Already tried 9 time(s); retry
policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1
SECONDS)
2013-07-26 21:52:16,269 WARN
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer: Unable to trigger
a roll of the active NN
java.net.ConnectException: Call From node1/192.168.142.129 to node2:8020
failed on connection exception: java.net.ConnectException: Connection
refused; For more details see:
http://wiki.apache.org/hadoop/ConnectionRefused
        at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native
Method)
        at
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)
        at
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
        at java.lang.reflect.Constructor.newInstance(Constructor.java:525)
        at org.apache.hadoop.net.NetUtils.wrapWithMessage(NetUtils.java:782)
        at org.apache.hadoop.net.NetUtils.wrapException(NetUtils.java:729)
        at org.apache.hadoop.ipc.Client.call(Client.java:1229)
        at
org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:202)
        at com.sun.proxy.$Proxy12.rollEditLog(Unknown Source)
        at
org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolTranslatorPB.rollEditLog(NamenodeProtocolTranslatorPB.java:137)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer.triggerActiveLogRoll(EditLogTailer.java:268)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer.access$600(EditLogTailer.java:61)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread.doWork(EditLogTailer.java:310)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread.access$200(EditLogTailer.java:279)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread$1.run(EditLogTailer.java:296)
        at
org.apache.hadoop.security.SecurityUtil.doAsLoginUserOrFatal(SecurityUtil.java:452)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread.run(EditLogTailer.java:292)
Caused by: java.net.ConnectException: Connection refused
        at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
        at
sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:692)
        at
org.apache.hadoop.net.SocketIOWithTimeout.connect(SocketIOWithTimeout.java:207)
        at org.apache.hadoop.net.NetUtils.connect(NetUtils.java:528)
        at org.apache.hadoop.net.NetUtils.connect(NetUtils.java:492)
        at
org.apache.hadoop.ipc.Client$Connection.setupConnection(Client.java:499)
        at
org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:593)
        at
org.apache.hadoop.ipc.Client$Connection.access$2000(Client.java:241)
        at org.apache.hadoop.ipc.Client.getConnection(Client.java:1278)
        at org.apache.hadoop.ipc.Client.call(Client.java:1196)


On Fri, Jul 26, 2013 at 1:31 PM, ch huang <ju...@gmail.com> wrote:

> i config name node HA,but when i start data node ,i found the error info
> in log
> and here is my hdfs-site.xml file
>
> <configuration>
> <!--
>   <property>
>      <name>dfs.name.dir</name>
>      <value>/var/lib/hadoop-hdfs/cache/hdfs/dfs/name</value>
>   </property>
> -->
> <property>
>  <name>dfs.permissions.superusergroup</name>
>  <value>hadoop</value>
> </property>
> <!--
> <property>
>         <name>dfs.namenode.name.dir</name>
>         <value>/data/hadoopnamespace</value>
> </property>
> -->
> <property>
>         <name>dfs.datanode.data.dir</name>
>         <value>/data/hadoopdataspace</value>
> </property>
> <property>
>         <name>dfs.datanode.failed.volumes.tolerated</name>
>         <value>3</value>
> </property>
> <!-- HA -->
> <property>
>         <name>dfs.nameservices</name>
>         <value>mycluster</value>
> </property>
> <property>
>         <name>dfs.ha.namenodes.mycluster</name>
>         <value>nn1,nn2</value>
> </property>
> <property>
>         <name>dfs.namenode.rpc-address.mycluster.nn1</name>
>         <value>node1:8020</value>
> </property>
> <property>
>         <name>dfs.namenode.rpc-address.mycluster.nn2</name>
>         <value>node2:8020</value>
> </property>
> <property>
>         <name>dfs.namenode.http-address.mycluster.nn1</name>
>         <value>node1:50070</value>
> </property>
> <property>
>         <name>dfs.namenode.http-address.mycluster.nn2</name>
>         <value>node2:50070</value>
> </property>
> <property>
>         <name>dfs.namenode.shared.edits.dir</name>
>
> <value>qjournal://node1:8485;node2:8485;node3:8485/mycluster</value>
> </property>
> <property>
>         <name>dfs.journalnode.edits.dir</name>
>         <value>/data/1/dfs/jn</value>
> </property>
> <property>
>         <name>dfs.client.failover.proxy.provider.mycluster</name>
>
> <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
> </property>
> <!-- HA end -->
> <!-- ssh fence method -->
> <property>
>         <name>dfs.ha.fencing.methods</name>
>         <value>sshfence</value>
> </property>
> <property>
>         <name>dfs.ha.fencing.ssh.private-key-files</name>
>         <value>/home/nodefence/.ssh/id_rsa</value>
> </property>
> <property>
>   <name>dfs.ha.fencing.ssh.connect-timeout</name>
>   <value>30000</value>
>   <description>
>     SSH connection timeout, in milliseconds, to use with the builtin
>     sshfence fencer.
>   </description>
> </property>
> <!-- enable web HDFS -->
> <property>
>   <name>dfs.webhdfs.enabled</name>
>   <value>true</value>
> </property>
> </configuration>
>
>
>
> 2013-07-26 21:20:18,850 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Setting up storage:
> nsid=291409768;bpid=BP-771660648-192.168.142.129-1374837820241;lv=-40;nsInfo=lv
>
> =-40;cid=CID-28365f0e-e4f1-45b0-a86a-bb37794b6672;nsid=291409768;c=0;bpid=BP-771660648-192.168.142.129-1374837820241
> 2013-07-26 21:20:18,870 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-5001
> 0-1374844418641) service to node1/192.168.142.129:8020 beginning
> handshake with NN
> 2013-07-26 21:20:18,873 FATAL
> org.apache.hadoop.hdfs.server.datanode.DataNode: Initialization failed for
> block pool Block pool BP-771660648-192.168.142.129-1374837820241 (storag
> e id DS-713465905-192.168.142.131-50010-1374844418641) service to node2/
> 192.168.142.130:8020
> org.apache.hadoop.util.DiskChecker$DiskErrorException: Invalid volume
> failure  config value: 3
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl.<init>(FsDatasetImpl.java:183)
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:34)
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:30)
>         at
> org.apache.hadoop.hdfs.server.datanode.DataNode.initStorage(DataNode.java:920)
>         at
> org.apache.hadoop.hdfs.server.datanode.DataNode.initBlockPool(DataNode.java:882)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPOfferService.verifyAndSetNamespaceInfo(BPOfferService.java:308)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.connectToNNAndHandshake(BPServiceActor.java:218)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:660)
>         at java.lang.Thread.run(Thread.java:722)
> 2013-07-26 21:20:18,874 WARN
> org.apache.hadoop.hdfs.server.datanode.DataNode: En
> ding block pool service for: Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-50010-1374844418641) service to node2/
> 192.168.142.130:8020
> 2013-07-26 21:20:18,886 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-50010-1374844418641) service to node1/
> 192.168.142.129:8020 successfully registered with NN
> 2013-07-26 21:20:18,887 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: For namenode node1/
> 192.168.142.129:8020 using DELETEREPORT_INTERVAL of 300000 msec
> BLOCKREPORT_INTERVAL of 21600000msec Initial delay: 0msec;
> heartBeatInterval=3000
> 2013-07-26 21:20:18,887 ERROR
> org.apache.hadoop.hdfs.server.datanode.DataNode: Exception in
> BPOfferService for Block pool BP-771660648-192.168.142.129-1374837820241
> (storage id DS-713465905-192.168.142.131-50010-1374844418641) service to
> node1/192.168.142.129:8020
> java.lang.NullPointerException
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.sendHeartBeat(BPServiceActor.java:435)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.offerService(BPServiceActor.java:521)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:673)
>         at java.lang.Thread.run(Thread.java:722)
> ~
>

Re: problem about starting datanode

Posted by ch huang <ju...@gmail.com>.
here is info in namenode log

WithFixedSleep(maxRetries=10, sleepTime=1 SECONDS)
2013-07-26 21:52:12,210 INFO org.apache.hadoop.ipc.Client: Retrying connect
to server: node2/192.168.142.130:8020. Already tried 5 time(s); retry
policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1
SECONDS)
2013-07-26 21:52:13,224 INFO org.apache.hadoop.ipc.Client: Retrying connect
to server: node2/192.168.142.130:8020. Already tried 6 time(s); retry
policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1
SECONDS)
2013-07-26 21:52:14,238 INFO org.apache.hadoop.ipc.Client: Retrying connect
to server: node2/192.168.142.130:8020. Already tried 7 time(s); retry
policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1
SECONDS)
2013-07-26 21:52:15,252 INFO org.apache.hadoop.ipc.Client: Retrying connect
to server: node2/192.168.142.130:8020. Already tried 8 time(s); retry
policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1
SECONDS)
2013-07-26 21:52:16,266 INFO org.apache.hadoop.ipc.Client: Retrying connect
to server: node2/192.168.142.130:8020. Already tried 9 time(s); retry
policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1
SECONDS)
2013-07-26 21:52:16,269 WARN
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer: Unable to trigger
a roll of the active NN
java.net.ConnectException: Call From node1/192.168.142.129 to node2:8020
failed on connection exception: java.net.ConnectException: Connection
refused; For more details see:
http://wiki.apache.org/hadoop/ConnectionRefused
        at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native
Method)
        at
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)
        at
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
        at java.lang.reflect.Constructor.newInstance(Constructor.java:525)
        at org.apache.hadoop.net.NetUtils.wrapWithMessage(NetUtils.java:782)
        at org.apache.hadoop.net.NetUtils.wrapException(NetUtils.java:729)
        at org.apache.hadoop.ipc.Client.call(Client.java:1229)
        at
org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:202)
        at com.sun.proxy.$Proxy12.rollEditLog(Unknown Source)
        at
org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolTranslatorPB.rollEditLog(NamenodeProtocolTranslatorPB.java:137)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer.triggerActiveLogRoll(EditLogTailer.java:268)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer.access$600(EditLogTailer.java:61)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread.doWork(EditLogTailer.java:310)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread.access$200(EditLogTailer.java:279)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread$1.run(EditLogTailer.java:296)
        at
org.apache.hadoop.security.SecurityUtil.doAsLoginUserOrFatal(SecurityUtil.java:452)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread.run(EditLogTailer.java:292)
Caused by: java.net.ConnectException: Connection refused
        at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
        at
sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:692)
        at
org.apache.hadoop.net.SocketIOWithTimeout.connect(SocketIOWithTimeout.java:207)
        at org.apache.hadoop.net.NetUtils.connect(NetUtils.java:528)
        at org.apache.hadoop.net.NetUtils.connect(NetUtils.java:492)
        at
org.apache.hadoop.ipc.Client$Connection.setupConnection(Client.java:499)
        at
org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:593)
        at
org.apache.hadoop.ipc.Client$Connection.access$2000(Client.java:241)
        at org.apache.hadoop.ipc.Client.getConnection(Client.java:1278)
        at org.apache.hadoop.ipc.Client.call(Client.java:1196)


On Fri, Jul 26, 2013 at 1:31 PM, ch huang <ju...@gmail.com> wrote:

> i config name node HA,but when i start data node ,i found the error info
> in log
> and here is my hdfs-site.xml file
>
> <configuration>
> <!--
>   <property>
>      <name>dfs.name.dir</name>
>      <value>/var/lib/hadoop-hdfs/cache/hdfs/dfs/name</value>
>   </property>
> -->
> <property>
>  <name>dfs.permissions.superusergroup</name>
>  <value>hadoop</value>
> </property>
> <!--
> <property>
>         <name>dfs.namenode.name.dir</name>
>         <value>/data/hadoopnamespace</value>
> </property>
> -->
> <property>
>         <name>dfs.datanode.data.dir</name>
>         <value>/data/hadoopdataspace</value>
> </property>
> <property>
>         <name>dfs.datanode.failed.volumes.tolerated</name>
>         <value>3</value>
> </property>
> <!-- HA -->
> <property>
>         <name>dfs.nameservices</name>
>         <value>mycluster</value>
> </property>
> <property>
>         <name>dfs.ha.namenodes.mycluster</name>
>         <value>nn1,nn2</value>
> </property>
> <property>
>         <name>dfs.namenode.rpc-address.mycluster.nn1</name>
>         <value>node1:8020</value>
> </property>
> <property>
>         <name>dfs.namenode.rpc-address.mycluster.nn2</name>
>         <value>node2:8020</value>
> </property>
> <property>
>         <name>dfs.namenode.http-address.mycluster.nn1</name>
>         <value>node1:50070</value>
> </property>
> <property>
>         <name>dfs.namenode.http-address.mycluster.nn2</name>
>         <value>node2:50070</value>
> </property>
> <property>
>         <name>dfs.namenode.shared.edits.dir</name>
>
> <value>qjournal://node1:8485;node2:8485;node3:8485/mycluster</value>
> </property>
> <property>
>         <name>dfs.journalnode.edits.dir</name>
>         <value>/data/1/dfs/jn</value>
> </property>
> <property>
>         <name>dfs.client.failover.proxy.provider.mycluster</name>
>
> <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
> </property>
> <!-- HA end -->
> <!-- ssh fence method -->
> <property>
>         <name>dfs.ha.fencing.methods</name>
>         <value>sshfence</value>
> </property>
> <property>
>         <name>dfs.ha.fencing.ssh.private-key-files</name>
>         <value>/home/nodefence/.ssh/id_rsa</value>
> </property>
> <property>
>   <name>dfs.ha.fencing.ssh.connect-timeout</name>
>   <value>30000</value>
>   <description>
>     SSH connection timeout, in milliseconds, to use with the builtin
>     sshfence fencer.
>   </description>
> </property>
> <!-- enable web HDFS -->
> <property>
>   <name>dfs.webhdfs.enabled</name>
>   <value>true</value>
> </property>
> </configuration>
>
>
>
> 2013-07-26 21:20:18,850 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Setting up storage:
> nsid=291409768;bpid=BP-771660648-192.168.142.129-1374837820241;lv=-40;nsInfo=lv
>
> =-40;cid=CID-28365f0e-e4f1-45b0-a86a-bb37794b6672;nsid=291409768;c=0;bpid=BP-771660648-192.168.142.129-1374837820241
> 2013-07-26 21:20:18,870 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-5001
> 0-1374844418641) service to node1/192.168.142.129:8020 beginning
> handshake with NN
> 2013-07-26 21:20:18,873 FATAL
> org.apache.hadoop.hdfs.server.datanode.DataNode: Initialization failed for
> block pool Block pool BP-771660648-192.168.142.129-1374837820241 (storag
> e id DS-713465905-192.168.142.131-50010-1374844418641) service to node2/
> 192.168.142.130:8020
> org.apache.hadoop.util.DiskChecker$DiskErrorException: Invalid volume
> failure  config value: 3
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl.<init>(FsDatasetImpl.java:183)
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:34)
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:30)
>         at
> org.apache.hadoop.hdfs.server.datanode.DataNode.initStorage(DataNode.java:920)
>         at
> org.apache.hadoop.hdfs.server.datanode.DataNode.initBlockPool(DataNode.java:882)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPOfferService.verifyAndSetNamespaceInfo(BPOfferService.java:308)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.connectToNNAndHandshake(BPServiceActor.java:218)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:660)
>         at java.lang.Thread.run(Thread.java:722)
> 2013-07-26 21:20:18,874 WARN
> org.apache.hadoop.hdfs.server.datanode.DataNode: En
> ding block pool service for: Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-50010-1374844418641) service to node2/
> 192.168.142.130:8020
> 2013-07-26 21:20:18,886 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-50010-1374844418641) service to node1/
> 192.168.142.129:8020 successfully registered with NN
> 2013-07-26 21:20:18,887 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: For namenode node1/
> 192.168.142.129:8020 using DELETEREPORT_INTERVAL of 300000 msec
> BLOCKREPORT_INTERVAL of 21600000msec Initial delay: 0msec;
> heartBeatInterval=3000
> 2013-07-26 21:20:18,887 ERROR
> org.apache.hadoop.hdfs.server.datanode.DataNode: Exception in
> BPOfferService for Block pool BP-771660648-192.168.142.129-1374837820241
> (storage id DS-713465905-192.168.142.131-50010-1374844418641) service to
> node1/192.168.142.129:8020
> java.lang.NullPointerException
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.sendHeartBeat(BPServiceActor.java:435)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.offerService(BPServiceActor.java:521)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:673)
>         at java.lang.Thread.run(Thread.java:722)
> ~
>

Re: problem about starting datanode

Posted by Harsh J <ha...@cloudera.com>.
You have one datanode data volume (dfs.datanode.data.dir) configured,
but you've specified tolerated failures as 3. The state of 3 > 1 is
invalid and hence the error. You cannot enable disk failure toleration
of a DN with just one volume, so remove the toleration config and your
problem will be resolved.

On Fri, Jul 26, 2013 at 11:01 AM, ch huang <ju...@gmail.com> wrote:
> i config name node HA,but when i start data node ,i found the error info in
> log
> and here is my hdfs-site.xml file
>
> <configuration>
> <!--
>   <property>
>      <name>dfs.name.dir</name>
>      <value>/var/lib/hadoop-hdfs/cache/hdfs/dfs/name</value>
>   </property>
> -->
> <property>
>  <name>dfs.permissions.superusergroup</name>
>  <value>hadoop</value>
> </property>
> <!--
> <property>
>         <name>dfs.namenode.name.dir</name>
>         <value>/data/hadoopnamespace</value>
> </property>
> -->
> <property>
>         <name>dfs.datanode.data.dir</name>
>         <value>/data/hadoopdataspace</value>
> </property>
> <property>
>         <name>dfs.datanode.failed.volumes.tolerated</name>
>         <value>3</value>
> </property>
> <!-- HA -->
> <property>
>         <name>dfs.nameservices</name>
>         <value>mycluster</value>
> </property>
> <property>
>         <name>dfs.ha.namenodes.mycluster</name>
>         <value>nn1,nn2</value>
> </property>
> <property>
>         <name>dfs.namenode.rpc-address.mycluster.nn1</name>
>         <value>node1:8020</value>
> </property>
> <property>
>         <name>dfs.namenode.rpc-address.mycluster.nn2</name>
>         <value>node2:8020</value>
> </property>
> <property>
>         <name>dfs.namenode.http-address.mycluster.nn1</name>
>         <value>node1:50070</value>
> </property>
> <property>
>         <name>dfs.namenode.http-address.mycluster.nn2</name>
>         <value>node2:50070</value>
> </property>
> <property>
>         <name>dfs.namenode.shared.edits.dir</name>
>         <value>qjournal://node1:8485;node2:8485;node3:8485/mycluster</value>
> </property>
> <property>
>         <name>dfs.journalnode.edits.dir</name>
>         <value>/data/1/dfs/jn</value>
> </property>
> <property>
>         <name>dfs.client.failover.proxy.provider.mycluster</name>
>
> <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
> </property>
> <!-- HA end -->
> <!-- ssh fence method -->
> <property>
>         <name>dfs.ha.fencing.methods</name>
>         <value>sshfence</value>
> </property>
> <property>
>         <name>dfs.ha.fencing.ssh.private-key-files</name>
>         <value>/home/nodefence/.ssh/id_rsa</value>
> </property>
> <property>
>   <name>dfs.ha.fencing.ssh.connect-timeout</name>
>   <value>30000</value>
>   <description>
>     SSH connection timeout, in milliseconds, to use with the builtin
>     sshfence fencer.
>   </description>
> </property>
> <!-- enable web HDFS -->
> <property>
>   <name>dfs.webhdfs.enabled</name>
>   <value>true</value>
> </property>
> </configuration>
>
>
>
> 2013-07-26 21:20:18,850 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Setting up storage:
> nsid=291409768;bpid=BP-771660648-192.168.142.129-1374837820241;lv=-40;nsInfo=lv
> =-40;cid=CID-28365f0e-e4f1-45b0-a86a-bb37794b6672;nsid=291409768;c=0;bpid=BP-771660648-192.168.142.129-1374837820241
> 2013-07-26 21:20:18,870 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-5001
> 0-1374844418641) service to node1/192.168.142.129:8020 beginning handshake
> with NN
> 2013-07-26 21:20:18,873 FATAL
> org.apache.hadoop.hdfs.server.datanode.DataNode: Initialization failed for
> block pool Block pool BP-771660648-192.168.142.129-1374837820241 (storag
> e id DS-713465905-192.168.142.131-50010-1374844418641) service to
> node2/192.168.142.130:8020
> org.apache.hadoop.util.DiskChecker$DiskErrorException: Invalid volume
> failure  config value: 3
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl.<init>(FsDatasetImpl.java:183)
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:34)
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:30)
>         at
> org.apache.hadoop.hdfs.server.datanode.DataNode.initStorage(DataNode.java:920)
>         at
> org.apache.hadoop.hdfs.server.datanode.DataNode.initBlockPool(DataNode.java:882)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPOfferService.verifyAndSetNamespaceInfo(BPOfferService.java:308)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.connectToNNAndHandshake(BPServiceActor.java:218)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:660)
>         at java.lang.Thread.run(Thread.java:722)
> 2013-07-26 21:20:18,874 WARN
> org.apache.hadoop.hdfs.server.datanode.DataNode: En
> ding block pool service for: Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-50010-1374844418641) service to
> node2/192.168.142.130:8020
> 2013-07-26 21:20:18,886 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-50010-1374844418641) service to
> node1/192.168.142.129:8020 successfully registered with NN
> 2013-07-26 21:20:18,887 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: For namenode
> node1/192.168.142.129:8020 using DELETEREPORT_INTERVAL of 300000 msec
> BLOCKREPORT_INTERVAL of 21600000msec Initial delay: 0msec;
> heartBeatInterval=3000
> 2013-07-26 21:20:18,887 ERROR
> org.apache.hadoop.hdfs.server.datanode.DataNode: Exception in BPOfferService
> for Block pool BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-50010-1374844418641) service to
> node1/192.168.142.129:8020
> java.lang.NullPointerException
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.sendHeartBeat(BPServiceActor.java:435)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.offerService(BPServiceActor.java:521)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:673)
>         at java.lang.Thread.run(Thread.java:722)
> ~



-- 
Harsh J

Re: problem about starting datanode

Posted by ch huang <ju...@gmail.com>.
here is info in namenode log

WithFixedSleep(maxRetries=10, sleepTime=1 SECONDS)
2013-07-26 21:52:12,210 INFO org.apache.hadoop.ipc.Client: Retrying connect
to server: node2/192.168.142.130:8020. Already tried 5 time(s); retry
policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1
SECONDS)
2013-07-26 21:52:13,224 INFO org.apache.hadoop.ipc.Client: Retrying connect
to server: node2/192.168.142.130:8020. Already tried 6 time(s); retry
policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1
SECONDS)
2013-07-26 21:52:14,238 INFO org.apache.hadoop.ipc.Client: Retrying connect
to server: node2/192.168.142.130:8020. Already tried 7 time(s); retry
policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1
SECONDS)
2013-07-26 21:52:15,252 INFO org.apache.hadoop.ipc.Client: Retrying connect
to server: node2/192.168.142.130:8020. Already tried 8 time(s); retry
policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1
SECONDS)
2013-07-26 21:52:16,266 INFO org.apache.hadoop.ipc.Client: Retrying connect
to server: node2/192.168.142.130:8020. Already tried 9 time(s); retry
policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1
SECONDS)
2013-07-26 21:52:16,269 WARN
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer: Unable to trigger
a roll of the active NN
java.net.ConnectException: Call From node1/192.168.142.129 to node2:8020
failed on connection exception: java.net.ConnectException: Connection
refused; For more details see:
http://wiki.apache.org/hadoop/ConnectionRefused
        at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native
Method)
        at
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)
        at
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
        at java.lang.reflect.Constructor.newInstance(Constructor.java:525)
        at org.apache.hadoop.net.NetUtils.wrapWithMessage(NetUtils.java:782)
        at org.apache.hadoop.net.NetUtils.wrapException(NetUtils.java:729)
        at org.apache.hadoop.ipc.Client.call(Client.java:1229)
        at
org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:202)
        at com.sun.proxy.$Proxy12.rollEditLog(Unknown Source)
        at
org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolTranslatorPB.rollEditLog(NamenodeProtocolTranslatorPB.java:137)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer.triggerActiveLogRoll(EditLogTailer.java:268)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer.access$600(EditLogTailer.java:61)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread.doWork(EditLogTailer.java:310)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread.access$200(EditLogTailer.java:279)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread$1.run(EditLogTailer.java:296)
        at
org.apache.hadoop.security.SecurityUtil.doAsLoginUserOrFatal(SecurityUtil.java:452)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread.run(EditLogTailer.java:292)
Caused by: java.net.ConnectException: Connection refused
        at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
        at
sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:692)
        at
org.apache.hadoop.net.SocketIOWithTimeout.connect(SocketIOWithTimeout.java:207)
        at org.apache.hadoop.net.NetUtils.connect(NetUtils.java:528)
        at org.apache.hadoop.net.NetUtils.connect(NetUtils.java:492)
        at
org.apache.hadoop.ipc.Client$Connection.setupConnection(Client.java:499)
        at
org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:593)
        at
org.apache.hadoop.ipc.Client$Connection.access$2000(Client.java:241)
        at org.apache.hadoop.ipc.Client.getConnection(Client.java:1278)
        at org.apache.hadoop.ipc.Client.call(Client.java:1196)


On Fri, Jul 26, 2013 at 1:31 PM, ch huang <ju...@gmail.com> wrote:

> i config name node HA,but when i start data node ,i found the error info
> in log
> and here is my hdfs-site.xml file
>
> <configuration>
> <!--
>   <property>
>      <name>dfs.name.dir</name>
>      <value>/var/lib/hadoop-hdfs/cache/hdfs/dfs/name</value>
>   </property>
> -->
> <property>
>  <name>dfs.permissions.superusergroup</name>
>  <value>hadoop</value>
> </property>
> <!--
> <property>
>         <name>dfs.namenode.name.dir</name>
>         <value>/data/hadoopnamespace</value>
> </property>
> -->
> <property>
>         <name>dfs.datanode.data.dir</name>
>         <value>/data/hadoopdataspace</value>
> </property>
> <property>
>         <name>dfs.datanode.failed.volumes.tolerated</name>
>         <value>3</value>
> </property>
> <!-- HA -->
> <property>
>         <name>dfs.nameservices</name>
>         <value>mycluster</value>
> </property>
> <property>
>         <name>dfs.ha.namenodes.mycluster</name>
>         <value>nn1,nn2</value>
> </property>
> <property>
>         <name>dfs.namenode.rpc-address.mycluster.nn1</name>
>         <value>node1:8020</value>
> </property>
> <property>
>         <name>dfs.namenode.rpc-address.mycluster.nn2</name>
>         <value>node2:8020</value>
> </property>
> <property>
>         <name>dfs.namenode.http-address.mycluster.nn1</name>
>         <value>node1:50070</value>
> </property>
> <property>
>         <name>dfs.namenode.http-address.mycluster.nn2</name>
>         <value>node2:50070</value>
> </property>
> <property>
>         <name>dfs.namenode.shared.edits.dir</name>
>
> <value>qjournal://node1:8485;node2:8485;node3:8485/mycluster</value>
> </property>
> <property>
>         <name>dfs.journalnode.edits.dir</name>
>         <value>/data/1/dfs/jn</value>
> </property>
> <property>
>         <name>dfs.client.failover.proxy.provider.mycluster</name>
>
> <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
> </property>
> <!-- HA end -->
> <!-- ssh fence method -->
> <property>
>         <name>dfs.ha.fencing.methods</name>
>         <value>sshfence</value>
> </property>
> <property>
>         <name>dfs.ha.fencing.ssh.private-key-files</name>
>         <value>/home/nodefence/.ssh/id_rsa</value>
> </property>
> <property>
>   <name>dfs.ha.fencing.ssh.connect-timeout</name>
>   <value>30000</value>
>   <description>
>     SSH connection timeout, in milliseconds, to use with the builtin
>     sshfence fencer.
>   </description>
> </property>
> <!-- enable web HDFS -->
> <property>
>   <name>dfs.webhdfs.enabled</name>
>   <value>true</value>
> </property>
> </configuration>
>
>
>
> 2013-07-26 21:20:18,850 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Setting up storage:
> nsid=291409768;bpid=BP-771660648-192.168.142.129-1374837820241;lv=-40;nsInfo=lv
>
> =-40;cid=CID-28365f0e-e4f1-45b0-a86a-bb37794b6672;nsid=291409768;c=0;bpid=BP-771660648-192.168.142.129-1374837820241
> 2013-07-26 21:20:18,870 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-5001
> 0-1374844418641) service to node1/192.168.142.129:8020 beginning
> handshake with NN
> 2013-07-26 21:20:18,873 FATAL
> org.apache.hadoop.hdfs.server.datanode.DataNode: Initialization failed for
> block pool Block pool BP-771660648-192.168.142.129-1374837820241 (storag
> e id DS-713465905-192.168.142.131-50010-1374844418641) service to node2/
> 192.168.142.130:8020
> org.apache.hadoop.util.DiskChecker$DiskErrorException: Invalid volume
> failure  config value: 3
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl.<init>(FsDatasetImpl.java:183)
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:34)
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:30)
>         at
> org.apache.hadoop.hdfs.server.datanode.DataNode.initStorage(DataNode.java:920)
>         at
> org.apache.hadoop.hdfs.server.datanode.DataNode.initBlockPool(DataNode.java:882)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPOfferService.verifyAndSetNamespaceInfo(BPOfferService.java:308)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.connectToNNAndHandshake(BPServiceActor.java:218)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:660)
>         at java.lang.Thread.run(Thread.java:722)
> 2013-07-26 21:20:18,874 WARN
> org.apache.hadoop.hdfs.server.datanode.DataNode: En
> ding block pool service for: Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-50010-1374844418641) service to node2/
> 192.168.142.130:8020
> 2013-07-26 21:20:18,886 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-50010-1374844418641) service to node1/
> 192.168.142.129:8020 successfully registered with NN
> 2013-07-26 21:20:18,887 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: For namenode node1/
> 192.168.142.129:8020 using DELETEREPORT_INTERVAL of 300000 msec
> BLOCKREPORT_INTERVAL of 21600000msec Initial delay: 0msec;
> heartBeatInterval=3000
> 2013-07-26 21:20:18,887 ERROR
> org.apache.hadoop.hdfs.server.datanode.DataNode: Exception in
> BPOfferService for Block pool BP-771660648-192.168.142.129-1374837820241
> (storage id DS-713465905-192.168.142.131-50010-1374844418641) service to
> node1/192.168.142.129:8020
> java.lang.NullPointerException
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.sendHeartBeat(BPServiceActor.java:435)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.offerService(BPServiceActor.java:521)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:673)
>         at java.lang.Thread.run(Thread.java:722)
> ~
>

Re: problem about starting datanode

Posted by Harsh J <ha...@cloudera.com>.
You have one datanode data volume (dfs.datanode.data.dir) configured,
but you've specified tolerated failures as 3. The state of 3 > 1 is
invalid and hence the error. You cannot enable disk failure toleration
of a DN with just one volume, so remove the toleration config and your
problem will be resolved.

On Fri, Jul 26, 2013 at 11:01 AM, ch huang <ju...@gmail.com> wrote:
> i config name node HA,but when i start data node ,i found the error info in
> log
> and here is my hdfs-site.xml file
>
> <configuration>
> <!--
>   <property>
>      <name>dfs.name.dir</name>
>      <value>/var/lib/hadoop-hdfs/cache/hdfs/dfs/name</value>
>   </property>
> -->
> <property>
>  <name>dfs.permissions.superusergroup</name>
>  <value>hadoop</value>
> </property>
> <!--
> <property>
>         <name>dfs.namenode.name.dir</name>
>         <value>/data/hadoopnamespace</value>
> </property>
> -->
> <property>
>         <name>dfs.datanode.data.dir</name>
>         <value>/data/hadoopdataspace</value>
> </property>
> <property>
>         <name>dfs.datanode.failed.volumes.tolerated</name>
>         <value>3</value>
> </property>
> <!-- HA -->
> <property>
>         <name>dfs.nameservices</name>
>         <value>mycluster</value>
> </property>
> <property>
>         <name>dfs.ha.namenodes.mycluster</name>
>         <value>nn1,nn2</value>
> </property>
> <property>
>         <name>dfs.namenode.rpc-address.mycluster.nn1</name>
>         <value>node1:8020</value>
> </property>
> <property>
>         <name>dfs.namenode.rpc-address.mycluster.nn2</name>
>         <value>node2:8020</value>
> </property>
> <property>
>         <name>dfs.namenode.http-address.mycluster.nn1</name>
>         <value>node1:50070</value>
> </property>
> <property>
>         <name>dfs.namenode.http-address.mycluster.nn2</name>
>         <value>node2:50070</value>
> </property>
> <property>
>         <name>dfs.namenode.shared.edits.dir</name>
>         <value>qjournal://node1:8485;node2:8485;node3:8485/mycluster</value>
> </property>
> <property>
>         <name>dfs.journalnode.edits.dir</name>
>         <value>/data/1/dfs/jn</value>
> </property>
> <property>
>         <name>dfs.client.failover.proxy.provider.mycluster</name>
>
> <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
> </property>
> <!-- HA end -->
> <!-- ssh fence method -->
> <property>
>         <name>dfs.ha.fencing.methods</name>
>         <value>sshfence</value>
> </property>
> <property>
>         <name>dfs.ha.fencing.ssh.private-key-files</name>
>         <value>/home/nodefence/.ssh/id_rsa</value>
> </property>
> <property>
>   <name>dfs.ha.fencing.ssh.connect-timeout</name>
>   <value>30000</value>
>   <description>
>     SSH connection timeout, in milliseconds, to use with the builtin
>     sshfence fencer.
>   </description>
> </property>
> <!-- enable web HDFS -->
> <property>
>   <name>dfs.webhdfs.enabled</name>
>   <value>true</value>
> </property>
> </configuration>
>
>
>
> 2013-07-26 21:20:18,850 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Setting up storage:
> nsid=291409768;bpid=BP-771660648-192.168.142.129-1374837820241;lv=-40;nsInfo=lv
> =-40;cid=CID-28365f0e-e4f1-45b0-a86a-bb37794b6672;nsid=291409768;c=0;bpid=BP-771660648-192.168.142.129-1374837820241
> 2013-07-26 21:20:18,870 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-5001
> 0-1374844418641) service to node1/192.168.142.129:8020 beginning handshake
> with NN
> 2013-07-26 21:20:18,873 FATAL
> org.apache.hadoop.hdfs.server.datanode.DataNode: Initialization failed for
> block pool Block pool BP-771660648-192.168.142.129-1374837820241 (storag
> e id DS-713465905-192.168.142.131-50010-1374844418641) service to
> node2/192.168.142.130:8020
> org.apache.hadoop.util.DiskChecker$DiskErrorException: Invalid volume
> failure  config value: 3
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl.<init>(FsDatasetImpl.java:183)
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:34)
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:30)
>         at
> org.apache.hadoop.hdfs.server.datanode.DataNode.initStorage(DataNode.java:920)
>         at
> org.apache.hadoop.hdfs.server.datanode.DataNode.initBlockPool(DataNode.java:882)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPOfferService.verifyAndSetNamespaceInfo(BPOfferService.java:308)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.connectToNNAndHandshake(BPServiceActor.java:218)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:660)
>         at java.lang.Thread.run(Thread.java:722)
> 2013-07-26 21:20:18,874 WARN
> org.apache.hadoop.hdfs.server.datanode.DataNode: En
> ding block pool service for: Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-50010-1374844418641) service to
> node2/192.168.142.130:8020
> 2013-07-26 21:20:18,886 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-50010-1374844418641) service to
> node1/192.168.142.129:8020 successfully registered with NN
> 2013-07-26 21:20:18,887 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: For namenode
> node1/192.168.142.129:8020 using DELETEREPORT_INTERVAL of 300000 msec
> BLOCKREPORT_INTERVAL of 21600000msec Initial delay: 0msec;
> heartBeatInterval=3000
> 2013-07-26 21:20:18,887 ERROR
> org.apache.hadoop.hdfs.server.datanode.DataNode: Exception in BPOfferService
> for Block pool BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-50010-1374844418641) service to
> node1/192.168.142.129:8020
> java.lang.NullPointerException
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.sendHeartBeat(BPServiceActor.java:435)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.offerService(BPServiceActor.java:521)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:673)
>         at java.lang.Thread.run(Thread.java:722)
> ~



-- 
Harsh J

Re: problem about starting datanode

Posted by ch huang <ju...@gmail.com>.
here is info in namenode log

WithFixedSleep(maxRetries=10, sleepTime=1 SECONDS)
2013-07-26 21:52:12,210 INFO org.apache.hadoop.ipc.Client: Retrying connect
to server: node2/192.168.142.130:8020. Already tried 5 time(s); retry
policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1
SECONDS)
2013-07-26 21:52:13,224 INFO org.apache.hadoop.ipc.Client: Retrying connect
to server: node2/192.168.142.130:8020. Already tried 6 time(s); retry
policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1
SECONDS)
2013-07-26 21:52:14,238 INFO org.apache.hadoop.ipc.Client: Retrying connect
to server: node2/192.168.142.130:8020. Already tried 7 time(s); retry
policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1
SECONDS)
2013-07-26 21:52:15,252 INFO org.apache.hadoop.ipc.Client: Retrying connect
to server: node2/192.168.142.130:8020. Already tried 8 time(s); retry
policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1
SECONDS)
2013-07-26 21:52:16,266 INFO org.apache.hadoop.ipc.Client: Retrying connect
to server: node2/192.168.142.130:8020. Already tried 9 time(s); retry
policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1
SECONDS)
2013-07-26 21:52:16,269 WARN
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer: Unable to trigger
a roll of the active NN
java.net.ConnectException: Call From node1/192.168.142.129 to node2:8020
failed on connection exception: java.net.ConnectException: Connection
refused; For more details see:
http://wiki.apache.org/hadoop/ConnectionRefused
        at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native
Method)
        at
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)
        at
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
        at java.lang.reflect.Constructor.newInstance(Constructor.java:525)
        at org.apache.hadoop.net.NetUtils.wrapWithMessage(NetUtils.java:782)
        at org.apache.hadoop.net.NetUtils.wrapException(NetUtils.java:729)
        at org.apache.hadoop.ipc.Client.call(Client.java:1229)
        at
org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:202)
        at com.sun.proxy.$Proxy12.rollEditLog(Unknown Source)
        at
org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolTranslatorPB.rollEditLog(NamenodeProtocolTranslatorPB.java:137)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer.triggerActiveLogRoll(EditLogTailer.java:268)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer.access$600(EditLogTailer.java:61)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread.doWork(EditLogTailer.java:310)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread.access$200(EditLogTailer.java:279)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread$1.run(EditLogTailer.java:296)
        at
org.apache.hadoop.security.SecurityUtil.doAsLoginUserOrFatal(SecurityUtil.java:452)
        at
org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread.run(EditLogTailer.java:292)
Caused by: java.net.ConnectException: Connection refused
        at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
        at
sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:692)
        at
org.apache.hadoop.net.SocketIOWithTimeout.connect(SocketIOWithTimeout.java:207)
        at org.apache.hadoop.net.NetUtils.connect(NetUtils.java:528)
        at org.apache.hadoop.net.NetUtils.connect(NetUtils.java:492)
        at
org.apache.hadoop.ipc.Client$Connection.setupConnection(Client.java:499)
        at
org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:593)
        at
org.apache.hadoop.ipc.Client$Connection.access$2000(Client.java:241)
        at org.apache.hadoop.ipc.Client.getConnection(Client.java:1278)
        at org.apache.hadoop.ipc.Client.call(Client.java:1196)


On Fri, Jul 26, 2013 at 1:31 PM, ch huang <ju...@gmail.com> wrote:

> i config name node HA,but when i start data node ,i found the error info
> in log
> and here is my hdfs-site.xml file
>
> <configuration>
> <!--
>   <property>
>      <name>dfs.name.dir</name>
>      <value>/var/lib/hadoop-hdfs/cache/hdfs/dfs/name</value>
>   </property>
> -->
> <property>
>  <name>dfs.permissions.superusergroup</name>
>  <value>hadoop</value>
> </property>
> <!--
> <property>
>         <name>dfs.namenode.name.dir</name>
>         <value>/data/hadoopnamespace</value>
> </property>
> -->
> <property>
>         <name>dfs.datanode.data.dir</name>
>         <value>/data/hadoopdataspace</value>
> </property>
> <property>
>         <name>dfs.datanode.failed.volumes.tolerated</name>
>         <value>3</value>
> </property>
> <!-- HA -->
> <property>
>         <name>dfs.nameservices</name>
>         <value>mycluster</value>
> </property>
> <property>
>         <name>dfs.ha.namenodes.mycluster</name>
>         <value>nn1,nn2</value>
> </property>
> <property>
>         <name>dfs.namenode.rpc-address.mycluster.nn1</name>
>         <value>node1:8020</value>
> </property>
> <property>
>         <name>dfs.namenode.rpc-address.mycluster.nn2</name>
>         <value>node2:8020</value>
> </property>
> <property>
>         <name>dfs.namenode.http-address.mycluster.nn1</name>
>         <value>node1:50070</value>
> </property>
> <property>
>         <name>dfs.namenode.http-address.mycluster.nn2</name>
>         <value>node2:50070</value>
> </property>
> <property>
>         <name>dfs.namenode.shared.edits.dir</name>
>
> <value>qjournal://node1:8485;node2:8485;node3:8485/mycluster</value>
> </property>
> <property>
>         <name>dfs.journalnode.edits.dir</name>
>         <value>/data/1/dfs/jn</value>
> </property>
> <property>
>         <name>dfs.client.failover.proxy.provider.mycluster</name>
>
> <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
> </property>
> <!-- HA end -->
> <!-- ssh fence method -->
> <property>
>         <name>dfs.ha.fencing.methods</name>
>         <value>sshfence</value>
> </property>
> <property>
>         <name>dfs.ha.fencing.ssh.private-key-files</name>
>         <value>/home/nodefence/.ssh/id_rsa</value>
> </property>
> <property>
>   <name>dfs.ha.fencing.ssh.connect-timeout</name>
>   <value>30000</value>
>   <description>
>     SSH connection timeout, in milliseconds, to use with the builtin
>     sshfence fencer.
>   </description>
> </property>
> <!-- enable web HDFS -->
> <property>
>   <name>dfs.webhdfs.enabled</name>
>   <value>true</value>
> </property>
> </configuration>
>
>
>
> 2013-07-26 21:20:18,850 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Setting up storage:
> nsid=291409768;bpid=BP-771660648-192.168.142.129-1374837820241;lv=-40;nsInfo=lv
>
> =-40;cid=CID-28365f0e-e4f1-45b0-a86a-bb37794b6672;nsid=291409768;c=0;bpid=BP-771660648-192.168.142.129-1374837820241
> 2013-07-26 21:20:18,870 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-5001
> 0-1374844418641) service to node1/192.168.142.129:8020 beginning
> handshake with NN
> 2013-07-26 21:20:18,873 FATAL
> org.apache.hadoop.hdfs.server.datanode.DataNode: Initialization failed for
> block pool Block pool BP-771660648-192.168.142.129-1374837820241 (storag
> e id DS-713465905-192.168.142.131-50010-1374844418641) service to node2/
> 192.168.142.130:8020
> org.apache.hadoop.util.DiskChecker$DiskErrorException: Invalid volume
> failure  config value: 3
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl.<init>(FsDatasetImpl.java:183)
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:34)
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:30)
>         at
> org.apache.hadoop.hdfs.server.datanode.DataNode.initStorage(DataNode.java:920)
>         at
> org.apache.hadoop.hdfs.server.datanode.DataNode.initBlockPool(DataNode.java:882)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPOfferService.verifyAndSetNamespaceInfo(BPOfferService.java:308)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.connectToNNAndHandshake(BPServiceActor.java:218)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:660)
>         at java.lang.Thread.run(Thread.java:722)
> 2013-07-26 21:20:18,874 WARN
> org.apache.hadoop.hdfs.server.datanode.DataNode: En
> ding block pool service for: Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-50010-1374844418641) service to node2/
> 192.168.142.130:8020
> 2013-07-26 21:20:18,886 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-50010-1374844418641) service to node1/
> 192.168.142.129:8020 successfully registered with NN
> 2013-07-26 21:20:18,887 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: For namenode node1/
> 192.168.142.129:8020 using DELETEREPORT_INTERVAL of 300000 msec
> BLOCKREPORT_INTERVAL of 21600000msec Initial delay: 0msec;
> heartBeatInterval=3000
> 2013-07-26 21:20:18,887 ERROR
> org.apache.hadoop.hdfs.server.datanode.DataNode: Exception in
> BPOfferService for Block pool BP-771660648-192.168.142.129-1374837820241
> (storage id DS-713465905-192.168.142.131-50010-1374844418641) service to
> node1/192.168.142.129:8020
> java.lang.NullPointerException
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.sendHeartBeat(BPServiceActor.java:435)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.offerService(BPServiceActor.java:521)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:673)
>         at java.lang.Thread.run(Thread.java:722)
> ~
>

Re: problem about starting datanode

Posted by Harsh J <ha...@cloudera.com>.
You have one datanode data volume (dfs.datanode.data.dir) configured,
but you've specified tolerated failures as 3. The state of 3 > 1 is
invalid and hence the error. You cannot enable disk failure toleration
of a DN with just one volume, so remove the toleration config and your
problem will be resolved.

On Fri, Jul 26, 2013 at 11:01 AM, ch huang <ju...@gmail.com> wrote:
> i config name node HA,but when i start data node ,i found the error info in
> log
> and here is my hdfs-site.xml file
>
> <configuration>
> <!--
>   <property>
>      <name>dfs.name.dir</name>
>      <value>/var/lib/hadoop-hdfs/cache/hdfs/dfs/name</value>
>   </property>
> -->
> <property>
>  <name>dfs.permissions.superusergroup</name>
>  <value>hadoop</value>
> </property>
> <!--
> <property>
>         <name>dfs.namenode.name.dir</name>
>         <value>/data/hadoopnamespace</value>
> </property>
> -->
> <property>
>         <name>dfs.datanode.data.dir</name>
>         <value>/data/hadoopdataspace</value>
> </property>
> <property>
>         <name>dfs.datanode.failed.volumes.tolerated</name>
>         <value>3</value>
> </property>
> <!-- HA -->
> <property>
>         <name>dfs.nameservices</name>
>         <value>mycluster</value>
> </property>
> <property>
>         <name>dfs.ha.namenodes.mycluster</name>
>         <value>nn1,nn2</value>
> </property>
> <property>
>         <name>dfs.namenode.rpc-address.mycluster.nn1</name>
>         <value>node1:8020</value>
> </property>
> <property>
>         <name>dfs.namenode.rpc-address.mycluster.nn2</name>
>         <value>node2:8020</value>
> </property>
> <property>
>         <name>dfs.namenode.http-address.mycluster.nn1</name>
>         <value>node1:50070</value>
> </property>
> <property>
>         <name>dfs.namenode.http-address.mycluster.nn2</name>
>         <value>node2:50070</value>
> </property>
> <property>
>         <name>dfs.namenode.shared.edits.dir</name>
>         <value>qjournal://node1:8485;node2:8485;node3:8485/mycluster</value>
> </property>
> <property>
>         <name>dfs.journalnode.edits.dir</name>
>         <value>/data/1/dfs/jn</value>
> </property>
> <property>
>         <name>dfs.client.failover.proxy.provider.mycluster</name>
>
> <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
> </property>
> <!-- HA end -->
> <!-- ssh fence method -->
> <property>
>         <name>dfs.ha.fencing.methods</name>
>         <value>sshfence</value>
> </property>
> <property>
>         <name>dfs.ha.fencing.ssh.private-key-files</name>
>         <value>/home/nodefence/.ssh/id_rsa</value>
> </property>
> <property>
>   <name>dfs.ha.fencing.ssh.connect-timeout</name>
>   <value>30000</value>
>   <description>
>     SSH connection timeout, in milliseconds, to use with the builtin
>     sshfence fencer.
>   </description>
> </property>
> <!-- enable web HDFS -->
> <property>
>   <name>dfs.webhdfs.enabled</name>
>   <value>true</value>
> </property>
> </configuration>
>
>
>
> 2013-07-26 21:20:18,850 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Setting up storage:
> nsid=291409768;bpid=BP-771660648-192.168.142.129-1374837820241;lv=-40;nsInfo=lv
> =-40;cid=CID-28365f0e-e4f1-45b0-a86a-bb37794b6672;nsid=291409768;c=0;bpid=BP-771660648-192.168.142.129-1374837820241
> 2013-07-26 21:20:18,870 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-5001
> 0-1374844418641) service to node1/192.168.142.129:8020 beginning handshake
> with NN
> 2013-07-26 21:20:18,873 FATAL
> org.apache.hadoop.hdfs.server.datanode.DataNode: Initialization failed for
> block pool Block pool BP-771660648-192.168.142.129-1374837820241 (storag
> e id DS-713465905-192.168.142.131-50010-1374844418641) service to
> node2/192.168.142.130:8020
> org.apache.hadoop.util.DiskChecker$DiskErrorException: Invalid volume
> failure  config value: 3
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl.<init>(FsDatasetImpl.java:183)
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:34)
>         at
> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory.newInstance(FsDatasetFactory.java:30)
>         at
> org.apache.hadoop.hdfs.server.datanode.DataNode.initStorage(DataNode.java:920)
>         at
> org.apache.hadoop.hdfs.server.datanode.DataNode.initBlockPool(DataNode.java:882)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPOfferService.verifyAndSetNamespaceInfo(BPOfferService.java:308)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.connectToNNAndHandshake(BPServiceActor.java:218)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:660)
>         at java.lang.Thread.run(Thread.java:722)
> 2013-07-26 21:20:18,874 WARN
> org.apache.hadoop.hdfs.server.datanode.DataNode: En
> ding block pool service for: Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-50010-1374844418641) service to
> node2/192.168.142.130:8020
> 2013-07-26 21:20:18,886 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool Block pool
> BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-50010-1374844418641) service to
> node1/192.168.142.129:8020 successfully registered with NN
> 2013-07-26 21:20:18,887 INFO
> org.apache.hadoop.hdfs.server.datanode.DataNode: For namenode
> node1/192.168.142.129:8020 using DELETEREPORT_INTERVAL of 300000 msec
> BLOCKREPORT_INTERVAL of 21600000msec Initial delay: 0msec;
> heartBeatInterval=3000
> 2013-07-26 21:20:18,887 ERROR
> org.apache.hadoop.hdfs.server.datanode.DataNode: Exception in BPOfferService
> for Block pool BP-771660648-192.168.142.129-1374837820241 (storage id
> DS-713465905-192.168.142.131-50010-1374844418641) service to
> node1/192.168.142.129:8020
> java.lang.NullPointerException
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.sendHeartBeat(BPServiceActor.java:435)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.offerService(BPServiceActor.java:521)
>         at
> org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:673)
>         at java.lang.Thread.run(Thread.java:722)
> ~



-- 
Harsh J