You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@ozone.apache.org by "Arpit Agarwal (Jira)" <ji...@apache.org> on 2020/02/28 21:20:00 UTC

[jira] [Updated] (HDDS-741) all containers are in 'CLOSING' state after service restart

     [ https://issues.apache.org/jira/browse/HDDS-741?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Arpit Agarwal updated HDDS-741:
-------------------------------
    Target Version/s: 0.6.0  (was: 0.5.0)

> all containers are in 'CLOSING' state after service restart
> -----------------------------------------------------------
>
>                 Key: HDDS-741
>                 URL: https://issues.apache.org/jira/browse/HDDS-741
>             Project: Hadoop Distributed Data Store
>          Issue Type: Bug
>          Components: SCM
>    Affects Versions: 0.3.0
>            Reporter: Nilotpal Nandi
>            Assignee: Nanda kumar
>            Priority: Blocker
>              Labels: pushed-to-craterlake, test-badlands
>         Attachments: all-node-ozone-logs-1540556458.tar.gz
>
>
> all containers are in closing state after service restart. None of the writes are working after restart.
> The cluster contains 11 live datanodes.
> **
> {noformat}
> [
>  {
>  "nodeType": "OM",
>  "hostname": "ctr-e138-1518143905142-544443-01-000008.hwx.site",
>  "ports": {
>  "RPC": 9889,
>  "HTTP": 9874
>  }
>  },
>  {
>  "nodeType": "SCM",
>  "hostname": "ctr-e138-1518143905142-544443-01-000003.hwx.site",
>  "ports": {
>  "RPC": 9860
>  }
>  },
>  {
>  "nodeType": "DATANODE",
>  "hostname": "ctr-e138-1518143905142-541661-01-000003.hwx.site",
>  "ports": {
>  "HTTP": 9880
>  }
>  },
>  {
>  "nodeType": "DATANODE",
>  "hostname": "ctr-e138-1518143905142-541661-01-000007.hwx.site",
>  "ports": {
>  "HTTP": 9880
>  }
>  },
>  {
>  "nodeType": "DATANODE",
>  "hostname": "ctr-e138-1518143905142-544443-01-000003.hwx.site",
>  "ports": {
>  "HTTP": 9880
>  }
>  },
>  {
>  "nodeType": "DATANODE",
>  "hostname": "ctr-e138-1518143905142-541661-01-000004.hwx.site",
>  "ports": {
>  "HTTP": 9880
>  }
>  },
>  {
>  "nodeType": "DATANODE",
>  "hostname": "ctr-e138-1518143905142-544443-01-000004.hwx.site",
>  "ports": {
>  "HTTP": 9880
>  }
>  },
>  {
>  "nodeType": "DATANODE",
>  "hostname": "ctr-e138-1518143905142-544443-01-000008.hwx.site",
>  "ports": {
>  "HTTP": 9880
>  }
>  },
>  {
>  "nodeType": "DATANODE",
>  "hostname": "ctr-e138-1518143905142-541661-01-000002.hwx.site",
>  "ports": {
>  "HTTP": 9880
>  }
>  },
>  {
>  "nodeType": "DATANODE",
>  "hostname": "ctr-e138-1518143905142-544443-01-000005.hwx.site",
>  "ports": {
>  "HTTP": 9880
>  }
>  },
>  {
>  "nodeType": "DATANODE",
>  "hostname": "ctr-e138-1518143905142-541661-01-000006.hwx.site",
>  "ports": {
>  "HTTP": 9880
>  }
>  },
>  {
>  "nodeType": "DATANODE",
>  "hostname": "ctr-e138-1518143905142-544443-01-000007.hwx.site",
>  "ports": {
>  "HTTP": 9880
>  }
>  },
>  {
>  "nodeType": "DATANODE",
>  "hostname": "ctr-e138-1518143905142-544443-01-000006.hwx.site",
>  "ports": {
>  "HTTP": 9880
>  }
>  }
> ]{noformat}
> error thrown while write :
>  
> {noformat}
> [root@ctr-e138-1518143905142-541661-01-000007 test_files]# ozone fs -put /etc/passwd /testdir5/
> 2018-10-26 12:09:43,822 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
> 2018-10-26 12:09:47,882 ERROR io.ChunkGroupOutputStream: Try to allocate more blocks for write failed, already allocated 0 blocks for this write.
> put: Allocate block failed, error:INTERNAL_ERROR{noformat}
>  
>  
> pipelines in the cluster :
>  
> {noformat}
> [root@ctr-e138-1518143905142-541661-01-000007 test_files]# ozone scmcli listPipelines
> Pipeline[ Id: 29b68cc2-2d18-4db0-a11a-587ae4abc715, Nodes: e3d89961-fe38-4ed0-8a32-cd1849c58e0c{ip: 172.27.20.96, host: ctr-e138-1518143905142-544443-01-000008.hwx.site}b33a30d9-f1e2-448e-aabb-61a970445cea{ip: 172.27.85.64, host: ctr-e138-1518143905142-541661-01-000007.hwx.site}, Type:RATIS, Factor:THREE, State:CLOSING]
> Pipeline[ Id: 05061f87-4c68-443b-ae27-984da2d0a2cd, Nodes: dc002a73-fc63-4e76-be3e-3c6d16ede5f6{ip: 172.27.38.9, host: ctr-e138-1518143905142-544443-01-000004.hwx.site}4e6bd2a2-6802-4e67-9710-612a2cdb9dc1{ip: 172.27.24.90, host: ctr-e138-1518143905142-544443-01-000005.hwx.site}be3f0db4-3a19-44a5-bd6e-0da47d2ed92e{ip: 172.27.20.91, host: ctr-e138-1518143905142-544443-01-000003.hwx.site}, Type:RATIS, Factor:THREE, State:CLOSING]
> Pipeline[ Id: 80893f87-5e73-49a2-8f38-2adb2b13140a, Nodes: 63833540-bf93-410c-b081-243a56f93c88{ip: 172.27.10.199, host: ctr-e138-1518143905142-544443-01-000007.hwx.site}6e8b7129-8615-45fe-81e0-848a2e0ba520{ip: 172.27.15.139, host: ctr-e138-1518143905142-544443-01-000006.hwx.site}aab1f2e5-1cf0-430d-b1bf-04be8630a8ee{ip: 172.27.57.0, host: ctr-e138-1518143905142-541661-01-000003.hwx.site}, Type:RATIS, Factor:THREE, State:CLOSING]
> Pipeline[ Id: f0a14cb9-d37a-4c7c-b3e6-b7e3830dfd5f, Nodes: 61e271bf-68ad-435e-8a6e-582be90ebb6f{ip: 172.27.19.74, host: ctr-e138-1518143905142-541661-01-000006.hwx.site}3622352c-b136-4c74-b952-34e938cbda94{ip: 172.27.15.131, host: ctr-e138-1518143905142-541661-01-000002.hwx.site}cb2b1e95-e803-48d3-bdf2-bf878cae62cf{ip: 172.27.23.139, host: ctr-e138-1518143905142-541661-01-000004.hwx.site}, Type:RATIS, Factor:THREE, State:CLOSING]
> {noformat}
>  
> datanode.log :
> -----------------------
> {noformat}
> 2018-10-26 12:17:23,697 INFO org.apache.ratis.server.impl.LeaderElection: e3d89961-fe38-4ed0-8a32-cd1849c58e0c: Election REJECTED; received 1 response(s) [e3d89961-fe38-4ed0-8a32-cd1849c58e0c<-b33a30d9-f1e2-448e-aabb-61a970445cea#0:FAIL-t1019] and 1 exception(s); e3d89961-fe38-4ed0-8a32-cd1849c58e0c:t1019, leader=null, voted=e3d89961-fe38-4ed0-8a32-cd1849c58e0c, raftlog=e3d89961-fe38-4ed0-8a32-cd1849c58e0c-SegmentedRaftLog:OPENED, conf=531: [e3d89961-fe38-4ed0-8a32-cd1849c58e0c:172.27.20.96:9858, b33a30d9-f1e2-448e-aabb-61a970445cea:172.27.85.64:9858, 0d7f5327-df16-40fe-ac88-7ed06e76a20f:172.27.68.65:9858], old=null
> 2018-10-26 12:17:23,697 INFO org.apache.ratis.server.impl.LeaderElection: 0: java.util.concurrent.ExecutionException: org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException: UNAVAILABLE: io exception
> 2018-10-26 12:17:23,697 INFO org.apache.ratis.server.impl.RaftServerImpl: e3d89961-fe38-4ed0-8a32-cd1849c58e0c changes role from CANDIDATE to FOLLOWER at term 1019 for changeToFollower
> 2018-10-26 12:17:23,697 INFO org.apache.ratis.server.impl.RoleInfo: e3d89961-fe38-4ed0-8a32-cd1849c58e0c: shutdown LeaderElection
> 2018-10-26 12:17:23,698 INFO org.apache.ratis.server.impl.RoleInfo: e3d89961-fe38-4ed0-8a32-cd1849c58e0c: start FollowerState
> 2018-10-26 12:17:24,403 WARN org.apache.ratis.grpc.server.GrpcLogAppender: GrpcLogAppender(e3d89961-fe38-4ed0-8a32-cd1849c58e0c -> b33a30d9-f1e2-448e-aabb-61a970445cea): appendEntries Timeout, request=e3d89961-fe38-4ed0-8a32-cd1849c58e0c->b33a30d9-f1e2-448e-aabb-61a970445cea#0
> 2018-10-26 12:17:24,859 INFO org.apache.ratis.server.impl.FollowerState: e3d89961-fe38-4ed0-8a32-cd1849c58e0c changes to CANDIDATE, lastRpcTime:1161, electionTimeout:1161ms
> 2018-10-26 12:17:24,860 INFO org.apache.ratis.server.impl.RoleInfo: e3d89961-fe38-4ed0-8a32-cd1849c58e0c: shutdown FollowerState
> 2018-10-26 12:17:24,860 INFO org.apache.ratis.server.impl.RaftServerImpl: e3d89961-fe38-4ed0-8a32-cd1849c58e0c changes role from FOLLOWER to CANDIDATE at term 1020 for changeToCandidate
> 2018-10-26 12:17:24,860 INFO org.apache.ratis.server.impl.RoleInfo: e3d89961-fe38-4ed0-8a32-cd1849c58e0c: start LeaderElection
> 2018-10-26 12:17:24,864 INFO org.apache.ratis.server.impl.LeaderElection: e3d89961-fe38-4ed0-8a32-cd1849c58e0c: begin an election in Term 1021
> 2018-10-26 12:17:24,869 INFO org.apache.ratis.server.impl.LeaderElection: e3d89961-fe38-4ed0-8a32-cd1849c58e0c got exception when requesting votes: {}
> java.util.concurrent.ExecutionException: org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException: UNAVAILABLE: io exception
>  at java.util.concurrent.FutureTask.report(FutureTask.java:122)
>  at java.util.concurrent.FutureTask.get(FutureTask.java:192)
>  at org.apache.ratis.server.impl.LeaderElection.waitForResults(LeaderElection.java:214)
>  at org.apache.ratis.server.impl.LeaderElection.askForVotes(LeaderElection.java:146)
>  at org.apache.ratis.server.impl.LeaderElection.run(LeaderElection.java:102)
> Caused by: org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException: UNAVAILABLE: io exception
>  at org.apache.ratis.thirdparty.io.grpc.stub.ClientCalls.toStatusRuntimeException(ClientCalls.java:222)
>  at org.apache.ratis.thirdparty.io.grpc.stub.ClientCalls.getUnchecked(ClientCalls.java:203)
>  at org.apache.ratis.thirdparty.io.grpc.stub.ClientCalls.blockingUnaryCall(ClientCalls.java:132)
>  at org.apache.ratis.proto.grpc.RaftServerProtocolServiceGrpc$RaftServerProtocolServiceBlockingStub.requestVote(RaftServerProtocolServiceGrpc.java:265)
>  at org.apache.ratis.grpc.server.GrpcServerProtocolClient.requestVote(GrpcServerProtocolClient.java:61)
>  at org.apache.ratis.grpc.server.GrpcService.requestVote(GrpcService.java:150)
>  at org.apache.ratis.server.impl.LeaderElection.lambda$submitRequests$0(LeaderElection.java:188)
>  at java.util.concurrent.FutureTask.run(FutureTask.java:266)
>  at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
>  at java.util.concurrent.FutureTask.run(FutureTask.java:266)
>  at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
>  at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
>  at java.lang.Thread.run(Thread.java:745)
> Caused by: org.apache.ratis.thirdparty.io.netty.channel.AbstractChannel$AnnotatedConnectException: Connection refused: /172.27.68.65:9858
>  at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
>  at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:717)
>  at org.apache.ratis.thirdparty.io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:325)
>  at org.apache.ratis.thirdparty.io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:340)
>  at org.apache.ratis.thirdparty.io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:634)
>  at org.apache.ratis.thirdparty.io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:581)
>  at org.apache.ratis.thirdparty.io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:498)
>  at org.apache.ratis.thirdparty.io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:460)
>  at org.apache.ratis.thirdparty.io.netty.util.concurrent.SingleThreadEventExecutor$5.run(SingleThreadEventExecutor.java:884)
>  at org.apache.ratis.thirdparty.io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
>  ... 1 more
> Caused by: java.net.ConnectException: Connection refused
>  ... 11 more
> 2018-10-26 12:17:24,882 INFO org.apache.ratis.server.impl.LeaderElection: e3d89961-fe38-4ed0-8a32-cd1849c58e0c: Election PASSED; received 1 response(s) [e3d89961-fe38-4ed0-8a32-cd1849c58e0c<-b33a30d9-f1e2-448e-aabb-61a970445cea#0:OK-t1021] and 1 exception(s); e3d89961-fe38-4ed0-8a32-cd1849c58e0c:t1021, leader=null, voted=e3d89961-fe38-4ed0-8a32-cd1849c58e0c, raftlog=e3d89961-fe38-4ed0-8a32-cd1849c58e0c-SegmentedRaftLog:OPENED, conf=531: [e3d89961-fe38-4ed0-8a32-cd1849c58e0c:172.27.20.96:9858, b33a30d9-f1e2-448e-aabb-61a970445cea:172.27.85.64:9858, 0d7f5327-df16-40fe-ac88-7ed06e76a20f:172.27.68.65:9858], old=null{noformat}
>  



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: ozone-issues-unsubscribe@hadoop.apache.org
For additional commands, e-mail: ozone-issues-help@hadoop.apache.org