You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kyuubi.apache.org by fe...@apache.org on 2023/03/25 03:07:12 UTC
[kyuubi] branch master updated: [KYUUBI #4480] Engine alive probe should close thrift connection on engine lost
This is an automated email from the ASF dual-hosted git repository.
feiwang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kyuubi.git
The following commit(s) were added to refs/heads/master by this push:
new cee2f00f9 [KYUUBI #4480] Engine alive probe should close thrift connection on engine lost
cee2f00f9 is described below
commit cee2f00f91b6da567cffd7f241066291fac72711
Author: Xieming LI <ri...@gmail.com>
AuthorDate: Sat Mar 25 11:07:02 2023 +0800
[KYUUBI #4480] Engine alive probe should close thrift connection on engine lost
### _Why are the changes needed?_
As described in https://github.com/apache/kyuubi/issues/4457, when the backend engine becomes unresponsive,
the beeline will hang because `client.getOperationStatus(_remoteOpHandle)` in `ExecuteStatement #waitStatementComplete()` would never receive any response.
https://github.com/apache/kyuubi/blob/43309b86f1997b028e8fde5cb4e6449d818f4f73/kyuubi-server/src/main/scala/org/apache/kyuubi/operation/ExecuteStatement.scala#L101-L105
While the EngineAliveProbe can identify engine failure, it does not resolve the underlying issue of the thrift client waiting for a response, causing the beeline to remain unresponsive. The only way to resolve this state of suspension is by interrupting the thrift client thread.
https://github.com/apache/kyuubi/blob/3d65f2711faa5dc9173130557e2d33adab04b5c7/kyuubi-server/src/main/scala/org/apache/kyuubi/client/KyuubiSyncThriftClient.scala#L84
This pull request introduces a new logic to the EngineAliveProbe feature, whereby upon detecting an engine failure, it deliberately closes the thrift connection. This creates an exception that can then be handled by the error handling mechanism introduced in apache#646, thereby allowing for a graceful shutdown of the session handle.
### _How was this patch tested?_
- [ ] Add some test cases that check the changes thoroughly including negative and positive cases if possible
I am very happy to add a test case but I am not aware of how to simulate a state of unresponsive.
Otherwise, I can implement test like this:
https://github.com/apache/kyuubi/blob/43309b86f1997b028e8fde5cb4e6449d818f4f73/kyuubi-server/src/test/scala/org/apache/kyuubi/operation/KyuubiOperationPerUserSuite.scala#L169
- [x] Add screenshots for manual tests if appropriate
1. Execute a query with beeline
```
$ beeline -u 'jdbc:hive2://<KyuubiServer>:10009/default?socketTimeout=60000;#spark.yarn.queue=shared' -n jpz3032 -p <password> -f test.sql
```
2. Go to ResourceManager and Identify the ApplicationMaster
<img width="2091" alt="Screenshot 2023-03-08 at 12 13 31" src="https://user-images.githubusercontent.com/4378066/223610552-6a35dc6a-891a-415b-b142-a2b104da2c1f.png">
3. ssh into the host and restart the NodeManager
```
[<user><NodeManager> ~]$ sudo -i
[root<NodeManager> :~]# /usr/sbin/reboot
Connection to <NodeManager> closed by remote host.
Connection to <NodeManager> closed.
```
4. Check the kyuubi server log: the session is terminated with error
```
2023-03-14 10:56:10.330 INFO org.apache.kyuubi.operation.ExecuteStatement: Query[4b23657a-cacb-4ead-8abd-50920dd73de9] in RUNNING_STATE
2023-03-14 10:56:15.332 INFO org.apache.kyuubi.operation.ExecuteStatement: Query[4b23657a-cacb-4ead-8abd-50920dd73de9] in RUNNING_STATE
2023-03-14 10:56:20.333 INFO org.apache.kyuubi.operation.ExecuteStatement: Query[4b23657a-cacb-4ead-8abd-50920dd73de9] in RUNNING_STATE
```
After 3 consecutive alive probe fails, Engine is marked dead.
```
2023-03-14 10:56:40.089 WARN org.apache.kyuubi.client.KyuubiSyncThriftClient: The engine[Some(application_1676285123186_0447)] alive probe fails
org.apache.thrift.transport.TTransportException: java.net.SocketTimeoutException: Read timed out
at org.apache.thrift.transport.TIOStreamTransport.read(TIOStreamTransport.java:129) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.transport.TTransport.readAll(TTransport.java:86) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.transport.TSaslTransport.readLength(TSaslTransport.java:376) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.transport.TSaslTransport.readFrame(TSaslTransport.java:453) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.transport.TSaslTransport.read(TSaslTransport.java:435) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.transport.TSaslClientTransport.read(TSaslClientTransport.java:37) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.transport.TTransport.readAll(TTransport.java:86) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.protocol.TBinaryProtocol.readAll(TBinaryProtocol.java:429) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.protocol.TBinaryProtocol.readI32(TBinaryProtocol.java:318) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.protocol.TBinaryProtocol.readMessageBegin(TBinaryProtocol.java:219) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.TServiceClient.receiveBase(TServiceClient.java:77) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.hive.service.rpc.thrift.TCLIService$Client.recv_GetInfo(TCLIService.java:222) ~[hive-service-rpc-3.1.3.jar:3.1.3]
at org.apache.hive.service.rpc.thrift.TCLIService$Client.GetInfo(TCLIService.java:209) ~[hive-service-rpc-3.1.3.jar:3.1.3]
at org.apache.kyuubi.client.KyuubiSyncThriftClient$$anon$1.$anonfun$run$1(KyuubiSyncThriftClient.scala:93) ~[kyuubi-server_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at org.apache.kyuubi.client.KyuubiSyncThriftClient$$anon$1.$anonfun$run$1$adapted(KyuubiSyncThriftClient.scala:87) ~[kyuubi-server_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at scala.Option.foreach(Option.scala:407) ~[scala-library-2.12.17.jar:?]
at org.apache.kyuubi.client.KyuubiSyncThriftClient$$anon$1.run(KyuubiSyncThriftClient.scala:87) ~[kyuubi-server_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) ~[?:1.8.0_362]
at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308) ~[?:1.8.0_362]
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:180) ~[?:1.8.0_362]
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:294) ~[?:1.8.0_362]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) ~[?:1.8.0_362]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) ~[?:1.8.0_362]
at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_362]
Caused by: java.net.SocketTimeoutException: Read timed out
at java.net.SocketInputStream.socketRead0(Native Method) ~[?:1.8.0_362]
at java.net.SocketInputStream.socketRead(SocketInputStream.java:116) ~[?:1.8.0_362]
at java.net.SocketInputStream.read(SocketInputStream.java:171) ~[?:1.8.0_362]
at java.net.SocketInputStream.read(SocketInputStream.java:141) ~[?:1.8.0_362]
at java.io.BufferedInputStream.fill(BufferedInputStream.java:246) ~[?:1.8.0_362]
at java.io.BufferedInputStream.read1(BufferedInputStream.java:286) ~[?:1.8.0_362]
at java.io.BufferedInputStream.read(BufferedInputStream.java:345) ~[?:1.8.0_362]
at org.apache.thrift.transport.TIOStreamTransport.read(TIOStreamTransport.java:127) ~[libthrift-0.9.3.jar:0.9.3]
... 23 more
2023-03-14 10:57:00.100 WARN org.apache.kyuubi.client.KyuubiSyncThriftClient: The engine[Some(application_1676285123186_0447)] alive probe fails
org.apache.thrift.transport.TTransportException: java.net.SocketTimeoutException: Read timed out
at org.apache.thrift.transport.TIOStreamTransport.read(TIOStreamTransport.java:129) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.transport.TTransport.readAll(TTransport.java:86) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.transport.TSaslTransport.readLength(TSaslTransport.java:376) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.transport.TSaslTransport.readFrame(TSaslTransport.java:453) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.transport.TSaslTransport.read(TSaslTransport.java:435) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.transport.TSaslClientTransport.read(TSaslClientTransport.java:37) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.transport.TTransport.readAll(TTransport.java:86) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.protocol.TBinaryProtocol.readAll(TBinaryProtocol.java:429) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.protocol.TBinaryProtocol.readI32(TBinaryProtocol.java:318) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.protocol.TBinaryProtocol.readMessageBegin(TBinaryProtocol.java:219) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.TServiceClient.receiveBase(TServiceClient.java:77) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.hive.service.rpc.thrift.TCLIService$Client.recv_GetInfo(TCLIService.java:222) ~[hive-service-rpc-3.1.3.jar:3.1.3]
at org.apache.hive.service.rpc.thrift.TCLIService$Client.GetInfo(TCLIService.java:209) ~[hive-service-rpc-3.1.3.jar:3.1.3]
at org.apache.kyuubi.client.KyuubiSyncThriftClient$$anon$1.$anonfun$run$1(KyuubiSyncThriftClient.scala:93) ~[kyuubi-server_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at org.apache.kyuubi.client.KyuubiSyncThriftClient$$anon$1.$anonfun$run$1$adapted(KyuubiSyncThriftClient.scala:87) ~[kyuubi-server_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at scala.Option.foreach(Option.scala:407) ~[scala-library-2.12.17.jar:?]
at org.apache.kyuubi.client.KyuubiSyncThriftClient$$anon$1.run(KyuubiSyncThriftClient.scala:87) ~[kyuubi-server_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) ~[?:1.8.0_362]
at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308) ~[?:1.8.0_362]
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:180) ~[?:1.8.0_362]
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:294) ~[?:1.8.0_362]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) ~[?:1.8.0_362]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) ~[?:1.8.0_362]
at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_362]
Caused by: java.net.SocketTimeoutException: Read timed out
at java.net.SocketInputStream.socketRead0(Native Method) ~[?:1.8.0_362]
at java.net.SocketInputStream.socketRead(SocketInputStream.java:116) ~[?:1.8.0_362]
at java.net.SocketInputStream.read(SocketInputStream.java:171) ~[?:1.8.0_362]
at java.net.SocketInputStream.read(SocketInputStream.java:141) ~[?:1.8.0_362]
at java.io.BufferedInputStream.fill(BufferedInputStream.java:246) ~[?:1.8.0_362]
at java.io.BufferedInputStream.read1(BufferedInputStream.java:286) ~[?:1.8.0_362]
at java.io.BufferedInputStream.read(BufferedInputStream.java:345) ~[?:1.8.0_362]
at org.apache.thrift.transport.TIOStreamTransport.read(TIOStreamTransport.java:127) ~[libthrift-0.9.3.jar:0.9.3]
... 23 more
2023-03-14 10:57:20.111 WARN org.apache.kyuubi.client.KyuubiSyncThriftClient: The engine[Some(application_1676285123186_0447)] alive probe fails
org.apache.thrift.transport.TTransportException: java.net.SocketTimeoutException: Read timed out
at org.apache.thrift.transport.TIOStreamTransport.read(TIOStreamTransport.java:129) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.transport.TTransport.readAll(TTransport.java:86) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.transport.TSaslTransport.readLength(TSaslTransport.java:376) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.transport.TSaslTransport.readFrame(TSaslTransport.java:453) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.transport.TSaslTransport.read(TSaslTransport.java:435) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.transport.TSaslClientTransport.read(TSaslClientTransport.java:37) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.transport.TTransport.readAll(TTransport.java:86) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.protocol.TBinaryProtocol.readAll(TBinaryProtocol.java:429) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.protocol.TBinaryProtocol.readI32(TBinaryProtocol.java:318) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.protocol.TBinaryProtocol.readMessageBegin(TBinaryProtocol.java:219) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.TServiceClient.receiveBase(TServiceClient.java:77) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.hive.service.rpc.thrift.TCLIService$Client.recv_GetInfo(TCLIService.java:222) ~[hive-service-rpc-3.1.3.jar:3.1.3]
at org.apache.hive.service.rpc.thrift.TCLIService$Client.GetInfo(TCLIService.java:209) ~[hive-service-rpc-3.1.3.jar:3.1.3]
at org.apache.kyuubi.client.KyuubiSyncThriftClient$$anon$1.$anonfun$run$1(KyuubiSyncThriftClient.scala:93) ~[kyuubi-server_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at org.apache.kyuubi.client.KyuubiSyncThriftClient$$anon$1.$anonfun$run$1$adapted(KyuubiSyncThriftClient.scala:87) ~[kyuubi-server_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at scala.Option.foreach(Option.scala:407) ~[scala-library-2.12.17.jar:?]
at org.apache.kyuubi.client.KyuubiSyncThriftClient$$anon$1.run(KyuubiSyncThriftClient.scala:87) ~[kyuubi-server_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) ~[?:1.8.0_362]
at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308) ~[?:1.8.0_362]
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:180) ~[?:1.8.0_362]
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:294) ~[?:1.8.0_362]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) ~[?:1.8.0_362]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) ~[?:1.8.0_362]
at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_362]
Caused by: java.net.SocketTimeoutException: Read timed out
at java.net.SocketInputStream.socketRead0(Native Method) ~[?:1.8.0_362]
at java.net.SocketInputStream.socketRead(SocketInputStream.java:116) ~[?:1.8.0_362]
at java.net.SocketInputStream.read(SocketInputStream.java:171) ~[?:1.8.0_362]
at java.net.SocketInputStream.read(SocketInputStream.java:141) ~[?:1.8.0_362]
at java.io.BufferedInputStream.fill(BufferedInputStream.java:246) ~[?:1.8.0_362]
at java.io.BufferedInputStream.read1(BufferedInputStream.java:286) ~[?:1.8.0_362]
at java.io.BufferedInputStream.read(BufferedInputStream.java:345) ~[?:1.8.0_362]
at org.apache.thrift.transport.TIOStreamTransport.read(TIOStreamTransport.java:127) ~[libthrift-0.9.3.jar:0.9.3]
... 23 more
2023-03-14 10:57:20.112 ERROR org.apache.kyuubi.client.KyuubiSyncThriftClient: Mark the engine[Some(application_1676285123186_0447)] not alive with no recent alive probe success: 60034 ms exceeds timeout 60000 ms
```
Now force closing the thrift client to generate an exception.
```
2023-03-14 10:58:10.112 WARN org.apache.kyuubi.client.KyuubiSyncThriftClient: Removing Clients for TSessionHandle(sessionId:THandleIdentifier(guid:10 94 8A 85 00 DD 43 5D 85 58 91 84 61 81 A5 A4, secret:C2 EE 5B 97 3E A0 41 FC AC 16 9B D7 08 ED 8F 38))
2023-03-14 10:58:10.114 INFO org.apache.kyuubi.operation.ExecuteStatement: Processing jpz3032's query[4b23657a-cacb-4ead-8abd-50920dd73de9]: RUNNING_STATE -> ERROR_STATE, time taken: 139.861 seconds
2023-03-14 10:58:10.119 WARN org.apache.kyuubi.operation.ExecuteStatement: Error closing THandleIdentifier(guid:65 95 B2 11 0C F1 45 7E B4 0B F8 D3 21 26 C1 05, secret:C2 EE 5B 97 3E A0 41 FC AC 16 9B D7 08 ED 8F 38): connection does not exist
org.apache.kyuubi.KyuubiSQLException: connection does not exist
at org.apache.kyuubi.KyuubiSQLException$.connectionDoesNotExist(KyuubiSQLException.scala:90) ~[kyuubi-common_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at org.apache.kyuubi.client.KyuubiSyncThriftClient.withLockAcquired(KyuubiSyncThriftClient.scala:139) ~[kyuubi-server_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at org.apache.kyuubi.client.KyuubiSyncThriftClient.withLockAcquiredAsyncRequest(KyuubiSyncThriftClient.scala:145) ~[kyuubi-server_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at org.apache.kyuubi.client.KyuubiSyncThriftClient.closeOperation(KyuubiSyncThriftClient.scala:393) ~[kyuubi-server_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at org.apache.kyuubi.operation.KyuubiOperation.liftedTree3$1(KyuubiOperation.scala:136) ~[kyuubi-server_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at org.apache.kyuubi.operation.KyuubiOperation.close(KyuubiOperation.scala:135) ~[kyuubi-server_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at org.apache.kyuubi.operation.OperationManager.closeOperation(OperationManager.scala:126) ~[kyuubi-common_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at org.apache.kyuubi.session.AbstractSession.$anonfun$closeOperation$1(AbstractSession.scala:224) ~[kyuubi-common_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) ~[scala-library-2.12.17.jar:?]
at org.apache.kyuubi.session.AbstractSession.withAcquireRelease(AbstractSession.scala:82) ~[kyuubi-common_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at org.apache.kyuubi.session.AbstractSession.closeOperation(AbstractSession.scala:222) ~[kyuubi-common_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at org.apache.kyuubi.service.AbstractBackendService.closeOperation(AbstractBackendService.scala:188) ~[kyuubi-common_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at org.apache.kyuubi.server.KyuubiServer$$anon$1.org$apache$kyuubi$server$BackendServiceMetric$$super$closeOperation(KyuubiServer.scala:138) ~[kyuubi-server_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at org.apache.kyuubi.server.BackendServiceMetric.$anonfun$closeOperation$1(BackendServiceMetric.scala:169) ~[kyuubi-server_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) ~[scala-library-2.12.17.jar:?]
at org.apache.kyuubi.metrics.MetricsSystem$.timerTracing(MetricsSystem.scala:111) ~[kyuubi-metrics_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at org.apache.kyuubi.server.BackendServiceMetric.closeOperation(BackendServiceMetric.scala:169) ~[kyuubi-server_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at org.apache.kyuubi.server.BackendServiceMetric.closeOperation$(BackendServiceMetric.scala:167) ~[kyuubi-server_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at org.apache.kyuubi.server.KyuubiServer$$anon$1.closeOperation(KyuubiServer.scala:138) ~[kyuubi-server_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at org.apache.kyuubi.service.TFrontendService.CloseOperation(TFrontendService.scala:498) ~[kyuubi-common_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at org.apache.hive.service.rpc.thrift.TCLIService$Processor$CloseOperation.getResult(TCLIService.java:1797) ~[hive-service-rpc-3.1.3.jar:3.1.3]
at org.apache.hive.service.rpc.thrift.TCLIService$Processor$CloseOperation.getResult(TCLIService.java:1782) ~[hive-service-rpc-3.1.3.jar:3.1.3]
at org.apache.thrift.ProcessFunction.process(ProcessFunction.java:39) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.thrift.TBaseProcessor.process(TBaseProcessor.java:39) ~[libthrift-0.9.3.jar:0.9.3]
at org.apache.kyuubi.service.authentication.TSetIpAddressProcessor.process(TSetIpAddressProcessor.scala:36) ~[kyuubi-common_2.12-1.7.0-SNAPSHOT.jar:1.7.0-SNAPSHOT]
at org.apache.thrift.server.TThreadPoolServer$WorkerProcess.run(TThreadPoolServer.java:286) ~[libthrift-0.9.3.jar:0.9.3]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) ~[?:1.8.0_362]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) ~[?:1.8.0_362]
at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_362]
2023-03-14 10:58:10.130 INFO org.apache.kyuubi.server.KyuubiTBinaryFrontendService: Received request of closing SessionHandle [7c5bd429-49a2-46c2-ba50-02825140e4c6]
2023-03-14 10:58:10.130 INFO org.apache.kyuubi.session.KyuubiSessionManager: jpz3032's session with SessionHandle [7c5bd429-49a2-46c2-ba50-02825140e4c6] is closed, current opening sessions 0
2023-03-14 10:58:10.131 INFO org.apache.kyuubi.server.KyuubiTBinaryFrontendService: Finished closing SessionHandle [7c5bd429-49a2-46c2-ba50-02825140e4c6]
```
5. Beeline Logs
Beeline finished with errors (as expected)
```
2023-03-14 10:56:15.332 INFO org.apache.kyuubi.operation.ExecuteStatement: Query[4b23657a-cacb-4ead-8abd-50920dd73de9] in RUNNING_STATE
2023-03-14 10:56:20.333 INFO org.apache.kyuubi.operation.ExecuteStatement: Query[4b23657a-cacb-4ead-8abd-50920dd73de9] in RUNNING_STATE
2023-03-14 10:58:10.114 INFO org.apache.kyuubi.operation.ExecuteStatement: Processing jpz3032's query[4b23657a-cacb-4ead-8abd-50920dd73de9]: RUNNING_STATE -> ERROR_STATE, time taken: 139.861 seconds
Error: org.apache.kyuubi.KyuubiSQLException: Error operating ExecuteStatement: org.apache.thrift.transport.TTransportException: java.net.SocketException: Socket closed
at org.apache.thrift.transport.TIOStreamTransport.read(TIOStreamTransport.java:129)
at org.apache.thrift.transport.TTransport.readAll(TTransport.java:86)
at org.apache.thrift.transport.TSaslTransport.readLength(TSaslTransport.java:376)
at org.apache.thrift.transport.TSaslTransport.readFrame(TSaslTransport.java:453)
at org.apache.thrift.transport.TSaslTransport.read(TSaslTransport.java:435)
at org.apache.thrift.transport.TSaslClientTransport.read(TSaslClientTransport.java:37)
at org.apache.thrift.transport.TTransport.readAll(TTransport.java:86)
at org.apache.thrift.protocol.TBinaryProtocol.readAll(TBinaryProtocol.java:429)
at org.apache.thrift.protocol.TBinaryProtocol.readI32(TBinaryProtocol.java:318)
at org.apache.thrift.protocol.TBinaryProtocol.readMessageBegin(TBinaryProtocol.java:219)
at org.apache.thrift.TServiceClient.receiveBase(TServiceClient.java:77)
at org.apache.hive.service.rpc.thrift.TCLIService$Client.recv_GetOperationStatus(TCLIService.java:475)
at org.apache.hive.service.rpc.thrift.TCLIService$Client.GetOperationStatus(TCLIService.java:462)
at org.apache.kyuubi.client.KyuubiSyncThriftClient.$anonfun$getOperationStatus$1(KyuubiSyncThriftClient.scala:377)
at org.apache.kyuubi.client.KyuubiSyncThriftClient.$anonfun$withLockAcquiredAsyncRequest$2(KyuubiSyncThriftClient.scala:151)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750)
Caused by: java.net.SocketException: Socket closed
at java.net.SocketInputStream.socketRead0(Native Method)
at java.net.SocketInputStream.socketRead(SocketInputStream.java:116)
at java.net.SocketInputStream.read(SocketInputStream.java:171)
at java.net.SocketInputStream.read(SocketInputStream.java:141)
at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
at java.io.BufferedInputStream.read1(BufferedInputStream.java:286)
at java.io.BufferedInputStream.read(BufferedInputStream.java:345)
at org.apache.thrift.transport.TIOStreamTransport.read(TIOStreamTransport.java:127)
... 20 more
at org.apache.kyuubi.KyuubiSQLException$.apply(KyuubiSQLException.scala:69)
at org.apache.kyuubi.operation.KyuubiOperation$$anonfun$onError$1.applyOrElse(KyuubiOperation.scala:76)
at org.apache.kyuubi.operation.KyuubiOperation$$anonfun$onError$1.applyOrElse(KyuubiOperation.scala:57)
at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
at org.apache.kyuubi.operation.ExecuteStatement.waitStatementComplete(ExecuteStatement.scala:144)
at org.apache.kyuubi.operation.ExecuteStatement.$anonfun$runInternal$1(ExecuteStatement.scala:161)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.thrift.transport.TTransportException: java.net.SocketException: Socket closed
at org.apache.thrift.transport.TIOStreamTransport.read(TIOStreamTransport.java:129)
at org.apache.thrift.transport.TTransport.readAll(TTransport.java:86)
at org.apache.thrift.transport.TSaslTransport.readLength(TSaslTransport.java:376)
at org.apache.thrift.transport.TSaslTransport.readFrame(TSaslTransport.java:453)
at org.apache.thrift.transport.TSaslTransport.read(TSaslTransport.java:435)
at org.apache.thrift.transport.TSaslClientTransport.read(TSaslClientTransport.java:37)
at org.apache.thrift.transport.TTransport.readAll(TTransport.java:86)
at org.apache.thrift.protocol.TBinaryProtocol.readAll(TBinaryProtocol.java:429)
at org.apache.thrift.protocol.TBinaryProtocol.readI32(TBinaryProtocol.java:318)
at org.apache.thrift.protocol.TBinaryProtocol.readMessageBegin(TBinaryProtocol.java:219)
at org.apache.thrift.TServiceClient.receiveBase(TServiceClient.java:77)
at org.apache.hive.service.rpc.thrift.TCLIService$Client.recv_GetOperationStatus(TCLIService.java:475)
at org.apache.hive.service.rpc.thrift.TCLIService$Client.GetOperationStatus(TCLIService.java:462)
at org.apache.kyuubi.client.KyuubiSyncThriftClient.$anonfun$getOperationStatus$1(KyuubiSyncThriftClient.scala:377)
at org.apache.kyuubi.client.KyuubiSyncThriftClient.$anonfun$withLockAcquiredAsyncRequest$2(KyuubiSyncThriftClient.scala:151)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
... 3 more
Caused by: java.net.SocketException: Socket closed
at java.net.SocketInputStream.socketRead0(Native Method)
at java.net.SocketInputStream.socketRead(SocketInputStream.java:116)
at java.net.SocketInputStream.read(SocketInputStream.java:171)
at java.net.SocketInputStream.read(SocketInputStream.java:141)
at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
at java.io.BufferedInputStream.read1(BufferedInputStream.java:286)
at java.io.BufferedInputStream.read(BufferedInputStream.java:345)
at org.apache.thrift.transport.TIOStreamTransport.read(TIOStreamTransport.java:127)
... 20 more (state=,code=0)
Closing: 0: jdbc:hive2://lndcndevms1510.nhnjp.ism:10009/default?socketTimeout=60000;#spark.yarn.queue=shared
```
- [ ] [Run test](https://kyuubi.readthedocs.io/en/master/develop_tools/testing.html#running-tests) locally before make a pull request
The kyuubi-server suite wasn't working even before this PR.
Closes #4480 from risyomei/feature/kill-when-engine-fail.
Closes #4480
f61102274 [risyomei] Fixed Test messages(3)
3ed6886f1 [risyomei] Fixed Test messages(2)
30675d088 [risyomei] Fixed Test messages
d80cfcb45 [Xieming LI] Merge remote-tracking branch 'origin/master' into feature/kill-when-engine-fail
3b320e1e2 [Xieming LI] Make sure thread is shut down
5ee9ddf5e [Xieming LI] Commit for work
8c854cc6d [Cheng Pan] Update kyuubi-server/src/main/scala/org/apache/kyuubi/client/KyuubiSyncThriftClient.scala
564e09cf6 [Xieming LI] Fixed based on Comment
ae1a02552 [Xieming LI] Fixed unused import
1b50a5986 [Xieming LI] rebase origin/master
d9f9e3c93 [Xieming LI] Optimized logic based on PR comment
a5422855f [Xieming LI] Fixe based on comment
04b9ff3a3 [Xieming LI] Added an option to kill the thrift connection when Engine is considered dead
3571d68fd [Xieming LI] Fixe based on comment
e40fcfcd4 [Xieming LI] Added an option to kill the thrift connection when Engine is considered dead
Lead-authored-by: Xieming LI <ri...@gmail.com>
Co-authored-by: risyomei <ri...@gmail.com>
Co-authored-by: Cheng Pan <pa...@gmail.com>
Signed-off-by: fwang12 <fw...@ebay.com>
---
.../org/apache/kyuubi/client/KyuubiSyncThriftClient.scala | 14 ++++++++++++++
.../kyuubi/operation/KyuubiOperationPerUserSuite.scala | 6 ++++--
2 files changed, 18 insertions(+), 2 deletions(-)
diff --git a/kyuubi-server/src/main/scala/org/apache/kyuubi/client/KyuubiSyncThriftClient.scala b/kyuubi-server/src/main/scala/org/apache/kyuubi/client/KyuubiSyncThriftClient.scala
index 12a4c824c..8b8561fa9 100644
--- a/kyuubi-server/src/main/scala/org/apache/kyuubi/client/KyuubiSyncThriftClient.scala
+++ b/kyuubi-server/src/main/scala/org/apache/kyuubi/client/KyuubiSyncThriftClient.scala
@@ -59,6 +59,7 @@ class KyuubiSyncThriftClient private (
@volatile private var _aliveProbeSessionHandle: TSessionHandle = _
@volatile private var remoteEngineBroken: Boolean = false
+ @volatile private var clientClosedOnEngineBroken: Boolean = false
private val engineAliveProbeClient = engineAliveProbeProtocol.map(new TCLIService.Client(_))
private var engineAliveThreadPool: ScheduledExecutorService = _
@volatile private var engineLastAlive: Long = _
@@ -109,6 +110,18 @@ class KyuubiSyncThriftClient private (
}
} else {
shutdownAsyncRequestExecutor()
+ warn(s"Removing Clients for ${_remoteSessionHandle}")
+ Seq(protocol).union(engineAliveProbeProtocol.toSeq).foreach { tProtocol =>
+ Utils.tryLogNonFatalError {
+ if (tProtocol.getTransport.isOpen) {
+ tProtocol.getTransport.close()
+ }
+ }
+ clientClosedOnEngineBroken = true
+ Option(engineAliveThreadPool).foreach { pool =>
+ ThreadUtils.shutdown(pool, Duration(engineAliveProbeInterval, TimeUnit.MILLISECONDS))
+ }
+ }
}
}
}
@@ -199,6 +212,7 @@ class KyuubiSyncThriftClient private (
}
def closeSession(): Unit = {
+ if (clientClosedOnEngineBroken) return
try {
if (_remoteSessionHandle != null) {
val req = new TCloseSessionReq(_remoteSessionHandle)
diff --git a/kyuubi-server/src/test/scala/org/apache/kyuubi/operation/KyuubiOperationPerUserSuite.scala b/kyuubi-server/src/test/scala/org/apache/kyuubi/operation/KyuubiOperationPerUserSuite.scala
index 40bb165b8..21bf56b4f 100644
--- a/kyuubi-server/src/test/scala/org/apache/kyuubi/operation/KyuubiOperationPerUserSuite.scala
+++ b/kyuubi-server/src/test/scala/org/apache/kyuubi/operation/KyuubiOperationPerUserSuite.scala
@@ -200,9 +200,11 @@ class KyuubiOperationPerUserSuite
val executeStmtResp = client.ExecuteStatement(executeStmtReq)
assert(executeStmtResp.getStatus.getStatusCode === TStatusCode.ERROR_STATUS)
assert(executeStmtResp.getStatus.getErrorMessage.contains(
- "java.net.SocketException: Connection reset") ||
+ "java.net.SocketException") ||
executeStmtResp.getStatus.getErrorMessage.contains(
- "Caused by: java.net.SocketException: Broken pipe (Write failed)"))
+ "org.apache.thrift.transport.TTransportException") ||
+ executeStmtResp.getStatus.getErrorMessage.contains(
+ "connection does not exist"))
val elapsedTime = System.currentTimeMillis() - startTime
assert(elapsedTime < 20 * 1000)
assert(session.client.asyncRequestInterrupted)