You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@kylin.apache.org by lxw <lx...@qq.com> on 2016/08/19 05:34:15 UTC

Cube query problem

Hi,experts:

   After create and build successfully, I use SQL to query the cube and got error:

Error while executing SQL "select advid_, sum(pv_) as pv  from lxw1234.t_table  group by advid_   limit 10": Error in coprocessor                 
   
   I use kylin-1.5.3、hbase-0.98.6-cdh5.2.0,  and some other cubes can be normal query .

   The error log is :

2016-08-19 13:24:55,862 WARN  [kylin-coproc--pool5-t19] ipc.CoprocessorRpcChannel:59 : Call failed on IOException
org.apache.hadoop.hbase.DoNotRetryIOException: org.apache.hadoop.hbase.DoNotRetryIOException: java.lang.NoClassDefFoundError: Could not initialize class org.apache.kylin.metadata.datatype.DataType
        at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2067)
        at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:108)
        at org.apache.hadoop.hbase.ipc.RpcExecutor.consumerLoop(RpcExecutor.java:114)
        at org.apache.hadoop.hbase.ipc.RpcExecutor$1.run(RpcExecutor.java:94)
        at java.lang.Thread.run(Thread.java:744)
Caused by: java.lang.NoClassDefFoundError: Could not initialize class org.apache.kylin.metadata.datatype.DataType
        at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.java:328)
        at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.java:283)
        at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(GTScanRequest.java:322)
        at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(GTScanRequest.java:295)
        at org.apache.kylin.storage.hbase.cube.v2.coprocessor.endpoint.CubeVisitService.visitCube(CubeVisitService.java:189)
        at org.apache.kylin.storage.hbase.cube.v2.coprocessor.endpoint.generated.CubeVisitProtos$CubeVisitService.callMethod(CubeVisitProtos.java:4353)
        at org.apache.hadoop.hbase.regionserver.HRegion.execService(HRegion.java:5602)
        at org.apache.hadoop.hbase.regionserver.HRegionServer.execServiceOnRegion(HRegionServer.java:3416)
        at org.apache.hadoop.hbase.regionserver.HRegionServer.execService(HRegionServer.java:3398)
        at org.apache.hadoop.hbase.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:29591)
        at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2031)
        ... 4 more

        at sun.reflect.GeneratedConstructorAccessor86.newInstance(Unknown Source)
        at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
        at java.lang.reflect.Constructor.newInstance(Constructor.java:526)
        at org.apache.hadoop.ipc.RemoteException.instantiateException(RemoteException.java:106)
        at org.apache.hadoop.ipc.RemoteException.unwrapRemoteException(RemoteException.java:95)
        at org.apache.hadoop.hbase.protobuf.ProtobufUtil.getRemoteException(ProtobufUtil.java:304)
        at org.apache.hadoop.hbase.protobuf.ProtobufUtil.execService(ProtobufUtil.java:1627)
        at org.apache.hadoop.hbase.ipc.RegionCoprocessorRpcChannel$1.call(RegionCoprocessorRpcChannel.java:93)
        at org.apache.hadoop.hbase.ipc.RegionCoprocessorRpcChannel$1.call(RegionCoprocessorRpcChannel.java:90)
        at org.apache.hadoop.hbase.client.RpcRetryingCaller.callWithRetries(RpcRetryingCaller.java:114)
        at org.apache.hadoop.hbase.client.RpcRetryingCaller.callWithRetries(RpcRetryingCaller.java:90)
        at org.apache.hadoop.hbase.ipc.RegionCoprocessorRpcChannel.callExecService(RegionCoprocessorRpcChannel.java:96)
        at org.apache.hadoop.hbase.ipc.CoprocessorRpcChannel.callMethod(CoprocessorRpcChannel.java:57)
        at org.apache.kylin.storage.hbase.cube.v2.coprocessor.endpoint.generated.CubeVisitProtos$CubeVisitService$Stub.visitCube(CubeVisitProtos.java:4400)
        at org.apache.kylin.storage.hbase.cube.v2.CubeHBaseEndpointRPC$1$1.call(CubeHBaseEndpointRPC.java:371)
        at org.apache.kylin.storage.hbase.cube.v2.CubeHBaseEndpointRPC$1$1.call(CubeHBaseEndpointRPC.java:367)
        at org.apache.hadoop.hbase.client.HTable$16.call(HTable.java:1542)
        at java.util.concurrent.FutureTask.run(FutureTask.java:262)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:744)
Caused by: org.apache.hadoop.hbase.ipc.RemoteWithExtrasException(org.apache.hadoop.hbase.DoNotRetryIOException): org.apache.hadoop.hbase.DoNotRetryIOException: java.lang.NoClassDefFoundError: Could not initialize class org.apache.kylin.metadata.datatype.DataType
        at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2067)
        at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:108)
        at org.apache.hadoop.hbase.ipc.RpcExecutor.consumerLoop(RpcExecutor.java:114)
        at org.apache.hadoop.hbase.ipc.RpcExecutor$1.run(RpcExecutor.java:94)
        at java.lang.Thread.run(Thread.java:744)
Caused by: java.lang.NoClassDefFoundError: Could not initialize class org.apache.kylin.metadata.datatype.DataType
        at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.java:328)
        at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.java:283)
        at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(GTScanRequest.java:322)
        at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(GTScanRequest.java:295)
        at org.apache.kylin.storage.hbase.cube.v2.coprocessor.endpoint.CubeVisitService.visitCube(CubeVisitService.java:189)
        at org.apache.kylin.storage.hbase.cube.v2.coprocessor.endpoint.generated.CubeVisitProtos$CubeVisitService.callMethod(CubeVisitProtos.java:4353)
        at org.apache.hadoop.hbase.regionserver.HRegion.execService(HRegion.java:5602)
        at org.apache.hadoop.hbase.regionserver.HRegionServer.execServiceOnRegion(HRegionServer.java:3416)
        at org.apache.hadoop.hbase.regionserver.HRegionServer.execService(HRegionServer.java:3398)
        at org.apache.hadoop.hbase.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:29591)
        at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2031)
        ... 4 more

        at org.apache.hadoop.hbase.ipc.RpcClient.call(RpcClient.java:1457)
        at org.apache.hadoop.hbase.ipc.RpcClient.callBlockingMethod(RpcClient.java:1661)
        at org.apache.hadoop.hbase.ipc.RpcClient$BlockingRpcChannelImplementation.callBlockingMethod(RpcClient.java:1719)
        at org.apache.hadoop.hbase.protobuf.generated.ClientProtos$ClientService$BlockingStub.execService(ClientProtos.java:30014)
        at org.apache.hadoop.hbase.protobuf.ProtobufUtil.execService(ProtobufUtil.java:1623)
        ... 14 more
2016-08-19 13:24:55,864 WARN  [pool-8-thread-9] client.HTable:1556 : Error calling coprocessor service org.apache.kylin.storage.hbase.cube.v2.coprocessor.endpoint.generated.CubeVisitProtos$CubeVisitService for row \x00\x00
java.util.concurrent.ExecutionException: org.apache.hadoop.hbase.DoNotRetryIOException: org.apache.hadoop.hbase.DoNotRetryIOException: java.lang.NoClassDefFoundError: Could not initialize class org.apache.kylin.metadata.datatype.DataType
        at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2067)
        at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:108)
        at org.apache.hadoop.hbase.ipc.RpcExecutor.consumerLoop(RpcExecutor.java:114)
        at org.apache.hadoop.hbase.ipc.RpcExecutor$1.run(RpcExecutor.java:94)
        at java.lang.Thread.run(Thread.java:744)
Caused by: java.lang.NoClassDefFoundError: Could not initialize class org.apache.kylin.metadata.datatype.DataType
        at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.java:328)
        at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.java:283)
        at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(GTScanRequest.java:322)
        at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(GTScanRequest.java:295)
        at org.apache.kylin.storage.hbase.cube.v2.coprocessor.endpoint.CubeVisitService.visitCube(CubeVisitService.java:189)
        at org.apache.kylin.storage.hbase.cube.v2.coprocessor.endpoint.generated.CubeVisitProtos$CubeVisitService.callMethod(CubeVisitProtos.java:4353)
        at org.apache.hadoop.hbase.regionserver.HRegion.execService(HRegion.java:5602)
        at org.apache.hadoop.hbase.regionserver.HRegionServer.execServiceOnRegion(HRegionServer.java:3416)
        at org.apache.hadoop.hbase.regionserver.HRegionServer.execService(HRegionServer.java:3398)
        at org.apache.hadoop.hbase.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:29591)
        at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2031)
        ... 4 more

        at java.util.concurrent.FutureTask.report(FutureTask.java:122)
        at java.util.concurrent.FutureTask.get(FutureTask.java:188)
        at org.apache.hadoop.hbase.client.HTable.coprocessorService(HTable.java:1554)
        at org.apache.kylin.storage.hbase.cube.v2.CubeHBaseEndpointRPC$1.run(CubeHBaseEndpointRPC.java:366)
        at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
        at java.util.concurrent.FutureTask.run(FutureTask.java:262)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:744)
Caused by: org.apache.hadoop.hbase.DoNotRetryIOException: org.apache.hadoop.hbase.DoNotRetryIOException: java.lang.NoClassDefFoundError: Could not initialize class org.apache.kylin.metadata.datatype.DataType
        at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2067)
        at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:108)
        at org.apache.hadoop.hbase.ipc.RpcExecutor.consumerLoop(RpcExecutor.java:114)
        at org.apache.hadoop.hbase.ipc.RpcExecutor$1.run(RpcExecutor.java:94)
        at java.lang.Thread.run(Thread.java:744)
Caused by: java.lang.NoClassDefFoundError: Could not initialize class org.apache.kylin.metadata.datatype.DataType
        at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.java:328)
        at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.java:283)
        at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(GTScanRequest.java:322)
        at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(GTScanRequest.java:295)
        at org.apache.kylin.storage.hbase.cube.v2.coprocessor.endpoint.CubeVisitService.visitCube(CubeVisitService.java:189)
        at org.apache.kylin.storage.hbase.cube.v2.coprocessor.endpoint.generated.CubeVisitProtos$CubeVisitService.callMethod(CubeVisitProtos.java:4353)
        at org.apache.hadoop.hbase.regionserver.HRegion.execService(HRegion.java:5602)
        at org.apache.hadoop.hbase.regionserver.HRegionServer.execServiceOnRegion(HRegionServer.java:3416)
        at org.apache.hadoop.hbase.regionserver.HRegionServer.execService(HRegionServer.java:3398)
        at org.apache.hadoop.hbase.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:29591)
        at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2031)
        ... 4 more

        at sun.reflect.GeneratedConstructorAccessor86.newInstance(Unknown Source)
        at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
        at java.lang.reflect.Constructor.newInstance(Constructor.java:526)
        at org.apache.hadoop.ipc.RemoteException.instantiateException(RemoteException.java:106)
        at org.apache.hadoop.ipc.RemoteException.unwrapRemoteException(RemoteException.java:95)
        at org.apache.hadoop.hbase.protobuf.ProtobufUtil.getRemoteException(ProtobufUtil.java:304)
        at org.apache.hadoop.hbase.protobuf.ProtobufUtil.execService(ProtobufUtil.java:1627)
        at org.apache.hadoop.hbase.ipc.RegionCoprocessorRpcChannel$1.call(RegionCoprocessorRpcChannel.java:93)
        at org.apache.hadoop.hbase.ipc.RegionCoprocessorRpcChannel$1.call(RegionCoprocessorRpcChannel.java:90)
        at org.apache.hadoop.hbase.client.RpcRetryingCaller.callWithRetries(RpcRetryingCaller.java:114)
        at org.apache.hadoop.hbase.client.RpcRetryingCaller.callWithRetries(RpcRetryingCaller.java:90)
        at org.apache.hadoop.hbase.ipc.RegionCoprocessorRpcChannel.callExecService(RegionCoprocessorRpcChannel.java:96)
        at org.apache.hadoop.hbase.ipc.CoprocessorRpcChannel.callMethod(CoprocessorRpcChannel.java:57)
        at org.apache.kylin.storage.hbase.cube.v2.coprocessor.endpoint.generated.CubeVisitProtos$CubeVisitService$Stub.visitCube(CubeVisitProtos.java:4400)
        at org.apache.kylin.storage.hbase.cube.v2.CubeHBaseEndpointRPC$1$1.call(CubeHBaseEndpointRPC.java:371)
        at org.apache.kylin.storage.hbase.cube.v2.CubeHBaseEndpointRPC$1$1.call(CubeHBaseEndpointRPC.java:367)
        at org.apache.hadoop.hbase.client.HTable$16.call(HTable.java:1542)
        ... 4 more
Caused by: org.apache.hadoop.hbase.ipc.RemoteWithExtrasException(org.apache.hadoop.hbase.DoNotRetryIOException): org.apache.hadoop.hbase.DoNotRetryIOException: java.lang.NoClassDefFoundError: Could not initialize class org.apache.kylin.metadata.datatype.DataType
        at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2067)
        at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:108)
        at org.apache.hadoop.hbase.ipc.RpcExecutor.consumerLoop(RpcExecutor.java:114)
        at org.apache.hadoop.hbase.ipc.RpcExecutor$1.run(RpcExecutor.java:94)
        at java.lang.Thread.run(Thread.java:744)
Caused by: java.lang.NoClassDefFoundError: Could not initialize class org.apache.kylin.metadata.datatype.DataType
        at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.java:328)
        at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.java:283)
        at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(GTScanRequest.java:322)
        at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(GTScanRequest.java:295)
        at org.apache.kylin.storage.hbase.cube.v2.coprocessor.endpoint.CubeVisitService.visitCube(CubeVisitService.java:189)
        at org.apache.kylin.storage.hbase.cube.v2.coprocessor.endpoint.generated.CubeVisitProtos$CubeVisitService.callMethod(CubeVisitProtos.java:4353)
        at org.apache.hadoop.hbase.regionserver.HRegion.execService(HRegion.java:5602)
        at org.apache.hadoop.hbase.regionserver.HRegionServer.execServiceOnRegion(HRegionServer.java:3416)
        at org.apache.hadoop.hbase.regionserver.HRegionServer.execService(HRegionServer.java:3398)
        at org.apache.hadoop.hbase.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:29591)
        at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2031)
        ... 4 more

        at org.apache.hadoop.hbase.ipc.RpcClient.call(RpcClient.java:1457)
        at org.apache.hadoop.hbase.ipc.RpcClient.callBlockingMethod(RpcClient.java:1661)
        at org.apache.hadoop.hbase.ipc.RpcClient$BlockingRpcChannelImplementation.callBlockingMethod(RpcClient.java:1719)
        at org.apache.hadoop.hbase.protobuf.generated.ClientProtos$ClientService$BlockingStub.execService(ClientProtos.java:30014)
        at org.apache.hadoop.hbase.protobuf.ProtobufUtil.execService(ProtobufUtil.java:1623)
        ... 14 more
2016-08-19 13:24:55,866 ERROR [pool-8-thread-9] v2.CubeHBaseEndpointRPC:404 : <sub-thread for GTScanRequest 80b872a> Error when visiting cubes by endpoint

Re: Cube query problem

Posted by Alberto Ramón <a....@gmail.com>.
Did You install flink 1.5.2 for hbase 1.x or for 0.9x? See that there is
two versions of kylin
I don't have more ideas, good luck

El 27/8/2016 15:02, "Li Yang" <li...@apache.org> escribió:

> Haven't tested CDH 5.2, but we know CDH 5.7 works well with Kylin.
>
> On Fri, Aug 19, 2016 at 1:34 PM, lxw <lx...@qq.com> wrote:
>
> > Hi,experts:
> >
> >    After create and build successfully, I use SQL to query the cube and
> > got error:
> >
> > Error while executing SQL "select advid_, sum(pv_) as pv  from
> > lxw1234.t_table  group by advid_   limit 10": Error in coprocessor
> >
> >    I use kylin-1.5.3、hbase-0.98.6-cdh5.2.0,  and some other cubes can be
> > normal query .
> >
> >    The error log is :
> >
> > 2016-08-19 13:24:55,862 WARN  [kylin-coproc--pool5-t19]
> > ipc.CoprocessorRpcChannel:59 : Call failed on IOException
> > org.apache.hadoop.hbase.DoNotRetryIOException: org.apache.hadoop.hbase.
> DoNotRetryIOException:
> > java.lang.NoClassDefFoundError: Could not initialize class
> > org.apache.kylin.metadata.datatype.DataType
> >         at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:
> 2067)
> >         at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.
> java:108)
> >         at org.apache.hadoop.hbase.ipc.RpcExecutor.consumerLoop(
> > RpcExecutor.java:114)
> >         at org.apache.hadoop.hbase.ipc.RpcExecutor$1.run(RpcExecutor.
> > java:94)
> >         at java.lang.Thread.run(Thread.java:744)
> > Caused by: java.lang.NoClassDefFoundError: Could not initialize class
> > org.apache.kylin.metadata.datatype.DataType
> >         at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.
> > java:328)
> >         at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.
> > java:283)
> >         at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(
> > GTScanRequest.java:322)
> >         at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(
> > GTScanRequest.java:295)
> >         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> > endpoint.CubeVisitService.visitCube(CubeVisitService.java:189)
> >         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> > endpoint.generated.CubeVisitProtos$CubeVisitService.callMethod(
> > CubeVisitProtos.java:4353)
> >         at org.apache.hadoop.hbase.regionserver.HRegion.
> > execService(HRegion.java:5602)
> >         at org.apache.hadoop.hbase.regionserver.HRegionServer.
> > execServiceOnRegion(HRegionServer.java:3416)
> >         at org.apache.hadoop.hbase.regionserver.HRegionServer.
> > execService(HRegionServer.java:3398)
> >         at org.apache.hadoop.hbase.protobuf.generated.
> > ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:29591)
> >         at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:
> 2031)
> >         ... 4 more
> >
> >         at sun.reflect.GeneratedConstructorAccessor86
> .newInstance(Unknown
> > Source)
> >         at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(
> > DelegatingConstructorAccessorImpl.java:45)
> >         at java.lang.reflect.Constructor.newInstance(Constructor.java:
> 526)
> >         at org.apache.hadoop.ipc.RemoteException.instantiateException(
> > RemoteException.java:106)
> >         at org.apache.hadoop.ipc.RemoteException.unwrapRemoteException(
> > RemoteException.java:95)
> >         at org.apache.hadoop.hbase.protobuf.ProtobufUtil.
> > getRemoteException(ProtobufUtil.java:304)
> >         at org.apache.hadoop.hbase.protobuf.ProtobufUtil.
> > execService(ProtobufUtil.java:1627)
> >         at org.apache.hadoop.hbase.ipc.RegionCoprocessorRpcChannel$1.
> call(
> > RegionCoprocessorRpcChannel.java:93)
> >         at org.apache.hadoop.hbase.ipc.RegionCoprocessorRpcChannel$1.
> call(
> > RegionCoprocessorRpcChannel.java:90)
> >         at org.apache.hadoop.hbase.client.RpcRetryingCaller.
> > callWithRetries(RpcRetryingCaller.java:114)
> >         at org.apache.hadoop.hbase.client.RpcRetryingCaller.
> > callWithRetries(RpcRetryingCaller.java:90)
> >         at org.apache.hadoop.hbase.ipc.RegionCoprocessorRpcChannel.
> > callExecService(RegionCoprocessorRpcChannel.java:96)
> >         at org.apache.hadoop.hbase.ipc.CoprocessorRpcChannel.callMethod(
> > CoprocessorRpcChannel.java:57)
> >         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> > endpoint.generated.CubeVisitProtos$CubeVisitService$Stub.
> > visitCube(CubeVisitProtos.java:4400)
> >         at org.apache.kylin.storage.hbase.cube.v2.
> > CubeHBaseEndpointRPC$1$1.call(CubeHBaseEndpointRPC.java:371)
> >         at org.apache.kylin.storage.hbase.cube.v2.
> > CubeHBaseEndpointRPC$1$1.call(CubeHBaseEndpointRPC.java:367)
> >         at org.apache.hadoop.hbase.client.HTable$16.call(HTable.
> java:1542)
> >         at java.util.concurrent.FutureTask.run(FutureTask.java:262)
> >         at java.util.concurrent.ThreadPoolExecutor.runWorker(
> > ThreadPoolExecutor.java:1145)
> >         at java.util.concurrent.ThreadPoolExecutor$Worker.run(
> > ThreadPoolExecutor.java:615)
> >         at java.lang.Thread.run(Thread.java:744)
> > Caused by: org.apache.hadoop.hbase.ipc.RemoteWithExtrasException(org.
> > apache.hadoop.hbase.DoNotRetryIOException): org.apache.hadoop.hbase.
> DoNotRetryIOException:
> > java.lang.NoClassDefFoundError: Could not initialize class
> > org.apache.kylin.metadata.datatype.DataType
> >         at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:
> 2067)
> >         at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.
> java:108)
> >         at org.apache.hadoop.hbase.ipc.RpcExecutor.consumerLoop(
> > RpcExecutor.java:114)
> >         at org.apache.hadoop.hbase.ipc.RpcExecutor$1.run(RpcExecutor.
> > java:94)
> >         at java.lang.Thread.run(Thread.java:744)
> > Caused by: java.lang.NoClassDefFoundError: Could not initialize class
> > org.apache.kylin.metadata.datatype.DataType
> >         at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.
> > java:328)
> >         at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.
> > java:283)
> >         at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(
> > GTScanRequest.java:322)
> >         at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(
> > GTScanRequest.java:295)
> >         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> > endpoint.CubeVisitService.visitCube(CubeVisitService.java:189)
> >         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> > endpoint.generated.CubeVisitProtos$CubeVisitService.callMethod(
> > CubeVisitProtos.java:4353)
> >         at org.apache.hadoop.hbase.regionserver.HRegion.
> > execService(HRegion.java:5602)
> >         at org.apache.hadoop.hbase.regionserver.HRegionServer.
> > execServiceOnRegion(HRegionServer.java:3416)
> >         at org.apache.hadoop.hbase.regionserver.HRegionServer.
> > execService(HRegionServer.java:3398)
> >         at org.apache.hadoop.hbase.protobuf.generated.
> > ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:29591)
> >         at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:
> 2031)
> >         ... 4 more
> >
> >         at org.apache.hadoop.hbase.ipc.RpcClient.call(RpcClient.java:
> 1457)
> >         at org.apache.hadoop.hbase.ipc.RpcClient.callBlockingMethod(
> > RpcClient.java:1661)
> >         at org.apache.hadoop.hbase.ipc.RpcClient$
> > BlockingRpcChannelImplementation.callBlockingMethod(RpcClient.java:1719)
> >         at org.apache.hadoop.hbase.protobuf.generated.
> > ClientProtos$ClientService$BlockingStub.execService(
> > ClientProtos.java:30014)
> >         at org.apache.hadoop.hbase.protobuf.ProtobufUtil.
> > execService(ProtobufUtil.java:1623)
> >         ... 14 more
> > 2016-08-19 13:24:55,864 WARN  [pool-8-thread-9] client.HTable:1556 :
> Error
> > calling coprocessor service org.apache.kylin.storage.
> > hbase.cube.v2.coprocessor.endpoint.generated.CubeVisitProtos$
> CubeVisitService
> > for row \x00\x00
> > java.util.concurrent.ExecutionException: org.apache.hadoop.hbase.
> DoNotRetryIOException:
> > org.apache.hadoop.hbase.DoNotRetryIOException: java.lang.
> NoClassDefFoundError:
> > Could not initialize class org.apache.kylin.metadata.datatype.DataType
> >         at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:
> 2067)
> >         at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.
> java:108)
> >         at org.apache.hadoop.hbase.ipc.RpcExecutor.consumerLoop(
> > RpcExecutor.java:114)
> >         at org.apache.hadoop.hbase.ipc.RpcExecutor$1.run(RpcExecutor.
> > java:94)
> >         at java.lang.Thread.run(Thread.java:744)
> > Caused by: java.lang.NoClassDefFoundError: Could not initialize class
> > org.apache.kylin.metadata.datatype.DataType
> >         at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.
> > java:328)
> >         at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.
> > java:283)
> >         at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(
> > GTScanRequest.java:322)
> >         at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(
> > GTScanRequest.java:295)
> >         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> > endpoint.CubeVisitService.visitCube(CubeVisitService.java:189)
> >         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> > endpoint.generated.CubeVisitProtos$CubeVisitService.callMethod(
> > CubeVisitProtos.java:4353)
> >         at org.apache.hadoop.hbase.regionserver.HRegion.
> > execService(HRegion.java:5602)
> >         at org.apache.hadoop.hbase.regionserver.HRegionServer.
> > execServiceOnRegion(HRegionServer.java:3416)
> >         at org.apache.hadoop.hbase.regionserver.HRegionServer.
> > execService(HRegionServer.java:3398)
> >         at org.apache.hadoop.hbase.protobuf.generated.
> > ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:29591)
> >         at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:
> 2031)
> >         ... 4 more
> >
> >         at java.util.concurrent.FutureTask.report(FutureTask.java:122)
> >         at java.util.concurrent.FutureTask.get(FutureTask.java:188)
> >         at org.apache.hadoop.hbase.client.HTable.
> > coprocessorService(HTable.java:1554)
> >         at org.apache.kylin.storage.hbase.cube.v2.
> > CubeHBaseEndpointRPC$1.run(CubeHBaseEndpointRPC.java:366)
> >         at java.util.concurrent.Executors$RunnableAdapter.
> > call(Executors.java:471)
> >         at java.util.concurrent.FutureTask.run(FutureTask.java:262)
> >         at java.util.concurrent.ThreadPoolExecutor.runWorker(
> > ThreadPoolExecutor.java:1145)
> >         at java.util.concurrent.ThreadPoolExecutor$Worker.run(
> > ThreadPoolExecutor.java:615)
> >         at java.lang.Thread.run(Thread.java:744)
> > Caused by: org.apache.hadoop.hbase.DoNotRetryIOException:
> > org.apache.hadoop.hbase.DoNotRetryIOException: java.lang.
> NoClassDefFoundError:
> > Could not initialize class org.apache.kylin.metadata.datatype.DataType
> >         at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:
> 2067)
> >         at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.
> java:108)
> >         at org.apache.hadoop.hbase.ipc.RpcExecutor.consumerLoop(
> > RpcExecutor.java:114)
> >         at org.apache.hadoop.hbase.ipc.RpcExecutor$1.run(RpcExecutor.
> > java:94)
> >         at java.lang.Thread.run(Thread.java:744)
> > Caused by: java.lang.NoClassDefFoundError: Could not initialize class
> > org.apache.kylin.metadata.datatype.DataType
> >         at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.
> > java:328)
> >         at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.
> > java:283)
> >         at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(
> > GTScanRequest.java:322)
> >         at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(
> > GTScanRequest.java:295)
> >         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> > endpoint.CubeVisitService.visitCube(CubeVisitService.java:189)
> >         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> > endpoint.generated.CubeVisitProtos$CubeVisitService.callMethod(
> > CubeVisitProtos.java:4353)
> >         at org.apache.hadoop.hbase.regionserver.HRegion.
> > execService(HRegion.java:5602)
> >         at org.apache.hadoop.hbase.regionserver.HRegionServer.
> > execServiceOnRegion(HRegionServer.java:3416)
> >         at org.apache.hadoop.hbase.regionserver.HRegionServer.
> > execService(HRegionServer.java:3398)
> >         at org.apache.hadoop.hbase.protobuf.generated.
> > ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:29591)
> >         at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:
> 2031)
> >         ... 4 more
> >
> >         at sun.reflect.GeneratedConstructorAccessor86
> .newInstance(Unknown
> > Source)
> >         at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(
> > DelegatingConstructorAccessorImpl.java:45)
> >         at java.lang.reflect.Constructor.newInstance(Constructor.java:
> 526)
> >         at org.apache.hadoop.ipc.RemoteException.instantiateException(
> > RemoteException.java:106)
> >         at org.apache.hadoop.ipc.RemoteException.unwrapRemoteException(
> > RemoteException.java:95)
> >         at org.apache.hadoop.hbase.protobuf.ProtobufUtil.
> > getRemoteException(ProtobufUtil.java:304)
> >         at org.apache.hadoop.hbase.protobuf.ProtobufUtil.
> > execService(ProtobufUtil.java:1627)
> >         at org.apache.hadoop.hbase.ipc.RegionCoprocessorRpcChannel$1.
> call(
> > RegionCoprocessorRpcChannel.java:93)
> >         at org.apache.hadoop.hbase.ipc.RegionCoprocessorRpcChannel$1.
> call(
> > RegionCoprocessorRpcChannel.java:90)
> >         at org.apache.hadoop.hbase.client.RpcRetryingCaller.
> > callWithRetries(RpcRetryingCaller.java:114)
> >         at org.apache.hadoop.hbase.client.RpcRetryingCaller.
> > callWithRetries(RpcRetryingCaller.java:90)
> >         at org.apache.hadoop.hbase.ipc.RegionCoprocessorRpcChannel.
> > callExecService(RegionCoprocessorRpcChannel.java:96)
> >         at org.apache.hadoop.hbase.ipc.CoprocessorRpcChannel.callMethod(
> > CoprocessorRpcChannel.java:57)
> >         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> > endpoint.generated.CubeVisitProtos$CubeVisitService$Stub.
> > visitCube(CubeVisitProtos.java:4400)
> >         at org.apache.kylin.storage.hbase.cube.v2.
> > CubeHBaseEndpointRPC$1$1.call(CubeHBaseEndpointRPC.java:371)
> >         at org.apache.kylin.storage.hbase.cube.v2.
> > CubeHBaseEndpointRPC$1$1.call(CubeHBaseEndpointRPC.java:367)
> >         at org.apache.hadoop.hbase.client.HTable$16.call(HTable.
> java:1542)
> >         ... 4 more
> > Caused by: org.apache.hadoop.hbase.ipc.RemoteWithExtrasException(org.
> > apache.hadoop.hbase.DoNotRetryIOException): org.apache.hadoop.hbase.
> DoNotRetryIOException:
> > java.lang.NoClassDefFoundError: Could not initialize class
> > org.apache.kylin.metadata.datatype.DataType
> >         at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:
> 2067)
> >         at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.
> java:108)
> >         at org.apache.hadoop.hbase.ipc.RpcExecutor.consumerLoop(
> > RpcExecutor.java:114)
> >         at org.apache.hadoop.hbase.ipc.RpcExecutor$1.run(RpcExecutor.
> > java:94)
> >         at java.lang.Thread.run(Thread.java:744)
> > Caused by: java.lang.NoClassDefFoundError: Could not initialize class
> > org.apache.kylin.metadata.datatype.DataType
> >         at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.
> > java:328)
> >         at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.
> > java:283)
> >         at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(
> > GTScanRequest.java:322)
> >         at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(
> > GTScanRequest.java:295)
> >         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> > endpoint.CubeVisitService.visitCube(CubeVisitService.java:189)
> >         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> > endpoint.generated.CubeVisitProtos$CubeVisitService.callMethod(
> > CubeVisitProtos.java:4353)
> >         at org.apache.hadoop.hbase.regionserver.HRegion.
> > execService(HRegion.java:5602)
> >         at org.apache.hadoop.hbase.regionserver.HRegionServer.
> > execServiceOnRegion(HRegionServer.java:3416)
> >         at org.apache.hadoop.hbase.regionserver.HRegionServer.
> > execService(HRegionServer.java:3398)
> >         at org.apache.hadoop.hbase.protobuf.generated.
> > ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:29591)
> >         at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:
> 2031)
> >         ... 4 more
> >
> >         at org.apache.hadoop.hbase.ipc.RpcClient.call(RpcClient.java:
> 1457)
> >         at org.apache.hadoop.hbase.ipc.RpcClient.callBlockingMethod(
> > RpcClient.java:1661)
> >         at org.apache.hadoop.hbase.ipc.RpcClient$
> > BlockingRpcChannelImplementation.callBlockingMethod(RpcClient.java:1719)
> >         at org.apache.hadoop.hbase.protobuf.generated.
> > ClientProtos$ClientService$BlockingStub.execService(
> > ClientProtos.java:30014)
> >         at org.apache.hadoop.hbase.protobuf.ProtobufUtil.
> > execService(ProtobufUtil.java:1623)
> >         ... 14 more
> > 2016-08-19 13:24:55,866 ERROR [pool-8-thread-9]
> > v2.CubeHBaseEndpointRPC:404 : <sub-thread for GTScanRequest 80b872a>
> Error
> > when visiting cubes by endpoint
>

Re: Cube query problem

Posted by Li Yang <li...@apache.org>.
Haven't tested CDH 5.2, but we know CDH 5.7 works well with Kylin.

On Fri, Aug 19, 2016 at 1:34 PM, lxw <lx...@qq.com> wrote:

> Hi,experts:
>
>    After create and build successfully, I use SQL to query the cube and
> got error:
>
> Error while executing SQL "select advid_, sum(pv_) as pv  from
> lxw1234.t_table  group by advid_   limit 10": Error in coprocessor
>
>    I use kylin-1.5.3、hbase-0.98.6-cdh5.2.0,  and some other cubes can be
> normal query .
>
>    The error log is :
>
> 2016-08-19 13:24:55,862 WARN  [kylin-coproc--pool5-t19]
> ipc.CoprocessorRpcChannel:59 : Call failed on IOException
> org.apache.hadoop.hbase.DoNotRetryIOException: org.apache.hadoop.hbase.DoNotRetryIOException:
> java.lang.NoClassDefFoundError: Could not initialize class
> org.apache.kylin.metadata.datatype.DataType
>         at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2067)
>         at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:108)
>         at org.apache.hadoop.hbase.ipc.RpcExecutor.consumerLoop(
> RpcExecutor.java:114)
>         at org.apache.hadoop.hbase.ipc.RpcExecutor$1.run(RpcExecutor.
> java:94)
>         at java.lang.Thread.run(Thread.java:744)
> Caused by: java.lang.NoClassDefFoundError: Could not initialize class
> org.apache.kylin.metadata.datatype.DataType
>         at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.
> java:328)
>         at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.
> java:283)
>         at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(
> GTScanRequest.java:322)
>         at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(
> GTScanRequest.java:295)
>         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> endpoint.CubeVisitService.visitCube(CubeVisitService.java:189)
>         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> endpoint.generated.CubeVisitProtos$CubeVisitService.callMethod(
> CubeVisitProtos.java:4353)
>         at org.apache.hadoop.hbase.regionserver.HRegion.
> execService(HRegion.java:5602)
>         at org.apache.hadoop.hbase.regionserver.HRegionServer.
> execServiceOnRegion(HRegionServer.java:3416)
>         at org.apache.hadoop.hbase.regionserver.HRegionServer.
> execService(HRegionServer.java:3398)
>         at org.apache.hadoop.hbase.protobuf.generated.
> ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:29591)
>         at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2031)
>         ... 4 more
>
>         at sun.reflect.GeneratedConstructorAccessor86.newInstance(Unknown
> Source)
>         at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(
> DelegatingConstructorAccessorImpl.java:45)
>         at java.lang.reflect.Constructor.newInstance(Constructor.java:526)
>         at org.apache.hadoop.ipc.RemoteException.instantiateException(
> RemoteException.java:106)
>         at org.apache.hadoop.ipc.RemoteException.unwrapRemoteException(
> RemoteException.java:95)
>         at org.apache.hadoop.hbase.protobuf.ProtobufUtil.
> getRemoteException(ProtobufUtil.java:304)
>         at org.apache.hadoop.hbase.protobuf.ProtobufUtil.
> execService(ProtobufUtil.java:1627)
>         at org.apache.hadoop.hbase.ipc.RegionCoprocessorRpcChannel$1.call(
> RegionCoprocessorRpcChannel.java:93)
>         at org.apache.hadoop.hbase.ipc.RegionCoprocessorRpcChannel$1.call(
> RegionCoprocessorRpcChannel.java:90)
>         at org.apache.hadoop.hbase.client.RpcRetryingCaller.
> callWithRetries(RpcRetryingCaller.java:114)
>         at org.apache.hadoop.hbase.client.RpcRetryingCaller.
> callWithRetries(RpcRetryingCaller.java:90)
>         at org.apache.hadoop.hbase.ipc.RegionCoprocessorRpcChannel.
> callExecService(RegionCoprocessorRpcChannel.java:96)
>         at org.apache.hadoop.hbase.ipc.CoprocessorRpcChannel.callMethod(
> CoprocessorRpcChannel.java:57)
>         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> endpoint.generated.CubeVisitProtos$CubeVisitService$Stub.
> visitCube(CubeVisitProtos.java:4400)
>         at org.apache.kylin.storage.hbase.cube.v2.
> CubeHBaseEndpointRPC$1$1.call(CubeHBaseEndpointRPC.java:371)
>         at org.apache.kylin.storage.hbase.cube.v2.
> CubeHBaseEndpointRPC$1$1.call(CubeHBaseEndpointRPC.java:367)
>         at org.apache.hadoop.hbase.client.HTable$16.call(HTable.java:1542)
>         at java.util.concurrent.FutureTask.run(FutureTask.java:262)
>         at java.util.concurrent.ThreadPoolExecutor.runWorker(
> ThreadPoolExecutor.java:1145)
>         at java.util.concurrent.ThreadPoolExecutor$Worker.run(
> ThreadPoolExecutor.java:615)
>         at java.lang.Thread.run(Thread.java:744)
> Caused by: org.apache.hadoop.hbase.ipc.RemoteWithExtrasException(org.
> apache.hadoop.hbase.DoNotRetryIOException): org.apache.hadoop.hbase.DoNotRetryIOException:
> java.lang.NoClassDefFoundError: Could not initialize class
> org.apache.kylin.metadata.datatype.DataType
>         at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2067)
>         at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:108)
>         at org.apache.hadoop.hbase.ipc.RpcExecutor.consumerLoop(
> RpcExecutor.java:114)
>         at org.apache.hadoop.hbase.ipc.RpcExecutor$1.run(RpcExecutor.
> java:94)
>         at java.lang.Thread.run(Thread.java:744)
> Caused by: java.lang.NoClassDefFoundError: Could not initialize class
> org.apache.kylin.metadata.datatype.DataType
>         at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.
> java:328)
>         at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.
> java:283)
>         at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(
> GTScanRequest.java:322)
>         at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(
> GTScanRequest.java:295)
>         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> endpoint.CubeVisitService.visitCube(CubeVisitService.java:189)
>         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> endpoint.generated.CubeVisitProtos$CubeVisitService.callMethod(
> CubeVisitProtos.java:4353)
>         at org.apache.hadoop.hbase.regionserver.HRegion.
> execService(HRegion.java:5602)
>         at org.apache.hadoop.hbase.regionserver.HRegionServer.
> execServiceOnRegion(HRegionServer.java:3416)
>         at org.apache.hadoop.hbase.regionserver.HRegionServer.
> execService(HRegionServer.java:3398)
>         at org.apache.hadoop.hbase.protobuf.generated.
> ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:29591)
>         at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2031)
>         ... 4 more
>
>         at org.apache.hadoop.hbase.ipc.RpcClient.call(RpcClient.java:1457)
>         at org.apache.hadoop.hbase.ipc.RpcClient.callBlockingMethod(
> RpcClient.java:1661)
>         at org.apache.hadoop.hbase.ipc.RpcClient$
> BlockingRpcChannelImplementation.callBlockingMethod(RpcClient.java:1719)
>         at org.apache.hadoop.hbase.protobuf.generated.
> ClientProtos$ClientService$BlockingStub.execService(
> ClientProtos.java:30014)
>         at org.apache.hadoop.hbase.protobuf.ProtobufUtil.
> execService(ProtobufUtil.java:1623)
>         ... 14 more
> 2016-08-19 13:24:55,864 WARN  [pool-8-thread-9] client.HTable:1556 : Error
> calling coprocessor service org.apache.kylin.storage.
> hbase.cube.v2.coprocessor.endpoint.generated.CubeVisitProtos$CubeVisitService
> for row \x00\x00
> java.util.concurrent.ExecutionException: org.apache.hadoop.hbase.DoNotRetryIOException:
> org.apache.hadoop.hbase.DoNotRetryIOException: java.lang.NoClassDefFoundError:
> Could not initialize class org.apache.kylin.metadata.datatype.DataType
>         at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2067)
>         at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:108)
>         at org.apache.hadoop.hbase.ipc.RpcExecutor.consumerLoop(
> RpcExecutor.java:114)
>         at org.apache.hadoop.hbase.ipc.RpcExecutor$1.run(RpcExecutor.
> java:94)
>         at java.lang.Thread.run(Thread.java:744)
> Caused by: java.lang.NoClassDefFoundError: Could not initialize class
> org.apache.kylin.metadata.datatype.DataType
>         at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.
> java:328)
>         at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.
> java:283)
>         at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(
> GTScanRequest.java:322)
>         at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(
> GTScanRequest.java:295)
>         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> endpoint.CubeVisitService.visitCube(CubeVisitService.java:189)
>         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> endpoint.generated.CubeVisitProtos$CubeVisitService.callMethod(
> CubeVisitProtos.java:4353)
>         at org.apache.hadoop.hbase.regionserver.HRegion.
> execService(HRegion.java:5602)
>         at org.apache.hadoop.hbase.regionserver.HRegionServer.
> execServiceOnRegion(HRegionServer.java:3416)
>         at org.apache.hadoop.hbase.regionserver.HRegionServer.
> execService(HRegionServer.java:3398)
>         at org.apache.hadoop.hbase.protobuf.generated.
> ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:29591)
>         at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2031)
>         ... 4 more
>
>         at java.util.concurrent.FutureTask.report(FutureTask.java:122)
>         at java.util.concurrent.FutureTask.get(FutureTask.java:188)
>         at org.apache.hadoop.hbase.client.HTable.
> coprocessorService(HTable.java:1554)
>         at org.apache.kylin.storage.hbase.cube.v2.
> CubeHBaseEndpointRPC$1.run(CubeHBaseEndpointRPC.java:366)
>         at java.util.concurrent.Executors$RunnableAdapter.
> call(Executors.java:471)
>         at java.util.concurrent.FutureTask.run(FutureTask.java:262)
>         at java.util.concurrent.ThreadPoolExecutor.runWorker(
> ThreadPoolExecutor.java:1145)
>         at java.util.concurrent.ThreadPoolExecutor$Worker.run(
> ThreadPoolExecutor.java:615)
>         at java.lang.Thread.run(Thread.java:744)
> Caused by: org.apache.hadoop.hbase.DoNotRetryIOException:
> org.apache.hadoop.hbase.DoNotRetryIOException: java.lang.NoClassDefFoundError:
> Could not initialize class org.apache.kylin.metadata.datatype.DataType
>         at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2067)
>         at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:108)
>         at org.apache.hadoop.hbase.ipc.RpcExecutor.consumerLoop(
> RpcExecutor.java:114)
>         at org.apache.hadoop.hbase.ipc.RpcExecutor$1.run(RpcExecutor.
> java:94)
>         at java.lang.Thread.run(Thread.java:744)
> Caused by: java.lang.NoClassDefFoundError: Could not initialize class
> org.apache.kylin.metadata.datatype.DataType
>         at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.
> java:328)
>         at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.
> java:283)
>         at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(
> GTScanRequest.java:322)
>         at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(
> GTScanRequest.java:295)
>         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> endpoint.CubeVisitService.visitCube(CubeVisitService.java:189)
>         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> endpoint.generated.CubeVisitProtos$CubeVisitService.callMethod(
> CubeVisitProtos.java:4353)
>         at org.apache.hadoop.hbase.regionserver.HRegion.
> execService(HRegion.java:5602)
>         at org.apache.hadoop.hbase.regionserver.HRegionServer.
> execServiceOnRegion(HRegionServer.java:3416)
>         at org.apache.hadoop.hbase.regionserver.HRegionServer.
> execService(HRegionServer.java:3398)
>         at org.apache.hadoop.hbase.protobuf.generated.
> ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:29591)
>         at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2031)
>         ... 4 more
>
>         at sun.reflect.GeneratedConstructorAccessor86.newInstance(Unknown
> Source)
>         at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(
> DelegatingConstructorAccessorImpl.java:45)
>         at java.lang.reflect.Constructor.newInstance(Constructor.java:526)
>         at org.apache.hadoop.ipc.RemoteException.instantiateException(
> RemoteException.java:106)
>         at org.apache.hadoop.ipc.RemoteException.unwrapRemoteException(
> RemoteException.java:95)
>         at org.apache.hadoop.hbase.protobuf.ProtobufUtil.
> getRemoteException(ProtobufUtil.java:304)
>         at org.apache.hadoop.hbase.protobuf.ProtobufUtil.
> execService(ProtobufUtil.java:1627)
>         at org.apache.hadoop.hbase.ipc.RegionCoprocessorRpcChannel$1.call(
> RegionCoprocessorRpcChannel.java:93)
>         at org.apache.hadoop.hbase.ipc.RegionCoprocessorRpcChannel$1.call(
> RegionCoprocessorRpcChannel.java:90)
>         at org.apache.hadoop.hbase.client.RpcRetryingCaller.
> callWithRetries(RpcRetryingCaller.java:114)
>         at org.apache.hadoop.hbase.client.RpcRetryingCaller.
> callWithRetries(RpcRetryingCaller.java:90)
>         at org.apache.hadoop.hbase.ipc.RegionCoprocessorRpcChannel.
> callExecService(RegionCoprocessorRpcChannel.java:96)
>         at org.apache.hadoop.hbase.ipc.CoprocessorRpcChannel.callMethod(
> CoprocessorRpcChannel.java:57)
>         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> endpoint.generated.CubeVisitProtos$CubeVisitService$Stub.
> visitCube(CubeVisitProtos.java:4400)
>         at org.apache.kylin.storage.hbase.cube.v2.
> CubeHBaseEndpointRPC$1$1.call(CubeHBaseEndpointRPC.java:371)
>         at org.apache.kylin.storage.hbase.cube.v2.
> CubeHBaseEndpointRPC$1$1.call(CubeHBaseEndpointRPC.java:367)
>         at org.apache.hadoop.hbase.client.HTable$16.call(HTable.java:1542)
>         ... 4 more
> Caused by: org.apache.hadoop.hbase.ipc.RemoteWithExtrasException(org.
> apache.hadoop.hbase.DoNotRetryIOException): org.apache.hadoop.hbase.DoNotRetryIOException:
> java.lang.NoClassDefFoundError: Could not initialize class
> org.apache.kylin.metadata.datatype.DataType
>         at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2067)
>         at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:108)
>         at org.apache.hadoop.hbase.ipc.RpcExecutor.consumerLoop(
> RpcExecutor.java:114)
>         at org.apache.hadoop.hbase.ipc.RpcExecutor$1.run(RpcExecutor.
> java:94)
>         at java.lang.Thread.run(Thread.java:744)
> Caused by: java.lang.NoClassDefFoundError: Could not initialize class
> org.apache.kylin.metadata.datatype.DataType
>         at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.
> java:328)
>         at org.apache.kylin.gridtable.GTInfo$1.deserialize(GTInfo.
> java:283)
>         at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(
> GTScanRequest.java:322)
>         at org.apache.kylin.gridtable.GTScanRequest$2.deserialize(
> GTScanRequest.java:295)
>         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> endpoint.CubeVisitService.visitCube(CubeVisitService.java:189)
>         at org.apache.kylin.storage.hbase.cube.v2.coprocessor.
> endpoint.generated.CubeVisitProtos$CubeVisitService.callMethod(
> CubeVisitProtos.java:4353)
>         at org.apache.hadoop.hbase.regionserver.HRegion.
> execService(HRegion.java:5602)
>         at org.apache.hadoop.hbase.regionserver.HRegionServer.
> execServiceOnRegion(HRegionServer.java:3416)
>         at org.apache.hadoop.hbase.regionserver.HRegionServer.
> execService(HRegionServer.java:3398)
>         at org.apache.hadoop.hbase.protobuf.generated.
> ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:29591)
>         at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2031)
>         ... 4 more
>
>         at org.apache.hadoop.hbase.ipc.RpcClient.call(RpcClient.java:1457)
>         at org.apache.hadoop.hbase.ipc.RpcClient.callBlockingMethod(
> RpcClient.java:1661)
>         at org.apache.hadoop.hbase.ipc.RpcClient$
> BlockingRpcChannelImplementation.callBlockingMethod(RpcClient.java:1719)
>         at org.apache.hadoop.hbase.protobuf.generated.
> ClientProtos$ClientService$BlockingStub.execService(
> ClientProtos.java:30014)
>         at org.apache.hadoop.hbase.protobuf.ProtobufUtil.
> execService(ProtobufUtil.java:1623)
>         ... 14 more
> 2016-08-19 13:24:55,866 ERROR [pool-8-thread-9]
> v2.CubeHBaseEndpointRPC:404 : <sub-thread for GTScanRequest 80b872a> Error
> when visiting cubes by endpoint

Re: Precisely Count Distinct on 100 million string values column

Posted by Yiming Liu <li...@gmail.com>.
Thanks Yerui for the response.

2016-08-25 11:55 GMT+08:00 Yerui Sun <su...@gmail.com>:

> lxw,
> If the values exceed Integer.MAX_VALUE, exception will be threw when
> dictionary building.
>
> You can firstly disable cube and then edit the json on web ui. The action
> button is in the ‘Admins’ of cube list table.
>
> BTW, the 255 limitation could be removed in theory, however, that made the
> logic more complicated. You can have a try and contribute the patch if
> you’re interested.
>
> Yiming,
> I will post a patch for more clearly exception message and some minor
> improve of GlobalDictionary.
> But maybe later, it’s quite a busy week...
>
> > 在 2016年8月25日,10:05,lxw <lx...@qq.com> 写道:
> >
> > Sorry,
> >
> > About question 1,
> > I means if count distinct values of column data cross all segments
> exceed Integer.MAX_VALUE, what will be happened?
> >
> >
> >
> > ------------------ 原始邮件 ------------------
> > 发件人: "lxw";<lx...@qq.com>;
> > 发送时间: 2016年8月25日(星期四) 上午10:01
> > 收件人: "dev"<de...@kylin.apache.org>;
> >
> > 主题: 回复: Precisely Count Distinct on 100 million string values column
> >
> >
> >
> > I have 2 more questions:
> >
> > 1. The capacity of the global dictionary is Integer.MAX_VALUE? If count
> distinct values of column data cross all segments, what will be happened?
> duplication or error ?
> >
> > 2. Where I can manually edit a cube desc json? Now I use JAVA API to
> create or update cube.
> >
> > Thanks!
> >
> >
> >
> > ------------------ 原始邮件 ------------------
> > 发件人: "Yiming Liu";<li...@gmail.com>;
> > 发送时间: 2016年8月25日(星期四) 上午9:41
> > 收件人: "dev"<de...@kylin.apache.org>; "sunyerui"<su...@gmail.com>;
> >
> > 主题: Re: Precisely Count Distinct on 100 million string values column
> >
> >
> >
> > Good found.
> >
> > The code AppendTrieDictionary line 604:
> >
> > // nValueBytes
> > if (n.part.length > 255)
> >    throw new RuntimeException();
> >
> > Hi Yerui,
> >
> > Could you add more comments for the 255 limit, with more meaningful
> exception?
> >
> >
> > 2016-08-24 20:44 GMT+08:00 lxw <lx...@qq.com>:
> >
> >> It caused by length(USER_ID) > 255.
> >> After exclude these dirty data, it works .
> >>
> >>
> >> Total 150 million records, execute this query:
> >>
> >> select city_code,
> >> sum(bid_request) as bid_request,
> >> count(distinct user_id) as uv
> >> from liuxiaowen.TEST_T_PBS_UV_FACT
> >> group by city_code
> >> order by uv desc limit 100
> >>
> >> Kylin cost  7 seconds, and Hive cost 180 seconds, the result is same.
> >>
> >>
> >>
> >> ------------------ Original ------------------
> >> From:  "lxw";<lx...@qq.com>;
> >> Date:  Wed, Aug 24, 2016 05:27 PM
> >> To:  "dev"<de...@kylin.apache.org>;
> >>
> >> Subject:  Precisely Count Distinct on 100 million string values column
> >>
> >>
> >>
> >> Hi,
> >>
> >>    I am trying to use "Precisely Count Distinct" on 100 million string
> >> values column "USER_ID", I updated the cube json :
> >> "dictionaries": [     {       "column": "USER_ID",       "builder":
> >> "org.apache.kylin.dict.GlobalDictionaryBuilder"     }   ],
> >>
> >> "override_kylin_properties": {     "kylin.job.mr.config.override.
> mapred.map.child.java.opts":
> >> "-Xmx7g",     "kylin.job.mr.config.override.mapreduce.map.memory.mb":
> >> "7168"   }  when I build the cube, an error occurred on "#4 Step Name:
> >> Build Dimension Dictionary",
> >>  the error log in "kylin.log" :
> >>
> >> 2016-08-24 17:27:53,282 ERROR [pool-7-thread-10] dict.CachedTreeMap:239
> :
> >> write value into /kylin_test1/kylin_metadata_
> test1/resources/GlobalDict/
> >> dict/LIUXIAOWEN.TEST_T_PBS_UV_FACT/USER_ID.tmp/cached_
> >> AQEByQXVzFd8r0YviP4x84YqUv-NcRiuCI2d exception:
> java.lang.RuntimeException
> >> java.lang.RuntimeException
> >>        at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
> >> build_writeNode(AppendTrieDictionary.java:605)
> >>        at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
> >> buildTrieBytes(AppendTrieDictionary.java:576)
> >>        at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
> >> write(AppendTrieDictionary.java:523)
> >>        at org.apache.kylin.dict.CachedTreeMap.writeValue(
> >> CachedTreeMap.java:234)
> >>        at org.apache.kylin.dict.CachedTreeMap.write(
> >> CachedTreeMap.java:374)
> >>        at org.apache.kylin.dict.AppendTrieDictionary.flushIndex(
> >> AppendTrieDictionary.java:1043)
> >>        at org.apache.kylin.dict.AppendTrieDictionary$Builder.
> >> build(AppendTrieDictionary.java:954)
> >>        at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
> >> GlobalDictionaryBuilder.java:82)
> >>        at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
> >> DictionaryGenerator.java:81)
> >>        at org.apache.kylin.dict.DictionaryManager.buildDictionary(
> >> DictionaryManager.java:323)
> >>        at org.apache.kylin.cube.CubeManager.buildDictionary(
> >> CubeManager.java:185)
> >>        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> >> processSegment(DictionaryGeneratorCLI.java:51)
> >>        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> >> processSegment(DictionaryGeneratorCLI.java:42)
> >>        at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
> >> CreateDictionaryJob.java:56)
> >>        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
> >>        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
> >>        at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
> >> doWork(HadoopShellExecutable.java:63)
> >>        at org.apache.kylin.job.execution.AbstractExecutable.
> >> execute(AbstractExecutable.java:112)
> >>        at org.apache.kylin.job.execution.DefaultChainedExecutable.
> doWork(
> >> DefaultChainedExecutable.java:57)
> >>        at org.apache.kylin.job.execution.AbstractExecutable.
> >> execute(AbstractExecutable.java:112)
> >>        at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
> >> JobRunner.run(DefaultScheduler.java:127)
> >>        at java.util.concurrent.ThreadPoolExecutor.runWorker(
> >> ThreadPoolExecutor.java:1145)
> >>        at java.util.concurrent.ThreadPoolExecutor$Worker.run(
> >> ThreadPoolExecutor.java:615)
> >>        at java.lang.Thread.run(Thread.java:744)
> >> 2016-08-24 17:27:53,340 ERROR [pool-7-thread-10]
> >> common.HadoopShellExecutable:65 : error execute
> HadoopShellExecutable{id=
> >> 3a0f2751-dd2a-4a3b-a27a-58bfc0edbbfd-03, name=Build Dimension
> Dictionary,
> >> state=RUNNING}
> >> java.lang.RuntimeException
> >>        at org.apache.kylin.dict.CachedTreeMap.writeValue(
> >> CachedTreeMap.java:240)
> >>        at org.apache.kylin.dict.CachedTreeMap.write(
> >> CachedTreeMap.java:374)
> >>        at org.apache.kylin.dict.AppendTrieDictionary.flushIndex(
> >> AppendTrieDictionary.java:1043)
> >>        at org.apache.kylin.dict.AppendTrieDictionary$Builder.
> >> build(AppendTrieDictionary.java:954)
> >>        at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
> >> GlobalDictionaryBuilder.java:82)
> >>        at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
> >> DictionaryGenerator.java:81)
> >>        at org.apache.kylin.dict.DictionaryManager.buildDictionary(
> >> DictionaryManager.java:323)
> >>        at org.apache.kylin.cube.CubeManager.buildDictionary(
> >> CubeManager.java:185)
> >>        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> >> processSegment(DictionaryGeneratorCLI.java:51)
> >>        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> >> processSegment(DictionaryGeneratorCLI.java:42)
> >>        at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
> >> CreateDictionaryJob.java:56)
> >>        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
> >>        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
> >>        at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
> >> doWork(HadoopShellExecutable.java:63)
> >>        at org.apache.kylin.job.execution.AbstractExecutable.
> >> execute(AbstractExecutable.java:112)
> >>        at org.apache.kylin.job.execution.DefaultChainedExecutable.
> doWork(
> >> DefaultChainedExecutable.java:57)
> >>        at org.apache.kylin.job.execution.AbstractExecutable.
> >> execute(AbstractExecutable.java:112)
> >>        at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
> >> JobRunner.run(DefaultScheduler.java:127)
> >>        at java.util.concurrent.ThreadPoolExecutor.runWorker(
> >> ThreadPoolExecutor.java:1145)
> >>        at java.util.concurrent.ThreadPoolExecutor$Worker.run(
> >> ThreadPoolExecutor.java:615)
> >>        at java.lang.Thread.run(Thread.java:744)
> >>
> >>    and the error log in "kylin.out" :
> >>
> >> Aug 24, 2016 5:25:32 PM com.google.common.cache.LocalCache
> >> processPendingNotifications
> >> WARNING: Exception thrown by removal listener
> >> java.lang.RuntimeException
> >>        at org.apache.kylin.dict.CachedTreeMap.writeValue(
> >> CachedTreeMap.java:240)
> >>        at org.apache.kylin.dict.CachedTreeMap.access$300(
> >> CachedTreeMap.java:52)
> >>        at org.apache.kylin.dict.CachedTreeMap$1.onRemoval(
> >> CachedTreeMap.java:149)
> >>        at com.google.common.cache.LocalCache.
> processPendingNotifications(
> >> LocalCache.java:2011)
> >>        at com.google.common.cache.LocalCache$Segment.
> >> runUnlockedCleanup(LocalCache.java:3501)
> >>        at com.google.common.cache.LocalCache$Segment.
> >> postWriteCleanup(LocalCache.java:3477)
> >>        at com.google.common.cache.LocalCache$Segment.put(
> >> LocalCache.java:2940)
> >>        at com.google.common.cache.LocalCache.put(LocalCache.java:4202)
> >>        at com.google.common.cache.LocalCache$LocalManualCache.
> >> put(LocalCache.java:4798)
> >>        at org.apache.kylin.dict.CachedTreeMap.put(
> CachedTreeMap.java:284)
> >>        at org.apache.kylin.dict.CachedTreeMap.put(
> CachedTreeMap.java:52)
> >>        at org.apache.kylin.dict.AppendTrieDictionary$Builder.
> >> addValue(AppendTrieDictionary.java:829)
> >>        at org.apache.kylin.dict.AppendTrieDictionary$Builder.
> >> addValue(AppendTrieDictionary.java:804)
> >>        at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
> >> GlobalDictionaryBuilder.java:78)
> >>        at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
> >> DictionaryGenerator.java:81)
> >>        at org.apache.kylin.dict.DictionaryManager.buildDictionary(
> >> DictionaryManager.java:323)
> >>        at org.apache.kylin.cube.CubeManager.buildDictionary(
> >> CubeManager.java:185)
> >>        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> >> processSegment(DictionaryGeneratorCLI.java:51)
> >>        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> >> processSegment(DictionaryGeneratorCLI.java:42)
> >>        at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
> >> CreateDictionaryJob.java:56)
> >>        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
> >>        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
> >>        at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
> >> doWork(HadoopShellExecutable.java:63)
> >>        at org.apache.kylin.job.execution.AbstractExecutable.
> >> execute(AbstractExecutable.java:112)
> >>        at org.apache.kylin.job.execution.DefaultChainedExecutable.
> doWork(
> >> DefaultChainedExecutable.java:57)
> >>        at org.apache.kylin.job.execution.AbstractExecutable.
> >> execute(AbstractExecutable.java:112)
> >>        at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
> >> JobRunner.run(DefaultScheduler.java:127)
> >>        at java.util.concurrent.ThreadPoolExecutor.runWorker(
> >> ThreadPoolExecutor.java:1145)
> >>        at java.util.concurrent.ThreadPoolExecutor$Worker.run(
> >> ThreadPoolExecutor.java:615)
> >>        at java.lang.Thread.run(Thread.java:744)
> >>
> >> usage: CreateDictionaryJob
> >> -cubename <cubename>         Cube name. For exmaple, flat_item_cube
> >> -input <input>               Input path
> >> -segmentname <segmentname>   Cube segment name
> >>
> >
> >
> >
> > --
> > With Warm regards
> >
> > Yiming Liu (刘一鸣)
>
>


-- 
With Warm regards

Yiming Liu (刘一鸣)

Re: Precisely Count Distinct on 100 million string values column

Posted by Yerui Sun <su...@gmail.com>.
Hmm… You’re right, hybrid cube couldn’t resolve your problem. 

It’s really a challenge to count distinct on such huge dataset. 
A possible solution is expand dict id from int to bigint, and made RoaringBitmap support bigint. However, it need quite changing on current code.

> 在 2016年8月25日,16:40,lxw <lx...@qq.com> 写道:
> 
> 1. Yes, USER_ID have duplicated values between segments, 100 million is new, maby another 150 million is old per segment.
> 2. I think "Hybrid Model" also has problem about my scene, just like "default dictionay cross segments", this is "global dictionary cross cubes", am I right?
> 
> 
> 
> 
> 
> ------------------ 原始邮件 ------------------
> 发件人: "Yerui Sun";<su...@gmail.com>;
> 发送时间: 2016年8月25日(星期四) 下午4:22
> 收件人: "dev"<de...@kylin.apache.org>; 
> 
> 主题: Re: Precisely Count Distinct on 100 million string values column
> 
> 
> 
> That depends on your USER_ID carnality. I think your USER_ID should have duplicated values between segments, that’s why you use count **distinct**. If the USER_ID always different and show up only once, just count should be fine, no need to count **distinct**.
> 
> If the USER_ID carnality indeed over 2 billion, maybe you need create one cube every 21 days, and combine them into one hybrid cube? I’m not sure whether it worked, you can check http://kylin.apache.org/blog/2015/09/25/hybrid-model/  and have a try. 
> 
>> 在 2016年8月25日,12:56,lxw <lx...@qq.com> 写道:
>> 
>> Thanks, I got it.
>> 
>> We have 100 million new USER_IDs per day (segment), that means after 21 days, the building task will be failed?
>> And we can't use "Precisely Count Distinct" in out scene?
>> 
>> 
>> 
>> 
>> 
>> ------------------ 原始邮件 ------------------
>> 发件人: "Yerui Sun";<su...@gmail.com>;
>> 发送时间: 2016年8月25日(星期四) 中午11:55
>> 收件人: "dev"<de...@kylin.apache.org>; 
>> 
>> 主题: Re: Precisely Count Distinct on 100 million string values column
>> 
>> 
>> 
>> lxw,
>> If the values exceed Integer.MAX_VALUE, exception will be threw when dictionary building.
>> 
>> You can firstly disable cube and then edit the json on web ui. The action button is in the ‘Admins’ of cube list table.
>> 
>> BTW, the 255 limitation could be removed in theory, however, that made the logic more complicated. You can have a try and contribute the patch if you’re interested.
>> 
>> Yiming,
>> I will post a patch for more clearly exception message and some minor improve of GlobalDictionary. 
>> But maybe later, it’s quite a busy week... 
>> 
>>> 在 2016年8月25日,10:05,lxw <lx...@qq.com> 写道:
>>> 
>>> Sorry, 
>>> 
>>> About question 1, 
>>> I means if count distinct values of column data cross all segments exceed Integer.MAX_VALUE, what will be happened?
>>> 
>>> 
>>> 
>>> ------------------ 原始邮件 ------------------
>>> 发件人: "lxw";<lx...@qq.com>;
>>> 发送时间: 2016年8月25日(星期四) 上午10:01
>>> 收件人: "dev"<de...@kylin.apache.org>; 
>>> 
>>> 主题: 回复: Precisely Count Distinct on 100 million string values column
>>> 
>>> 
>>> 
>>> I have 2 more questions:
>>> 
>>> 1. The capacity of the global dictionary is Integer.MAX_VALUE? If count distinct values of column data cross all segments, what will be happened? duplication or error ?
>>> 
>>> 2. Where I can manually edit a cube desc json? Now I use JAVA API to create or update cube.
>>> 
>>> Thanks!
>>> 
>>> 
>>> 
>>> ------------------ 原始邮件 ------------------
>>> 发件人: "Yiming Liu";<li...@gmail.com>;
>>> 发送时间: 2016年8月25日(星期四) 上午9:41
>>> 收件人: "dev"<de...@kylin.apache.org>; "sunyerui"<su...@gmail.com>; 
>>> 
>>> 主题: Re: Precisely Count Distinct on 100 million string values column
>>> 
>>> 
>>> 
>>> Good found.
>>> 
>>> The code AppendTrieDictionary line 604:
>>> 
>>> // nValueBytes
>>> if (n.part.length > 255)
>>>  throw new RuntimeException();
>>> 
>>> Hi Yerui,
>>> 
>>> Could you add more comments for the 255 limit, with more meaningful exception?
>>> 
>>> 
>>> 2016-08-24 20:44 GMT+08:00 lxw <lx...@qq.com>:
>>> 
>>>> It caused by length(USER_ID) > 255.
>>>> After exclude these dirty data, it works .
>>>> 
>>>> 
>>>> Total 150 million records, execute this query:
>>>> 
>>>> select city_code,
>>>> sum(bid_request) as bid_request,
>>>> count(distinct user_id) as uv
>>>> from liuxiaowen.TEST_T_PBS_UV_FACT
>>>> group by city_code
>>>> order by uv desc limit 100
>>>> 
>>>> Kylin cost  7 seconds, and Hive cost 180 seconds, the result is same.
>>>> 
>>>> 
>>>> 
>>>> ------------------ Original ------------------
>>>> From:  "lxw";<lx...@qq.com>;
>>>> Date:  Wed, Aug 24, 2016 05:27 PM
>>>> To:  "dev"<de...@kylin.apache.org>;
>>>> 
>>>> Subject:  Precisely Count Distinct on 100 million string values column
>>>> 
>>>> 
>>>> 
>>>> Hi,
>>>> 
>>>>  I am trying to use "Precisely Count Distinct" on 100 million string
>>>> values column "USER_ID", I updated the cube json :
>>>> "dictionaries": [     {       "column": "USER_ID",       "builder":
>>>> "org.apache.kylin.dict.GlobalDictionaryBuilder"     }   ],
>>>> 
>>>> "override_kylin_properties": {     "kylin.job.mr.config.override.mapred.map.child.java.opts":
>>>> "-Xmx7g",     "kylin.job.mr.config.override.mapreduce.map.memory.mb":
>>>> "7168"   }  when I build the cube, an error occurred on "#4 Step Name:
>>>> Build Dimension Dictionary",
>>>> the error log in "kylin.log" :
>>>> 
>>>> 2016-08-24 17:27:53,282 ERROR [pool-7-thread-10] dict.CachedTreeMap:239 :
>>>> write value into /kylin_test1/kylin_metadata_test1/resources/GlobalDict/
>>>> dict/LIUXIAOWEN.TEST_T_PBS_UV_FACT/USER_ID.tmp/cached_
>>>> AQEByQXVzFd8r0YviP4x84YqUv-NcRiuCI2d exception: java.lang.RuntimeException
>>>> java.lang.RuntimeException
>>>>      at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
>>>> build_writeNode(AppendTrieDictionary.java:605)
>>>>      at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
>>>> buildTrieBytes(AppendTrieDictionary.java:576)
>>>>      at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
>>>> write(AppendTrieDictionary.java:523)
>>>>      at org.apache.kylin.dict.CachedTreeMap.writeValue(
>>>> CachedTreeMap.java:234)
>>>>      at org.apache.kylin.dict.CachedTreeMap.write(
>>>> CachedTreeMap.java:374)
>>>>      at org.apache.kylin.dict.AppendTrieDictionary.flushIndex(
>>>> AppendTrieDictionary.java:1043)
>>>>      at org.apache.kylin.dict.AppendTrieDictionary$Builder.
>>>> build(AppendTrieDictionary.java:954)
>>>>      at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
>>>> GlobalDictionaryBuilder.java:82)
>>>>      at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
>>>> DictionaryGenerator.java:81)
>>>>      at org.apache.kylin.dict.DictionaryManager.buildDictionary(
>>>> DictionaryManager.java:323)
>>>>      at org.apache.kylin.cube.CubeManager.buildDictionary(
>>>> CubeManager.java:185)
>>>>      at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>>>> processSegment(DictionaryGeneratorCLI.java:51)
>>>>      at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>>>> processSegment(DictionaryGeneratorCLI.java:42)
>>>>      at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
>>>> CreateDictionaryJob.java:56)
>>>>      at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>>>>      at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>>>>      at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
>>>> doWork(HadoopShellExecutable.java:63)
>>>>      at org.apache.kylin.job.execution.AbstractExecutable.
>>>> execute(AbstractExecutable.java:112)
>>>>      at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
>>>> DefaultChainedExecutable.java:57)
>>>>      at org.apache.kylin.job.execution.AbstractExecutable.
>>>> execute(AbstractExecutable.java:112)
>>>>      at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
>>>> JobRunner.run(DefaultScheduler.java:127)
>>>>      at java.util.concurrent.ThreadPoolExecutor.runWorker(
>>>> ThreadPoolExecutor.java:1145)
>>>>      at java.util.concurrent.ThreadPoolExecutor$Worker.run(
>>>> ThreadPoolExecutor.java:615)
>>>>      at java.lang.Thread.run(Thread.java:744)
>>>> 2016-08-24 17:27:53,340 ERROR [pool-7-thread-10]
>>>> common.HadoopShellExecutable:65 : error execute HadoopShellExecutable{id=
>>>> 3a0f2751-dd2a-4a3b-a27a-58bfc0edbbfd-03, name=Build Dimension Dictionary,
>>>> state=RUNNING}
>>>> java.lang.RuntimeException
>>>>      at org.apache.kylin.dict.CachedTreeMap.writeValue(
>>>> CachedTreeMap.java:240)
>>>>      at org.apache.kylin.dict.CachedTreeMap.write(
>>>> CachedTreeMap.java:374)
>>>>      at org.apache.kylin.dict.AppendTrieDictionary.flushIndex(
>>>> AppendTrieDictionary.java:1043)
>>>>      at org.apache.kylin.dict.AppendTrieDictionary$Builder.
>>>> build(AppendTrieDictionary.java:954)
>>>>      at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
>>>> GlobalDictionaryBuilder.java:82)
>>>>      at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
>>>> DictionaryGenerator.java:81)
>>>>      at org.apache.kylin.dict.DictionaryManager.buildDictionary(
>>>> DictionaryManager.java:323)
>>>>      at org.apache.kylin.cube.CubeManager.buildDictionary(
>>>> CubeManager.java:185)
>>>>      at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>>>> processSegment(DictionaryGeneratorCLI.java:51)
>>>>      at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>>>> processSegment(DictionaryGeneratorCLI.java:42)
>>>>      at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
>>>> CreateDictionaryJob.java:56)
>>>>      at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>>>>      at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>>>>      at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
>>>> doWork(HadoopShellExecutable.java:63)
>>>>      at org.apache.kylin.job.execution.AbstractExecutable.
>>>> execute(AbstractExecutable.java:112)
>>>>      at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
>>>> DefaultChainedExecutable.java:57)
>>>>      at org.apache.kylin.job.execution.AbstractExecutable.
>>>> execute(AbstractExecutable.java:112)
>>>>      at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
>>>> JobRunner.run(DefaultScheduler.java:127)
>>>>      at java.util.concurrent.ThreadPoolExecutor.runWorker(
>>>> ThreadPoolExecutor.java:1145)
>>>>      at java.util.concurrent.ThreadPoolExecutor$Worker.run(
>>>> ThreadPoolExecutor.java:615)
>>>>      at java.lang.Thread.run(Thread.java:744)
>>>> 
>>>>  and the error log in "kylin.out" :
>>>> 
>>>> Aug 24, 2016 5:25:32 PM com.google.common.cache.LocalCache
>>>> processPendingNotifications
>>>> WARNING: Exception thrown by removal listener
>>>> java.lang.RuntimeException
>>>>      at org.apache.kylin.dict.CachedTreeMap.writeValue(
>>>> CachedTreeMap.java:240)
>>>>      at org.apache.kylin.dict.CachedTreeMap.access$300(
>>>> CachedTreeMap.java:52)
>>>>      at org.apache.kylin.dict.CachedTreeMap$1.onRemoval(
>>>> CachedTreeMap.java:149)
>>>>      at com.google.common.cache.LocalCache.processPendingNotifications(
>>>> LocalCache.java:2011)
>>>>      at com.google.common.cache.LocalCache$Segment.
>>>> runUnlockedCleanup(LocalCache.java:3501)
>>>>      at com.google.common.cache.LocalCache$Segment.
>>>> postWriteCleanup(LocalCache.java:3477)
>>>>      at com.google.common.cache.LocalCache$Segment.put(
>>>> LocalCache.java:2940)
>>>>      at com.google.common.cache.LocalCache.put(LocalCache.java:4202)
>>>>      at com.google.common.cache.LocalCache$LocalManualCache.
>>>> put(LocalCache.java:4798)
>>>>      at org.apache.kylin.dict.CachedTreeMap.put(CachedTreeMap.java:284)
>>>>      at org.apache.kylin.dict.CachedTreeMap.put(CachedTreeMap.java:52)
>>>>      at org.apache.kylin.dict.AppendTrieDictionary$Builder.
>>>> addValue(AppendTrieDictionary.java:829)
>>>>      at org.apache.kylin.dict.AppendTrieDictionary$Builder.
>>>> addValue(AppendTrieDictionary.java:804)
>>>>      at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
>>>> GlobalDictionaryBuilder.java:78)
>>>>      at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
>>>> DictionaryGenerator.java:81)
>>>>      at org.apache.kylin.dict.DictionaryManager.buildDictionary(
>>>> DictionaryManager.java:323)
>>>>      at org.apache.kylin.cube.CubeManager.buildDictionary(
>>>> CubeManager.java:185)
>>>>      at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>>>> processSegment(DictionaryGeneratorCLI.java:51)
>>>>      at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>>>> processSegment(DictionaryGeneratorCLI.java:42)
>>>>      at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
>>>> CreateDictionaryJob.java:56)
>>>>      at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>>>>      at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>>>>      at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
>>>> doWork(HadoopShellExecutable.java:63)
>>>>      at org.apache.kylin.job.execution.AbstractExecutable.
>>>> execute(AbstractExecutable.java:112)
>>>>      at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
>>>> DefaultChainedExecutable.java:57)
>>>>      at org.apache.kylin.job.execution.AbstractExecutable.
>>>> execute(AbstractExecutable.java:112)
>>>>      at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
>>>> JobRunner.run(DefaultScheduler.java:127)
>>>>      at java.util.concurrent.ThreadPoolExecutor.runWorker(
>>>> ThreadPoolExecutor.java:1145)
>>>>      at java.util.concurrent.ThreadPoolExecutor$Worker.run(
>>>> ThreadPoolExecutor.java:615)
>>>>      at java.lang.Thread.run(Thread.java:744)
>>>> 
>>>> usage: CreateDictionaryJob
>>>> -cubename <cubename>         Cube name. For exmaple, flat_item_cube
>>>> -input <input>               Input path
>>>> -segmentname <segmentname>   Cube segment name
>>>> 
>>> 
>>> 
>>> 
>>> -- 
>>> With Warm regards
>>> 
>>> Yiming Liu (刘一鸣)


回复: Precisely Count Distinct on 100 million string values column

Posted by lxw <lx...@qq.com>.
1. Yes, USER_ID have duplicated values between segments, 100 million is new, maby another 150 million is old per segment.
2. I think "Hybrid Model" also has problem about my scene, just like "default dictionay cross segments", this is "global dictionary cross cubes", am I right?





------------------ 原始邮件 ------------------
发件人: "Yerui Sun";<su...@gmail.com>;
发送时间: 2016年8月25日(星期四) 下午4:22
收件人: "dev"<de...@kylin.apache.org>; 

主题: Re: Precisely Count Distinct on 100 million string values column



That depends on your USER_ID carnality. I think your USER_ID should have duplicated values between segments, that’s why you use count **distinct**. If the USER_ID always different and show up only once, just count should be fine, no need to count **distinct**.

If the USER_ID carnality indeed over 2 billion, maybe you need create one cube every 21 days, and combine them into one hybrid cube? I’m not sure whether it worked, you can check http://kylin.apache.org/blog/2015/09/25/hybrid-model/  and have a try. 

> 在 2016年8月25日,12:56,lxw <lx...@qq.com> 写道:
> 
> Thanks, I got it.
> 
> We have 100 million new USER_IDs per day (segment), that means after 21 days, the building task will be failed?
> And we can't use "Precisely Count Distinct" in out scene?
> 
> 
> 
> 
> 
> ------------------ 原始邮件 ------------------
> 发件人: "Yerui Sun";<su...@gmail.com>;
> 发送时间: 2016年8月25日(星期四) 中午11:55
> 收件人: "dev"<de...@kylin.apache.org>; 
> 
> 主题: Re: Precisely Count Distinct on 100 million string values column
> 
> 
> 
> lxw,
> If the values exceed Integer.MAX_VALUE, exception will be threw when dictionary building.
> 
> You can firstly disable cube and then edit the json on web ui. The action button is in the ‘Admins’ of cube list table.
> 
> BTW, the 255 limitation could be removed in theory, however, that made the logic more complicated. You can have a try and contribute the patch if you’re interested.
> 
> Yiming,
> I will post a patch for more clearly exception message and some minor improve of GlobalDictionary. 
> But maybe later, it’s quite a busy week... 
> 
>> 在 2016年8月25日,10:05,lxw <lx...@qq.com> 写道:
>> 
>> Sorry, 
>> 
>> About question 1, 
>> I means if count distinct values of column data cross all segments exceed Integer.MAX_VALUE, what will be happened?
>> 
>> 
>> 
>> ------------------ 原始邮件 ------------------
>> 发件人: "lxw";<lx...@qq.com>;
>> 发送时间: 2016年8月25日(星期四) 上午10:01
>> 收件人: "dev"<de...@kylin.apache.org>; 
>> 
>> 主题: 回复: Precisely Count Distinct on 100 million string values column
>> 
>> 
>> 
>> I have 2 more questions:
>> 
>> 1. The capacity of the global dictionary is Integer.MAX_VALUE? If count distinct values of column data cross all segments, what will be happened? duplication or error ?
>> 
>> 2. Where I can manually edit a cube desc json? Now I use JAVA API to create or update cube.
>> 
>> Thanks!
>> 
>> 
>> 
>> ------------------ 原始邮件 ------------------
>> 发件人: "Yiming Liu";<li...@gmail.com>;
>> 发送时间: 2016年8月25日(星期四) 上午9:41
>> 收件人: "dev"<de...@kylin.apache.org>; "sunyerui"<su...@gmail.com>; 
>> 
>> 主题: Re: Precisely Count Distinct on 100 million string values column
>> 
>> 
>> 
>> Good found.
>> 
>> The code AppendTrieDictionary line 604:
>> 
>> // nValueBytes
>> if (n.part.length > 255)
>>   throw new RuntimeException();
>> 
>> Hi Yerui,
>> 
>> Could you add more comments for the 255 limit, with more meaningful exception?
>> 
>> 
>> 2016-08-24 20:44 GMT+08:00 lxw <lx...@qq.com>:
>> 
>>> It caused by length(USER_ID) > 255.
>>> After exclude these dirty data, it works .
>>> 
>>> 
>>> Total 150 million records, execute this query:
>>> 
>>> select city_code,
>>> sum(bid_request) as bid_request,
>>> count(distinct user_id) as uv
>>> from liuxiaowen.TEST_T_PBS_UV_FACT
>>> group by city_code
>>> order by uv desc limit 100
>>> 
>>> Kylin cost  7 seconds, and Hive cost 180 seconds, the result is same.
>>> 
>>> 
>>> 
>>> ------------------ Original ------------------
>>> From:  "lxw";<lx...@qq.com>;
>>> Date:  Wed, Aug 24, 2016 05:27 PM
>>> To:  "dev"<de...@kylin.apache.org>;
>>> 
>>> Subject:  Precisely Count Distinct on 100 million string values column
>>> 
>>> 
>>> 
>>> Hi,
>>> 
>>>   I am trying to use "Precisely Count Distinct" on 100 million string
>>> values column "USER_ID", I updated the cube json :
>>> "dictionaries": [     {       "column": "USER_ID",       "builder":
>>> "org.apache.kylin.dict.GlobalDictionaryBuilder"     }   ],
>>> 
>>> "override_kylin_properties": {     "kylin.job.mr.config.override.mapred.map.child.java.opts":
>>> "-Xmx7g",     "kylin.job.mr.config.override.mapreduce.map.memory.mb":
>>> "7168"   }  when I build the cube, an error occurred on "#4 Step Name:
>>> Build Dimension Dictionary",
>>> the error log in "kylin.log" :
>>> 
>>> 2016-08-24 17:27:53,282 ERROR [pool-7-thread-10] dict.CachedTreeMap:239 :
>>> write value into /kylin_test1/kylin_metadata_test1/resources/GlobalDict/
>>> dict/LIUXIAOWEN.TEST_T_PBS_UV_FACT/USER_ID.tmp/cached_
>>> AQEByQXVzFd8r0YviP4x84YqUv-NcRiuCI2d exception: java.lang.RuntimeException
>>> java.lang.RuntimeException
>>>       at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
>>> build_writeNode(AppendTrieDictionary.java:605)
>>>       at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
>>> buildTrieBytes(AppendTrieDictionary.java:576)
>>>       at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
>>> write(AppendTrieDictionary.java:523)
>>>       at org.apache.kylin.dict.CachedTreeMap.writeValue(
>>> CachedTreeMap.java:234)
>>>       at org.apache.kylin.dict.CachedTreeMap.write(
>>> CachedTreeMap.java:374)
>>>       at org.apache.kylin.dict.AppendTrieDictionary.flushIndex(
>>> AppendTrieDictionary.java:1043)
>>>       at org.apache.kylin.dict.AppendTrieDictionary$Builder.
>>> build(AppendTrieDictionary.java:954)
>>>       at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
>>> GlobalDictionaryBuilder.java:82)
>>>       at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
>>> DictionaryGenerator.java:81)
>>>       at org.apache.kylin.dict.DictionaryManager.buildDictionary(
>>> DictionaryManager.java:323)
>>>       at org.apache.kylin.cube.CubeManager.buildDictionary(
>>> CubeManager.java:185)
>>>       at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>>> processSegment(DictionaryGeneratorCLI.java:51)
>>>       at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>>> processSegment(DictionaryGeneratorCLI.java:42)
>>>       at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
>>> CreateDictionaryJob.java:56)
>>>       at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>>>       at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>>>       at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
>>> doWork(HadoopShellExecutable.java:63)
>>>       at org.apache.kylin.job.execution.AbstractExecutable.
>>> execute(AbstractExecutable.java:112)
>>>       at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
>>> DefaultChainedExecutable.java:57)
>>>       at org.apache.kylin.job.execution.AbstractExecutable.
>>> execute(AbstractExecutable.java:112)
>>>       at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
>>> JobRunner.run(DefaultScheduler.java:127)
>>>       at java.util.concurrent.ThreadPoolExecutor.runWorker(
>>> ThreadPoolExecutor.java:1145)
>>>       at java.util.concurrent.ThreadPoolExecutor$Worker.run(
>>> ThreadPoolExecutor.java:615)
>>>       at java.lang.Thread.run(Thread.java:744)
>>> 2016-08-24 17:27:53,340 ERROR [pool-7-thread-10]
>>> common.HadoopShellExecutable:65 : error execute HadoopShellExecutable{id=
>>> 3a0f2751-dd2a-4a3b-a27a-58bfc0edbbfd-03, name=Build Dimension Dictionary,
>>> state=RUNNING}
>>> java.lang.RuntimeException
>>>       at org.apache.kylin.dict.CachedTreeMap.writeValue(
>>> CachedTreeMap.java:240)
>>>       at org.apache.kylin.dict.CachedTreeMap.write(
>>> CachedTreeMap.java:374)
>>>       at org.apache.kylin.dict.AppendTrieDictionary.flushIndex(
>>> AppendTrieDictionary.java:1043)
>>>       at org.apache.kylin.dict.AppendTrieDictionary$Builder.
>>> build(AppendTrieDictionary.java:954)
>>>       at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
>>> GlobalDictionaryBuilder.java:82)
>>>       at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
>>> DictionaryGenerator.java:81)
>>>       at org.apache.kylin.dict.DictionaryManager.buildDictionary(
>>> DictionaryManager.java:323)
>>>       at org.apache.kylin.cube.CubeManager.buildDictionary(
>>> CubeManager.java:185)
>>>       at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>>> processSegment(DictionaryGeneratorCLI.java:51)
>>>       at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>>> processSegment(DictionaryGeneratorCLI.java:42)
>>>       at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
>>> CreateDictionaryJob.java:56)
>>>       at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>>>       at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>>>       at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
>>> doWork(HadoopShellExecutable.java:63)
>>>       at org.apache.kylin.job.execution.AbstractExecutable.
>>> execute(AbstractExecutable.java:112)
>>>       at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
>>> DefaultChainedExecutable.java:57)
>>>       at org.apache.kylin.job.execution.AbstractExecutable.
>>> execute(AbstractExecutable.java:112)
>>>       at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
>>> JobRunner.run(DefaultScheduler.java:127)
>>>       at java.util.concurrent.ThreadPoolExecutor.runWorker(
>>> ThreadPoolExecutor.java:1145)
>>>       at java.util.concurrent.ThreadPoolExecutor$Worker.run(
>>> ThreadPoolExecutor.java:615)
>>>       at java.lang.Thread.run(Thread.java:744)
>>> 
>>>   and the error log in "kylin.out" :
>>> 
>>> Aug 24, 2016 5:25:32 PM com.google.common.cache.LocalCache
>>> processPendingNotifications
>>> WARNING: Exception thrown by removal listener
>>> java.lang.RuntimeException
>>>       at org.apache.kylin.dict.CachedTreeMap.writeValue(
>>> CachedTreeMap.java:240)
>>>       at org.apache.kylin.dict.CachedTreeMap.access$300(
>>> CachedTreeMap.java:52)
>>>       at org.apache.kylin.dict.CachedTreeMap$1.onRemoval(
>>> CachedTreeMap.java:149)
>>>       at com.google.common.cache.LocalCache.processPendingNotifications(
>>> LocalCache.java:2011)
>>>       at com.google.common.cache.LocalCache$Segment.
>>> runUnlockedCleanup(LocalCache.java:3501)
>>>       at com.google.common.cache.LocalCache$Segment.
>>> postWriteCleanup(LocalCache.java:3477)
>>>       at com.google.common.cache.LocalCache$Segment.put(
>>> LocalCache.java:2940)
>>>       at com.google.common.cache.LocalCache.put(LocalCache.java:4202)
>>>       at com.google.common.cache.LocalCache$LocalManualCache.
>>> put(LocalCache.java:4798)
>>>       at org.apache.kylin.dict.CachedTreeMap.put(CachedTreeMap.java:284)
>>>       at org.apache.kylin.dict.CachedTreeMap.put(CachedTreeMap.java:52)
>>>       at org.apache.kylin.dict.AppendTrieDictionary$Builder.
>>> addValue(AppendTrieDictionary.java:829)
>>>       at org.apache.kylin.dict.AppendTrieDictionary$Builder.
>>> addValue(AppendTrieDictionary.java:804)
>>>       at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
>>> GlobalDictionaryBuilder.java:78)
>>>       at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
>>> DictionaryGenerator.java:81)
>>>       at org.apache.kylin.dict.DictionaryManager.buildDictionary(
>>> DictionaryManager.java:323)
>>>       at org.apache.kylin.cube.CubeManager.buildDictionary(
>>> CubeManager.java:185)
>>>       at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>>> processSegment(DictionaryGeneratorCLI.java:51)
>>>       at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>>> processSegment(DictionaryGeneratorCLI.java:42)
>>>       at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
>>> CreateDictionaryJob.java:56)
>>>       at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>>>       at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>>>       at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
>>> doWork(HadoopShellExecutable.java:63)
>>>       at org.apache.kylin.job.execution.AbstractExecutable.
>>> execute(AbstractExecutable.java:112)
>>>       at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
>>> DefaultChainedExecutable.java:57)
>>>       at org.apache.kylin.job.execution.AbstractExecutable.
>>> execute(AbstractExecutable.java:112)
>>>       at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
>>> JobRunner.run(DefaultScheduler.java:127)
>>>       at java.util.concurrent.ThreadPoolExecutor.runWorker(
>>> ThreadPoolExecutor.java:1145)
>>>       at java.util.concurrent.ThreadPoolExecutor$Worker.run(
>>> ThreadPoolExecutor.java:615)
>>>       at java.lang.Thread.run(Thread.java:744)
>>> 
>>> usage: CreateDictionaryJob
>>> -cubename <cubename>         Cube name. For exmaple, flat_item_cube
>>> -input <input>               Input path
>>> -segmentname <segmentname>   Cube segment name
>>> 
>> 
>> 
>> 
>> -- 
>> With Warm regards
>> 
>> Yiming Liu (刘一鸣)

Re: Precisely Count Distinct on 100 million string values column

Posted by Yerui Sun <su...@gmail.com>.
That depends on your USER_ID carnality. I think your USER_ID should have duplicated values between segments, that’s why you use count **distinct**. If the USER_ID always different and show up only once, just count should be fine, no need to count **distinct**.

If the USER_ID carnality indeed over 2 billion, maybe you need create one cube every 21 days, and combine them into one hybrid cube? I’m not sure whether it worked, you can check http://kylin.apache.org/blog/2015/09/25/hybrid-model/  and have a try. 

> 在 2016年8月25日,12:56,lxw <lx...@qq.com> 写道:
> 
> Thanks, I got it.
> 
> We have 100 million new USER_IDs per day (segment), that means after 21 days, the building task will be failed?
> And we can't use "Precisely Count Distinct" in out scene?
> 
> 
> 
> 
> 
> ------------------ 原始邮件 ------------------
> 发件人: "Yerui Sun";<su...@gmail.com>;
> 发送时间: 2016年8月25日(星期四) 中午11:55
> 收件人: "dev"<de...@kylin.apache.org>; 
> 
> 主题: Re: Precisely Count Distinct on 100 million string values column
> 
> 
> 
> lxw,
> If the values exceed Integer.MAX_VALUE, exception will be threw when dictionary building.
> 
> You can firstly disable cube and then edit the json on web ui. The action button is in the ‘Admins’ of cube list table.
> 
> BTW, the 255 limitation could be removed in theory, however, that made the logic more complicated. You can have a try and contribute the patch if you’re interested.
> 
> Yiming,
> I will post a patch for more clearly exception message and some minor improve of GlobalDictionary. 
> But maybe later, it’s quite a busy week... 
> 
>> 在 2016年8月25日,10:05,lxw <lx...@qq.com> 写道:
>> 
>> Sorry, 
>> 
>> About question 1, 
>> I means if count distinct values of column data cross all segments exceed Integer.MAX_VALUE, what will be happened?
>> 
>> 
>> 
>> ------------------ 原始邮件 ------------------
>> 发件人: "lxw";<lx...@qq.com>;
>> 发送时间: 2016年8月25日(星期四) 上午10:01
>> 收件人: "dev"<de...@kylin.apache.org>; 
>> 
>> 主题: 回复: Precisely Count Distinct on 100 million string values column
>> 
>> 
>> 
>> I have 2 more questions:
>> 
>> 1. The capacity of the global dictionary is Integer.MAX_VALUE? If count distinct values of column data cross all segments, what will be happened? duplication or error ?
>> 
>> 2. Where I can manually edit a cube desc json? Now I use JAVA API to create or update cube.
>> 
>> Thanks!
>> 
>> 
>> 
>> ------------------ 原始邮件 ------------------
>> 发件人: "Yiming Liu";<li...@gmail.com>;
>> 发送时间: 2016年8月25日(星期四) 上午9:41
>> 收件人: "dev"<de...@kylin.apache.org>; "sunyerui"<su...@gmail.com>; 
>> 
>> 主题: Re: Precisely Count Distinct on 100 million string values column
>> 
>> 
>> 
>> Good found.
>> 
>> The code AppendTrieDictionary line 604:
>> 
>> // nValueBytes
>> if (n.part.length > 255)
>>   throw new RuntimeException();
>> 
>> Hi Yerui,
>> 
>> Could you add more comments for the 255 limit, with more meaningful exception?
>> 
>> 
>> 2016-08-24 20:44 GMT+08:00 lxw <lx...@qq.com>:
>> 
>>> It caused by length(USER_ID) > 255.
>>> After exclude these dirty data, it works .
>>> 
>>> 
>>> Total 150 million records, execute this query:
>>> 
>>> select city_code,
>>> sum(bid_request) as bid_request,
>>> count(distinct user_id) as uv
>>> from liuxiaowen.TEST_T_PBS_UV_FACT
>>> group by city_code
>>> order by uv desc limit 100
>>> 
>>> Kylin cost  7 seconds, and Hive cost 180 seconds, the result is same.
>>> 
>>> 
>>> 
>>> ------------------ Original ------------------
>>> From:  "lxw";<lx...@qq.com>;
>>> Date:  Wed, Aug 24, 2016 05:27 PM
>>> To:  "dev"<de...@kylin.apache.org>;
>>> 
>>> Subject:  Precisely Count Distinct on 100 million string values column
>>> 
>>> 
>>> 
>>> Hi,
>>> 
>>>   I am trying to use "Precisely Count Distinct" on 100 million string
>>> values column "USER_ID", I updated the cube json :
>>> "dictionaries": [     {       "column": "USER_ID",       "builder":
>>> "org.apache.kylin.dict.GlobalDictionaryBuilder"     }   ],
>>> 
>>> "override_kylin_properties": {     "kylin.job.mr.config.override.mapred.map.child.java.opts":
>>> "-Xmx7g",     "kylin.job.mr.config.override.mapreduce.map.memory.mb":
>>> "7168"   }  when I build the cube, an error occurred on "#4 Step Name:
>>> Build Dimension Dictionary",
>>> the error log in "kylin.log" :
>>> 
>>> 2016-08-24 17:27:53,282 ERROR [pool-7-thread-10] dict.CachedTreeMap:239 :
>>> write value into /kylin_test1/kylin_metadata_test1/resources/GlobalDict/
>>> dict/LIUXIAOWEN.TEST_T_PBS_UV_FACT/USER_ID.tmp/cached_
>>> AQEByQXVzFd8r0YviP4x84YqUv-NcRiuCI2d exception: java.lang.RuntimeException
>>> java.lang.RuntimeException
>>>       at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
>>> build_writeNode(AppendTrieDictionary.java:605)
>>>       at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
>>> buildTrieBytes(AppendTrieDictionary.java:576)
>>>       at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
>>> write(AppendTrieDictionary.java:523)
>>>       at org.apache.kylin.dict.CachedTreeMap.writeValue(
>>> CachedTreeMap.java:234)
>>>       at org.apache.kylin.dict.CachedTreeMap.write(
>>> CachedTreeMap.java:374)
>>>       at org.apache.kylin.dict.AppendTrieDictionary.flushIndex(
>>> AppendTrieDictionary.java:1043)
>>>       at org.apache.kylin.dict.AppendTrieDictionary$Builder.
>>> build(AppendTrieDictionary.java:954)
>>>       at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
>>> GlobalDictionaryBuilder.java:82)
>>>       at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
>>> DictionaryGenerator.java:81)
>>>       at org.apache.kylin.dict.DictionaryManager.buildDictionary(
>>> DictionaryManager.java:323)
>>>       at org.apache.kylin.cube.CubeManager.buildDictionary(
>>> CubeManager.java:185)
>>>       at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>>> processSegment(DictionaryGeneratorCLI.java:51)
>>>       at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>>> processSegment(DictionaryGeneratorCLI.java:42)
>>>       at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
>>> CreateDictionaryJob.java:56)
>>>       at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>>>       at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>>>       at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
>>> doWork(HadoopShellExecutable.java:63)
>>>       at org.apache.kylin.job.execution.AbstractExecutable.
>>> execute(AbstractExecutable.java:112)
>>>       at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
>>> DefaultChainedExecutable.java:57)
>>>       at org.apache.kylin.job.execution.AbstractExecutable.
>>> execute(AbstractExecutable.java:112)
>>>       at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
>>> JobRunner.run(DefaultScheduler.java:127)
>>>       at java.util.concurrent.ThreadPoolExecutor.runWorker(
>>> ThreadPoolExecutor.java:1145)
>>>       at java.util.concurrent.ThreadPoolExecutor$Worker.run(
>>> ThreadPoolExecutor.java:615)
>>>       at java.lang.Thread.run(Thread.java:744)
>>> 2016-08-24 17:27:53,340 ERROR [pool-7-thread-10]
>>> common.HadoopShellExecutable:65 : error execute HadoopShellExecutable{id=
>>> 3a0f2751-dd2a-4a3b-a27a-58bfc0edbbfd-03, name=Build Dimension Dictionary,
>>> state=RUNNING}
>>> java.lang.RuntimeException
>>>       at org.apache.kylin.dict.CachedTreeMap.writeValue(
>>> CachedTreeMap.java:240)
>>>       at org.apache.kylin.dict.CachedTreeMap.write(
>>> CachedTreeMap.java:374)
>>>       at org.apache.kylin.dict.AppendTrieDictionary.flushIndex(
>>> AppendTrieDictionary.java:1043)
>>>       at org.apache.kylin.dict.AppendTrieDictionary$Builder.
>>> build(AppendTrieDictionary.java:954)
>>>       at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
>>> GlobalDictionaryBuilder.java:82)
>>>       at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
>>> DictionaryGenerator.java:81)
>>>       at org.apache.kylin.dict.DictionaryManager.buildDictionary(
>>> DictionaryManager.java:323)
>>>       at org.apache.kylin.cube.CubeManager.buildDictionary(
>>> CubeManager.java:185)
>>>       at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>>> processSegment(DictionaryGeneratorCLI.java:51)
>>>       at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>>> processSegment(DictionaryGeneratorCLI.java:42)
>>>       at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
>>> CreateDictionaryJob.java:56)
>>>       at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>>>       at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>>>       at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
>>> doWork(HadoopShellExecutable.java:63)
>>>       at org.apache.kylin.job.execution.AbstractExecutable.
>>> execute(AbstractExecutable.java:112)
>>>       at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
>>> DefaultChainedExecutable.java:57)
>>>       at org.apache.kylin.job.execution.AbstractExecutable.
>>> execute(AbstractExecutable.java:112)
>>>       at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
>>> JobRunner.run(DefaultScheduler.java:127)
>>>       at java.util.concurrent.ThreadPoolExecutor.runWorker(
>>> ThreadPoolExecutor.java:1145)
>>>       at java.util.concurrent.ThreadPoolExecutor$Worker.run(
>>> ThreadPoolExecutor.java:615)
>>>       at java.lang.Thread.run(Thread.java:744)
>>> 
>>>   and the error log in "kylin.out" :
>>> 
>>> Aug 24, 2016 5:25:32 PM com.google.common.cache.LocalCache
>>> processPendingNotifications
>>> WARNING: Exception thrown by removal listener
>>> java.lang.RuntimeException
>>>       at org.apache.kylin.dict.CachedTreeMap.writeValue(
>>> CachedTreeMap.java:240)
>>>       at org.apache.kylin.dict.CachedTreeMap.access$300(
>>> CachedTreeMap.java:52)
>>>       at org.apache.kylin.dict.CachedTreeMap$1.onRemoval(
>>> CachedTreeMap.java:149)
>>>       at com.google.common.cache.LocalCache.processPendingNotifications(
>>> LocalCache.java:2011)
>>>       at com.google.common.cache.LocalCache$Segment.
>>> runUnlockedCleanup(LocalCache.java:3501)
>>>       at com.google.common.cache.LocalCache$Segment.
>>> postWriteCleanup(LocalCache.java:3477)
>>>       at com.google.common.cache.LocalCache$Segment.put(
>>> LocalCache.java:2940)
>>>       at com.google.common.cache.LocalCache.put(LocalCache.java:4202)
>>>       at com.google.common.cache.LocalCache$LocalManualCache.
>>> put(LocalCache.java:4798)
>>>       at org.apache.kylin.dict.CachedTreeMap.put(CachedTreeMap.java:284)
>>>       at org.apache.kylin.dict.CachedTreeMap.put(CachedTreeMap.java:52)
>>>       at org.apache.kylin.dict.AppendTrieDictionary$Builder.
>>> addValue(AppendTrieDictionary.java:829)
>>>       at org.apache.kylin.dict.AppendTrieDictionary$Builder.
>>> addValue(AppendTrieDictionary.java:804)
>>>       at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
>>> GlobalDictionaryBuilder.java:78)
>>>       at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
>>> DictionaryGenerator.java:81)
>>>       at org.apache.kylin.dict.DictionaryManager.buildDictionary(
>>> DictionaryManager.java:323)
>>>       at org.apache.kylin.cube.CubeManager.buildDictionary(
>>> CubeManager.java:185)
>>>       at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>>> processSegment(DictionaryGeneratorCLI.java:51)
>>>       at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>>> processSegment(DictionaryGeneratorCLI.java:42)
>>>       at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
>>> CreateDictionaryJob.java:56)
>>>       at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>>>       at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>>>       at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
>>> doWork(HadoopShellExecutable.java:63)
>>>       at org.apache.kylin.job.execution.AbstractExecutable.
>>> execute(AbstractExecutable.java:112)
>>>       at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
>>> DefaultChainedExecutable.java:57)
>>>       at org.apache.kylin.job.execution.AbstractExecutable.
>>> execute(AbstractExecutable.java:112)
>>>       at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
>>> JobRunner.run(DefaultScheduler.java:127)
>>>       at java.util.concurrent.ThreadPoolExecutor.runWorker(
>>> ThreadPoolExecutor.java:1145)
>>>       at java.util.concurrent.ThreadPoolExecutor$Worker.run(
>>> ThreadPoolExecutor.java:615)
>>>       at java.lang.Thread.run(Thread.java:744)
>>> 
>>> usage: CreateDictionaryJob
>>> -cubename <cubename>         Cube name. For exmaple, flat_item_cube
>>> -input <input>               Input path
>>> -segmentname <segmentname>   Cube segment name
>>> 
>> 
>> 
>> 
>> -- 
>> With Warm regards
>> 
>> Yiming Liu (刘一鸣)


回复: Precisely Count Distinct on 100 million string values column

Posted by lxw <lx...@qq.com>.
Thanks, I got it.

We have 100 million new USER_IDs per day (segment), that means after 21 days, the building task will be failed?
And we can't use "Precisely Count Distinct" in out scene?





------------------ 原始邮件 ------------------
发件人: "Yerui Sun";<su...@gmail.com>;
发送时间: 2016年8月25日(星期四) 中午11:55
收件人: "dev"<de...@kylin.apache.org>; 

主题: Re: Precisely Count Distinct on 100 million string values column



lxw,
If the values exceed Integer.MAX_VALUE, exception will be threw when dictionary building.

You can firstly disable cube and then edit the json on web ui. The action button is in the ‘Admins’ of cube list table.

BTW, the 255 limitation could be removed in theory, however, that made the logic more complicated. You can have a try and contribute the patch if you’re interested.

Yiming,
I will post a patch for more clearly exception message and some minor improve of GlobalDictionary. 
But maybe later, it’s quite a busy week... 

> 在 2016年8月25日,10:05,lxw <lx...@qq.com> 写道:
> 
> Sorry, 
> 
> About question 1, 
> I means if count distinct values of column data cross all segments exceed Integer.MAX_VALUE, what will be happened?
> 
> 
> 
> ------------------ 原始邮件 ------------------
> 发件人: "lxw";<lx...@qq.com>;
> 发送时间: 2016年8月25日(星期四) 上午10:01
> 收件人: "dev"<de...@kylin.apache.org>; 
> 
> 主题: 回复: Precisely Count Distinct on 100 million string values column
> 
> 
> 
> I have 2 more questions:
> 
> 1. The capacity of the global dictionary is Integer.MAX_VALUE? If count distinct values of column data cross all segments, what will be happened? duplication or error ?
> 
> 2. Where I can manually edit a cube desc json? Now I use JAVA API to create or update cube.
> 
> Thanks!
> 
> 
> 
> ------------------ 原始邮件 ------------------
> 发件人: "Yiming Liu";<li...@gmail.com>;
> 发送时间: 2016年8月25日(星期四) 上午9:41
> 收件人: "dev"<de...@kylin.apache.org>; "sunyerui"<su...@gmail.com>; 
> 
> 主题: Re: Precisely Count Distinct on 100 million string values column
> 
> 
> 
> Good found.
> 
> The code AppendTrieDictionary line 604:
> 
> // nValueBytes
> if (n.part.length > 255)
>    throw new RuntimeException();
> 
> Hi Yerui,
> 
> Could you add more comments for the 255 limit, with more meaningful exception?
> 
> 
> 2016-08-24 20:44 GMT+08:00 lxw <lx...@qq.com>:
> 
>> It caused by length(USER_ID) > 255.
>> After exclude these dirty data, it works .
>> 
>> 
>> Total 150 million records, execute this query:
>> 
>> select city_code,
>> sum(bid_request) as bid_request,
>> count(distinct user_id) as uv
>> from liuxiaowen.TEST_T_PBS_UV_FACT
>> group by city_code
>> order by uv desc limit 100
>> 
>> Kylin cost  7 seconds, and Hive cost 180 seconds, the result is same.
>> 
>> 
>> 
>> ------------------ Original ------------------
>> From:  "lxw";<lx...@qq.com>;
>> Date:  Wed, Aug 24, 2016 05:27 PM
>> To:  "dev"<de...@kylin.apache.org>;
>> 
>> Subject:  Precisely Count Distinct on 100 million string values column
>> 
>> 
>> 
>> Hi,
>> 
>>    I am trying to use "Precisely Count Distinct" on 100 million string
>> values column "USER_ID", I updated the cube json :
>> "dictionaries": [     {       "column": "USER_ID",       "builder":
>> "org.apache.kylin.dict.GlobalDictionaryBuilder"     }   ],
>> 
>> "override_kylin_properties": {     "kylin.job.mr.config.override.mapred.map.child.java.opts":
>> "-Xmx7g",     "kylin.job.mr.config.override.mapreduce.map.memory.mb":
>> "7168"   }  when I build the cube, an error occurred on "#4 Step Name:
>> Build Dimension Dictionary",
>>  the error log in "kylin.log" :
>> 
>> 2016-08-24 17:27:53,282 ERROR [pool-7-thread-10] dict.CachedTreeMap:239 :
>> write value into /kylin_test1/kylin_metadata_test1/resources/GlobalDict/
>> dict/LIUXIAOWEN.TEST_T_PBS_UV_FACT/USER_ID.tmp/cached_
>> AQEByQXVzFd8r0YviP4x84YqUv-NcRiuCI2d exception: java.lang.RuntimeException
>> java.lang.RuntimeException
>>        at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
>> build_writeNode(AppendTrieDictionary.java:605)
>>        at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
>> buildTrieBytes(AppendTrieDictionary.java:576)
>>        at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
>> write(AppendTrieDictionary.java:523)
>>        at org.apache.kylin.dict.CachedTreeMap.writeValue(
>> CachedTreeMap.java:234)
>>        at org.apache.kylin.dict.CachedTreeMap.write(
>> CachedTreeMap.java:374)
>>        at org.apache.kylin.dict.AppendTrieDictionary.flushIndex(
>> AppendTrieDictionary.java:1043)
>>        at org.apache.kylin.dict.AppendTrieDictionary$Builder.
>> build(AppendTrieDictionary.java:954)
>>        at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
>> GlobalDictionaryBuilder.java:82)
>>        at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
>> DictionaryGenerator.java:81)
>>        at org.apache.kylin.dict.DictionaryManager.buildDictionary(
>> DictionaryManager.java:323)
>>        at org.apache.kylin.cube.CubeManager.buildDictionary(
>> CubeManager.java:185)
>>        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>> processSegment(DictionaryGeneratorCLI.java:51)
>>        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>> processSegment(DictionaryGeneratorCLI.java:42)
>>        at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
>> CreateDictionaryJob.java:56)
>>        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>>        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>>        at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
>> doWork(HadoopShellExecutable.java:63)
>>        at org.apache.kylin.job.execution.AbstractExecutable.
>> execute(AbstractExecutable.java:112)
>>        at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
>> DefaultChainedExecutable.java:57)
>>        at org.apache.kylin.job.execution.AbstractExecutable.
>> execute(AbstractExecutable.java:112)
>>        at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
>> JobRunner.run(DefaultScheduler.java:127)
>>        at java.util.concurrent.ThreadPoolExecutor.runWorker(
>> ThreadPoolExecutor.java:1145)
>>        at java.util.concurrent.ThreadPoolExecutor$Worker.run(
>> ThreadPoolExecutor.java:615)
>>        at java.lang.Thread.run(Thread.java:744)
>> 2016-08-24 17:27:53,340 ERROR [pool-7-thread-10]
>> common.HadoopShellExecutable:65 : error execute HadoopShellExecutable{id=
>> 3a0f2751-dd2a-4a3b-a27a-58bfc0edbbfd-03, name=Build Dimension Dictionary,
>> state=RUNNING}
>> java.lang.RuntimeException
>>        at org.apache.kylin.dict.CachedTreeMap.writeValue(
>> CachedTreeMap.java:240)
>>        at org.apache.kylin.dict.CachedTreeMap.write(
>> CachedTreeMap.java:374)
>>        at org.apache.kylin.dict.AppendTrieDictionary.flushIndex(
>> AppendTrieDictionary.java:1043)
>>        at org.apache.kylin.dict.AppendTrieDictionary$Builder.
>> build(AppendTrieDictionary.java:954)
>>        at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
>> GlobalDictionaryBuilder.java:82)
>>        at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
>> DictionaryGenerator.java:81)
>>        at org.apache.kylin.dict.DictionaryManager.buildDictionary(
>> DictionaryManager.java:323)
>>        at org.apache.kylin.cube.CubeManager.buildDictionary(
>> CubeManager.java:185)
>>        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>> processSegment(DictionaryGeneratorCLI.java:51)
>>        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>> processSegment(DictionaryGeneratorCLI.java:42)
>>        at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
>> CreateDictionaryJob.java:56)
>>        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>>        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>>        at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
>> doWork(HadoopShellExecutable.java:63)
>>        at org.apache.kylin.job.execution.AbstractExecutable.
>> execute(AbstractExecutable.java:112)
>>        at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
>> DefaultChainedExecutable.java:57)
>>        at org.apache.kylin.job.execution.AbstractExecutable.
>> execute(AbstractExecutable.java:112)
>>        at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
>> JobRunner.run(DefaultScheduler.java:127)
>>        at java.util.concurrent.ThreadPoolExecutor.runWorker(
>> ThreadPoolExecutor.java:1145)
>>        at java.util.concurrent.ThreadPoolExecutor$Worker.run(
>> ThreadPoolExecutor.java:615)
>>        at java.lang.Thread.run(Thread.java:744)
>> 
>>    and the error log in "kylin.out" :
>> 
>> Aug 24, 2016 5:25:32 PM com.google.common.cache.LocalCache
>> processPendingNotifications
>> WARNING: Exception thrown by removal listener
>> java.lang.RuntimeException
>>        at org.apache.kylin.dict.CachedTreeMap.writeValue(
>> CachedTreeMap.java:240)
>>        at org.apache.kylin.dict.CachedTreeMap.access$300(
>> CachedTreeMap.java:52)
>>        at org.apache.kylin.dict.CachedTreeMap$1.onRemoval(
>> CachedTreeMap.java:149)
>>        at com.google.common.cache.LocalCache.processPendingNotifications(
>> LocalCache.java:2011)
>>        at com.google.common.cache.LocalCache$Segment.
>> runUnlockedCleanup(LocalCache.java:3501)
>>        at com.google.common.cache.LocalCache$Segment.
>> postWriteCleanup(LocalCache.java:3477)
>>        at com.google.common.cache.LocalCache$Segment.put(
>> LocalCache.java:2940)
>>        at com.google.common.cache.LocalCache.put(LocalCache.java:4202)
>>        at com.google.common.cache.LocalCache$LocalManualCache.
>> put(LocalCache.java:4798)
>>        at org.apache.kylin.dict.CachedTreeMap.put(CachedTreeMap.java:284)
>>        at org.apache.kylin.dict.CachedTreeMap.put(CachedTreeMap.java:52)
>>        at org.apache.kylin.dict.AppendTrieDictionary$Builder.
>> addValue(AppendTrieDictionary.java:829)
>>        at org.apache.kylin.dict.AppendTrieDictionary$Builder.
>> addValue(AppendTrieDictionary.java:804)
>>        at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
>> GlobalDictionaryBuilder.java:78)
>>        at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
>> DictionaryGenerator.java:81)
>>        at org.apache.kylin.dict.DictionaryManager.buildDictionary(
>> DictionaryManager.java:323)
>>        at org.apache.kylin.cube.CubeManager.buildDictionary(
>> CubeManager.java:185)
>>        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>> processSegment(DictionaryGeneratorCLI.java:51)
>>        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>> processSegment(DictionaryGeneratorCLI.java:42)
>>        at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
>> CreateDictionaryJob.java:56)
>>        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>>        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>>        at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
>> doWork(HadoopShellExecutable.java:63)
>>        at org.apache.kylin.job.execution.AbstractExecutable.
>> execute(AbstractExecutable.java:112)
>>        at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
>> DefaultChainedExecutable.java:57)
>>        at org.apache.kylin.job.execution.AbstractExecutable.
>> execute(AbstractExecutable.java:112)
>>        at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
>> JobRunner.run(DefaultScheduler.java:127)
>>        at java.util.concurrent.ThreadPoolExecutor.runWorker(
>> ThreadPoolExecutor.java:1145)
>>        at java.util.concurrent.ThreadPoolExecutor$Worker.run(
>> ThreadPoolExecutor.java:615)
>>        at java.lang.Thread.run(Thread.java:744)
>> 
>> usage: CreateDictionaryJob
>> -cubename <cubename>         Cube name. For exmaple, flat_item_cube
>> -input <input>               Input path
>> -segmentname <segmentname>   Cube segment name
>> 
> 
> 
> 
> -- 
> With Warm regards
> 
> Yiming Liu (刘一鸣)

Re: Precisely Count Distinct on 100 million string values column

Posted by Yerui Sun <su...@gmail.com>.
lxw,
If the values exceed Integer.MAX_VALUE, exception will be threw when dictionary building.

You can firstly disable cube and then edit the json on web ui. The action button is in the ‘Admins’ of cube list table.

BTW, the 255 limitation could be removed in theory, however, that made the logic more complicated. You can have a try and contribute the patch if you’re interested.

Yiming,
I will post a patch for more clearly exception message and some minor improve of GlobalDictionary. 
But maybe later, it’s quite a busy week... 

> 在 2016年8月25日,10:05,lxw <lx...@qq.com> 写道:
> 
> Sorry, 
> 
> About question 1, 
> I means if count distinct values of column data cross all segments exceed Integer.MAX_VALUE, what will be happened?
> 
> 
> 
> ------------------ 原始邮件 ------------------
> 发件人: "lxw";<lx...@qq.com>;
> 发送时间: 2016年8月25日(星期四) 上午10:01
> 收件人: "dev"<de...@kylin.apache.org>; 
> 
> 主题: 回复: Precisely Count Distinct on 100 million string values column
> 
> 
> 
> I have 2 more questions:
> 
> 1. The capacity of the global dictionary is Integer.MAX_VALUE? If count distinct values of column data cross all segments, what will be happened? duplication or error ?
> 
> 2. Where I can manually edit a cube desc json? Now I use JAVA API to create or update cube.
> 
> Thanks!
> 
> 
> 
> ------------------ 原始邮件 ------------------
> 发件人: "Yiming Liu";<li...@gmail.com>;
> 发送时间: 2016年8月25日(星期四) 上午9:41
> 收件人: "dev"<de...@kylin.apache.org>; "sunyerui"<su...@gmail.com>; 
> 
> 主题: Re: Precisely Count Distinct on 100 million string values column
> 
> 
> 
> Good found.
> 
> The code AppendTrieDictionary line 604:
> 
> // nValueBytes
> if (n.part.length > 255)
>    throw new RuntimeException();
> 
> Hi Yerui,
> 
> Could you add more comments for the 255 limit, with more meaningful exception?
> 
> 
> 2016-08-24 20:44 GMT+08:00 lxw <lx...@qq.com>:
> 
>> It caused by length(USER_ID) > 255.
>> After exclude these dirty data, it works .
>> 
>> 
>> Total 150 million records, execute this query:
>> 
>> select city_code,
>> sum(bid_request) as bid_request,
>> count(distinct user_id) as uv
>> from liuxiaowen.TEST_T_PBS_UV_FACT
>> group by city_code
>> order by uv desc limit 100
>> 
>> Kylin cost  7 seconds, and Hive cost 180 seconds, the result is same.
>> 
>> 
>> 
>> ------------------ Original ------------------
>> From:  "lxw";<lx...@qq.com>;
>> Date:  Wed, Aug 24, 2016 05:27 PM
>> To:  "dev"<de...@kylin.apache.org>;
>> 
>> Subject:  Precisely Count Distinct on 100 million string values column
>> 
>> 
>> 
>> Hi,
>> 
>>    I am trying to use "Precisely Count Distinct" on 100 million string
>> values column "USER_ID", I updated the cube json :
>> "dictionaries": [     {       "column": "USER_ID",       "builder":
>> "org.apache.kylin.dict.GlobalDictionaryBuilder"     }   ],
>> 
>> "override_kylin_properties": {     "kylin.job.mr.config.override.mapred.map.child.java.opts":
>> "-Xmx7g",     "kylin.job.mr.config.override.mapreduce.map.memory.mb":
>> "7168"   }  when I build the cube, an error occurred on "#4 Step Name:
>> Build Dimension Dictionary",
>>  the error log in "kylin.log" :
>> 
>> 2016-08-24 17:27:53,282 ERROR [pool-7-thread-10] dict.CachedTreeMap:239 :
>> write value into /kylin_test1/kylin_metadata_test1/resources/GlobalDict/
>> dict/LIUXIAOWEN.TEST_T_PBS_UV_FACT/USER_ID.tmp/cached_
>> AQEByQXVzFd8r0YviP4x84YqUv-NcRiuCI2d exception: java.lang.RuntimeException
>> java.lang.RuntimeException
>>        at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
>> build_writeNode(AppendTrieDictionary.java:605)
>>        at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
>> buildTrieBytes(AppendTrieDictionary.java:576)
>>        at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
>> write(AppendTrieDictionary.java:523)
>>        at org.apache.kylin.dict.CachedTreeMap.writeValue(
>> CachedTreeMap.java:234)
>>        at org.apache.kylin.dict.CachedTreeMap.write(
>> CachedTreeMap.java:374)
>>        at org.apache.kylin.dict.AppendTrieDictionary.flushIndex(
>> AppendTrieDictionary.java:1043)
>>        at org.apache.kylin.dict.AppendTrieDictionary$Builder.
>> build(AppendTrieDictionary.java:954)
>>        at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
>> GlobalDictionaryBuilder.java:82)
>>        at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
>> DictionaryGenerator.java:81)
>>        at org.apache.kylin.dict.DictionaryManager.buildDictionary(
>> DictionaryManager.java:323)
>>        at org.apache.kylin.cube.CubeManager.buildDictionary(
>> CubeManager.java:185)
>>        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>> processSegment(DictionaryGeneratorCLI.java:51)
>>        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>> processSegment(DictionaryGeneratorCLI.java:42)
>>        at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
>> CreateDictionaryJob.java:56)
>>        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>>        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>>        at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
>> doWork(HadoopShellExecutable.java:63)
>>        at org.apache.kylin.job.execution.AbstractExecutable.
>> execute(AbstractExecutable.java:112)
>>        at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
>> DefaultChainedExecutable.java:57)
>>        at org.apache.kylin.job.execution.AbstractExecutable.
>> execute(AbstractExecutable.java:112)
>>        at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
>> JobRunner.run(DefaultScheduler.java:127)
>>        at java.util.concurrent.ThreadPoolExecutor.runWorker(
>> ThreadPoolExecutor.java:1145)
>>        at java.util.concurrent.ThreadPoolExecutor$Worker.run(
>> ThreadPoolExecutor.java:615)
>>        at java.lang.Thread.run(Thread.java:744)
>> 2016-08-24 17:27:53,340 ERROR [pool-7-thread-10]
>> common.HadoopShellExecutable:65 : error execute HadoopShellExecutable{id=
>> 3a0f2751-dd2a-4a3b-a27a-58bfc0edbbfd-03, name=Build Dimension Dictionary,
>> state=RUNNING}
>> java.lang.RuntimeException
>>        at org.apache.kylin.dict.CachedTreeMap.writeValue(
>> CachedTreeMap.java:240)
>>        at org.apache.kylin.dict.CachedTreeMap.write(
>> CachedTreeMap.java:374)
>>        at org.apache.kylin.dict.AppendTrieDictionary.flushIndex(
>> AppendTrieDictionary.java:1043)
>>        at org.apache.kylin.dict.AppendTrieDictionary$Builder.
>> build(AppendTrieDictionary.java:954)
>>        at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
>> GlobalDictionaryBuilder.java:82)
>>        at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
>> DictionaryGenerator.java:81)
>>        at org.apache.kylin.dict.DictionaryManager.buildDictionary(
>> DictionaryManager.java:323)
>>        at org.apache.kylin.cube.CubeManager.buildDictionary(
>> CubeManager.java:185)
>>        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>> processSegment(DictionaryGeneratorCLI.java:51)
>>        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>> processSegment(DictionaryGeneratorCLI.java:42)
>>        at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
>> CreateDictionaryJob.java:56)
>>        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>>        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>>        at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
>> doWork(HadoopShellExecutable.java:63)
>>        at org.apache.kylin.job.execution.AbstractExecutable.
>> execute(AbstractExecutable.java:112)
>>        at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
>> DefaultChainedExecutable.java:57)
>>        at org.apache.kylin.job.execution.AbstractExecutable.
>> execute(AbstractExecutable.java:112)
>>        at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
>> JobRunner.run(DefaultScheduler.java:127)
>>        at java.util.concurrent.ThreadPoolExecutor.runWorker(
>> ThreadPoolExecutor.java:1145)
>>        at java.util.concurrent.ThreadPoolExecutor$Worker.run(
>> ThreadPoolExecutor.java:615)
>>        at java.lang.Thread.run(Thread.java:744)
>> 
>>    and the error log in "kylin.out" :
>> 
>> Aug 24, 2016 5:25:32 PM com.google.common.cache.LocalCache
>> processPendingNotifications
>> WARNING: Exception thrown by removal listener
>> java.lang.RuntimeException
>>        at org.apache.kylin.dict.CachedTreeMap.writeValue(
>> CachedTreeMap.java:240)
>>        at org.apache.kylin.dict.CachedTreeMap.access$300(
>> CachedTreeMap.java:52)
>>        at org.apache.kylin.dict.CachedTreeMap$1.onRemoval(
>> CachedTreeMap.java:149)
>>        at com.google.common.cache.LocalCache.processPendingNotifications(
>> LocalCache.java:2011)
>>        at com.google.common.cache.LocalCache$Segment.
>> runUnlockedCleanup(LocalCache.java:3501)
>>        at com.google.common.cache.LocalCache$Segment.
>> postWriteCleanup(LocalCache.java:3477)
>>        at com.google.common.cache.LocalCache$Segment.put(
>> LocalCache.java:2940)
>>        at com.google.common.cache.LocalCache.put(LocalCache.java:4202)
>>        at com.google.common.cache.LocalCache$LocalManualCache.
>> put(LocalCache.java:4798)
>>        at org.apache.kylin.dict.CachedTreeMap.put(CachedTreeMap.java:284)
>>        at org.apache.kylin.dict.CachedTreeMap.put(CachedTreeMap.java:52)
>>        at org.apache.kylin.dict.AppendTrieDictionary$Builder.
>> addValue(AppendTrieDictionary.java:829)
>>        at org.apache.kylin.dict.AppendTrieDictionary$Builder.
>> addValue(AppendTrieDictionary.java:804)
>>        at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
>> GlobalDictionaryBuilder.java:78)
>>        at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
>> DictionaryGenerator.java:81)
>>        at org.apache.kylin.dict.DictionaryManager.buildDictionary(
>> DictionaryManager.java:323)
>>        at org.apache.kylin.cube.CubeManager.buildDictionary(
>> CubeManager.java:185)
>>        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>> processSegment(DictionaryGeneratorCLI.java:51)
>>        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
>> processSegment(DictionaryGeneratorCLI.java:42)
>>        at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
>> CreateDictionaryJob.java:56)
>>        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>>        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>>        at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
>> doWork(HadoopShellExecutable.java:63)
>>        at org.apache.kylin.job.execution.AbstractExecutable.
>> execute(AbstractExecutable.java:112)
>>        at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
>> DefaultChainedExecutable.java:57)
>>        at org.apache.kylin.job.execution.AbstractExecutable.
>> execute(AbstractExecutable.java:112)
>>        at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
>> JobRunner.run(DefaultScheduler.java:127)
>>        at java.util.concurrent.ThreadPoolExecutor.runWorker(
>> ThreadPoolExecutor.java:1145)
>>        at java.util.concurrent.ThreadPoolExecutor$Worker.run(
>> ThreadPoolExecutor.java:615)
>>        at java.lang.Thread.run(Thread.java:744)
>> 
>> usage: CreateDictionaryJob
>> -cubename <cubename>         Cube name. For exmaple, flat_item_cube
>> -input <input>               Input path
>> -segmentname <segmentname>   Cube segment name
>> 
> 
> 
> 
> -- 
> With Warm regards
> 
> Yiming Liu (刘一鸣)


回复: Precisely Count Distinct on 100 million string values column

Posted by lxw <lx...@qq.com>.
Sorry, 

About question 1, 
I means if count distinct values of column data cross all segments exceed Integer.MAX_VALUE, what will be happened?



------------------ 原始邮件 ------------------
发件人: "lxw";<lx...@qq.com>;
发送时间: 2016年8月25日(星期四) 上午10:01
收件人: "dev"<de...@kylin.apache.org>; 

主题: 回复: Precisely Count Distinct on 100 million string values column



I have 2 more questions:

1. The capacity of the global dictionary is Integer.MAX_VALUE? If count distinct values of column data cross all segments, what will be happened? duplication or error ?

2. Where I can manually edit a cube desc json? Now I use JAVA API to create or update cube.

Thanks!



------------------ 原始邮件 ------------------
发件人: "Yiming Liu";<li...@gmail.com>;
发送时间: 2016年8月25日(星期四) 上午9:41
收件人: "dev"<de...@kylin.apache.org>; "sunyerui"<su...@gmail.com>; 

主题: Re: Precisely Count Distinct on 100 million string values column



Good found.

The code AppendTrieDictionary line 604:

// nValueBytes
if (n.part.length > 255)
    throw new RuntimeException();

Hi Yerui,

Could you add more comments for the 255 limit, with more meaningful exception?


2016-08-24 20:44 GMT+08:00 lxw <lx...@qq.com>:

> It caused by length(USER_ID) > 255.
> After exclude these dirty data, it works .
>
>
> Total 150 million records, execute this query:
>
> select city_code,
> sum(bid_request) as bid_request,
> count(distinct user_id) as uv
> from liuxiaowen.TEST_T_PBS_UV_FACT
> group by city_code
> order by uv desc limit 100
>
> Kylin cost  7 seconds, and Hive cost 180 seconds, the result is same.
>
>
>
> ------------------ Original ------------------
> From:  "lxw";<lx...@qq.com>;
> Date:  Wed, Aug 24, 2016 05:27 PM
> To:  "dev"<de...@kylin.apache.org>;
>
> Subject:  Precisely Count Distinct on 100 million string values column
>
>
>
> Hi,
>
>     I am trying to use "Precisely Count Distinct" on 100 million string
> values column "USER_ID", I updated the cube json :
> "dictionaries": [     {       "column": "USER_ID",       "builder":
> "org.apache.kylin.dict.GlobalDictionaryBuilder"     }   ],
>
> "override_kylin_properties": {     "kylin.job.mr.config.override.mapred.map.child.java.opts":
> "-Xmx7g",     "kylin.job.mr.config.override.mapreduce.map.memory.mb":
> "7168"   }  when I build the cube, an error occurred on "#4 Step Name:
> Build Dimension Dictionary",
>   the error log in "kylin.log" :
>
> 2016-08-24 17:27:53,282 ERROR [pool-7-thread-10] dict.CachedTreeMap:239 :
> write value into /kylin_test1/kylin_metadata_test1/resources/GlobalDict/
> dict/LIUXIAOWEN.TEST_T_PBS_UV_FACT/USER_ID.tmp/cached_
> AQEByQXVzFd8r0YviP4x84YqUv-NcRiuCI2d exception: java.lang.RuntimeException
> java.lang.RuntimeException
>         at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
> build_writeNode(AppendTrieDictionary.java:605)
>         at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
> buildTrieBytes(AppendTrieDictionary.java:576)
>         at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
> write(AppendTrieDictionary.java:523)
>         at org.apache.kylin.dict.CachedTreeMap.writeValue(
> CachedTreeMap.java:234)
>         at org.apache.kylin.dict.CachedTreeMap.write(
> CachedTreeMap.java:374)
>         at org.apache.kylin.dict.AppendTrieDictionary.flushIndex(
> AppendTrieDictionary.java:1043)
>         at org.apache.kylin.dict.AppendTrieDictionary$Builder.
> build(AppendTrieDictionary.java:954)
>         at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
> GlobalDictionaryBuilder.java:82)
>         at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
> DictionaryGenerator.java:81)
>         at org.apache.kylin.dict.DictionaryManager.buildDictionary(
> DictionaryManager.java:323)
>         at org.apache.kylin.cube.CubeManager.buildDictionary(
> CubeManager.java:185)
>         at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> processSegment(DictionaryGeneratorCLI.java:51)
>         at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> processSegment(DictionaryGeneratorCLI.java:42)
>         at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
> CreateDictionaryJob.java:56)
>         at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>         at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>         at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
> doWork(HadoopShellExecutable.java:63)
>         at org.apache.kylin.job.execution.AbstractExecutable.
> execute(AbstractExecutable.java:112)
>         at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
> DefaultChainedExecutable.java:57)
>         at org.apache.kylin.job.execution.AbstractExecutable.
> execute(AbstractExecutable.java:112)
>         at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
> JobRunner.run(DefaultScheduler.java:127)
>         at java.util.concurrent.ThreadPoolExecutor.runWorker(
> ThreadPoolExecutor.java:1145)
>         at java.util.concurrent.ThreadPoolExecutor$Worker.run(
> ThreadPoolExecutor.java:615)
>         at java.lang.Thread.run(Thread.java:744)
> 2016-08-24 17:27:53,340 ERROR [pool-7-thread-10]
> common.HadoopShellExecutable:65 : error execute HadoopShellExecutable{id=
> 3a0f2751-dd2a-4a3b-a27a-58bfc0edbbfd-03, name=Build Dimension Dictionary,
> state=RUNNING}
> java.lang.RuntimeException
>         at org.apache.kylin.dict.CachedTreeMap.writeValue(
> CachedTreeMap.java:240)
>         at org.apache.kylin.dict.CachedTreeMap.write(
> CachedTreeMap.java:374)
>         at org.apache.kylin.dict.AppendTrieDictionary.flushIndex(
> AppendTrieDictionary.java:1043)
>         at org.apache.kylin.dict.AppendTrieDictionary$Builder.
> build(AppendTrieDictionary.java:954)
>         at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
> GlobalDictionaryBuilder.java:82)
>         at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
> DictionaryGenerator.java:81)
>         at org.apache.kylin.dict.DictionaryManager.buildDictionary(
> DictionaryManager.java:323)
>         at org.apache.kylin.cube.CubeManager.buildDictionary(
> CubeManager.java:185)
>         at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> processSegment(DictionaryGeneratorCLI.java:51)
>         at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> processSegment(DictionaryGeneratorCLI.java:42)
>         at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
> CreateDictionaryJob.java:56)
>         at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>         at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>         at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
> doWork(HadoopShellExecutable.java:63)
>         at org.apache.kylin.job.execution.AbstractExecutable.
> execute(AbstractExecutable.java:112)
>         at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
> DefaultChainedExecutable.java:57)
>         at org.apache.kylin.job.execution.AbstractExecutable.
> execute(AbstractExecutable.java:112)
>         at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
> JobRunner.run(DefaultScheduler.java:127)
>         at java.util.concurrent.ThreadPoolExecutor.runWorker(
> ThreadPoolExecutor.java:1145)
>         at java.util.concurrent.ThreadPoolExecutor$Worker.run(
> ThreadPoolExecutor.java:615)
>         at java.lang.Thread.run(Thread.java:744)
>
>     and the error log in "kylin.out" :
>
>  Aug 24, 2016 5:25:32 PM com.google.common.cache.LocalCache
> processPendingNotifications
> WARNING: Exception thrown by removal listener
> java.lang.RuntimeException
>         at org.apache.kylin.dict.CachedTreeMap.writeValue(
> CachedTreeMap.java:240)
>         at org.apache.kylin.dict.CachedTreeMap.access$300(
> CachedTreeMap.java:52)
>         at org.apache.kylin.dict.CachedTreeMap$1.onRemoval(
> CachedTreeMap.java:149)
>         at com.google.common.cache.LocalCache.processPendingNotifications(
> LocalCache.java:2011)
>         at com.google.common.cache.LocalCache$Segment.
> runUnlockedCleanup(LocalCache.java:3501)
>         at com.google.common.cache.LocalCache$Segment.
> postWriteCleanup(LocalCache.java:3477)
>         at com.google.common.cache.LocalCache$Segment.put(
> LocalCache.java:2940)
>         at com.google.common.cache.LocalCache.put(LocalCache.java:4202)
>         at com.google.common.cache.LocalCache$LocalManualCache.
> put(LocalCache.java:4798)
>         at org.apache.kylin.dict.CachedTreeMap.put(CachedTreeMap.java:284)
>         at org.apache.kylin.dict.CachedTreeMap.put(CachedTreeMap.java:52)
>         at org.apache.kylin.dict.AppendTrieDictionary$Builder.
> addValue(AppendTrieDictionary.java:829)
>         at org.apache.kylin.dict.AppendTrieDictionary$Builder.
> addValue(AppendTrieDictionary.java:804)
>         at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
> GlobalDictionaryBuilder.java:78)
>         at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
> DictionaryGenerator.java:81)
>         at org.apache.kylin.dict.DictionaryManager.buildDictionary(
> DictionaryManager.java:323)
>         at org.apache.kylin.cube.CubeManager.buildDictionary(
> CubeManager.java:185)
>         at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> processSegment(DictionaryGeneratorCLI.java:51)
>         at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> processSegment(DictionaryGeneratorCLI.java:42)
>         at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
> CreateDictionaryJob.java:56)
>         at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>         at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>         at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
> doWork(HadoopShellExecutable.java:63)
>         at org.apache.kylin.job.execution.AbstractExecutable.
> execute(AbstractExecutable.java:112)
>         at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
> DefaultChainedExecutable.java:57)
>         at org.apache.kylin.job.execution.AbstractExecutable.
> execute(AbstractExecutable.java:112)
>         at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
> JobRunner.run(DefaultScheduler.java:127)
>         at java.util.concurrent.ThreadPoolExecutor.runWorker(
> ThreadPoolExecutor.java:1145)
>         at java.util.concurrent.ThreadPoolExecutor$Worker.run(
> ThreadPoolExecutor.java:615)
>         at java.lang.Thread.run(Thread.java:744)
>
> usage: CreateDictionaryJob
>  -cubename <cubename>         Cube name. For exmaple, flat_item_cube
>  -input <input>               Input path
>  -segmentname <segmentname>   Cube segment name
>



-- 
With Warm regards

Yiming Liu (刘一鸣)

回复: Precisely Count Distinct on 100 million string values column

Posted by lxw <lx...@qq.com>.
I have 2 more questions:

1. The capacity of the global dictionary is Integer.MAX_VALUE? If count distinct values of column data cross all segments, what will be happened? duplication or error ?

2. Where I can manually edit a cube desc json? Now I use JAVA API to create or update cube.

Thanks!



------------------ 原始邮件 ------------------
发件人: "Yiming Liu";<li...@gmail.com>;
发送时间: 2016年8月25日(星期四) 上午9:41
收件人: "dev"<de...@kylin.apache.org>; "sunyerui"<su...@gmail.com>; 

主题: Re: Precisely Count Distinct on 100 million string values column



Good found.

The code AppendTrieDictionary line 604:

// nValueBytes
if (n.part.length > 255)
    throw new RuntimeException();

Hi Yerui,

Could you add more comments for the 255 limit, with more meaningful exception?


2016-08-24 20:44 GMT+08:00 lxw <lx...@qq.com>:

> It caused by length(USER_ID) > 255.
> After exclude these dirty data, it works .
>
>
> Total 150 million records, execute this query:
>
> select city_code,
> sum(bid_request) as bid_request,
> count(distinct user_id) as uv
> from liuxiaowen.TEST_T_PBS_UV_FACT
> group by city_code
> order by uv desc limit 100
>
> Kylin cost  7 seconds, and Hive cost 180 seconds, the result is same.
>
>
>
> ------------------ Original ------------------
> From:  "lxw";<lx...@qq.com>;
> Date:  Wed, Aug 24, 2016 05:27 PM
> To:  "dev"<de...@kylin.apache.org>;
>
> Subject:  Precisely Count Distinct on 100 million string values column
>
>
>
> Hi,
>
>     I am trying to use "Precisely Count Distinct" on 100 million string
> values column "USER_ID", I updated the cube json :
> "dictionaries": [     {       "column": "USER_ID",       "builder":
> "org.apache.kylin.dict.GlobalDictionaryBuilder"     }   ],
>
> "override_kylin_properties": {     "kylin.job.mr.config.override.mapred.map.child.java.opts":
> "-Xmx7g",     "kylin.job.mr.config.override.mapreduce.map.memory.mb":
> "7168"   }  when I build the cube, an error occurred on "#4 Step Name:
> Build Dimension Dictionary",
>   the error log in "kylin.log" :
>
> 2016-08-24 17:27:53,282 ERROR [pool-7-thread-10] dict.CachedTreeMap:239 :
> write value into /kylin_test1/kylin_metadata_test1/resources/GlobalDict/
> dict/LIUXIAOWEN.TEST_T_PBS_UV_FACT/USER_ID.tmp/cached_
> AQEByQXVzFd8r0YviP4x84YqUv-NcRiuCI2d exception: java.lang.RuntimeException
> java.lang.RuntimeException
>         at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
> build_writeNode(AppendTrieDictionary.java:605)
>         at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
> buildTrieBytes(AppendTrieDictionary.java:576)
>         at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
> write(AppendTrieDictionary.java:523)
>         at org.apache.kylin.dict.CachedTreeMap.writeValue(
> CachedTreeMap.java:234)
>         at org.apache.kylin.dict.CachedTreeMap.write(
> CachedTreeMap.java:374)
>         at org.apache.kylin.dict.AppendTrieDictionary.flushIndex(
> AppendTrieDictionary.java:1043)
>         at org.apache.kylin.dict.AppendTrieDictionary$Builder.
> build(AppendTrieDictionary.java:954)
>         at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
> GlobalDictionaryBuilder.java:82)
>         at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
> DictionaryGenerator.java:81)
>         at org.apache.kylin.dict.DictionaryManager.buildDictionary(
> DictionaryManager.java:323)
>         at org.apache.kylin.cube.CubeManager.buildDictionary(
> CubeManager.java:185)
>         at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> processSegment(DictionaryGeneratorCLI.java:51)
>         at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> processSegment(DictionaryGeneratorCLI.java:42)
>         at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
> CreateDictionaryJob.java:56)
>         at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>         at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>         at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
> doWork(HadoopShellExecutable.java:63)
>         at org.apache.kylin.job.execution.AbstractExecutable.
> execute(AbstractExecutable.java:112)
>         at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
> DefaultChainedExecutable.java:57)
>         at org.apache.kylin.job.execution.AbstractExecutable.
> execute(AbstractExecutable.java:112)
>         at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
> JobRunner.run(DefaultScheduler.java:127)
>         at java.util.concurrent.ThreadPoolExecutor.runWorker(
> ThreadPoolExecutor.java:1145)
>         at java.util.concurrent.ThreadPoolExecutor$Worker.run(
> ThreadPoolExecutor.java:615)
>         at java.lang.Thread.run(Thread.java:744)
> 2016-08-24 17:27:53,340 ERROR [pool-7-thread-10]
> common.HadoopShellExecutable:65 : error execute HadoopShellExecutable{id=
> 3a0f2751-dd2a-4a3b-a27a-58bfc0edbbfd-03, name=Build Dimension Dictionary,
> state=RUNNING}
> java.lang.RuntimeException
>         at org.apache.kylin.dict.CachedTreeMap.writeValue(
> CachedTreeMap.java:240)
>         at org.apache.kylin.dict.CachedTreeMap.write(
> CachedTreeMap.java:374)
>         at org.apache.kylin.dict.AppendTrieDictionary.flushIndex(
> AppendTrieDictionary.java:1043)
>         at org.apache.kylin.dict.AppendTrieDictionary$Builder.
> build(AppendTrieDictionary.java:954)
>         at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
> GlobalDictionaryBuilder.java:82)
>         at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
> DictionaryGenerator.java:81)
>         at org.apache.kylin.dict.DictionaryManager.buildDictionary(
> DictionaryManager.java:323)
>         at org.apache.kylin.cube.CubeManager.buildDictionary(
> CubeManager.java:185)
>         at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> processSegment(DictionaryGeneratorCLI.java:51)
>         at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> processSegment(DictionaryGeneratorCLI.java:42)
>         at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
> CreateDictionaryJob.java:56)
>         at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>         at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>         at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
> doWork(HadoopShellExecutable.java:63)
>         at org.apache.kylin.job.execution.AbstractExecutable.
> execute(AbstractExecutable.java:112)
>         at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
> DefaultChainedExecutable.java:57)
>         at org.apache.kylin.job.execution.AbstractExecutable.
> execute(AbstractExecutable.java:112)
>         at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
> JobRunner.run(DefaultScheduler.java:127)
>         at java.util.concurrent.ThreadPoolExecutor.runWorker(
> ThreadPoolExecutor.java:1145)
>         at java.util.concurrent.ThreadPoolExecutor$Worker.run(
> ThreadPoolExecutor.java:615)
>         at java.lang.Thread.run(Thread.java:744)
>
>     and the error log in "kylin.out" :
>
>  Aug 24, 2016 5:25:32 PM com.google.common.cache.LocalCache
> processPendingNotifications
> WARNING: Exception thrown by removal listener
> java.lang.RuntimeException
>         at org.apache.kylin.dict.CachedTreeMap.writeValue(
> CachedTreeMap.java:240)
>         at org.apache.kylin.dict.CachedTreeMap.access$300(
> CachedTreeMap.java:52)
>         at org.apache.kylin.dict.CachedTreeMap$1.onRemoval(
> CachedTreeMap.java:149)
>         at com.google.common.cache.LocalCache.processPendingNotifications(
> LocalCache.java:2011)
>         at com.google.common.cache.LocalCache$Segment.
> runUnlockedCleanup(LocalCache.java:3501)
>         at com.google.common.cache.LocalCache$Segment.
> postWriteCleanup(LocalCache.java:3477)
>         at com.google.common.cache.LocalCache$Segment.put(
> LocalCache.java:2940)
>         at com.google.common.cache.LocalCache.put(LocalCache.java:4202)
>         at com.google.common.cache.LocalCache$LocalManualCache.
> put(LocalCache.java:4798)
>         at org.apache.kylin.dict.CachedTreeMap.put(CachedTreeMap.java:284)
>         at org.apache.kylin.dict.CachedTreeMap.put(CachedTreeMap.java:52)
>         at org.apache.kylin.dict.AppendTrieDictionary$Builder.
> addValue(AppendTrieDictionary.java:829)
>         at org.apache.kylin.dict.AppendTrieDictionary$Builder.
> addValue(AppendTrieDictionary.java:804)
>         at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
> GlobalDictionaryBuilder.java:78)
>         at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
> DictionaryGenerator.java:81)
>         at org.apache.kylin.dict.DictionaryManager.buildDictionary(
> DictionaryManager.java:323)
>         at org.apache.kylin.cube.CubeManager.buildDictionary(
> CubeManager.java:185)
>         at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> processSegment(DictionaryGeneratorCLI.java:51)
>         at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> processSegment(DictionaryGeneratorCLI.java:42)
>         at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
> CreateDictionaryJob.java:56)
>         at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>         at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>         at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
> doWork(HadoopShellExecutable.java:63)
>         at org.apache.kylin.job.execution.AbstractExecutable.
> execute(AbstractExecutable.java:112)
>         at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
> DefaultChainedExecutable.java:57)
>         at org.apache.kylin.job.execution.AbstractExecutable.
> execute(AbstractExecutable.java:112)
>         at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
> JobRunner.run(DefaultScheduler.java:127)
>         at java.util.concurrent.ThreadPoolExecutor.runWorker(
> ThreadPoolExecutor.java:1145)
>         at java.util.concurrent.ThreadPoolExecutor$Worker.run(
> ThreadPoolExecutor.java:615)
>         at java.lang.Thread.run(Thread.java:744)
>
> usage: CreateDictionaryJob
>  -cubename <cubename>         Cube name. For exmaple, flat_item_cube
>  -input <input>               Input path
>  -segmentname <segmentname>   Cube segment name
>



-- 
With Warm regards

Yiming Liu (刘一鸣)

Re: Precisely Count Distinct on 100 million string values column

Posted by Yiming Liu <li...@gmail.com>.
Good found.

The code AppendTrieDictionary line 604:

// nValueBytes
if (n.part.length > 255)
    throw new RuntimeException();

Hi Yerui,

Could you add more comments for the 255 limit, with more meaningful exception?


2016-08-24 20:44 GMT+08:00 lxw <lx...@qq.com>:

> It caused by length(USER_ID) > 255.
> After exclude these dirty data, it works .
>
>
> Total 150 million records, execute this query:
>
> select city_code,
> sum(bid_request) as bid_request,
> count(distinct user_id) as uv
> from liuxiaowen.TEST_T_PBS_UV_FACT
> group by city_code
> order by uv desc limit 100
>
> Kylin cost  7 seconds, and Hive cost 180 seconds, the result is same.
>
>
>
> ------------------ Original ------------------
> From:  "lxw";<lx...@qq.com>;
> Date:  Wed, Aug 24, 2016 05:27 PM
> To:  "dev"<de...@kylin.apache.org>;
>
> Subject:  Precisely Count Distinct on 100 million string values column
>
>
>
> Hi,
>
>     I am trying to use "Precisely Count Distinct" on 100 million string
> values column "USER_ID", I updated the cube json :
> "dictionaries": [     {       "column": "USER_ID",       "builder":
> "org.apache.kylin.dict.GlobalDictionaryBuilder"     }   ],
>
> "override_kylin_properties": {     "kylin.job.mr.config.override.mapred.map.child.java.opts":
> "-Xmx7g",     "kylin.job.mr.config.override.mapreduce.map.memory.mb":
> "7168"   }  when I build the cube, an error occurred on "#4 Step Name:
> Build Dimension Dictionary",
>   the error log in "kylin.log" :
>
> 2016-08-24 17:27:53,282 ERROR [pool-7-thread-10] dict.CachedTreeMap:239 :
> write value into /kylin_test1/kylin_metadata_test1/resources/GlobalDict/
> dict/LIUXIAOWEN.TEST_T_PBS_UV_FACT/USER_ID.tmp/cached_
> AQEByQXVzFd8r0YviP4x84YqUv-NcRiuCI2d exception: java.lang.RuntimeException
> java.lang.RuntimeException
>         at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
> build_writeNode(AppendTrieDictionary.java:605)
>         at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
> buildTrieBytes(AppendTrieDictionary.java:576)
>         at org.apache.kylin.dict.AppendTrieDictionary$DictNode.
> write(AppendTrieDictionary.java:523)
>         at org.apache.kylin.dict.CachedTreeMap.writeValue(
> CachedTreeMap.java:234)
>         at org.apache.kylin.dict.CachedTreeMap.write(
> CachedTreeMap.java:374)
>         at org.apache.kylin.dict.AppendTrieDictionary.flushIndex(
> AppendTrieDictionary.java:1043)
>         at org.apache.kylin.dict.AppendTrieDictionary$Builder.
> build(AppendTrieDictionary.java:954)
>         at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
> GlobalDictionaryBuilder.java:82)
>         at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
> DictionaryGenerator.java:81)
>         at org.apache.kylin.dict.DictionaryManager.buildDictionary(
> DictionaryManager.java:323)
>         at org.apache.kylin.cube.CubeManager.buildDictionary(
> CubeManager.java:185)
>         at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> processSegment(DictionaryGeneratorCLI.java:51)
>         at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> processSegment(DictionaryGeneratorCLI.java:42)
>         at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
> CreateDictionaryJob.java:56)
>         at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>         at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>         at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
> doWork(HadoopShellExecutable.java:63)
>         at org.apache.kylin.job.execution.AbstractExecutable.
> execute(AbstractExecutable.java:112)
>         at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
> DefaultChainedExecutable.java:57)
>         at org.apache.kylin.job.execution.AbstractExecutable.
> execute(AbstractExecutable.java:112)
>         at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
> JobRunner.run(DefaultScheduler.java:127)
>         at java.util.concurrent.ThreadPoolExecutor.runWorker(
> ThreadPoolExecutor.java:1145)
>         at java.util.concurrent.ThreadPoolExecutor$Worker.run(
> ThreadPoolExecutor.java:615)
>         at java.lang.Thread.run(Thread.java:744)
> 2016-08-24 17:27:53,340 ERROR [pool-7-thread-10]
> common.HadoopShellExecutable:65 : error execute HadoopShellExecutable{id=
> 3a0f2751-dd2a-4a3b-a27a-58bfc0edbbfd-03, name=Build Dimension Dictionary,
> state=RUNNING}
> java.lang.RuntimeException
>         at org.apache.kylin.dict.CachedTreeMap.writeValue(
> CachedTreeMap.java:240)
>         at org.apache.kylin.dict.CachedTreeMap.write(
> CachedTreeMap.java:374)
>         at org.apache.kylin.dict.AppendTrieDictionary.flushIndex(
> AppendTrieDictionary.java:1043)
>         at org.apache.kylin.dict.AppendTrieDictionary$Builder.
> build(AppendTrieDictionary.java:954)
>         at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
> GlobalDictionaryBuilder.java:82)
>         at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
> DictionaryGenerator.java:81)
>         at org.apache.kylin.dict.DictionaryManager.buildDictionary(
> DictionaryManager.java:323)
>         at org.apache.kylin.cube.CubeManager.buildDictionary(
> CubeManager.java:185)
>         at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> processSegment(DictionaryGeneratorCLI.java:51)
>         at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> processSegment(DictionaryGeneratorCLI.java:42)
>         at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
> CreateDictionaryJob.java:56)
>         at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>         at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>         at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
> doWork(HadoopShellExecutable.java:63)
>         at org.apache.kylin.job.execution.AbstractExecutable.
> execute(AbstractExecutable.java:112)
>         at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
> DefaultChainedExecutable.java:57)
>         at org.apache.kylin.job.execution.AbstractExecutable.
> execute(AbstractExecutable.java:112)
>         at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
> JobRunner.run(DefaultScheduler.java:127)
>         at java.util.concurrent.ThreadPoolExecutor.runWorker(
> ThreadPoolExecutor.java:1145)
>         at java.util.concurrent.ThreadPoolExecutor$Worker.run(
> ThreadPoolExecutor.java:615)
>         at java.lang.Thread.run(Thread.java:744)
>
>     and the error log in "kylin.out" :
>
>  Aug 24, 2016 5:25:32 PM com.google.common.cache.LocalCache
> processPendingNotifications
> WARNING: Exception thrown by removal listener
> java.lang.RuntimeException
>         at org.apache.kylin.dict.CachedTreeMap.writeValue(
> CachedTreeMap.java:240)
>         at org.apache.kylin.dict.CachedTreeMap.access$300(
> CachedTreeMap.java:52)
>         at org.apache.kylin.dict.CachedTreeMap$1.onRemoval(
> CachedTreeMap.java:149)
>         at com.google.common.cache.LocalCache.processPendingNotifications(
> LocalCache.java:2011)
>         at com.google.common.cache.LocalCache$Segment.
> runUnlockedCleanup(LocalCache.java:3501)
>         at com.google.common.cache.LocalCache$Segment.
> postWriteCleanup(LocalCache.java:3477)
>         at com.google.common.cache.LocalCache$Segment.put(
> LocalCache.java:2940)
>         at com.google.common.cache.LocalCache.put(LocalCache.java:4202)
>         at com.google.common.cache.LocalCache$LocalManualCache.
> put(LocalCache.java:4798)
>         at org.apache.kylin.dict.CachedTreeMap.put(CachedTreeMap.java:284)
>         at org.apache.kylin.dict.CachedTreeMap.put(CachedTreeMap.java:52)
>         at org.apache.kylin.dict.AppendTrieDictionary$Builder.
> addValue(AppendTrieDictionary.java:829)
>         at org.apache.kylin.dict.AppendTrieDictionary$Builder.
> addValue(AppendTrieDictionary.java:804)
>         at org.apache.kylin.dict.GlobalDictionaryBuilder.build(
> GlobalDictionaryBuilder.java:78)
>         at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(
> DictionaryGenerator.java:81)
>         at org.apache.kylin.dict.DictionaryManager.buildDictionary(
> DictionaryManager.java:323)
>         at org.apache.kylin.cube.CubeManager.buildDictionary(
> CubeManager.java:185)
>         at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> processSegment(DictionaryGeneratorCLI.java:51)
>         at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.
> processSegment(DictionaryGeneratorCLI.java:42)
>         at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(
> CreateDictionaryJob.java:56)
>         at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
>         at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
>         at org.apache.kylin.engine.mr.common.HadoopShellExecutable.
> doWork(HadoopShellExecutable.java:63)
>         at org.apache.kylin.job.execution.AbstractExecutable.
> execute(AbstractExecutable.java:112)
>         at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(
> DefaultChainedExecutable.java:57)
>         at org.apache.kylin.job.execution.AbstractExecutable.
> execute(AbstractExecutable.java:112)
>         at org.apache.kylin.job.impl.threadpool.DefaultScheduler$
> JobRunner.run(DefaultScheduler.java:127)
>         at java.util.concurrent.ThreadPoolExecutor.runWorker(
> ThreadPoolExecutor.java:1145)
>         at java.util.concurrent.ThreadPoolExecutor$Worker.run(
> ThreadPoolExecutor.java:615)
>         at java.lang.Thread.run(Thread.java:744)
>
> usage: CreateDictionaryJob
>  -cubename <cubename>         Cube name. For exmaple, flat_item_cube
>  -input <input>               Input path
>  -segmentname <segmentname>   Cube segment name
>



-- 
With Warm regards

Yiming Liu (刘一鸣)

Re: Precisely Count Distinct on 100 million string values column

Posted by lxw <lx...@qq.com>.
It caused by length(USER_ID) > 255.
After exclude these dirty data, it works .


Total 150 million records, execute this query:

select city_code,
sum(bid_request) as bid_request,
count(distinct user_id) as uv 
from liuxiaowen.TEST_T_PBS_UV_FACT 
group by city_code 
order by uv desc limit 100

Kylin cost  7 seconds, and Hive cost 180 seconds, the result is same.
 


------------------ Original ------------------
From:  "lxw";<lx...@qq.com>;
Date:  Wed, Aug 24, 2016 05:27 PM
To:  "dev"<de...@kylin.apache.org>; 

Subject:  Precisely Count Distinct on 100 million string values column



Hi,

    I am trying to use "Precisely Count Distinct" on 100 million string values column "USER_ID", I updated the cube json : 
"dictionaries": [     {       "column": "USER_ID",       "builder": "org.apache.kylin.dict.GlobalDictionaryBuilder"     }   ],
  
"override_kylin_properties": {     "kylin.job.mr.config.override.mapred.map.child.java.opts": "-Xmx7g",     "kylin.job.mr.config.override.mapreduce.map.memory.mb": "7168"   }  when I build the cube, an error occurred on "#4 Step Name: Build Dimension Dictionary", 
  the error log in "kylin.log" :
  
2016-08-24 17:27:53,282 ERROR [pool-7-thread-10] dict.CachedTreeMap:239 : write value into /kylin_test1/kylin_metadata_test1/resources/GlobalDict/dict/LIUXIAOWEN.TEST_T_PBS_UV_FACT/USER_ID.tmp/cached_AQEByQXVzFd8r0YviP4x84YqUv-NcRiuCI2d exception: java.lang.RuntimeException
java.lang.RuntimeException
        at org.apache.kylin.dict.AppendTrieDictionary$DictNode.build_writeNode(AppendTrieDictionary.java:605)
        at org.apache.kylin.dict.AppendTrieDictionary$DictNode.buildTrieBytes(AppendTrieDictionary.java:576)
        at org.apache.kylin.dict.AppendTrieDictionary$DictNode.write(AppendTrieDictionary.java:523)
        at org.apache.kylin.dict.CachedTreeMap.writeValue(CachedTreeMap.java:234)
        at org.apache.kylin.dict.CachedTreeMap.write(CachedTreeMap.java:374)
        at org.apache.kylin.dict.AppendTrieDictionary.flushIndex(AppendTrieDictionary.java:1043)
        at org.apache.kylin.dict.AppendTrieDictionary$Builder.build(AppendTrieDictionary.java:954)
        at org.apache.kylin.dict.GlobalDictionaryBuilder.build(GlobalDictionaryBuilder.java:82)
        at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(DictionaryGenerator.java:81)
        at org.apache.kylin.dict.DictionaryManager.buildDictionary(DictionaryManager.java:323)
        at org.apache.kylin.cube.CubeManager.buildDictionary(CubeManager.java:185)
        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.processSegment(DictionaryGeneratorCLI.java:51)
        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.processSegment(DictionaryGeneratorCLI.java:42)
        at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(CreateDictionaryJob.java:56)
        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
        at org.apache.kylin.engine.mr.common.HadoopShellExecutable.doWork(HadoopShellExecutable.java:63)
        at org.apache.kylin.job.execution.AbstractExecutable.execute(AbstractExecutable.java:112)
        at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(DefaultChainedExecutable.java:57)
        at org.apache.kylin.job.execution.AbstractExecutable.execute(AbstractExecutable.java:112)
        at org.apache.kylin.job.impl.threadpool.DefaultScheduler$JobRunner.run(DefaultScheduler.java:127)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:744)
2016-08-24 17:27:53,340 ERROR [pool-7-thread-10] common.HadoopShellExecutable:65 : error execute HadoopShellExecutable{id=3a0f2751-dd2a-4a3b-a27a-58bfc0edbbfd-03, name=Build Dimension Dictionary, state=RUNNING}
java.lang.RuntimeException
        at org.apache.kylin.dict.CachedTreeMap.writeValue(CachedTreeMap.java:240)
        at org.apache.kylin.dict.CachedTreeMap.write(CachedTreeMap.java:374)
        at org.apache.kylin.dict.AppendTrieDictionary.flushIndex(AppendTrieDictionary.java:1043)
        at org.apache.kylin.dict.AppendTrieDictionary$Builder.build(AppendTrieDictionary.java:954)
        at org.apache.kylin.dict.GlobalDictionaryBuilder.build(GlobalDictionaryBuilder.java:82)
        at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(DictionaryGenerator.java:81)
        at org.apache.kylin.dict.DictionaryManager.buildDictionary(DictionaryManager.java:323)
        at org.apache.kylin.cube.CubeManager.buildDictionary(CubeManager.java:185)
        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.processSegment(DictionaryGeneratorCLI.java:51)
        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.processSegment(DictionaryGeneratorCLI.java:42)
        at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(CreateDictionaryJob.java:56)
        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
        at org.apache.kylin.engine.mr.common.HadoopShellExecutable.doWork(HadoopShellExecutable.java:63)
        at org.apache.kylin.job.execution.AbstractExecutable.execute(AbstractExecutable.java:112)
        at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(DefaultChainedExecutable.java:57)
        at org.apache.kylin.job.execution.AbstractExecutable.execute(AbstractExecutable.java:112)
        at org.apache.kylin.job.impl.threadpool.DefaultScheduler$JobRunner.run(DefaultScheduler.java:127)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:744)

    and the error log in "kylin.out" :

 Aug 24, 2016 5:25:32 PM com.google.common.cache.LocalCache processPendingNotifications
WARNING: Exception thrown by removal listener
java.lang.RuntimeException
        at org.apache.kylin.dict.CachedTreeMap.writeValue(CachedTreeMap.java:240)
        at org.apache.kylin.dict.CachedTreeMap.access$300(CachedTreeMap.java:52)
        at org.apache.kylin.dict.CachedTreeMap$1.onRemoval(CachedTreeMap.java:149)
        at com.google.common.cache.LocalCache.processPendingNotifications(LocalCache.java:2011)
        at com.google.common.cache.LocalCache$Segment.runUnlockedCleanup(LocalCache.java:3501)
        at com.google.common.cache.LocalCache$Segment.postWriteCleanup(LocalCache.java:3477)
        at com.google.common.cache.LocalCache$Segment.put(LocalCache.java:2940)
        at com.google.common.cache.LocalCache.put(LocalCache.java:4202)
        at com.google.common.cache.LocalCache$LocalManualCache.put(LocalCache.java:4798)
        at org.apache.kylin.dict.CachedTreeMap.put(CachedTreeMap.java:284)
        at org.apache.kylin.dict.CachedTreeMap.put(CachedTreeMap.java:52)
        at org.apache.kylin.dict.AppendTrieDictionary$Builder.addValue(AppendTrieDictionary.java:829)
        at org.apache.kylin.dict.AppendTrieDictionary$Builder.addValue(AppendTrieDictionary.java:804)
        at org.apache.kylin.dict.GlobalDictionaryBuilder.build(GlobalDictionaryBuilder.java:78)
        at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(DictionaryGenerator.java:81)
        at org.apache.kylin.dict.DictionaryManager.buildDictionary(DictionaryManager.java:323)
        at org.apache.kylin.cube.CubeManager.buildDictionary(CubeManager.java:185)
        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.processSegment(DictionaryGeneratorCLI.java:51)
        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.processSegment(DictionaryGeneratorCLI.java:42)
        at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(CreateDictionaryJob.java:56)
        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
        at org.apache.kylin.engine.mr.common.HadoopShellExecutable.doWork(HadoopShellExecutable.java:63)
        at org.apache.kylin.job.execution.AbstractExecutable.execute(AbstractExecutable.java:112)
        at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(DefaultChainedExecutable.java:57)
        at org.apache.kylin.job.execution.AbstractExecutable.execute(AbstractExecutable.java:112)
        at org.apache.kylin.job.impl.threadpool.DefaultScheduler$JobRunner.run(DefaultScheduler.java:127)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:744)

usage: CreateDictionaryJob
 -cubename <cubename>         Cube name. For exmaple, flat_item_cube
 -input <input>               Input path
 -segmentname <segmentname>   Cube segment name

Precisely Count Distinct on 100 million string values column

Posted by lxw <lx...@qq.com>.
Hi,

    I am trying to use "Precisely Count Distinct" on 100 million string values column "USER_ID", I updated the cube json : 
"dictionaries": [     {       "column": "USER_ID",       "builder": "org.apache.kylin.dict.GlobalDictionaryBuilder"     }   ],
  
"override_kylin_properties": {     "kylin.job.mr.config.override.mapred.map.child.java.opts": "-Xmx7g",     "kylin.job.mr.config.override.mapreduce.map.memory.mb": "7168"   }  when I build the cube, an error occurred on "#4 Step Name: Build Dimension Dictionary", 
  the error log in "kylin.log" :
  
2016-08-24 17:27:53,282 ERROR [pool-7-thread-10] dict.CachedTreeMap:239 : write value into /kylin_test1/kylin_metadata_test1/resources/GlobalDict/dict/LIUXIAOWEN.TEST_T_PBS_UV_FACT/USER_ID.tmp/cached_AQEByQXVzFd8r0YviP4x84YqUv-NcRiuCI2d exception: java.lang.RuntimeException
java.lang.RuntimeException
        at org.apache.kylin.dict.AppendTrieDictionary$DictNode.build_writeNode(AppendTrieDictionary.java:605)
        at org.apache.kylin.dict.AppendTrieDictionary$DictNode.buildTrieBytes(AppendTrieDictionary.java:576)
        at org.apache.kylin.dict.AppendTrieDictionary$DictNode.write(AppendTrieDictionary.java:523)
        at org.apache.kylin.dict.CachedTreeMap.writeValue(CachedTreeMap.java:234)
        at org.apache.kylin.dict.CachedTreeMap.write(CachedTreeMap.java:374)
        at org.apache.kylin.dict.AppendTrieDictionary.flushIndex(AppendTrieDictionary.java:1043)
        at org.apache.kylin.dict.AppendTrieDictionary$Builder.build(AppendTrieDictionary.java:954)
        at org.apache.kylin.dict.GlobalDictionaryBuilder.build(GlobalDictionaryBuilder.java:82)
        at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(DictionaryGenerator.java:81)
        at org.apache.kylin.dict.DictionaryManager.buildDictionary(DictionaryManager.java:323)
        at org.apache.kylin.cube.CubeManager.buildDictionary(CubeManager.java:185)
        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.processSegment(DictionaryGeneratorCLI.java:51)
        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.processSegment(DictionaryGeneratorCLI.java:42)
        at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(CreateDictionaryJob.java:56)
        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
        at org.apache.kylin.engine.mr.common.HadoopShellExecutable.doWork(HadoopShellExecutable.java:63)
        at org.apache.kylin.job.execution.AbstractExecutable.execute(AbstractExecutable.java:112)
        at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(DefaultChainedExecutable.java:57)
        at org.apache.kylin.job.execution.AbstractExecutable.execute(AbstractExecutable.java:112)
        at org.apache.kylin.job.impl.threadpool.DefaultScheduler$JobRunner.run(DefaultScheduler.java:127)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:744)
2016-08-24 17:27:53,340 ERROR [pool-7-thread-10] common.HadoopShellExecutable:65 : error execute HadoopShellExecutable{id=3a0f2751-dd2a-4a3b-a27a-58bfc0edbbfd-03, name=Build Dimension Dictionary, state=RUNNING}
java.lang.RuntimeException
        at org.apache.kylin.dict.CachedTreeMap.writeValue(CachedTreeMap.java:240)
        at org.apache.kylin.dict.CachedTreeMap.write(CachedTreeMap.java:374)
        at org.apache.kylin.dict.AppendTrieDictionary.flushIndex(AppendTrieDictionary.java:1043)
        at org.apache.kylin.dict.AppendTrieDictionary$Builder.build(AppendTrieDictionary.java:954)
        at org.apache.kylin.dict.GlobalDictionaryBuilder.build(GlobalDictionaryBuilder.java:82)
        at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(DictionaryGenerator.java:81)
        at org.apache.kylin.dict.DictionaryManager.buildDictionary(DictionaryManager.java:323)
        at org.apache.kylin.cube.CubeManager.buildDictionary(CubeManager.java:185)
        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.processSegment(DictionaryGeneratorCLI.java:51)
        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.processSegment(DictionaryGeneratorCLI.java:42)
        at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(CreateDictionaryJob.java:56)
        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
        at org.apache.kylin.engine.mr.common.HadoopShellExecutable.doWork(HadoopShellExecutable.java:63)
        at org.apache.kylin.job.execution.AbstractExecutable.execute(AbstractExecutable.java:112)
        at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(DefaultChainedExecutable.java:57)
        at org.apache.kylin.job.execution.AbstractExecutable.execute(AbstractExecutable.java:112)
        at org.apache.kylin.job.impl.threadpool.DefaultScheduler$JobRunner.run(DefaultScheduler.java:127)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:744)

    and the error log in "kylin.out" :

 Aug 24, 2016 5:25:32 PM com.google.common.cache.LocalCache processPendingNotifications
WARNING: Exception thrown by removal listener
java.lang.RuntimeException
        at org.apache.kylin.dict.CachedTreeMap.writeValue(CachedTreeMap.java:240)
        at org.apache.kylin.dict.CachedTreeMap.access$300(CachedTreeMap.java:52)
        at org.apache.kylin.dict.CachedTreeMap$1.onRemoval(CachedTreeMap.java:149)
        at com.google.common.cache.LocalCache.processPendingNotifications(LocalCache.java:2011)
        at com.google.common.cache.LocalCache$Segment.runUnlockedCleanup(LocalCache.java:3501)
        at com.google.common.cache.LocalCache$Segment.postWriteCleanup(LocalCache.java:3477)
        at com.google.common.cache.LocalCache$Segment.put(LocalCache.java:2940)
        at com.google.common.cache.LocalCache.put(LocalCache.java:4202)
        at com.google.common.cache.LocalCache$LocalManualCache.put(LocalCache.java:4798)
        at org.apache.kylin.dict.CachedTreeMap.put(CachedTreeMap.java:284)
        at org.apache.kylin.dict.CachedTreeMap.put(CachedTreeMap.java:52)
        at org.apache.kylin.dict.AppendTrieDictionary$Builder.addValue(AppendTrieDictionary.java:829)
        at org.apache.kylin.dict.AppendTrieDictionary$Builder.addValue(AppendTrieDictionary.java:804)
        at org.apache.kylin.dict.GlobalDictionaryBuilder.build(GlobalDictionaryBuilder.java:78)
        at org.apache.kylin.dict.DictionaryGenerator.buildDictionary(DictionaryGenerator.java:81)
        at org.apache.kylin.dict.DictionaryManager.buildDictionary(DictionaryManager.java:323)
        at org.apache.kylin.cube.CubeManager.buildDictionary(CubeManager.java:185)
        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.processSegment(DictionaryGeneratorCLI.java:51)
        at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.processSegment(DictionaryGeneratorCLI.java:42)
        at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(CreateDictionaryJob.java:56)
        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
        at org.apache.kylin.engine.mr.common.HadoopShellExecutable.doWork(HadoopShellExecutable.java:63)
        at org.apache.kylin.job.execution.AbstractExecutable.execute(AbstractExecutable.java:112)
        at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(DefaultChainedExecutable.java:57)
        at org.apache.kylin.job.execution.AbstractExecutable.execute(AbstractExecutable.java:112)
        at org.apache.kylin.job.impl.threadpool.DefaultScheduler$JobRunner.run(DefaultScheduler.java:127)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:744)

usage: CreateDictionaryJob
 -cubename <cubename>         Cube name. For exmaple, flat_item_cube
 -input <input>               Input path
 -segmentname <segmentname>   Cube segment name