[MXNet Forum] [Gluon] MxNet Actor-Critic Model: the weights of the actor & critic are not updated

I have the following simple actor-critic NN classes:

    class actorNN(gluon.HybridBlock):
        def __init__(self, **kwargs):
            super(actorNN, self).__init__(**kwargs)
            with self.name_scope():
                self.fc1 = gluon.nn.Dense(8)
                self.fc2 = gluon.nn.Dense(2)
        def hybrid_forward(self, f, x):
            #print('type(x): {}, f: {}'.format(type(x).__name__, f.__name__))
            x = f.relu(self.fc1(x))
            return f.softmax(self.fc2(x))
    class criticNN(gluon.HybridBlock):
        def __init__(self, **kwargs):
            super(criticNN, self).__init__(**kwargs)
            with self.name_scope():
                self.fc1 = gluon.nn.Dense(10)
                self.fc2 = gluon.nn.Dense(1)
        def hybrid_forward(self, f, x):
            #print('type(x): {}, f: {}'.format(type(x).__name__, f.__name__))
            x = f.relu(self.fc1(x))
            return f.relu(self.fc2(x))

and I am initialising them as

    self.__actor = actorNN()
    self.__actor.collect_params().initialize(init = mx.init.Xavier(), ctx=self.ctx)#, #mx.init.Normal(sigma=1.)
    self.__actorTrainer = gluon.Trainer(self.__actor.collect_params(), 'sgd', {'learning_rate': lr})

    self.__critic = criticNN()
    self.__critic.collect_params().initialize(init = mx.init.Xavier(), ctx=self.ctx)#, #mx.init.Normal(sigma=1.)        
    self.__criticTrainer = gluon.Trainer(self.__critic.collect_params(), 'sgd', {'learning_rate': lr})

    # temp = self.__actor
    self.critic = self.__critic
    self.actorTrainer = self.__criticTrainer
    self.criticTrainer = self.__criticTrainer
    self.criticLossFn = gluon.loss.L1Loss()

Then training them with (all the variables are on a ndarray)

    def learn(self):
            rewards = self.rewardBuffer[np.nonzero(self.rewardBuffer)[0]]
            rewards = rewards.reshape((rewards.shape[0], 1))
            #values = self.valueBuffer[mx_np.nonzero(self.valueBuffer)].as_np_ndarray()
            batch_size = len(rewards)
            nonZeroInds = np.nonzero(self.observationBuffer)
            obs = self.observationBuffer[np.unique(nonZeroInds[0])]
            actionTaken = self.actionBuffer[np.nonzero(self.actionBuffer)[0]]
            actionTaken = actionTaken.reshape((actionTaken.shape[0], 1))
            actionTaken[np.where(np.array(actionTaken) == -1)[0]] = 0
            criticInput = mx.nd.concatenate([obs, actionTaken], axis = 1)
            with autograd.record():
                probs =
                values = self.critic(criticInput)
                discountedRewards = self.discountTheRewards(rewards, values, batch_size)
                actorLoss = (-1)* = probs, index = actionTaken, axis = 1).log(),discountedRewards)
            with autograd.record():
                values = self.critic(criticInput)
                criticLoss = self.criticLossFn(values, discountedRewards)

I print the weights & biases of both actor and critic after the end of each episode, but only a handful of weights of critic's NN are updated slightly. If I hybridize both actor and critic, and try to train them like that, then I also lost that slight update, and both actor & critic don't learn anything.


What am I doing wrong?


In case needed, this is my `discountTheRewards` method:

    def discountTheRewards(self, rewards, values, batch_size):
            if rewards.shape != values.shape:
                raise Exception("incompatible rewards-values data!")
            discountedRewards = mx.nd.zeros((batch_size,1), dtype = 'float32')
            for t in range(1,batch_size):
                G = 0
                for k in range(t, batch_size):
                    G = G + mx.nd.power(discountFactor,k-t) * rewards[k]
                discountedRewards[t] = G
            return (values - discountedRewards)

