You are viewing a plain text version of this content. The canonical link for it is here.
Posted to discuss-archive@mxnet.apache.org by onurcanbektas via MXNet Forum <mx...@discoursemail.com.INVALID> on 2020/07/21 17:18:23 UTC

[MXNet Forum] [Gluon] MxNet Actor-Critic Model: the weights of the actor & critic are not updated


I have the following simple actor-critic NN classes:

    class actorNN(gluon.HybridBlock):
        def __init__(self, **kwargs):
            super(actorNN, self).__init__(**kwargs)
            with self.name_scope():
                self.fc1 = gluon.nn.Dense(8)
                self.fc2 = gluon.nn.Dense(2)
    
        def hybrid_forward(self, f, x):
            #print('type(x): {}, f: {}'.format(type(x).__name__, f.__name__))
            x = f.relu(self.fc1(x))
            return f.softmax(self.fc2(x))
    
    class criticNN(gluon.HybridBlock):
        def __init__(self, **kwargs):
            super(criticNN, self).__init__(**kwargs)
            with self.name_scope():
                self.fc1 = gluon.nn.Dense(10)
                self.fc2 = gluon.nn.Dense(1)
    
        def hybrid_forward(self, f, x):
            #print('type(x): {}, f: {}'.format(type(x).__name__, f.__name__))
            x = f.relu(self.fc1(x))
            return f.relu(self.fc2(x))

and I am initialising them as

    self.__actor = actorNN()
    self.__actor.collect_params().initialize(init = mx.init.Xavier(), ctx=self.ctx)#, #mx.init.Normal(sigma=1.)
    self.__actorTrainer = gluon.Trainer(self.__actor.collect_params(), 'sgd', {'learning_rate': lr})
    #self.__actor.hybridize()

    self.__critic = criticNN()
    self.__critic.collect_params().initialize(init = mx.init.Xavier(), ctx=self.ctx)#, #mx.init.Normal(sigma=1.)        
    self.__criticTrainer = gluon.Trainer(self.__critic.collect_params(), 'sgd', {'learning_rate': lr})
    #self.__critic.hybridize()

    # temp
    self.actor = self.__actor
    self.critic = self.__critic
    self.actorTrainer = self.__criticTrainer
    self.criticTrainer = self.__criticTrainer
    self.criticLossFn = gluon.loss.L1Loss()

Then training them with (all the variables are on a ndarray)

    def learn(self):
            rewards = self.rewardBuffer[np.nonzero(self.rewardBuffer)[0]]
            rewards = rewards.reshape((rewards.shape[0], 1))
            #values = self.valueBuffer[mx_np.nonzero(self.valueBuffer)].as_np_ndarray()
            batch_size = len(rewards)
            nonZeroInds = np.nonzero(self.observationBuffer)
            obs = self.observationBuffer[np.unique(nonZeroInds[0])]
            actionTaken = self.actionBuffer[np.nonzero(self.actionBuffer)[0]]
            actionTaken = actionTaken.reshape((actionTaken.shape[0], 1))
            actionTaken[np.where(np.array(actionTaken) == -1)[0]] = 0
            criticInput = mx.nd.concatenate([obs, actionTaken], axis = 1)
            with autograd.record():
                probs = self.actor(obs)
                values = self.critic(criticInput)
                discountedRewards = self.discountTheRewards(rewards, values, batch_size)
                actorLoss = (-1)*mx.nd.dot(mx.nd.pick(data = probs, index = actionTaken, axis = 1).log(),discountedRewards)
            #self.actor.collect_params().zero_grad()
            actorLoss.backward()
            #autograd.backward(actorLoss)
            self.actorTrainer.set_learning_rate(lr)
            self.actorTrainer.step(batch_size)
    
            with autograd.record():
                values = self.critic(criticInput)
                criticLoss = self.criticLossFn(values, discountedRewards)
            #self.critic.collect_params().zero_grad()
            criticLoss.backward()
            self.criticTrainer.set_learning_rate(lr)
            self.criticTrainer.step(batch_size)


I print the weights & biases of both actor and critic after the end of each episode, but only a handful of weights of critic's NN are updated slightly. If I hybridize both actor and critic, and try to train them like that, then I also lost that slight update, and both actor & critic don't learn anything.

**Question:**

What am I doing wrong?


----------

In case needed, this is my `discountTheRewards` method:

    def discountTheRewards(self, rewards, values, batch_size):
            if rewards.shape != values.shape:
                raise Exception("incompatible rewards-values data!")
            
            discountedRewards = mx.nd.zeros((batch_size,1), dtype = 'float32')
            for t in range(1,batch_size):
                G = 0
                for k in range(t, batch_size):
                    G = G + mx.nd.power(discountFactor,k-t) * rewards[k]
                discountedRewards[t] = G
            return (values - discountedRewards)





---
[Visit Topic](https://discuss.mxnet.io/t/mxnet-actor-critic-model-the-weights-of-the-actor-critic-are-not-updated/6426/1) or reply to this email to respond.

You are receiving this because you enabled mailing list mode.

To unsubscribe from these emails, [click here](https://discuss.mxnet.io/email/unsubscribe/82294ac132582f93c8958d5c525c0d758a59b79805e7e10fbb60041976b04372).