You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/01/15 07:28:44 UTC
[GitHub] leezu commented on issue #8337: mx.autograd.grad works or fails depending on use of slices

leezu commented on issue #8337: mx.autograd.grad works or fails depending on use of slices
URL: https://github.com/apache/incubator-mxnet/issues/8337#issuecomment-357602028
 
 
   @piiswrong did you look into this yet?
   
   Here is some more example code that fails due to slicing:
   
   ```
   import mxnet as mx
   
   
   class LiftedStructLossLoop(mx.gluon.Block):
       """Computes the lifted structured loss.
   
       The loss encourages the positive distances (between a pair of embeddings
       with the same labels) to be smaller than any negative distances (between a
       pair of embeddings with different labels) in the mini-batch in a way that
       is differentiable with respect to the embedding vectors. See:
       https://arxiv.org/abs/1511.06452.
       """
   
       margin: float = 1.0
   
       def forward(self, score, target):
           loss = 0
           counter = 0
   
           bsz = score.shape[0]
           mag = (score**2).sum(1).broadcast_to((bsz, bsz))
           sim = mx.nd.dot(score, score.T)
   
           dist = (mag + mag.T - 2 * sim)
           dist = mx.nd.Activation(dist, act_type="relu").sqrt()
   
           target = target.asnumpy().flatten()
   
           for i in range(bsz):
               t_i = target[i]
   
               for j in range(i + 1, bsz):
                   t_j = target[j]
   
                   if t_i == t_j:
                       l_ni_sel = np.where(target != t_i)[0]
                       if len(l_ni_sel):
                           l_ni = (self.margin - dist[i][l_ni_sel]).exp().sum()
                       else:
                           l_ni = 0
   
                       l_nj_sel = np.where(target != t_j)[0]
                       if len(l_nj_sel):
                           l_nj = (self.margin - dist[j][l_nj_sel]).exp().sum()
                       else:
                           l_nj = 0
   
                       # May be -inf but will be clipped by relu
                       l_n = (l_ni + l_nj).log()
   
                       # Positive component
                       l_p = dist[i, j]
   
                       # print(
                       #     f"t_{i} == t_{j}.\tl_n{i}={l_ni.asscalar()}\tl_n{j}={l_nj.asscalar()}\tl_p={l_p.asscalar()}"
                       # )
   
                       loss_sqrt = mx.nd.Activation(l_n + l_p, act_type="relu")
                       loss = loss + loss_sqrt * loss_sqrt
                       counter += 1
   
           return loss / (2 * counter)
   
   
   if __name__ == "__main__":
       import numpy as np
       np.random.seed(123)
   
       score = np.random.uniform(0, 1, (20, 3))
       target = np.random.choice(range(3), 20)
   
       loss = LiftedStructLossLoop()
   
       score = mx.nd.array(score)
       target = mx.nd.array(target)
       score.attach_grad()
       with mx.autograd.record():
           l = loss(score, target)
       l.backward()
       print(score.grad)
   ```
   
   Note that the use of np.where is necessary as mxnet doesn't support boolean array indices, but should not have any effect on the gradient.
   
   The output is:
   ```
   [[        nan         nan         nan]
    [        nan         nan         nan]
    [        nan         nan         nan]
    [        nan         nan         nan]
    [        nan         nan         nan]
    [        nan         nan         nan]
    [        nan         nan         nan]
    [        nan         nan         nan]
    [        nan         nan         nan]
    [        nan         nan         nan]
    [ 0.17029689 -0.03528368  0.06762911]
    [        nan         nan         nan]
    [ 0.02389014  0.17050645  0.10098324]
    [        nan         nan         nan]
    [        nan         nan         nan]
    [        nan         nan         nan]
    [        nan         nan         nan]
    [-0.24169818 -0.05233514  0.00246271]
    [        nan         nan         nan]
    [        nan         na
   ```
   
   
   Unrolling the loops and using matrix multiplication will give us the following gradient with mxnet:
   
   ```
   
   [[-0.08850744  0.08619909  0.09619249]
    [ 0.13997763  0.08896485 -0.17054076]
    [-0.18517525 -0.04558571  0.00597473]
    [ 0.01351711 -0.10314143  0.00795864]
    [ 0.07882202 -0.1590932  -0.18347597]
    [-0.0316371  -0.02517551  0.04251539]
    [ 0.12770234 -0.01681857 -0.08277885]
    [ 0.25015241  0.04632783 -0.12174347]
    [-0.07428657  0.11525114  0.19583233]
    [-0.09889268 -0.11168075 -0.0907627 ]
    [ 0.17029692 -0.03528369  0.06762912]
    [ 0.12002046 -0.11145654 -0.23262346]
    [ 0.02389011  0.17050643  0.10098325]
    [-0.01220375  0.05192697  0.12084311]
    [-0.05118883  0.06531101  0.07887366]
    [-0.06932686 -0.04811895  0.12089249]
    [ 0.04876971  0.02139962 -0.27124658]
    [-0.24169821 -0.05233523  0.00246269]
    [-0.09003269  0.06214742  0.17757085]
    [-0.0301991   0.00065519  0.135443  ]]
   
   ```
   
   
   
   Which is the correct gradient as can be confirmed with pytorch (via https://gist.github.com/bkj/565c5e145786cfd362cffdbd8c089cf4)
   
   ```
   #!/usr/bin/env python
   """
       pytorch_lifted_loss.py
   """
   
   import torch
   import torch.nn as nn
   from torch.autograd import Variable
   
   
   def lifted_loss(score, target, margin=1):
       """
         Lifted loss, per "Deep Metric Learning via Lifted Structured Feature Embedding" by Song et al
         Implemented in `pytorch`
       """
   
       loss = 0
       counter = 0
   
       bsz = score.size(0)
       mag = (score**2).sum(1).expand(bsz, bsz)
       sim = score.mm(score.transpose(0, 1))
   
       dist = (mag + mag.transpose(0, 1) - 2 * sim)
       dist = torch.nn.functional.relu(dist).sqrt()
   
       for i in range(bsz):
           t_i = target[i].data[0]
   
           for j in range(i + 1, bsz):
               t_j = target[j].data[0]
   
               if t_i == t_j:
                   # Negative component
                   # !! Could do other things (like softmax that weights closer negatives)
                   l_ni = (margin - dist[i][target != t_i]).exp().sum()
                   l_nj = (margin - dist[j][target != t_j]).exp().sum()
                   l_n = (l_ni + l_nj).log()
   
                   # Positive component
                   l_p = dist[i, j]
   
                   loss += torch.nn.functional.relu(l_n + l_p)**2
                   counter += 1
   
       return loss / (2 * counter)
   
   
   # --
   
   if __name__ == "__main__":
       import numpy as np
       np.random.seed(123)
   
       score = np.random.uniform(0, 1, (20, 3))
       target = np.random.choice(range(3), 20)
   
       score = Variable(torch.FloatTensor(score), requires_grad=True)
       target = Variable(torch.LongTensor(target))
       l = lifted_loss(score, target)
       torch.autograd.backward(l)
       print(score.grad)
   ```
   
   ```
   Variable containing:
   -0.0885  0.0862  0.0962
    0.1400  0.0890 -0.1705
   -0.1852 -0.0456  0.0060
    0.0135 -0.1031  0.0080
    0.0788 -0.1591 -0.1835
   -0.0316 -0.0252  0.0425
    0.1277 -0.0168 -0.0828
    0.2502  0.0463 -0.1217
   -0.0743  0.1153  0.1958
   -0.0989 -0.1117 -0.0908
    0.1703 -0.0353  0.0676
    0.1200 -0.1115 -0.2326
    0.0239  0.1705  0.1010
   -0.0122  0.0519  0.1208
   -0.0512  0.0653  0.0789
   -0.0693 -0.0481  0.1209
    0.0488  0.0214 -0.2712
   -0.2417 -0.0523  0.0025
   -0.0900  0.0621  0.1776
   -0.0302  0.0007  0.1354
   [torch.FloatTensor of size 20x3]
   ```
   
   @sxjscience 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services