You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/01/15 07:28:44 UTC
[GitHub] leezu commented on issue #8337: mx.autograd.grad works or fails depending on use of slices
leezu commented on issue #8337: mx.autograd.grad works or fails depending on use of slices
URL: https://github.com/apache/incubator-mxnet/issues/8337#issuecomment-357602028
@piiswrong did you look into this yet?
Here is some more example code that fails due to slicing:
```
import mxnet as mx
class LiftedStructLossLoop(mx.gluon.Block):
"""Computes the lifted structured loss.
The loss encourages the positive distances (between a pair of embeddings
with the same labels) to be smaller than any negative distances (between a
pair of embeddings with different labels) in the mini-batch in a way that
is differentiable with respect to the embedding vectors. See:
https://arxiv.org/abs/1511.06452.
"""
margin: float = 1.0
def forward(self, score, target):
loss = 0
counter = 0
bsz = score.shape[0]
mag = (score**2).sum(1).broadcast_to((bsz, bsz))
sim = mx.nd.dot(score, score.T)
dist = (mag + mag.T - 2 * sim)
dist = mx.nd.Activation(dist, act_type="relu").sqrt()
target = target.asnumpy().flatten()
for i in range(bsz):
t_i = target[i]
for j in range(i + 1, bsz):
t_j = target[j]
if t_i == t_j:
l_ni_sel = np.where(target != t_i)[0]
if len(l_ni_sel):
l_ni = (self.margin - dist[i][l_ni_sel]).exp().sum()
else:
l_ni = 0
l_nj_sel = np.where(target != t_j)[0]
if len(l_nj_sel):
l_nj = (self.margin - dist[j][l_nj_sel]).exp().sum()
else:
l_nj = 0
# May be -inf but will be clipped by relu
l_n = (l_ni + l_nj).log()
# Positive component
l_p = dist[i, j]
# print(
# f"t_{i} == t_{j}.\tl_n{i}={l_ni.asscalar()}\tl_n{j}={l_nj.asscalar()}\tl_p={l_p.asscalar()}"
# )
loss_sqrt = mx.nd.Activation(l_n + l_p, act_type="relu")
loss = loss + loss_sqrt * loss_sqrt
counter += 1
return loss / (2 * counter)
if __name__ == "__main__":
import numpy as np
np.random.seed(123)
score = np.random.uniform(0, 1, (20, 3))
target = np.random.choice(range(3), 20)
loss = LiftedStructLossLoop()
score = mx.nd.array(score)
target = mx.nd.array(target)
score.attach_grad()
with mx.autograd.record():
l = loss(score, target)
l.backward()
print(score.grad)
```
Note that the use of np.where is necessary as mxnet doesn't support boolean array indices, but should not have any effect on the gradient.
The output is:
```
[[ nan nan nan]
[ nan nan nan]
[ nan nan nan]
[ nan nan nan]
[ nan nan nan]
[ nan nan nan]
[ nan nan nan]
[ nan nan nan]
[ nan nan nan]
[ nan nan nan]
[ 0.17029689 -0.03528368 0.06762911]
[ nan nan nan]
[ 0.02389014 0.17050645 0.10098324]
[ nan nan nan]
[ nan nan nan]
[ nan nan nan]
[ nan nan nan]
[-0.24169818 -0.05233514 0.00246271]
[ nan nan nan]
[ nan na
```
Unrolling the loops and using matrix multiplication will give us the following gradient with mxnet:
```
[[-0.08850744 0.08619909 0.09619249]
[ 0.13997763 0.08896485 -0.17054076]
[-0.18517525 -0.04558571 0.00597473]
[ 0.01351711 -0.10314143 0.00795864]
[ 0.07882202 -0.1590932 -0.18347597]
[-0.0316371 -0.02517551 0.04251539]
[ 0.12770234 -0.01681857 -0.08277885]
[ 0.25015241 0.04632783 -0.12174347]
[-0.07428657 0.11525114 0.19583233]
[-0.09889268 -0.11168075 -0.0907627 ]
[ 0.17029692 -0.03528369 0.06762912]
[ 0.12002046 -0.11145654 -0.23262346]
[ 0.02389011 0.17050643 0.10098325]
[-0.01220375 0.05192697 0.12084311]
[-0.05118883 0.06531101 0.07887366]
[-0.06932686 -0.04811895 0.12089249]
[ 0.04876971 0.02139962 -0.27124658]
[-0.24169821 -0.05233523 0.00246269]
[-0.09003269 0.06214742 0.17757085]
[-0.0301991 0.00065519 0.135443 ]]
```
Which is the correct gradient as can be confirmed with pytorch (via https://gist.github.com/bkj/565c5e145786cfd362cffdbd8c089cf4)
```
#!/usr/bin/env python
"""
pytorch_lifted_loss.py
"""
import torch
import torch.nn as nn
from torch.autograd import Variable
def lifted_loss(score, target, margin=1):
"""
Lifted loss, per "Deep Metric Learning via Lifted Structured Feature Embedding" by Song et al
Implemented in `pytorch`
"""
loss = 0
counter = 0
bsz = score.size(0)
mag = (score**2).sum(1).expand(bsz, bsz)
sim = score.mm(score.transpose(0, 1))
dist = (mag + mag.transpose(0, 1) - 2 * sim)
dist = torch.nn.functional.relu(dist).sqrt()
for i in range(bsz):
t_i = target[i].data[0]
for j in range(i + 1, bsz):
t_j = target[j].data[0]
if t_i == t_j:
# Negative component
# !! Could do other things (like softmax that weights closer negatives)
l_ni = (margin - dist[i][target != t_i]).exp().sum()
l_nj = (margin - dist[j][target != t_j]).exp().sum()
l_n = (l_ni + l_nj).log()
# Positive component
l_p = dist[i, j]
loss += torch.nn.functional.relu(l_n + l_p)**2
counter += 1
return loss / (2 * counter)
# --
if __name__ == "__main__":
import numpy as np
np.random.seed(123)
score = np.random.uniform(0, 1, (20, 3))
target = np.random.choice(range(3), 20)
score = Variable(torch.FloatTensor(score), requires_grad=True)
target = Variable(torch.LongTensor(target))
l = lifted_loss(score, target)
torch.autograd.backward(l)
print(score.grad)
```
```
Variable containing:
-0.0885 0.0862 0.0962
0.1400 0.0890 -0.1705
-0.1852 -0.0456 0.0060
0.0135 -0.1031 0.0080
0.0788 -0.1591 -0.1835
-0.0316 -0.0252 0.0425
0.1277 -0.0168 -0.0828
0.2502 0.0463 -0.1217
-0.0743 0.1153 0.1958
-0.0989 -0.1117 -0.0908
0.1703 -0.0353 0.0676
0.1200 -0.1115 -0.2326
0.0239 0.1705 0.1010
-0.0122 0.0519 0.1208
-0.0512 0.0653 0.0789
-0.0693 -0.0481 0.1209
0.0488 0.0214 -0.2712
-0.2417 -0.0523 0.0025
-0.0900 0.0621 0.1776
-0.0302 0.0007 0.1354
[torch.FloatTensor of size 20x3]
```
@sxjscience
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
With regards,
Apache Git Services