JOIN THE OBJECT DETECTION COURSE
Master the object detection skills with our new practical course. Only here you will get step-by-step algorithms explanation to build strong and deep understanding of the process.
Read About the CourseSimple practical examples to give you a good understanding of how all this NN/AI things really work
Backpropagation (backward propagation of errors) - is a widely used algorithm in training feedforward networks. It computes the gradient of the loss function with respect to the weights of the network. The main idea of it is to break big functions in small parts and use partial derivatives to get function derivative with using the Chain Rule.
So training the model is basically solving this equation:
\begin{equation} loss(model(X), Target) = 0 \end{equation}
And because solving this can be a very hard task, here comes backpropagation and gradient descent(updating weights by a small amount based on the gradient to move in the way of loss minimization).
All this is based on a simple diferentiation chain rule:
\begin{equation} \frac{df}{dx} = \frac{df}{dy}*\frac{dy}{dx} \end{equation}
For example we have simple function with 2 nodes(operation):
Let's say we have 3 variables x=-2, y=5, z=-4, the result will be f=-12, and our target for training is -13. Simple loss=f-target=1.
Now we need to propagate our error(1) back to the x,y,z. To do this we will need to calculate partial derivatives for every function(operation):
\begin{align*} Multiplication:&\\ (q * z)dq &= 1 * z \\ (q * z)dz &= q * 1 \\ Summation:&\\ (x + y)dx &= 1 \\ (x + y)dy &= 1 \end{align*}
And our chain is:
\begin{align*} \frac{df}{dx} &= 1*(z*loss(1)) = 1 * -4 * 1 = -4 \\ \frac{df}{fy} &= 1*(z*loss(1)) = 1 * -4 * 1 = -4 \\ \frac{df}{dz} &= q*loss(1) = 3 * 1 = 3 \\ \end{align*}
Here we will build a simple automatic differentiationexample from the above formulas.
class Tensor:
def __init__(self, data, requires_grad=False):
self.data = data
if not isinstance(data, np.ndarray):
self.data = np.array(data)
# whether to run backpropagation or not
self.requires_grad = requires_grad
# tensor gradient
self._grad = None
# operation if this tensor was used in it
self._grad_fn = None
@property
def shape(self):
return self.data.shape
@property
def grad_fn(self):
if not self.requires_grad:
raise Exception('This tensor is not backpropagated')
return self._grad_fn
@property
def grad(self):
return self._grad
def backward(self, grad=None):
if not self.grad_fn:
return False
if grad is None and self._grad is None:
# in case if this is last loss tensor
grad = self.__class__(1., requires_grad=False)
elif self.grad is not None:
grad = self._grad
if not self.requires_grad:
raise Exception('This tensor is not backpropagated')
self.grad_fn.backward(grad)
return True
def __str__(self):
return f'Tensor({str(self.data)})'
def add_grad(self, grad):
if self._grad is None:
self._grad = grad
else:
self._grad += grad
def __add__(self, o):
if self.data is not None:
self.data += o.data
return self
self.data = o.data
return self
class Op:
def forward(self):
raise NotImplemented
def backward(self, grad):
raise NotImplemented
def __call__(self, *args):
self.out = self.forward(*args)
self.out._grad_fn = self
return self.out
class AddOp(Op):
'''Sumation operation with 2 tensors'''
def forward(self, x: Tensor, y: Tensor):
self.x = x
self.y = y
# created tensor should be backpropagated if at least one
# of the input is backpropagated
requires_grad = x.requires_grad or y.requires_grad
return Tensor(x.data + y.data, requires_grad=requires_grad)
def backward(self, grad):
if self.x.requires_grad:
# as we have matrix operation one of the parameters can
# have partial shape in such scenarion we need to sum
# gradient values by missed axis
if self.x.shape != grad.shape:
axis = np.argmax(np.abs(np.array(self.x.shape) -
np.array(grad.shape)))
self.x.add_grad(Tensor(grad.data.sum(axis=axis,
keepdims=True)))
else:
self.x.add_grad(grad)
if self.x.grad_fn:
self.x.backward()
if self.y.requires_grad:
if self.y.shape != grad.shape:
axis = np.argmax(np.abs(np.array(self.y.shape) -
np.array(grad.shape)))
self.y.add_grad(Tensor(grad.data.sum(axis=axis,
keepdims=True)))
else:
self.y.add_grad(grad)
if self.y.grad_fn:
self.y.backward()
class MulOp(Op):
'''Multiplication operation with 2 tensors'''
def forward(self, x: Tensor, y: Tensor):
self.x = x
self.y = y
requires_grad = x.requires_grad or y.requires_grad
return Tensor(x.data * y.data, requires_grad=requires_grad)
def backward(self, grad):
if self.x.requires_grad:
print(self.x, self.x._grad, grad)
self.x.add_grad(Tensor(grad.data * self.y.data, False))
if self.x.grad_fn:
self.x.backward()
if self.y.requires_grad:
self.y.add_grad(Tensor(grad.data * self.x.data, False))
if self.y.grad_fn:
self.y.backward()
Now let's see what will happens
x = np.random.random((3,3))
a = np.random.random((3,3))
b = np.random.random((3,))
loss = np.random.random((3,3))
_x = Tensor(x,requires_grad=True)
_a = Tensor(a,requires_grad=True)
_b = Tensor(b,requires_grad=True)
# here we use parameter _a 2 times in different operations.
# that means that gradient will be calculated twice for _a.
# and thus we need to sum two gradients to get result
# gradient for _a
y = MulOp()(
AddOp()(
MulOp()(
_x, _a
), _b
), _a
)
print('y = ', y)
print()
print('y.grad_fn = ', y.grad_fn)
print()
y.backward(Tensor(loss))
print('_x.grad = ',_x.grad)
print()
print('_a.grad = ',_a.grad)
print()
print('_b.grad = ',_b.grad)
print()
print('y.grad = ',y.grad)
y = Tensor([[0.45119641 0.25401614 0.36177802]
[0.2445061 0.22747291 0.41594346]
[0.35465603 0.26046455 1.26796941]])
y.grad_fn = <__main__.MulOp object at 0x7f169e332358>
_x.grad = Tensor([[0.08710775 0.00380129 0.14389783]
[0.09573756 0.05524222 0.07022905]
[0.03903106 0.09778539 0.79086884]])
_a.grad = Tensor([[0.29088065 0.03449309 0.90931509]
[0.44720764 0.85727129 0.14349873]
[0.05794243 0.58307861 1.63786025]])
_b.grad = Tensor([[0.45979957 0.48953863 1.26903921]])
y.grad = None
Checking with PyTorch Autograd
_x = torch.tensor(x,requires_grad=True)
_a = torch.tensor(a,requires_grad=True)
_b = torch.tensor(b,requires_grad=True)
y = (_a*_x + _b) * _a
print('y = ', y)
print()
print('y.grad_fn = ', y.grad_fn)
print()
y.backward(torch.tensor(loss))
print('_x.grad = ',_x.grad)
print()
print('_a.grad = ',_a.grad)
print()
print('_b.grad = ',_b.grad)
print()
print('y.grad = ',y.grad)
y = tensor([[0.4512, 0.2540, 0.3618],
[0.2445, 0.2275, 0.4159],
[0.3547, 0.2605, 1.2680]], dtype=torch.float64, grad_fn=)
y.grad_fn =
_x.grad = tensor([[0.0871, 0.0038, 0.1439],
[0.0957, 0.0552, 0.0702],
[0.0390, 0.0978, 0.7909]], dtype=torch.float64)
_a.grad = tensor([[0.2909, 0.0345, 0.9093],
[0.4472, 0.8573, 0.1435],
[0.0579, 0.5831, 1.6379]], dtype=torch.float64)
_b.grad = tensor([0.4598, 0.4895, 1.2690], dtype=torch.float64)
y.grad = None
As you can se we got the same gradients as in our simple autograd solution
For simplicity we will not use autograd here and will solve derivative for each layer.
class Layer:
def forward(self):
raise NotImplemented
def backward(self, grad):
raise NotImplemented
def __call__(self, *args):
return self.forward(*args)
class Sigmoid:
def forward(self,x):
self.x = x
return 1/(1+np.exp(-x))
def backward(self, grad):
grad_input = self.x*(1-self.x) * grad
return grad_input
class Relu(Layer):
def forward(self,x):
self.x = x
return np.maximum(np.zeros_like(x), x)
def backward(self, grad):
grad_input = (self.x > 0) * grad
return grad_input
class SoftmaxCrossentropyWithLogits(Layer):
def forward(self, x, y):
self.x = x
self.y = y
exps = np.exp(x)
self.softmax = exps / np.sum(exps, axis=-1, keepdims=True)
logits = self.softmax[np.arange(x.shape[0]),y]
log_likelihood = -np.log(logits)
loss = np.sum(log_likelihood) / x.shape[0]
return loss
def backward(self, grad=None):
batch = self.x.shape[0]
grad = self.softmax
grad[np.arange(batch),self.y] -= 1
grad = grad/batch
return grad
class MSE(Layer):
def forward(self, x, y):
self.x = x
self.y = y
return ((x - y)**2) / (self.x.shape[0]*2)
def backward(self, grad=None):
# 1/2n * Sum(xi-yi)**2
# dx = 1/2n * Sum( x**2 -2*x*y + y**2)
# dx = (2x - 2y) / 2*n = (x - y) / n
return (self.x - self.y) / self.x.shape[0]
class Linear(Layer):
def __init__(self, input, output, lr=0.0001):
self.A = 2*np.random.random((input, output)) - 1
self.b = 2*np.random.random((output)) - 1
self.lr = lr
def forward(self, x):
self.x = x
return np.dot(x,self.A) + self.b
def backward(self, grad):
# d_layer / db = 1
b_grad = grad.mean(axis=0)*self.x.shape[0]
# d_layer / dA = x
A_grad = np.dot(self.x.T, grad)
# As this layer have somee weights we need to update them using
# gradient descent
# compute df / dx = df / d_layer * d_layer / dx
# df / d_layer == grad
grad_input = np.dot(grad, self.A.T)
self.A -= A_grad * self.lr
self.b -= b_grad * self.lr
return grad_input
class Model(Layer):
def __init__(self, lr=0.0001):
self.lr = lr
self.layers = [
Linear(3,15, lr=self.lr),
Relu(),
Linear(15,1, lr=self.lr)
]
def forward(self,x):
for l in self.layers:
x = l(x)
return x
def backward(self, grad):
for l in self.layers[::-1]:
grad = l.backward(grad)
return grad
mm = Model()
def yf(x1,x2,x3):
return np.array([2 * x1 + 3*x2 + 4*x3 + 5],dtype=np.float32)
loss = MSE()
for i in range(20000):
x1 = np.random.random()*30
x2 = np.random.random()*20
x3 = np.random.random()*11
y = mm(np.array([[x1,x2,x3]]))
err = loss(y, yf(x1,x2,x3))
yb = loss.backward()
yb = mm.backward(yb)
if i % 100 == 0:
print(err)
print('VAL Target:',yf(1,2,3), 'Res:',
mm(np.array([[1,2,3]])), 'Loss:',
loss(yf(1,2,3), mm(np.array([[1,2,3]]
))))
print('---------')
...
---------
[[0.02075696]]
VAL Target: [25.] Res: [[23.61075629]] Loss: [[0.96499904]]
---------
[[0.24674584]]
VAL Target: [25.] Res: [[23.67659836]] Loss: [[0.87569595]]
---------
[[0.10659916]]
VAL Target: [25.] Res: [[23.60292783]] Loss: [[0.97590533]]
---------
[[0.68391024]]
VAL Target: [25.] Res: [[23.60518007]] Loss: [[0.97276132]]
---------
[[0.07534666]]
VAL Target: [25.] Res: [[23.61488175]] Loss: [[0.95927629]]
---------
Now let's solve some classification task on MNIST dataset. We will use some PyTorch utils to make it simpler
from torchvision.datasets import MNIST
from torchvision import transforms
from torch.utils.data import DataLoader
class Model(Layer):
def __init__(self, lr=0.00001):
self.lr = lr
self.layers = [
Linear(784,100, lr=self.lr),
Relu(),
Linear(100,200, lr=self.lr),
Relu(),
Linear(200,10, lr=self.lr)
]
def forward(self,x):
for l in self.layers:
x = l(x)
return x
def backward(self, grad):
for l in self.layers[::-1]:
grad = l.backward(grad)
return grad
simple = transforms.Compose([
transforms.ToTensor(), # converts to [0,1] interval
])
ds = MNIST('./mnist', download=True, transform=simple)
ld = DataLoader(ds, batch_size=2, pin_memory=True, drop_last=True)
mm = Model()
loss = SoftmaxCrossentropyWithLogits()
_loss_avg = 0
for e in range(5):
for i, (img, label) in enumerate(ld):
x = img.view(2,-1).numpy()
res = mm(x)
_loss = loss(res, label.numpy())
_loss_avg += _loss.mean() # running loss mean
grad = loss.backward(1)
mm.backward(grad)
if i % 100 == 0:
print(_loss_avg/100)
_loss_avg = 0
print('---------')
for i in range(10):
img, target = ds[i]
plt.imshow(img[0])
plt.show()
x = img.view(1,-1).numpy()
res = mm(x)[0]
pred = np.argmax(res)
print(f'target: {target} predicted: {pred}' )
And that's all. Don't forget that you can run this code in Google Colab by clicking button "Open in colab"