Go to Practical Course

# Making Backpropagation, Autograd, MNIST Classifier from scratch in Python

Simple practical examples to give you a good understanding of how all this NN/AI things really work

Backpropagation (backward propagation of errors) - is a widely used algorithm in training feedforward networks. It computes the gradient of the loss function with respect to the weights of the network. The main idea of it is to break big functions in small parts and use partial derivatives to get function derivative with using the Chain Rule.

So training the model is basically solving this equation:

$$loss(model(X), Target) = 0$$

And because solving this can be a very hard task, here comes backpropagation and gradient descent(updating weights by a small amount based on the gradient to move in the way of loss minimization).

All this is based on a simple diferentiation chain rule:

$$\frac{df}{dx} = \frac{df}{dy}*\frac{dy}{dx}$$

For example we have simple function with 2 nodes(operation):

Let's say we have 3 variables x=-2, y=5, z=-4, the result will be f=-12, and our target for training is -13. Simple loss=f-target=1.

Now we need to propagate our error(1) back to the x,y,z. To do this we will need to calculate partial derivatives for every function(operation):

\begin{align*} Multiplication:&\\ (q * z)dq &= 1 * z \\ (q * z)dz &= q * 1 \\ Summation:&\\ (x + y)dx &= 1 \\ (x + y)dy &= 1 \end{align*}

And our chain is:

\begin{align*} \frac{df}{dx} &= 1*(z*loss(1)) = 1 * -4 * 1 = -4 \\ \frac{df}{fy} &= 1*(z*loss(1)) = 1 * -4 * 1 = -4 \\ \frac{df}{dz} &= q*loss(1) = 3 * 1 = 3 \\ \end{align*}

### Let's build a simple AutogradÂ example

Here we will build a simple automatic differentiationexample from the above formulas.


class Tensor:

self.data = data
if not isinstance(data, np.ndarray):
self.data = np.array(data)
# whether to run backpropagation or not
# operation if this tensor was used in it

@property
def shape(self):
return self.data.shape

@property
raise Exception('This tensor is not backpropagated')

@property

return False

# in case if this is last loss tensor

raise Exception('This tensor is not backpropagated')

return True

def __str__(self):
return f'Tensor({str(self.data)})'

else:

if self.data is not None:
self.data += o.data
return self
self.data = o.data
return self

class Op:

def forward(self):
raise NotImplemented

raise NotImplemented

def __call__(self, *args):
self.out = self.forward(*args)
return self.out

'''Sumation operation with 2 tensors'''

def forward(self, x: Tensor, y: Tensor):
self.x = x
self.y = y
# created tensor should be backpropagated if at least one
# of the input is backpropagated

# as we have matrix operation one of the parameters can
# have partial shape in such scenarion we need to sum
# gradient values by missed axis
axis = np.argmax(np.abs(np.array(self.x.shape) -
keepdims=True)))
else:
self.x.backward()
axis = np.argmax(np.abs(np.array(self.y.shape) -
keepdims=True)))
else:
self.y.backward()

class MulOp(Op):

'''Multiplication operation with 2 tensors'''

def forward(self, x: Tensor, y: Tensor):
self.x = x
self.y = y

self.x.backward()
self.y.backward()


Now let's see what will happens


x = np.random.random((3,3))
a = np.random.random((3,3))
b = np.random.random((3,))

loss = np.random.random((3,3))

# here we use parameter _a 2 times in different operations.
# that means that gradient will be calculated twice for _a.
# and thus we need to sum two gradients to get result
y = MulOp()(
MulOp()(
_x, _a
), _b
), _a
)
print('y = ', y)
print()
print()
y.backward(Tensor(loss))
print()
print()
print()



y =  Tensor([[0.45119641 0.25401614 0.36177802]
[0.2445061  0.22747291 0.41594346]
[0.35465603 0.26046455 1.26796941]])

y.grad_fn =  <__main__.MulOp object at 0x7f169e332358>

[0.09573756 0.05524222 0.07022905]
[0.03903106 0.09778539 0.79086884]])

[0.44720764 0.85727129 0.14349873]
[0.05794243 0.58307861 1.63786025]])




y = (_a*_x + _b) * _a
print('y = ', y)
print()
print()
y.backward(torch.tensor(loss))
print()
print()
print()



y =  tensor([[0.4512, 0.2540, 0.3618],
[0.2445, 0.2275, 0.4159],

[0.0957, 0.0552, 0.0702],
[0.0390, 0.0978, 0.7909]], dtype=torch.float64)

[0.4472, 0.8573, 0.1435],
[0.0579, 0.5831, 1.6379]], dtype=torch.float64)

_b.grad =  tensor([0.4598, 0.4895, 1.2690], dtype=torch.float64)



As you can se we got the same gradients as in our simple autograd solution

### Let's build Neural Networks and solve 2 simple tasks

For simplicity we will not use autograd here and will solve derivative for each layer.


class Layer:

def forward(self):
raise NotImplemented

raise NotImplemented

def __call__(self, *args):
return self.forward(*args)

class Sigmoid:

def forward(self,x):
self.x = x
return 1/(1+np.exp(-x))

class Relu(Layer):

def forward(self,x):
self.x = x
return np.maximum(np.zeros_like(x), x)

class SoftmaxCrossentropyWithLogits(Layer):

def forward(self, x, y):
self.x = x
self.y = y

exps = np.exp(x)
self.softmax = exps / np.sum(exps, axis=-1, keepdims=True)

logits = self.softmax[np.arange(x.shape[0]),y]
log_likelihood = -np.log(logits)
loss = np.sum(log_likelihood) / x.shape[0]
return loss

batch = self.x.shape[0]

class MSE(Layer):

def forward(self, x, y):
self.x = x
self.y = y
return ((x - y)**2) / (self.x.shape[0]*2)

# 1/2n * Sum(xi-yi)**2
# dx = 1/2n * Sum( x**2 -2*x*y + y**2)
# dx  = (2x - 2y) / 2*n = (x - y) / n
return (self.x - self.y) / self.x.shape[0]

class Linear(Layer):

def __init__(self, input, output, lr=0.0001):
self.A = 2*np.random.random((input, output)) - 1
self.b = 2*np.random.random((output)) - 1
self.lr = lr

def forward(self, x):
self.x = x
return np.dot(x,self.A) + self.b

# d_layer / db = 1
# d_layer / dA = x
# As this layer have somee weights we need to update them using
# compute df / dx = df / d_layer * d_layer / dx
# df / d_layer == grad



#### Running simple point interpolation


class Model(Layer):

def __init__(self, lr=0.0001):
self.lr = lr
self.layers = [
Linear(3,15, lr=self.lr),
Relu(),
Linear(15,1, lr=self.lr)
]

def forward(self,x):
for l in self.layers:
x = l(x)
return x

for l in self.layers[::-1]:

mm = Model()

def yf(x1,x2,x3):
return np.array([2 * x1 + 3*x2 + 4*x3 + 5],dtype=np.float32)

loss = MSE()

for i in range(20000):
x1 = np.random.random()*30
x2 = np.random.random()*20
x3 = np.random.random()*11
y = mm(np.array([[x1,x2,x3]]))
err = loss(y, yf(x1,x2,x3))

yb = loss.backward()
yb = mm.backward(yb)
if i % 100 == 0:
print(err)
print('VAL Target:',yf(1,2,3), 'Res:',
mm(np.array([[1,2,3]])), 'Loss:',
loss(yf(1,2,3), mm(np.array([[1,2,3]]
))))
print('---------')



...
---------
[[0.02075696]]
VAL Target: [25.] Res: [[23.61075629]] Loss: [[0.96499904]]
---------
[[0.24674584]]
VAL Target: [25.] Res: [[23.67659836]] Loss: [[0.87569595]]
---------
[[0.10659916]]
VAL Target: [25.] Res: [[23.60292783]] Loss: [[0.97590533]]
---------
[[0.68391024]]
VAL Target: [25.] Res: [[23.60518007]] Loss: [[0.97276132]]
---------
[[0.07534666]]
VAL Target: [25.] Res: [[23.61488175]] Loss: [[0.95927629]]
---------


#### Classifying MNIST with our custom network

Now let's solve some classification task on MNIST dataset. We will use some PyTorch utils to make it simpler


from torchvision.datasets import MNIST
from torchvision import transforms

class Model(Layer):

def __init__(self, lr=0.00001):
self.lr = lr
self.layers = [
Linear(784,100, lr=self.lr),
Relu(),
Linear(100,200, lr=self.lr),
Relu(),
Linear(200,10, lr=self.lr)
]

def forward(self,x):
for l in self.layers:
x = l(x)
return x

for l in self.layers[::-1]:

simple = transforms.Compose([
transforms.ToTensor(), # converts to [0,1] interval
])
ld = DataLoader(ds, batch_size=2, pin_memory=True, drop_last=True)

mm = Model()
loss = SoftmaxCrossentropyWithLogits()
_loss_avg = 0
for e in range(5):
for i, (img, label) in enumerate(ld):
x = img.view(2,-1).numpy()

res = mm(x)
_loss = loss(res, label.numpy())
_loss_avg += _loss.mean() # running loss mean

if i % 100 == 0:
print(_loss_avg/100)
_loss_avg = 0
print('---------')



for i in range(10):
img, target = ds[i]
plt.imshow(img[0])
plt.show()
x = img.view(1,-1).numpy()
res = mm(x)[0]
pred = np.argmax(res)
print(f'target: {target} predicted: {pred}' )


And that's all. Don't forget that you can run this code in Google Colab by clicking button "Open in colab"

### References

1. Backpropagation  [HTML]
2. Chain Rule  [HTML]