def backward_pass(parameters, cache, X, Y):
# unpack paramaeters and cache to get values for backpropagation calculations
W1 = parameters['W1']
W2 = parameters['W2']
Z1 = cache['Z1']
A1 = cache['A1']
Z2 = cache['Z2']
A2 = cache['A2']
m = X.shape[1] # number of examples in a training set
dZ2= A2 - Y
dW2 = (1 / m) * np.dot(dZ2, A1.T)
db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True) # keepdims - prevents python to output rank 1 array (n,)
dZ1 = np.multiply(np.dot(W2.T, dZ2), 1 - np.power(A1, 2)) # we use tanh activation function
dW1 = (1 / m) * np.dot(dZ1, X.T)
db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True)
grads = {"dW1": dW1,
"db1": db1,
"dW2": dW2,
"db2": db2}
return grads