AdagradΒΆ
Momentum and Nesterov momentum update all components of $x$ with the same learning rate. The adaptive subgradient method, or Adagrad, adapts a learning rate for each component of $x$. Adagrad dulls the influence of parameters with consistently high gradients, thereby increasing the influence of parameters with frequent updates.
The Adagrad update step is : \begin{align} x_i^{(k+1)} = x_i^{(k)} - \frac{\alpha}{\epsilon + \sqrt{s_i^{(k)}}}*g_i^{(k)} \end{align}
where $s^{(k)}$ is a vector whose $i^{th}$ entry is the sum of the squares of the partials, with respect to $x_i$, up to time step $k$, \begin{align} s_i^{(k)} = \sum_{j=1}^{k}(g_i^{(j)})^2 \end{align} where $\epsilon$ is a small value, on the order of $1* 10^{-8}$, to prevent division by zero. Adagrad is far less sensitive to the learning rate parameter $\alpha$. The learning rate parameter is typically set to a default value of 0.01.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Visualize the function (x**2+y**2)
x1 = np.linspace(-2,2,30)
x2 = np.linspace(-2,2,30)
# Creating 2-D grid of features
[X, Y] = np.meshgrid(x1,x2)
Z = X**2 + Y**2
figure = plt.figure(figsize=(8,6))
axis = figure.gca(projection='3d')
plt.title("3D Visualization of the function x^2+y^2")
axis.plot_surface(X, Y, Z, cmap='viridis')
plt.show()
plt.figure(figsize=(6,6))
plt.title("Contour plot of the function x^2+y^2")
plt.contour(X,Y,Z,55,cmap='viridis')
plt.show()
def fun(x,a):
pow = a
if pow == 0:
return x[0]**2 + x[1]**2
else:
h = 0.00001
x1 = np.array([x[0]+h,x[1]])
x2 = np.array([x[0],x[1]+h])
# The gradient Vector
g = np.array([(fun(x1,0)-fun(x,0))/h,(fun(x2,0)-fun(x,0))/h])
return g
def adagrad(x_init,alpha,eps,eps1,inter):
iter = 0
# Initializing lists to append values of x's, function output, gradients, sum of gradients and decreasing learning
# rate.
arr_x = [] ; arr_f = [] ; arr_g = [] ; arr_s = []; arr_lr = []
# assign values to x, fun(x), and gradient(x)
x, f ,g = x_init,fun(x_init,0),fun(x_init,1)
# Normalized gradient direction : d
d = np.array(g/np.linalg.norm(g))
# Appending the values of our first iteration to the created lists.
arr_x.append(x); arr_f.append(f); arr_g.append(g)
# Check if our starting point is minimum of the function
if (np.linalg.norm(g)==0):
print(" The min value of the function is:",f, "for value of x:",x)
return x
else:
# Start of the while loop, which will be executed until our condition is met
for i in range(inter):
iter+=1
# update statement "sum of the squares of the gradients upto g^(k)" of Adagrad
e = np.square(arr_g)
e = e.sum(axis=1)
s = np.sum(e)
# Recording the decreasing learning rate, for demonstration in graph
arr_s.append(np.sqrt(s)); arr_lr.append((lr/(eps1 + np.sqrt(s))))
# Step to update value of x, in the decreasing direction of gradient and a decreasing multiple of learning rate.
x = x - lr/(eps1 + np.sqrt(s))*arr_g[-1]
#recalculate fun(x) and grad(x), based on new values of x
f, g = fun(x,0) , fun(x,1)
# normalized gradient direction
d = np.array(g/np.linalg.norm(g))
# append the list with new values of x, fun(x) and grad(x)
arr_x.append(x) ; arr_f.append(f); arr_g.append(g)
print("iter",iter,"f:",f )
return (arr_x,arr_f,arr_s,arr_lr,arr_g)
x = [2,2]
inter = 10000
lr = 0.01
eps = 1e-4
eps1 = 1e-8
arr_x,arr_f,arr_s,arr_lr,arr_g = adagrad(x,lr,eps,eps1,inter)
print("The minimum point of the function:",arr_x[-1])
iter 10000 f: 1.297147588407563 The minimum point of the function: [0.80534079 0.80534079]
One drawback of the Adagrad, is that the components of (sum of the gradients) are strickly non-decreasing. The accumulated sum causes the effective learning rate to decrease during training, often becoming infitesimally small before convergence.
# Subplots to visualize the decreasing learning rate, decreasing function value and non-decreasing gradient sum.
fig, (ax1, ax2,ax3) = plt.subplots(1,3,figsize=(15,4))
x, y, lr, f = range(len(arr_s)) ,arr_s, arr_lr, arr_f[1:]
fig.suptitle('Capturing the nuances of Adagrad')
ax1.plot(x, lr,'tab:blue')
ax1.set_title('Decreasing learning rate: "lr"')
ax2.plot(x, f,'tab:green')
ax2.set_title('Decreasing function value: "f"')
ax3.plot(x, y, 'tab:red')
ax3.set_title('Increasing summation of gradients: "s"')
x1, x2 = np.linspace(0,2.5,500), np.linspace(0,2.5,500)
arr = pd.DataFrame(arr_x)
# Creating 2-D grid of features
[X, Y] = np.meshgrid(x1,x2)
Z = X**2 + Y**2
a,b = arr[0],arr[1]
plt.figure(figsize=(8,7))
plt.scatter(a,b)
plt.contour(X,Y,Z,90)
plt.plot(a,b,color='#bcbd22')
plt.show()
RMSΒΆ
RMS prop extends Adagrad to avoid the effect of a monotonically decreasing learning rate. RMS prop maintains a decaying average of squared gradients. This average is updated accordingly:
\begin{align} \hat{s}^{(k+1)} = \gamma \hat{s}^{(k+1)} - (1-\gamma)(g^{(k)} * g^{(k)}) \end{align}
where the decay $\gamma \in [0,1]$ is close to 0.9. The decaying average of past sqaured gradients can be substituted into RMS prop update equation:
\begin{align} x_i^{(k+1)} = x_i^{(k)} - \frac{\alpha}{\epsilon + \sqrt{\hat{s_i}^{(k)}}}*g_i^{(k)} \end{align} \begin{align} = x_i^{(k)} - \frac{\alpha}{\epsilon + RMS(g_i)}*g_i^{(k)} \end{align}
def RMS_prop(x_init,alpha,gamma,eps,eps1,inter):
iter = 0
# Initializing lists to append values of x's, function output, gradients, sum of gradients and decreasing learning
# rate.
arr_x = [] ; arr_f = [] ; arr_g = [] ; arr_s = []; arr_lr = []
# assign values to x, fun(x), and gradient(x)
x, f ,g = x_init,fun(x_init,0),fun(x_init,1)
# Normalized gradient direction : d
d = np.array(g/np.linalg.norm(g))
# Appending the values of our first iteration to the created lists.
arr_x.append(x); arr_f.append(f); arr_g.append(g)
s = np.zeros(len(x))
s = gamma*s + (1-gamma)* np.dot(arr_g[-1],arr_g[-1])
arr_s.append(s)
arr_lr.append(lr/(eps1 + np.sqrt(s)))
# Check if our starting point is minimum of the function
if (np.linalg.norm(g)==0):
print(" The min value of the function is:",f, "for value of x:",x)
return x
else:
# Start of the while loop, which will be executed until our condition is met
print(f)
while abs(f) > eps:
iter+=1
# update statement "sum of the squares of the gradients upto g^(k)" of Adagrad
#decaying average of squared gradients
s = gamma*s + (1-gamma)* np.dot(arr_g[-1],arr_g[-1])
# Step to update value of x, in the decreasing direction of gradient and a decreasing multiple of learning rate.
#x = x - lr/(eps1 + np.sqrt(s))*arr_g[-1]
x = x - lr/(eps1 + np.sqrt(s))*g
#recalculate fun(x) and grad(x), based on new values of x
f, g = fun(x,0) , fun(x,1)
# append the list with new values of x, fun(x) and grad(x)
arr_x.append(x) ; arr_f.append(f); arr_g.append(g); arr_s.append(s); arr_lr.append(lr/(eps1 + np.sqrt(s)))
print("iter",iter,"f:",f )
return (arr_x,arr_f,arr_s,arr_lr,arr_g)
x = [2,2]
#inter = 10000
gamma = 0.9
lr = 0.01
eps = 1e-4
eps1 = 1e-8
arr_x,arr_f,arr_s,arr_lr,arr_g = RMS_prop(x,lr,gamma,eps,eps1,inter)
print("The minimum point of the function:",arr_x[-1])
8 iter 316 f: 9.917668805573655e-05 The minimum point of the function: [0.0070419 0.0070419]
# Subplots to visualize the decreasing learning rate, decreasing function value and non-decreasing gradient sum.
fig, (ax1, ax2,ax3) = plt.subplots(1,3,figsize=(15,4))
x, y, lr, f = range(len(arr_s)) ,arr_s, arr_lr, arr_f
fig.suptitle('Capturing the nuances of RMS_prop')
ax1.plot(x, lr,'tab:blue')
ax1.set_title('learning rate: "lr"')
ax2.plot(x, f,'tab:green')
ax2.set_title('Decreasing function value: "f"')
ax3.plot(x, y, 'tab:red')
ax3.set_title('Decaying average of sum of squares of partials: "s"')
x1,x2,arr = np.linspace(0,2.5,500), np.linspace(0,2.5,500),pd.DataFrame(arr_x)
# Creating 2-D grid of features
[X, Y] = np.meshgrid(x1,x2)
Z = X**2 + Y**2
a,b = arr[0],arr[1]
plt.figure(figsize=(8,7))
plt.scatter(a,b)
plt.contour(X,Y,Z,90)
plt.plot(a,b,color='#bcbd22')
plt.show()
AdadeltaΒΆ
Adadelta is another method for overcoming "Adagrad's Monotonically decreasing learning rate. We use exponentially decaying average of square updates.
\begin{align} x_i^{(k+1)} = x_i^{(k)} - \frac{\text{RMS}(\triangle x_i)}{\epsilon + \text{RMS}(g_i)}*g_i^{(k)} \end{align} which eliminates the learning rate entirely.
def adadelta(x_init,gamma,eps,eps1):
iter = 0
# Initializing lists to append values of x's, function output, gradients, sum of gradients and decreasing learning
# rate.
arr_x = [] ; arr_f = [] ; arr_g = [] ; arr_s = []; arr_u = []
# assign values to x, fun(x), and gradient(x)
x, f ,g = x_init,fun(x_init,0),fun(x_init,1)
# Appending the values of our first iteration to the created lists.
arr_x.append(x); arr_f.append(f); arr_g.append(g)
s, u = np.zeros(len(x)),np.zeros(len(x))
# Calculating the incremental value of s with a steady decay, exponential decaying average of square updates
s = gamma*s + (1-gamma)* np.dot(arr_g[-1],arr_g[-1])
# Initial value of decay initialized to 0, to prevent entirely decaying to zero
del_x = np.zeros(len(x))
print("del_x",del_x)
u = gamma*u + (1-gamma)* np.dot(del_x,del_x)
arr_s.append(s); arr_u.append(u)
# Check if our starting point is minimum of the function
if (np.linalg.norm(g)==0):
print(" The min value of the function is:",f, "for value of x:",x)
return x
else:
# Start of the while loop, which will be executed until our condition is met
print(f)
while abs(f) > eps:
iter+=1
x = x + del_x
print(x)
# update value of fun(x), gradient(x)
f, g = fun(x,0) , fun(x,1)
# Appending the values calculated in the previous 2 steps
arr_x.append(x) ; arr_f.append(f); arr_g.append(g)
# decay steps of RMS prop and Adadelta
s = gamma*s + (1-gamma)* np.dot(arr_g[-1],arr_g[-1])
del_x = - np.divide((np.sqrt(u) + eps1),(np.sqrt(s)+eps1)) * g
print("del_x",del_x)
u = gamma*u + (1-gamma)* np.dot(del_x,del_x)
arr_s.append(s); arr_u.append(u)
print("iter",iter,"f:",f )
return (arr_x,arr_f,arr_s,arr_u)