,
{
for i in train_set
{

}
}
,
{

}


def batch_descent(A, Y, speed=0.001): theta = np.array(INITIAL_THETA.copy(), dtype=np.float32) theta.reshape((len(theta), 1)) previous_cost = 10 ** 6 current_cost = cost_function(A, Y, theta) while np.abs(previous_cost - current_cost) > EPS: previous_cost = current_cost derivatives = [0] * len(theta) # --------------------------------------------- for j in range(len(theta)): summ = 0 for i in range(len(Y)): summ += (Y[i] - A[i]@theta) * A[i][j] derivatives[j] = summ # theta[0] += speed * derivatives[0] theta[1] += speed * derivatives[1] # --------------------------------------------- current_cost = cost_function(A, Y, theta) print("Batch cost:", current_cost) plt.plot(theta[0], theta[1], 'ro') return theta 
def stochastic_descent(A, Y, speed=0.1): theta = np.array(INITIAL_THETA.copy(), dtype=np.float32) previous_cost = 10 ** 6 current_cost = cost_function(A, Y, theta) while np.abs(previous_cost - current_cost) > EPS: previous_cost = current_cost # -------------------------------------- # for i in range(len(Y)): i = np.random.randint(0, len(Y)) derivatives = [0] * len(theta) for j in range(len(theta)): derivatives[j] = (Y[i] - A[i]@theta) * A[i][j] theta[0] += speed * derivatives[0] theta[1] += speed * derivatives[1] current_cost = cost_function(A, Y, theta) print("Stochastic cost:", current_cost) plt.plot(theta[0], theta[1], 'ro') # -------------------------------------- current_cost = cost_function(A, Y, theta) return theta 
import numpy as np import matplotlib.pyplot as plt TOTAL = 200 STEP = 0.25 EPS = 0.1 INITIAL_THETA = [9, 14] def func(x): return 0.2 * x + 3 def generate_sample(total=TOTAL): x = 0 while x < total * STEP: yield func(x) + np.random.uniform(-1, 1) * np.random.uniform(2, 8) x += STEP def cost_function(A, Y, theta): return (Y - A@theta).T@(Y - A@theta) def batch_descent(A, Y, speed=0.001): theta = np.array(INITIAL_THETA.copy(), dtype=np.float32) theta.reshape((len(theta), 1)) previous_cost = 10 ** 6 current_cost = cost_function(A, Y, theta) while np.abs(previous_cost - current_cost) > EPS: previous_cost = current_cost derivatives = [0] * len(theta) # --------------------------------------------- for j in range(len(theta)): summ = 0 for i in range(len(Y)): summ += (Y[i] - A[i]@theta) * A[i][j] derivatives[j] = summ # theta[0] += speed * derivatives[0] theta[1] += speed * derivatives[1] # --------------------------------------------- current_cost = cost_function(A, Y, theta) print("Batch cost:", current_cost) plt.plot(theta[0], theta[1], 'ro') return theta def stochastic_descent(A, Y, speed=0.1): theta = np.array(INITIAL_THETA.copy(), dtype=np.float32) previous_cost = 10 ** 6 current_cost = cost_function(A, Y, theta) while np.abs(previous_cost - current_cost) > EPS: previous_cost = current_cost # -------------------------------------- # for i in range(len(Y)): i = np.random.randint(0, len(Y)) derivatives = [0] * len(theta) for j in range(len(theta)): derivatives[j] = (Y[i] - A[i]@theta) * A[i][j] theta[0] += speed * derivatives[0] theta[1] += speed * derivatives[1] # -------------------------------------- current_cost = cost_function(A, Y, theta) print("Stochastic cost:", current_cost) plt.plot(theta[0], theta[1], 'ro') return theta X = np.arange(0, TOTAL * STEP, STEP) Y = np.array([y for y in generate_sample(TOTAL)]) # , X = (X - X.min()) / (X.max() - X.min()) A = np.empty((TOTAL, 2)) A[:, 0] = 1 A[:, 1] = X theta = np.linalg.pinv(A).dot(Y) print(theta, cost_function(A, Y, theta)) import time start = time.clock() theta_stochastic = stochastic_descent(A, Y, 0.001) print("St:", time.clock() - start, theta_stochastic) start = time.clock() theta_batch = batch_descent(A, Y, 0.001) print("Btch:", time.clock() - start, theta_batch) 


). For example, the gradient descent of the second order (the learning rate was replaced by the Hessian), BFGS, conjugate gradients, the Newton method and a huge number of other methods . In general, optimization is a separate and very wide layer of problems. However, here is an example (though only a presentation) of the work + video of Jan Lekun , in which he says that you can not soar and just use gradient methods. Even considering that the 2007 presentation, many recent experiments with large ANNs used gradient methods. For example .
where k is the number of elements in the test set. Thus, vectorization is suitable for the mini-batch method. Like last time, I'll write out everything in the vectors that have been opened. Note that the first matrix is ​​transposed - 





where n is the number of elements, p is the number of signs. Much better
. An expression identical to this also occurs:
Source: https://habr.com/ru/post/308604/
All Articles