%matplotlib inline
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import sklearn.linear_model as linear_model
np.random.seed(1337)
from timeit import timeit
from optimizers import gradient_descent, gss
from plotters import error_plot, convergence_plot, kwargs, setup_layout
setup_layout()
Below is a naive implementation of Newton's iteration that we analyzed in the last lecture.
def newton_method(init, steps, grad, hessian, num_to_keep=None):
"""Newton iteration with full Hessian inverse.
Parameters
----------
initial : array
starting point
steps : list of floats
step size schedule for the algorithm
grad : function
mapping arrays to arrays of same shape
hessian : function
mapping 1d arrays to 2d arrays
num_to_keep : integer, optional
number of points to keep
Returns
-------
List of points computed by Newton iteration. Length of the
list is determined by `num_to_keep`.
"""
xs = [init]
for step in steps:
Hinv = np.linalg.pinv(hessian(xs[-1]))
xs.append(xs[-1] - step * Hinv.dot(grad(xs[-1])))
if num_to_keep:
xs = xs[-num_to_keep:]
return xs
As a sanity check, Newton's method should solve a quadratic in one step. The same is true for least squares.
def least_squares(A, b, x):
"""Least squares objective."""
return (0.5/len(A)) * np.linalg.norm(A.dot(x)-b)**2
def least_squares_gradient(A, b, x):
"""Gradient of least squares objective at x."""
return A.T.dot(A.dot(x)-b)/len(A)
def least_squares_hessian(A, x):
"""Hessian of least squares objective."""
return A.T.dot(A)/len(A)
m, n = 1000, 100
A = np.random.normal(0, 1, (m, n))
x_opt = np.random.normal(0, 1, n)
noise = np.random.normal(0, 0.1, m)
b = A.dot(x_opt)
objective = lambda x: least_squares(A, b, x)
gradient = lambda x: least_squares_gradient(A, b, x)
hessian = lambda x: least_squares_hessian(A, x)
x0 = np.random.normal(0, 1, n)
xs = newton_method(x0, [1], gradient, hessian)
errors = [objective(x) for x in xs]
error_plot(errors)
Let's move on to a less trivial example that we'll stick to for the rest of the lecture: logistic regression. A logistic model makes predictions based on a sigmoid function applied to a linear function of the data.
def sigmoid(x):
"""Sigmoid function."""
return 1./(1.+np.exp(-x))
xs = np.linspace(-5, 5, 100)
plt.plot(xs, sigmoid(xs), **kwargs);
def logistic_model(weights, data):
"""Logistic model."""
return sigmoid(data.dot(weights))
def log_likelihood(weights, data, labels):
"""Normalized negative log likelihood."""
scores = np.dot(data, weights)
return -np.mean(labels*scores - np.log(1 + np.exp(scores)))
def log_likelihood_gradient(weights, data, labels):
"""Gradient of negative log likelihood."""
predictions = logistic_model(weights, data)
return -data.T.dot(labels - predictions)/len(data)
def log_likelihood_hessian(weights, data, labels):
"""Hessian of negative log likelihood."""
predictions = logistic_model(weights, data)
diag = np.diag(predictions * (1 - predictions))
return data.T.dot(diag.dot(data))/len(data)
m, n = 1000, 100
data = np.random.normal(0, 1, (m, n))
wopt = np.random.normal(0, 1, n)
labels = (data.dot(wopt) > 0).astype(np.float)
gradient = lambda w: log_likelihood_gradient(w, data, labels)
hessian = lambda w: log_likelihood_hessian(w, data, labels)
init = np.zeros(n)
Let's get a baseline solution out of sklearn's logistic model.
cls = linear_model.LogisticRegression()
cls.fit(data, labels)
weights = np.reshape(cls.coef_, cls.coef_.shape[1])
baseline = log_likelihood(weights, data, labels)
baseline
ws_gd = gradient_descent(init, np.ones(1000), gradient)
ws_newton = newton_method(init, np.ones(10), gradient, hessian)
error_plot([log_likelihood(w, data, labels) for w in ws_gd])
plt.plot(range(len(ws_gd)), [baseline]*len(ws_gd),
label='sklearn', **kwargs)
_ = plt.legend()
error_plot([log_likelihood(w, data, labels) for w in ws_newton])
plt.plot(range(len(ws_newton)), [baseline]*len(ws_newton),
label='sklearn', **kwargs)
plt.legend();
We'll now see how to speed up Newton's iteration with a simple trick. Instead of computing a full pseudo-inverse, we'll solve set of a linear equations approximately. Recall that the system $Hx=g$ has the solution $x=H^{-1}g$.
def newton_method_lsq(init, steps, grad, hessian, least_squares,
num_to_keep=None):
"""Newton iteration with least squares solver.
Parameters
----------
initial : array
starting point
steps : list of floats
step size schedule for the algorithm
grad : function
mapping arrays to arrays of same shape
hessian : function
mapping 1d arrays to 2d arrays
least_squares : function
solver for linear equations
num_to_keep : integer, optional
number of points to keep
Returns
-------
List of points computed by Newton iteration. Length of the
list is determined by `num_to_keep`.
"""
xs = [init]
for step in steps:
# Use least squares solver for Hessian-gradient product
direction = least_squares(hessian(xs[-1]), grad(xs[-1]))
xs.append(xs[-1] - step * direction)
if num_to_keep:
xs = xs[-num_to_keep:]
return xs
If we use a full least squares solve, this is just equivalent to the pseudo-inverse we had before. This why Newton's method for logistic regression is sometimes called iteratively reweighted least squares.
lsq = lambda A, b: np.linalg.lstsq(A, b)[0]
ws_newton_lsq = newton_method_lsq(init, np.ones(10), gradient, hessian, lsq)
error_plot([log_likelihood(w, data, labels) for w in ws_newton_lsq])
plt.plot(range(len(ws_newton_lsq)), [baseline]*len(ws_newton_lsq),
label='sklearn', **kwargs)
_ = plt.legend()
Let's plug in an approximate solver to see what happens. All the tools for linear equations we've seen in previous lectures (gradient descent, conjugate gradient etc.) could be used here. We'll just stick to gradient descent to keep it simple.
def lsq_gd(A, b):
"""Approximate least squares with gradient descent"""
gradient = lambda x: least_squares_gradient(A, b, x)
return gradient_descent(b, 1000*np.ones(10), gradient)[-1]
ws_newton_lsq_gd = newton_method_lsq(init, np.ones(500), gradient, hessian, lsq_gd)
error_plot([log_likelihood(w, data, labels) for w in ws_newton_lsq_gd])
plt.plot(range(len(ws_newton_lsq_gd)), [baseline]*len(ws_newton_lsq_gd),
label='sklearn', **kwargs)
_ = plt.legend()
It looks like we didn't gain much compared with plain gradient descent. The number of iterations to reach the baseline got smaller, but each iteration is now costlier. Nonetheless, this idea gives rise to set of interesting heuristics.
def bfgs(init, grad, hessian, line_search, num_steps, num_to_keep=None):
"""BFGS algorithm (toy implementation).
Parameters
----------
initial : array
starting point
grad : function
mapping arrays to arrays of same shape
hessian : function
mapping 1d arrays to 2d arrays
line_search : function
method to determine step size
num_steps : number
number of iterations
num_to_keep : integer, optional
number of points to keep
Returns
-------
List of points computed by Newton iteration. Length of the
list is determined by `num_to_keep`.
"""
xs = [init]
I = np.eye(len(init))
Hs = [hessian(init)]
Hinvs = [I]
for _ in range(num_steps):
# update point
direction = Hinvs[-1].dot(grad(xs[-1]))
step = line_search(xs[-1], direction)
xs.append(xs[-1] + step * direction)
s = xs[-1] - xs[-2]
y = grad(xs[-1]) - grad(xs[-2])
ys = np.dot(y, s)
ssT = np.outer(s, s)
# udpate H
H = Hs[-1] + np.outer(y, y)/ys
H -= Hs[-1].dot(ssT.dot(Hs[-1]))/np.dot(s, Hs[-1].dot(s))
Hs.append(H)
# update Hinv
left = I - np.outer(s, y)/ys
right = I - np.outer(y, s)/ys
Hinvs.append(left.dot(Hinvs[-1]).dot(right) + ssT/ys)
if num_to_keep:
xs = xs[-num_to_keep:]
return xs
Let's first try this out with a constant step size.
def line_search(x, d):
"""Constant step size."""
return -0.1
ws_bfgs = bfgs(init, gradient, hessian, line_search, 100)
error_plot([log_likelihood(w, data, labels) for w in ws_bfgs])
plt.plot(range(len(ws_bfgs)), [baseline]*len(ws_bfgs),
label='sklearn', **kwargs)
_ = plt.legend()
Now find a clever step size using the golden section search algorithm we discussed in a previous lecture.
def line_search(x, d):
return gss(lambda step: log_likelihood(x+step*d, data, labels), -1, 1)
ws_bfgs = bfgs(init, gradient, hessian, line_search, 10)
error_plot([log_likelihood(w, data, labels) for w in ws_bfgs])
plt.plot(range(len(ws_bfgs)), [baseline]*len(ws_bfgs),
label='sklearn', **kwargs)
_ = plt.legend()
We see that BFGS with line search gets pretty close to Newton's method in iteration count, but each iteration is now much cheaper (in terms of the data dimension).
Recall that we can implement the Hessian inverse (approximately or exactly) by just using Hessian vector products. This follows from the Krylov methods for solving linear equations.
This raises the question how we can, in general, compute Hessian-vector products of possibly complicated functions for which we have no exact formula for the Hessian.
Luckily, this problem is also solved by automatic differentiation. Let's import autograd
first and define some function. For simplicity we'll reuse the log-likelihood we saw before.
import autograd.numpy as np
from autograd import grad
np.random.seed(1337)
def log_likelihood(weights, data, labels):
"""Normalized negative log likelihood."""
scores = np.dot(data, weights)
return -np.mean(labels*scores - np.log(1 + np.exp(scores)))
m, n = 1000, 100
data = np.random.normal(0, 1, (m, n))
wopt = np.random.normal(0, 1, n)
labels = (data.dot(wopt) > 0).astype(np.float)
Here's now how to get Hessian vector products with just two invocations of automatic differentiation.
objective = lambda w: log_likelihood(w, data, labels)
vector = np.random.normal(0, 1, n)
hvprod = grad(lambda x: np.dot(vector, grad(objective)(x)))
Let's verify that hvprod
correctly implements the product of the Hessian at a given point with vector
.
def log_likelihood_hessian(weights, data, labels):
"""Hessian of negative log likelihood."""
predictions = logistic_model(weights, data)
diag = np.diag(predictions * (1 - predictions))
return data.T.dot(diag.dot(data))/len(data)
H = log_likelihood_hessian(wopt, data, labels)
np.allclose(hvprod(wopt), H.dot(vector))
Let's see how this all plays out on a more realistic data set, the UCI adult data set. We'll first preprocess the data in a standard way.
import pandas as pd
import sklearn as skl
import sklearn.preprocessing as preprocessing
import sklearn.linear_model as linear_model
import sklearn.metrics as metrics
import sklearn.tree as tree
features = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Martial Status",
"Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
"Hours per week", "Country", "Target"]
# Source: https://archive.ics.uci.edu/ml/datasets/adult
original_train = pd.read_csv("adult.data.txt", names=features, sep=r'\s*,\s*', engine='python', na_values="?")
original_test = pd.read_csv("adult.test.txt", names=features, sep=r'\s*,\s*', engine='python', na_values="?", skiprows=1)
num_train = len(original_train)
original = pd.concat([original_train, original_test])
labels = original['Target']
labels = labels.replace('<=50K', 0).replace('>50K', 1)
labels = labels.replace('<=50K.', 0).replace('>50K.', 1)
# Redundant column
del original["Education"]
del original["Target"]
def data_transform(df):
binary_data = pd.get_dummies(df)
feature_cols = binary_data[binary_data.columns[:-2]]
scaler = preprocessing.StandardScaler()
data = pd.DataFrame(scaler.fit_transform(feature_cols), columns=feature_cols.columns)
return data
data = data_transform(original)
train_data = data[:num_train]
train_labels = labels[:num_train]
test_data = data[num_train:]
test_labels = labels[num_train:]
len(train_data), len(test_data)
original
X = train_data.values
y = train_labels.values
objective = lambda w: likelihood(w, X, y)
gradient = lambda w: log_likelihood_gradient(w, X, y)
hessian = lambda w: log_likelihood_hessian(w, X, y)
init = np.zeros(X.shape[1])
cls = linear_model.LogisticRegression()
cls.fit(X, y)
weights = np.reshape(cls.coef_, cls.coef_.shape[1])
baseline = log_likelihood(weights, X, y)
ws_newton = newton_method(init, np.ones(10), gradient, hessian)
error_plot([log_likelihood(w, X, y) for w in ws_newton])
plt.plot(range(len(ws_newton)), [baseline]*len(ws_newton),
label='sklearn', **kwargs)
_ = plt.legend()
def line_search(x, d):
return gss(lambda step: log_likelihood(x+step*d, X, y), -1, 1, tol=0.1)
ws_bfgs = bfgs(init, gradient, hessian, line_search, 10)
error_plot([log_likelihood(w, X, y) for w in ws_bfgs])
plt.plot(range(len(ws_bfgs)), [baseline]*len(ws_bfgs),
label='sklearn', **kwargs)
_ = plt.legend()
timeit(lambda: linear_model.LogisticRegression().fit(X, y), number=1)
timeit(lambda: bfgs(init, gradient, hessian, line_search, 10), number=1)
timeit(lambda: gradient_descent(init, np.ones(200), gradient), number=1)
Xtest = test_data.values
ytest = test_labels.values
def accuracy(weights):
predictions = logistic_model(weights, Xtest)
return np.mean((predictions > 0.5).astype(np.float) == ytest)
ws_gd = gradient_descent(init, np.ones(500), gradient)
accuracy(ws_gd[-1])
ws_bfgs = bfgs(init, gradient, hessian, line_search, 20)
accuracy(ws_bfgs[-1])
cls = linear_model.LogisticRegression(C=1, fit_intercept=False)
cls.fit(X, y)
cls.score(Xtest, ytest)
weights = np.reshape(cls.coef_, cls.coef_.shape[1])
accuracy(weights)
We see that our toy implementation of BFGS is actually fairly competitive. So is gradient descent which reaches good test performance with an optimized constant step size.