In the following function, for each iteration, we can use optimization methods to optimize \(\alpha\).
gd.lm <- function(X, y, beta.init, alpha, tol = 1e-05, max.iter = 100) {
beta.old <- beta.init
J <- betas <- list()
if (alpha == "auto") {
alpha <- optim(0.1, function(alpha) {
lm.cost(X, y, beta.old - alpha * lm.cost.grad(X, y, beta.old))
}, method = "L-BFGS-B", lower = 0, upper = 1)
if (alpha$convergence == 0) {
alpha <- alpha$par
} else {
alpha <- 0.1
}
}
betas[[1]] <- beta.old
J[[1]] <- lm.cost(X, y, beta.old)
beta.new <- beta.old - alpha * lm.cost.grad(X, y, beta.old)
betas[[2]] <- beta.new
J[[2]] <- lm.cost(X, y, beta.new)
iter <- 0
while ((abs(lm.cost(X, y, beta.new) - lm.cost(X, y, beta.old)) > tol) & (iter <
max.iter)) {
beta.old <- beta.new
if (alpha == "auto") {
alpha <- optim(0.1, function(alpha) {
lm.cost(X, y, beta.old - alpha * lm.cost.grad(X, y, beta.old))
}, method = "L-BFGS-B", lower = 0, upper = 1)
if (alpha$convergence == 0) {
alpha <- alpha$par
} else {
alpha <- 0.1
}
}
beta.new <- beta.old - alpha * lm.cost.grad(X, y, beta.old)
iter <- iter + 1
betas[[iter + 2]] <- beta.new
J[[iter + 2]] <- lm.cost(X, y, beta.new)
}
if (abs(lm.cost(X, y, beta.new) - lm.cost(X, y, beta.old)) > tol) {
cat("Could not converge. \n")
} else {
cat("Converged. \n")
cat("Iterated", iter + 1, "times.", "\n")
cat("Coef: ", beta.new, "\n")
return(list(coef = betas, cost = J, niter = iter + 1))
}
}
## Make the cost function
lm.cost <- function(X, y, beta) {
n <- length(y)
loss <- sum((X %*% beta - y)^2)/(2 * n)
return(loss)
}
## Calculate the gradient
lm.cost.grad <- function(X, y, beta) {
n <- length(y)
(1/n) * (t(X) %*% (X %*% beta - y))
}
Let us now generate some data and compare functions with and without optimized \(\alpha\).
## Generate some data
set.seed(20200401)
beta0 <- 1
beta1 <- 3
sigma <- 1
n <- 10000
x <- rnorm(n, 0, 1)
y <- beta0 + x * beta1 + rnorm(n, mean = 0, sd = sigma)
X <- cbind(1, x)
gd.auto <- gd.lm(X, y, beta.init = c(0, 0), alpha = "auto", tol = 1e-05, max.iter = 10000)
## Converged.
## Iterated 3 times.
## Coef: 0.995839 2.998762
gd1 <- gd.lm(X, y, beta.init = c(0, 0), alpha = 0.1, tol = 1e-05, max.iter = 10000)
## Converged.
## Iterated 55 times.
## Coef: 0.9934615 2.99014
betas <- as.data.frame(t(do.call(cbind, gd1$coef)))
colnames(betas) <- c("beta0", "beta1")
betas <- betas %>% mutate(iter = 1:nrow(betas))
betas <- melt(betas, id.vars = "iter", variable.name = "coef")
p1 <- ggplot(betas, aes(iter, value)) + geom_line(aes(colour = coef)) + ylim(c(0,
3.5)) + ggtitle("alpha = 0.1") + xlim(c(0, 600))
gd2 <- gd.lm(X, y, beta.init = c(0, 0), alpha = 0.01, tol = 1e-05, max.iter = 10000)
## Converged.
## Iterated 455 times.
## Coef: 0.9872457 2.969085
betas <- as.data.frame(t(do.call(cbind, gd2$coef)))
colnames(betas) <- c("beta0", "beta1")
betas <- betas %>% mutate(iter = 1:nrow(betas))
betas <- melt(betas, id.vars = "iter", variable.name = "coef")
p2 <- ggplot(betas, aes(iter, value)) + geom_line(aes(colour = coef)) + ylim(c(0,
3.5)) + ggtitle("alpha = 0.01") + xlim(c(0, 600))
niter <- data.frame(alpha = c("auto", 0.1, 0.01), niter = c(gd.auto$niter, gd1$niter,
gd2$niter))
p1 + p2
knitr::kable(niter) %>% kable_styling(bootstrap_options = "striped", full_width = F)
alpha | niter |
---|---|
auto | 3 |
0.1 | 55 |
0.01 | 455 |
We find that using optimized \(\alpha\) improves iteration efficiency significantly, while using smaller learning rates yields larger number of iterations.
Recall the SGD iteration. Repeat the following until convergence {
\[\beta:= \beta - \alpha \nabla J(\beta)_i.\]
}
sgd.lm <- function(X, y, beta.init, alpha = 0.5, n.samples = 1, tol = 1e-05, max.iter = 100) {
n <- length(y)
beta.old <- beta.init
J <- betas <- list()
sto.sample <- sample(1:n, n.samples, replace = TRUE)
betas[[1]] <- beta.old
J[[1]] <- lm.cost(X, y, beta.old)
beta.new <- beta.old - alpha * sgd.lm.cost.grad(X[sto.sample, ], y[sto.sample],
beta.old)
betas[[2]] <- beta.new
J[[2]] <- lm.cost(X, y, beta.new)
iter <- 0
n.best <- 0
while ((abs(lm.cost(X, y, beta.new) - lm.cost(X, y, beta.old)) > tol) & (iter +
2 < max.iter)) {
beta.old <- beta.new
sto.sample <- sample(1:n, n.samples, replace = TRUE)
beta.new <- beta.old - alpha * sgd.lm.cost.grad(X[sto.sample, ], y[sto.sample],
beta.old)
iter <- iter + 1
betas[[iter + 2]] <- beta.new
J[[iter + 2]] <- lm.cost(X, y, beta.new)
}
if (abs(lm.cost(X, y, beta.new) - lm.cost(X, y, beta.old)) > tol) {
cat("Could not converge. \n")
} else {
cat("Converged. \n")
cat("Iterated", iter + 1, "times.", "\n")
cat("Coef: ", beta.new, "\n")
return(list(coef = betas, cost = J, niter = iter + 1))
}
}
## Make the cost function
sgd.lm.cost <- function(X, y, beta) {
n <- length(y)
if (!is.matrix(X)) {
X <- matrix(X, nrow = 1)
}
loss <- sum((X %*% beta - y)^2)/(2 * n)
return(loss)
}
## Calculate the gradient
sgd.lm.cost.grad <- function(X, y, beta) {
n <- length(y)
if (!is.matrix(X)) {
X <- matrix(X, nrow = 1)
}
t(X) %*% (X %*% beta - y)/n
}
Let us test SGD on generated data.
# test on the generated data
sgd.est <- sgd.lm(X, y, beta.init = c(-4, -5), alpha = 0.05, tol = 1e-05, max.iter = 10000)
## Converged.
## Iterated 382 times.
## Coef: 1.006272 2.995898
betas <- as.data.frame(t(do.call(cbind, sgd.est$coef)))
colnames(betas) <- c("beta0", "beta1")
betas <- betas %>% mutate(iter = 1:nrow(betas))
betas <- melt(betas, id.vars = "iter", variable.name = "coef")
ggplot(betas, aes(iter, value)) + geom_line(aes(colour = coef)) + ylim(c(0, 3.5))
We find that the trace plots of SGD are not as smooth as GD.
Now let us test on bodyfat data.
bodyfat <- read.csv("./Bodyfat.csv")
# Note the importance of scaling
bodyfat.X <- bodyfat %>% select(Abdomen, Weight) %>% scale() %>% as.matrix()
bodyfat.X <- cbind(1, bodyfat.X)
bodyfat.y <- bodyfat %>% select(bodyfat) %>% as.matrix()
sgd.bodyfat <- sgd.lm(bodyfat.X, bodyfat.y, beta.init = c(0, 0, 0), alpha = 0.05,
tol = 1e-05, max.iter = 10000)
## Converged.
## Iterated 1362 times.
## Coef: 19.78609 10.81002 -5.165585
betas <- as.data.frame(t(do.call(cbind, sgd.bodyfat$coef)))
colnames(betas) <- c("beta0", "beta1", "beta2")
betas <- betas %>% mutate(iter = 1:nrow(betas))
betas <- melt(betas, id.vars = "iter", variable.name = "coef")
ggplot(betas, aes(iter, value)) + geom_line(aes(colour = coef)) + ylim(c(-10, 25))