## Optimize $$\alpha$$ in GD

In the following function, for each iteration, we can use optimization methods to optimize $$\alpha$$.

gd.lm <- function(X, y, beta.init, alpha, tol = 1e-05, max.iter = 100) {
beta.old <- beta.init
J <- betas <- list()
if (alpha == "auto") {
alpha <- optim(0.1, function(alpha) {
lm.cost(X, y, beta.old - alpha * lm.cost.grad(X, y, beta.old))
}, method = "L-BFGS-B", lower = 0, upper = 1)
if (alpha$convergence == 0) { alpha <- alpha$par
} else {
alpha <- 0.1
}
}
betas[[1]] <- beta.old
J[[1]] <- lm.cost(X, y, beta.old)
beta.new <- beta.old - alpha * lm.cost.grad(X, y, beta.old)
betas[[2]] <- beta.new
J[[2]] <- lm.cost(X, y, beta.new)
iter <- 0
while ((abs(lm.cost(X, y, beta.new) - lm.cost(X, y, beta.old)) > tol) & (iter <
max.iter)) {
beta.old <- beta.new
if (alpha == "auto") {
alpha <- optim(0.1, function(alpha) {
lm.cost(X, y, beta.old - alpha * lm.cost.grad(X, y, beta.old))
}, method = "L-BFGS-B", lower = 0, upper = 1)
if (alpha$convergence == 0) { alpha <- alpha$par
} else {
alpha <- 0.1
}
}
beta.new <- beta.old - alpha * lm.cost.grad(X, y, beta.old)
iter <- iter + 1
betas[[iter + 2]] <- beta.new
J[[iter + 2]] <- lm.cost(X, y, beta.new)
}
if (abs(lm.cost(X, y, beta.new) - lm.cost(X, y, beta.old)) > tol) {
cat("Could not converge. \n")
} else {
cat("Converged. \n")
cat("Iterated", iter + 1, "times.", "\n")
cat("Coef: ", beta.new, "\n")
return(list(coef = betas, cost = J, niter = iter + 1))
}
}
## Make the cost function
lm.cost <- function(X, y, beta) {
n <- length(y)
loss <- sum((X %*% beta - y)^2)/(2 * n)
return(loss)
}
lm.cost.grad <- function(X, y, beta) {
n <- length(y)
(1/n) * (t(X) %*% (X %*% beta - y))
}

Let us now generate some data and compare functions with and without optimized $$\alpha$$.

## Generate some data
set.seed(20200401)
beta0 <- 1
beta1 <- 3
sigma <- 1
n <- 10000
x <- rnorm(n, 0, 1)
y <- beta0 + x * beta1 + rnorm(n, mean = 0, sd = sigma)
X <- cbind(1, x)
gd.auto <- gd.lm(X, y, beta.init = c(0, 0), alpha = "auto", tol = 1e-05, max.iter = 10000)
## Converged.
## Iterated 3 times.
## Coef:  0.995839 2.998762
gd1 <- gd.lm(X, y, beta.init = c(0, 0), alpha = 0.1, tol = 1e-05, max.iter = 10000)
## Converged.
## Iterated 55 times.
## Coef:  0.9934615 2.99014
betas <- as.data.frame(t(do.call(cbind, gd1$coef))) colnames(betas) <- c("beta0", "beta1") betas <- betas %>% mutate(iter = 1:nrow(betas)) betas <- melt(betas, id.vars = "iter", variable.name = "coef") p1 <- ggplot(betas, aes(iter, value)) + geom_line(aes(colour = coef)) + ylim(c(0, 3.5)) + ggtitle("alpha = 0.1") + xlim(c(0, 600)) gd2 <- gd.lm(X, y, beta.init = c(0, 0), alpha = 0.01, tol = 1e-05, max.iter = 10000) ## Converged. ## Iterated 455 times. ## Coef: 0.9872457 2.969085 betas <- as.data.frame(t(do.call(cbind, gd2$coef)))
colnames(betas) <- c("beta0", "beta1")
betas <- betas %>% mutate(iter = 1:nrow(betas))
betas <- melt(betas, id.vars = "iter", variable.name = "coef")
p2 <- ggplot(betas, aes(iter, value)) + geom_line(aes(colour = coef)) + ylim(c(0,
3.5)) + ggtitle("alpha = 0.01") + xlim(c(0, 600))
niter <- data.frame(alpha = c("auto", 0.1, 0.01), niter = c(gd.auto$niter, gd1$niter,
gd2$niter)) p1 + p2 knitr::kable(niter) %>% kable_styling(bootstrap_options = "striped", full_width = F) alpha niter auto 3 0.1 55 0.01 455 We find that using optimized $$\alpha$$ improves iteration efficiency significantly, while using smaller learning rates yields larger number of iterations. ## SGD in R Recall the SGD iteration. Repeat the following until convergence { $\beta:= \beta - \alpha \nabla J(\beta)_i.$ } sgd.lm <- function(X, y, beta.init, alpha = 0.5, n.samples = 1, tol = 1e-05, max.iter = 100) { n <- length(y) beta.old <- beta.init J <- betas <- list() sto.sample <- sample(1:n, n.samples, replace = TRUE) betas[[1]] <- beta.old J[[1]] <- lm.cost(X, y, beta.old) beta.new <- beta.old - alpha * sgd.lm.cost.grad(X[sto.sample, ], y[sto.sample], beta.old) betas[[2]] <- beta.new J[[2]] <- lm.cost(X, y, beta.new) iter <- 0 n.best <- 0 while ((abs(lm.cost(X, y, beta.new) - lm.cost(X, y, beta.old)) > tol) & (iter + 2 < max.iter)) { beta.old <- beta.new sto.sample <- sample(1:n, n.samples, replace = TRUE) beta.new <- beta.old - alpha * sgd.lm.cost.grad(X[sto.sample, ], y[sto.sample], beta.old) iter <- iter + 1 betas[[iter + 2]] <- beta.new J[[iter + 2]] <- lm.cost(X, y, beta.new) } if (abs(lm.cost(X, y, beta.new) - lm.cost(X, y, beta.old)) > tol) { cat("Could not converge. \n") } else { cat("Converged. \n") cat("Iterated", iter + 1, "times.", "\n") cat("Coef: ", beta.new, "\n") return(list(coef = betas, cost = J, niter = iter + 1)) } } ## Make the cost function sgd.lm.cost <- function(X, y, beta) { n <- length(y) if (!is.matrix(X)) { X <- matrix(X, nrow = 1) } loss <- sum((X %*% beta - y)^2)/(2 * n) return(loss) } ## Calculate the gradient sgd.lm.cost.grad <- function(X, y, beta) { n <- length(y) if (!is.matrix(X)) { X <- matrix(X, nrow = 1) } t(X) %*% (X %*% beta - y)/n } Let us test SGD on generated data. # test on the generated data sgd.est <- sgd.lm(X, y, beta.init = c(-4, -5), alpha = 0.05, tol = 1e-05, max.iter = 10000) ## Converged. ## Iterated 382 times. ## Coef: 1.006272 2.995898 betas <- as.data.frame(t(do.call(cbind, sgd.est$coef)))
colnames(betas) <- c("beta0", "beta1")
betas <- betas %>% mutate(iter = 1:nrow(betas))
betas <- melt(betas, id.vars = "iter", variable.name = "coef")
ggplot(betas, aes(iter, value)) + geom_line(aes(colour = coef)) + ylim(c(0, 3.5))

We find that the trace plots of SGD are not as smooth as GD.

Now let us test on bodyfat data.

bodyfat <- read.csv("./Bodyfat.csv")
# Note the importance of scaling
bodyfat.X <- bodyfat %>% select(Abdomen, Weight) %>% scale() %>% as.matrix()
bodyfat.X <- cbind(1, bodyfat.X)
bodyfat.y <- bodyfat %>% select(bodyfat) %>% as.matrix()
sgd.bodyfat <- sgd.lm(bodyfat.X, bodyfat.y, beta.init = c(0, 0, 0), alpha = 0.05,
tol = 1e-05, max.iter = 10000)
## Converged.
## Iterated 1362 times.
## Coef:  19.78609 10.81002 -5.165585
betas <- as.data.frame(t(do.call(cbind, sgd.bodyfat\$coef)))
colnames(betas) <- c("beta0", "beta1", "beta2")
betas <- betas %>% mutate(iter = 1:nrow(betas))
betas <- melt(betas, id.vars = "iter", variable.name = "coef")
ggplot(betas, aes(iter, value)) + geom_line(aes(colour = coef)) + ylim(c(-10, 25))