Optimize \(\alpha\) in GD

In the following function, for each iteration, we can use optimization methods to optimize \(\alpha\).

gd.lm <- function(X, y, beta.init, alpha, tol = 1e-05, max.iter = 100) {
    beta.old <- beta.init
    J <- betas <- list()
    if (alpha == "auto") {
        alpha <- optim(0.1, function(alpha) {
            lm.cost(X, y, beta.old - alpha * lm.cost.grad(X, y, beta.old))
        }, method = "L-BFGS-B", lower = 0, upper = 1)
        if (alpha$convergence == 0) {
            alpha <- alpha$par
        } else {
            alpha <- 0.1
        }
    }
    betas[[1]] <- beta.old
    J[[1]] <- lm.cost(X, y, beta.old)
    beta.new <- beta.old - alpha * lm.cost.grad(X, y, beta.old)
    betas[[2]] <- beta.new
    J[[2]] <- lm.cost(X, y, beta.new)
    iter <- 0
    while ((abs(lm.cost(X, y, beta.new) - lm.cost(X, y, beta.old)) > tol) & (iter < 
        max.iter)) {
        beta.old <- beta.new
        if (alpha == "auto") {
            alpha <- optim(0.1, function(alpha) {
                lm.cost(X, y, beta.old - alpha * lm.cost.grad(X, y, beta.old))
            }, method = "L-BFGS-B", lower = 0, upper = 1)
            if (alpha$convergence == 0) {
                alpha <- alpha$par
            } else {
                alpha <- 0.1
            }
        }
        beta.new <- beta.old - alpha * lm.cost.grad(X, y, beta.old)
        iter <- iter + 1
        betas[[iter + 2]] <- beta.new
        J[[iter + 2]] <- lm.cost(X, y, beta.new)
    }
    if (abs(lm.cost(X, y, beta.new) - lm.cost(X, y, beta.old)) > tol) {
        cat("Could not converge. \n")
    } else {
        cat("Converged. \n")
        cat("Iterated", iter + 1, "times.", "\n")
        cat("Coef: ", beta.new, "\n")
        return(list(coef = betas, cost = J, niter = iter + 1))
    }
}
## Make the cost function
lm.cost <- function(X, y, beta) {
    n <- length(y)
    loss <- sum((X %*% beta - y)^2)/(2 * n)
    return(loss)
}
## Calculate the gradient
lm.cost.grad <- function(X, y, beta) {
    n <- length(y)
    (1/n) * (t(X) %*% (X %*% beta - y))
}

Let us now generate some data and compare functions with and without optimized \(\alpha\).

## Generate some data
set.seed(20200401)
beta0 <- 1
beta1 <- 3
sigma <- 1
n <- 10000
x <- rnorm(n, 0, 1)
y <- beta0 + x * beta1 + rnorm(n, mean = 0, sd = sigma)
X <- cbind(1, x)
gd.auto <- gd.lm(X, y, beta.init = c(0, 0), alpha = "auto", tol = 1e-05, max.iter = 10000)
## Converged. 
## Iterated 3 times. 
## Coef:  0.995839 2.998762
gd1 <- gd.lm(X, y, beta.init = c(0, 0), alpha = 0.1, tol = 1e-05, max.iter = 10000)
## Converged. 
## Iterated 55 times. 
## Coef:  0.9934615 2.99014
betas <- as.data.frame(t(do.call(cbind, gd1$coef)))
colnames(betas) <- c("beta0", "beta1")
betas <- betas %>% mutate(iter = 1:nrow(betas))
betas <- melt(betas, id.vars = "iter", variable.name = "coef")
p1 <- ggplot(betas, aes(iter, value)) + geom_line(aes(colour = coef)) + ylim(c(0, 
    3.5)) + ggtitle("alpha = 0.1") + xlim(c(0, 600))
gd2 <- gd.lm(X, y, beta.init = c(0, 0), alpha = 0.01, tol = 1e-05, max.iter = 10000)
## Converged. 
## Iterated 455 times. 
## Coef:  0.9872457 2.969085
betas <- as.data.frame(t(do.call(cbind, gd2$coef)))
colnames(betas) <- c("beta0", "beta1")
betas <- betas %>% mutate(iter = 1:nrow(betas))
betas <- melt(betas, id.vars = "iter", variable.name = "coef")
p2 <- ggplot(betas, aes(iter, value)) + geom_line(aes(colour = coef)) + ylim(c(0, 
    3.5)) + ggtitle("alpha = 0.01") + xlim(c(0, 600))
niter <- data.frame(alpha = c("auto", 0.1, 0.01), niter = c(gd.auto$niter, gd1$niter, 
    gd2$niter))
p1 + p2

knitr::kable(niter) %>% kable_styling(bootstrap_options = "striped", full_width = F)
alpha niter
auto 3
0.1 55
0.01 455

We find that using optimized \(\alpha\) improves iteration efficiency significantly, while using smaller learning rates yields larger number of iterations.

SGD in R

Recall the SGD iteration. Repeat the following until convergence {

\[\beta:= \beta - \alpha \nabla J(\beta)_i.\]

}

sgd.lm <- function(X, y, beta.init, alpha = 0.5, n.samples = 1, tol = 1e-05, max.iter = 100) {
    n <- length(y)
    beta.old <- beta.init
    J <- betas <- list()
    sto.sample <- sample(1:n, n.samples, replace = TRUE)
    betas[[1]] <- beta.old
    J[[1]] <- lm.cost(X, y, beta.old)
    beta.new <- beta.old - alpha * sgd.lm.cost.grad(X[sto.sample, ], y[sto.sample], 
        beta.old)
    betas[[2]] <- beta.new
    J[[2]] <- lm.cost(X, y, beta.new)
    iter <- 0
    n.best <- 0
    while ((abs(lm.cost(X, y, beta.new) - lm.cost(X, y, beta.old)) > tol) & (iter + 
        2 < max.iter)) {
        beta.old <- beta.new
        sto.sample <- sample(1:n, n.samples, replace = TRUE)
        beta.new <- beta.old - alpha * sgd.lm.cost.grad(X[sto.sample, ], y[sto.sample], 
            beta.old)
        iter <- iter + 1
        betas[[iter + 2]] <- beta.new
        J[[iter + 2]] <- lm.cost(X, y, beta.new)
    }
    if (abs(lm.cost(X, y, beta.new) - lm.cost(X, y, beta.old)) > tol) {
        cat("Could not converge. \n")
    } else {
        cat("Converged. \n")
        cat("Iterated", iter + 1, "times.", "\n")
        cat("Coef: ", beta.new, "\n")
        return(list(coef = betas, cost = J, niter = iter + 1))
    }
}

## Make the cost function
sgd.lm.cost <- function(X, y, beta) {
    n <- length(y)
    if (!is.matrix(X)) {
        X <- matrix(X, nrow = 1)
    }
    loss <- sum((X %*% beta - y)^2)/(2 * n)
    return(loss)
}
## Calculate the gradient
sgd.lm.cost.grad <- function(X, y, beta) {
    n <- length(y)
    if (!is.matrix(X)) {
        X <- matrix(X, nrow = 1)
    }
    t(X) %*% (X %*% beta - y)/n
}

Let us test SGD on generated data.

# test on the generated data
sgd.est <- sgd.lm(X, y, beta.init = c(-4, -5), alpha = 0.05, tol = 1e-05, max.iter = 10000)
## Converged. 
## Iterated 382 times. 
## Coef:  1.006272 2.995898
betas <- as.data.frame(t(do.call(cbind, sgd.est$coef)))
colnames(betas) <- c("beta0", "beta1")
betas <- betas %>% mutate(iter = 1:nrow(betas))
betas <- melt(betas, id.vars = "iter", variable.name = "coef")
ggplot(betas, aes(iter, value)) + geom_line(aes(colour = coef)) + ylim(c(0, 3.5))

We find that the trace plots of SGD are not as smooth as GD.

Now let us test on bodyfat data.

bodyfat <- read.csv("./Bodyfat.csv")
# Note the importance of scaling
bodyfat.X <- bodyfat %>% select(Abdomen, Weight) %>% scale() %>% as.matrix()
bodyfat.X <- cbind(1, bodyfat.X)
bodyfat.y <- bodyfat %>% select(bodyfat) %>% as.matrix()
sgd.bodyfat <- sgd.lm(bodyfat.X, bodyfat.y, beta.init = c(0, 0, 0), alpha = 0.05, 
    tol = 1e-05, max.iter = 10000)
## Converged. 
## Iterated 1362 times. 
## Coef:  19.78609 10.81002 -5.165585
betas <- as.data.frame(t(do.call(cbind, sgd.bodyfat$coef)))
colnames(betas) <- c("beta0", "beta1", "beta2")
betas <- betas %>% mutate(iter = 1:nrow(betas))
betas <- melt(betas, id.vars = "iter", variable.name = "coef")
ggplot(betas, aes(iter, value)) + geom_line(aes(colour = coef)) + ylim(c(-10, 25))