School of Economics and Management
Beihang University
http://yanfei.site
## The image matrix for training sample. 256x1707 azip <- read.table("azip.dat") ## The true digits given in the training sample. length = 1707 dzip <- as.numeric(read.table("dzip.dat")) ## The testing image matrix. 256x2007 testzip <- read.table("testzip.dat") ## The true digits for the testing sample. length = 2007 dtest <- read.table("dtest.dat") ## Display the image i <- 120 image(matrix(azip[, i], ncol = 16)[, 16:1], col = gray(255:0/255))
The naive method is to check the distance from each test image to the mean of training image.
## The mean of training sample of a single digit digits <- 0:9 # The possible digits in the US postal code img.mean <- matrix(0, 256, length(digits)) for (i in digits) { idx <- (i == dzip) # the location indicator for the ith digit imgi <- azip[, idx, drop = FALSE] imgi.mean <- rowMeans(imgi) img.mean[, i + 1] <- imgi.mean } ## Plot the mean image par(mfrow = c(2, 5)) for (i in 1:10) { image(matrix(img.mean[, i], ncol = 16)[, 16:1], col = gray(255:0/255)) }
## Sketch a distance function to compute the Euclidean ## distance between two matrices in row wise. rdist <- function(X, Y) { dim.X <- dim(X) dim.Y <- dim(Y) sum.X <- matrix(rowSums(X^2), dim.X[1], dim.Y[1]) sum.Y <- matrix(rowSums(Y^2), dim.X[1], dim.Y[1], byrow = TRUE) dist0 <- sum.X + sum.Y - 2 * tcrossprod(X, Y) out <- sqrt(dist0) return(out) } ## For an unknown testing digit image, compare the distance to ## the means test.sample <- 1:5 ## Let's first plot those testing image par(mfcol = c(ceiling(length(test.sample)/5), 5)) # five columns for (i in test.sample) { image(matrix(testzip[, i], ncol = 16)[, 16:1], col = gray(255:0/255)) }
## Calculate the distance from testing sample to the mean in ## the training sample. img.dist <- rdist(t(testzip[, test.sample]), t(img.mean)) ## The classification results by the naive method apply(img.dist, 1, which.min) - 1
## V1 V2 V3 V4 V5 ## 9 2 3 2 6
## Compute the singular matrix of a single digit in the ## training sample digit <- 9 ## Subtract the matrix for that digit img.mat <- azip[, digit == dzip, drop = FALSE] img.matSVD <- svd(img.mat) ## Plot the singular matrix under different basis. par(mfrow = c(2, 5)) for (i in 1:10) { image(matrix(img.matSVD$u[, i], 16)[, 16:1], col = gray(255:0/255), main = paste("singular image ", i, sep = "")) }
## Do the least square method with different basis and find ## the minimal residuals. ## The testing digit matrix test.idx <- 2 image(matrix(testzip[, test.idx], 16)[, 16:1], col = gray(255:0/255), main = paste("Testing digit"))
resid.norm <- matrix(NA, 10, 1, dimnames = list(0:9, "resid")) for (i in 0:9) { img.mat <- azip[, i == dzip, drop = FALSE] img.matSVD <- svd(img.mat) # basis.max <- ncol(img.matSVD$u) basis.max <- 4 resid.norm[i + 1, ] <- norm(matrix(lm(testzip[, test.idx] ~ 0 + img.matSVD$u[, 1:basis.max])$resid), "F") } resid.norm
## resid ## 0 11.815495 ## 1 12.536173 ## 2 11.388341 ## 3 12.706473 ## 4 12.141602 ## 5 12.926655 ## 6 9.455279 ## 7 12.577303 ## 8 12.061615 ## 9 12.551339